nvme: improve performance for virtual NVMe devices
authorHelen Koike <helen.koike@collabora.co.uk>
Mon, 10 Apr 2017 15:51:07 +0000 (12:51 -0300)
committerChristoph Hellwig <hch@lst.de>
Fri, 21 Apr 2017 14:41:47 +0000 (16:41 +0200)
This change provides a mechanism to reduce the number of MMIO doorbell
writes for the NVMe driver. When running in a virtualized environment
like QEMU, the cost of an MMIO is quite hefy here. The main idea for
the patch is provide the device two memory location locations:
 1) to store the doorbell values so they can be lookup without the doorbell
    MMIO write
 2) to store an event index.
I believe the doorbell value is obvious, the event index not so much.
Similar to the virtio specification, the virtual device can tell the
driver (guest OS) not to write MMIO unless you are writing past this
value.

FYI: doorbell values are written by the nvme driver (guest OS) and the
event index is written by the virtual device (host OS).

The patch implements a new admin command that will communicate where
these two memory locations reside. If the command fails, the nvme
driver will work as before without any optimizations.

Contributions:
  Eric Northup <digitaleric@google.com>
  Frank Swiderski <fes@google.com>
  Ted Tso <tytso@mit.edu>
  Keith Busch <keith.busch@intel.com>

Just to give an idea on the performance boost with the vendor
extension: Running fio [1], a stock NVMe driver I get about 200K read
IOPs with my vendor patch I get about 1000K read IOPs. This was
running with a null device i.e. the backing device simply returned
success on every read IO request.

[1] Running on a 4 core machine:
  fio --time_based --name=benchmark --runtime=30
  --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
  --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
  --rw=randread --blocksize=4k --randrepeat=false

Signed-off-by: Rob Nelson <rlnelson@google.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
[koike: updated for upstream]
Signed-off-by: Helen Koike <helen.koike@collabora.co.uk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
drivers/nvme/host/pci.c
include/linux/nvme.h

index af783a33e93a8016c508dad9cb5625c74662197f..a363fecb8d8256f16d9ca1a047358fba8856224e 100644 (file)
@@ -103,8 +103,22 @@ struct nvme_dev {
        u32 cmbloc;
        struct nvme_ctrl ctrl;
        struct completion ioq_wait;
+       u32 *dbbuf_dbs;
+       dma_addr_t dbbuf_dbs_dma_addr;
+       u32 *dbbuf_eis;
+       dma_addr_t dbbuf_eis_dma_addr;
 };
 
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+       return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+       return (qid * 2 + 1) * stride;
+}
+
 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
 {
        return container_of(ctrl, struct nvme_dev, ctrl);
@@ -133,6 +147,10 @@ struct nvme_queue {
        u16 qid;
        u8 cq_phase;
        u8 cqe_seen;
+       u32 *dbbuf_sq_db;
+       u32 *dbbuf_cq_db;
+       u32 *dbbuf_sq_ei;
+       u32 *dbbuf_cq_ei;
 };
 
 /*
@@ -171,6 +189,112 @@ static inline void _nvme_check_size(void)
        BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
        BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+       BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+       return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+       unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+       if (dev->dbbuf_dbs)
+               return 0;
+
+       dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+                                           &dev->dbbuf_dbs_dma_addr,
+                                           GFP_KERNEL);
+       if (!dev->dbbuf_dbs)
+               return -ENOMEM;
+       dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+                                           &dev->dbbuf_eis_dma_addr,
+                                           GFP_KERNEL);
+       if (!dev->dbbuf_eis) {
+               dma_free_coherent(dev->dev, mem_size,
+                                 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+               dev->dbbuf_dbs = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+       unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+       if (dev->dbbuf_dbs) {
+               dma_free_coherent(dev->dev, mem_size,
+                                 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+               dev->dbbuf_dbs = NULL;
+       }
+       if (dev->dbbuf_eis) {
+               dma_free_coherent(dev->dev, mem_size,
+                                 dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+               dev->dbbuf_eis = NULL;
+       }
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+                           struct nvme_queue *nvmeq, int qid)
+{
+       if (!dev->dbbuf_dbs || !qid)
+               return;
+
+       nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+       nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+       nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+       nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+       struct nvme_command c;
+
+       if (!dev->dbbuf_dbs)
+               return;
+
+       memset(&c, 0, sizeof(c));
+       c.dbbuf.opcode = nvme_admin_dbbuf;
+       c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+       c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+       if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+               dev_warn(dev->dev, "unable to set dbbuf\n");
+               /* Free memory and continue on */
+               nvme_dbbuf_dma_free(dev);
+       }
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+       return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+                                             volatile u32 *dbbuf_ei)
+{
+       if (dbbuf_db) {
+               u16 old_value;
+
+               /*
+                * Ensure that the queue is written before updating
+                * the doorbell in memory
+                */
+               wmb();
+
+               old_value = *dbbuf_db;
+               *dbbuf_db = value;
+
+               if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+                       return false;
+       }
+
+       return true;
 }
 
 /*
@@ -297,7 +421,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 
        if (++tail == nvmeq->q_depth)
                tail = 0;
-       writel(tail, nvmeq->q_db);
+       if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+                                             nvmeq->dbbuf_sq_ei))
+               writel(tail, nvmeq->q_db);
        nvmeq->sq_tail = tail;
 }
 
@@ -686,7 +812,9 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
                return;
 
        if (likely(nvmeq->cq_vector >= 0))
-               writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+               if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+                                                     nvmeq->dbbuf_cq_ei))
+                       writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
        nvmeq->cq_head = head;
        nvmeq->cq_phase = phase;
 
@@ -1070,6 +1198,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
        nvmeq->cq_phase = 1;
        nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
        memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+       nvme_dbbuf_init(dev, nvmeq, qid);
        dev->online_queues++;
        spin_unlock_irq(&nvmeq->q_lock);
 }
@@ -1542,6 +1671,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
                if (blk_mq_alloc_tag_set(&dev->tagset))
                        return 0;
                dev->ctrl.tagset = &dev->tagset;
+
+               nvme_dbbuf_set(dev);
        } else {
                blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
 
@@ -1728,6 +1859,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 {
        struct nvme_dev *dev = to_nvme_dev(ctrl);
 
+       nvme_dbbuf_dma_free(dev);
        put_device(dev->dev);
        if (dev->tagset.tags)
                blk_mq_free_tag_set(&dev->tagset);
@@ -1795,6 +1927,13 @@ static void nvme_reset_work(struct work_struct *work)
                dev->ctrl.opal_dev = NULL;
        }
 
+       if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+               result = nvme_dbbuf_dma_alloc(dev);
+               if (result)
+                       dev_warn(dev->dev,
+                                "unable to allocate dma for dbbuf\n");
+       }
+
        result = nvme_setup_io_queues(dev);
        if (result)
                goto out;
index 9061780b141ff44c58a0ab5d5429dfbba8f1d2fb..b625bacf37efaabd84e8735b2b304401cdb195fd 100644 (file)
@@ -245,6 +245,7 @@ enum {
        NVME_CTRL_ONCS_WRITE_ZEROES             = 1 << 3,
        NVME_CTRL_VWC_PRESENT                   = 1 << 0,
        NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
+       NVME_CTRL_OACS_DBBUF_SUPP               = 1 << 7,
 };
 
 struct nvme_lbaf {
@@ -603,6 +604,7 @@ enum nvme_admin_opcode {
        nvme_admin_download_fw          = 0x11,
        nvme_admin_ns_attach            = 0x15,
        nvme_admin_keep_alive           = 0x18,
+       nvme_admin_dbbuf                = 0x7C,
        nvme_admin_format_nvm           = 0x80,
        nvme_admin_security_send        = 0x81,
        nvme_admin_security_recv        = 0x82,
@@ -874,6 +876,16 @@ struct nvmf_property_get_command {
        __u8            resv4[16];
 };
 
+struct nvme_dbbuf {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __u32                   rsvd1[5];
+       __le64                  prp1;
+       __le64                  prp2;
+       __u32                   rsvd12[6];
+};
+
 struct nvme_command {
        union {
                struct nvme_common_command common;
@@ -893,6 +905,7 @@ struct nvme_command {
                struct nvmf_connect_command connect;
                struct nvmf_property_set_command prop_set;
                struct nvmf_property_get_command prop_get;
+               struct nvme_dbbuf dbbuf;
        };
 };