NVMe: Merge the nvme_bio and nvme_prp data structures
authorMatthew Wilcox <matthew.r.wilcox@intel.com>
Tue, 20 Dec 2011 18:34:52 +0000 (13:34 -0500)
committerMatthew Wilcox <matthew.r.wilcox@intel.com>
Tue, 10 Jan 2012 19:51:20 +0000 (14:51 -0500)
The new merged data structure is called nvme_iod.  This improves performance
for mid-sized I/Os (in the 16k range) since we save a memory allocation.
It is also a slightly simpler interface to use.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
drivers/block/nvme.c

index b0e8a6dd33b15515ec014fb299c04d20a2183b91..4517608c068f92cefe7c1b9e493e18dfbea15254 100644 (file)
@@ -290,52 +290,70 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
        return 0;
 }
 
-struct nvme_prps {
-       int npages;             /* 0 means small pool in use */
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_alloc_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+       void *private;          /* For the use of the submitter of the I/O */
+       int npages;             /* In the PRP list. 0 means small pool in use */
+       int offset;             /* Of PRP list */
+       int nents;              /* Used in scatterlist */
+       int length;             /* Of data, in bytes */
        dma_addr_t first_dma;
-       __le64 *list[0];
+       struct scatterlist sg[0];
 };
 
-static void nvme_free_prps(struct nvme_dev *dev, struct nvme_prps *prps)
+static __le64 **iod_list(struct nvme_iod *iod)
 {
-       const int last_prp = PAGE_SIZE / 8 - 1;
-       int i;
-       dma_addr_t prp_dma;
+       return ((void *)iod) + iod->offset;
+}
 
-       if (!prps)
-               return;
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size)
+{
+       unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
+       return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
 
-       prp_dma = prps->first_dma;
+static struct nvme_iod *
+nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
+{
+       struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+                               sizeof(__le64 *) * nvme_npages(nbytes) +
+                               sizeof(struct scatterlist) * nseg, gfp);
 
-       if (prps->npages == 0)
-               dma_pool_free(dev->prp_small_pool, prps->list[0], prp_dma);
-       for (i = 0; i < prps->npages; i++) {
-               __le64 *prp_list = prps->list[i];
-               dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
-               dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
-               prp_dma = next_prp_dma;
+       if (iod) {
+               iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+               iod->npages = -1;
+               iod->length = nbytes;
        }
-       kfree(prps);
-}
 
-struct nvme_bio {
-       struct bio *bio;
-       int nents;
-       struct nvme_prps *prps;
-       struct scatterlist sg[0];
-};
-
-/* XXX: use a mempool */
-static struct nvme_bio *alloc_nbio(unsigned nseg, gfp_t gfp)
-{
-       return kzalloc(sizeof(struct nvme_bio) +
-                       sizeof(struct scatterlist) * nseg, gfp);
+       return iod;
 }
 
-static void free_nbio(struct nvme_dev *dev, struct nvme_bio *nbio)
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
-       nvme_free_prps(dev, nbio->prps);
-       kfree(nbio);
+       const int last_prp = PAGE_SIZE / 8 - 1;
+       int i;
+       __le64 **list = iod_list(iod);
+       dma_addr_t prp_dma = iod->first_dma;
+
+       if (iod->npages == 0)
+               dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+       for (i = 0; i < iod->npages; i++) {
+               __le64 *prp_list = list[i];
+               dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+               dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
+               prp_dma = next_prp_dma;
+       }
+       kfree(iod);
 }
 
 static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
@@ -351,13 +369,13 @@ static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
 static void bio_completion(struct nvme_dev *dev, void *ctx,
                                                struct nvme_completion *cqe)
 {
-       struct nvme_bio *nbio = ctx;
-       struct bio *bio = nbio->bio;
+       struct nvme_iod *iod = ctx;
+       struct bio *bio = iod->private;
        u16 status = le16_to_cpup(&cqe->status) >> 1;
 
-       dma_unmap_sg(&dev->pci_dev->dev, nbio->sg, nbio->nents,
+       dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
                        bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-       free_nbio(dev, nbio);
+       nvme_free_iod(dev, iod);
        if (status) {
                bio_endio(bio, -EIO);
        } else if (bio->bi_vcnt > bio->bi_idx) {
@@ -368,25 +386,25 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
-                                       struct nvme_common_command *cmd,
-                                       struct scatterlist *sg, int *len,
-                                       gfp_t gfp)
+static int nvme_setup_prps(struct nvme_dev *dev,
+                       struct nvme_common_command *cmd, struct nvme_iod *iod,
+                       int total_len, gfp_t gfp)
 {
        struct dma_pool *pool;
-       int length = *len;
+       int length = total_len;
+       struct scatterlist *sg = iod->sg;
        int dma_len = sg_dma_len(sg);
        u64 dma_addr = sg_dma_address(sg);
        int offset = offset_in_page(dma_addr);
        __le64 *prp_list;
+       __le64 **list = iod_list(iod);
        dma_addr_t prp_dma;
-       int nprps, npages, i;
-       struct nvme_prps *prps = NULL;
+       int nprps, i;
 
        cmd->prp1 = cpu_to_le64(dma_addr);
        length -= (PAGE_SIZE - offset);
        if (length <= 0)
-               return prps;
+               return total_len;
 
        dma_len -= (PAGE_SIZE - offset);
        if (dma_len) {
@@ -399,46 +417,35 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
 
        if (length <= PAGE_SIZE) {
                cmd->prp2 = cpu_to_le64(dma_addr);
-               return prps;
+               return total_len;
        }
 
        nprps = DIV_ROUND_UP(length, PAGE_SIZE);
-       npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
-       prps = kmalloc(sizeof(*prps) + sizeof(__le64 *) * npages, gfp);
-       if (!prps) {
-               cmd->prp2 = cpu_to_le64(dma_addr);
-               *len = (*len - length) + PAGE_SIZE;
-               return prps;
-       }
-
        if (nprps <= (256 / 8)) {
                pool = dev->prp_small_pool;
-               prps->npages = 0;
+               iod->npages = 0;
        } else {
                pool = dev->prp_page_pool;
-               prps->npages = 1;
+               iod->npages = 1;
        }
 
        prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
        if (!prp_list) {
                cmd->prp2 = cpu_to_le64(dma_addr);
-               *len = (*len - length) + PAGE_SIZE;
-               kfree(prps);
-               return NULL;
+               iod->npages = -1;
+               return (total_len - length) + PAGE_SIZE;
        }
-       prps->list[0] = prp_list;
-       prps->first_dma = prp_dma;
+       list[0] = prp_list;
+       iod->first_dma = prp_dma;
        cmd->prp2 = cpu_to_le64(prp_dma);
        i = 0;
        for (;;) {
                if (i == PAGE_SIZE / 8) {
                        __le64 *old_prp_list = prp_list;
                        prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
-                       if (!prp_list) {
-                               *len = (*len - length);
-                               return prps;
-                       }
-                       prps->list[prps->npages++] = prp_list;
+                       if (!prp_list)
+                               return total_len - length;
+                       list[iod->npages++] = prp_list;
                        prp_list[0] = old_prp_list[i - 1];
                        old_prp_list[i - 1] = cpu_to_le64(prp_dma);
                        i = 1;
@@ -457,21 +464,21 @@ static struct nvme_prps *nvme_setup_prps(struct nvme_dev *dev,
                dma_len = sg_dma_len(sg);
        }
 
-       return prps;
+       return total_len;
 }
 
 /* NVMe scatterlists require no holes in the virtual address */
 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)  ((vec2)->bv_offset || \
                        (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
 
-static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
+static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
                struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
        struct bio_vec *bvec, *bvprv = NULL;
        struct scatterlist *sg = NULL;
        int i, old_idx, length = 0, nsegs = 0;
 
-       sg_init_table(nbio->sg, psegs);
+       sg_init_table(iod->sg, psegs);
        old_idx = bio->bi_idx;
        bio_for_each_segment(bvec, bio, i) {
                if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
@@ -479,7 +486,7 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
                } else {
                        if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
                                break;
-                       sg = sg ? sg + 1 : nbio->sg;
+                       sg = sg ? sg + 1 : iod->sg;
                        sg_set_page(sg, bvec->bv_page, bvec->bv_len,
                                                        bvec->bv_offset);
                        nsegs++;
@@ -488,9 +495,9 @@ static int nvme_map_bio(struct device *dev, struct nvme_bio *nbio,
                bvprv = bvec;
        }
        bio->bi_idx = i;
-       nbio->nents = nsegs;
+       iod->nents = nsegs;
        sg_mark_end(sg);
-       if (dma_map_sg(dev, nbio->sg, nbio->nents, dma_dir) == 0) {
+       if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
                bio->bi_idx = old_idx;
                return -ENOMEM;
        }
@@ -531,7 +538,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                                                                struct bio *bio)
 {
        struct nvme_command *cmnd;
-       struct nvme_bio *nbio;
+       struct nvme_iod *iod;
        enum dma_data_direction dma_dir;
        int cmdid, length, result = -ENOMEM;
        u16 control;
@@ -544,15 +551,15 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                        return result;
        }
 
-       nbio = alloc_nbio(psegs, GFP_ATOMIC);
-       if (!nbio)
+       iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
+       if (!iod)
                goto nomem;
-       nbio->bio = bio;
+       iod->private = bio;
 
        result = -EBUSY;
-       cmdid = alloc_cmdid(nvmeq, nbio, bio_completion, IO_TIMEOUT);
+       cmdid = alloc_cmdid(nvmeq, iod, bio_completion, IO_TIMEOUT);
        if (unlikely(cmdid < 0))
-               goto free_nbio;
+               goto free_iod;
 
        if ((bio->bi_rw & REQ_FLUSH) && !psegs)
                return nvme_submit_flush(nvmeq, ns, cmdid);
@@ -578,15 +585,15 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                dma_dir = DMA_FROM_DEVICE;
        }
 
-       result = nvme_map_bio(nvmeq->q_dmadev, nbio, bio, dma_dir, psegs);
+       result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
        if (result < 0)
-               goto free_nbio;
+               goto free_iod;
        length = result;
 
        cmnd->rw.command_id = cmdid;
        cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
-       nbio->prps = nvme_setup_prps(nvmeq->dev, &cmnd->common, nbio->sg,
-                                                       &length, GFP_ATOMIC);
+       length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
+                                                               GFP_ATOMIC);
        cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
        cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
        cmnd->rw.control = cpu_to_le16(control);
@@ -600,8 +607,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
        return 0;
 
- free_nbio:
-       free_nbio(nvmeq->dev, nbio);
+ free_iod:
+       nvme_free_iod(nvmeq->dev, iod);
  nomem:
        return result;
 }
@@ -1005,18 +1012,18 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
        return result;
 }
 
-static int nvme_map_user_pages(struct nvme_dev *dev, int write,
-                               unsigned long addr, unsigned length,
-                               struct scatterlist **sgp)
+static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+                               unsigned long addr, unsigned length)
 {
        int i, err, count, nents, offset;
        struct scatterlist *sg;
        struct page **pages;
+       struct nvme_iod *iod;
 
        if (addr & 3)
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
        if (!length)
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
 
        offset = offset_in_page(addr);
        count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
@@ -1029,7 +1036,8 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
                goto put_pages;
        }
 
-       sg = kcalloc(count, sizeof(*sg), GFP_KERNEL);
+       iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+       sg = iod->sg;
        sg_init_table(sg, count);
        for (i = 0; i < count; i++) {
                sg_set_page(&sg[i], pages[i],
@@ -1042,22 +1050,24 @@ static int nvme_map_user_pages(struct nvme_dev *dev, int write,
        nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
                                write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
        if (!nents)
-               goto put_pages;
+               goto free_iod;
 
        kfree(pages);
-       *sgp = sg;
-       return nents;
+       return iod;
 
+ free_iod:
+       kfree(iod);
  put_pages:
        for (i = 0; i < count; i++)
                put_page(pages[i]);
        kfree(pages);
-       return err;
+       return ERR_PTR(err);
 }
 
 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-                       unsigned long addr, int length, struct scatterlist *sg)
+                       unsigned long addr, int length, struct nvme_iod *iod)
 {
+       struct scatterlist *sg = iod->sg;
        int i, count;
 
        count = DIV_ROUND_UP(offset_in_page(addr) + length, PAGE_SIZE);
@@ -1074,9 +1084,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        struct nvme_user_io io;
        struct nvme_command c;
        unsigned length;
-       int nents, status;
-       struct scatterlist *sg;
-       struct nvme_prps *prps;
+       int status;
+       struct nvme_iod *iod;
 
        if (copy_from_user(&io, uio, sizeof(io)))
                return -EFAULT;
@@ -1086,15 +1095,14 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        case nvme_cmd_write:
        case nvme_cmd_read:
        case nvme_cmd_compare:
-               nents = nvme_map_user_pages(dev, io.opcode & 1, io.addr,
-                                                               length, &sg);
+               iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
                break;
        default:
                return -EINVAL;
        }
 
-       if (nents < 0)
-               return nents;
+       if (IS_ERR(iod))
+               return PTR_ERR(iod);
 
        memset(&c, 0, sizeof(c));
        c.rw.opcode = io.opcode;
@@ -1108,7 +1116,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        c.rw.apptag = io.apptag;
        c.rw.appmask = io.appmask;
        /* XXX: metadata */
-       prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+       length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
        nvmeq = get_nvmeq(dev);
        /*
@@ -1123,8 +1131,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        else
                status = nvme_submit_sync_cmd(nvmeq, &c, NULL, IO_TIMEOUT);
 
-       nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, sg);
-       nvme_free_prps(dev, prps);
+       nvme_unmap_user_pages(dev, io.opcode & 1, io.addr, length, iod);
+       nvme_free_iod(dev, iod);
        return status;
 }
 
@@ -1134,9 +1142,8 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
        struct nvme_dev *dev = ns->dev;
        struct nvme_admin_cmd cmd;
        struct nvme_command c;
-       int status, length, nents = 0;
-       struct scatterlist *sg;
-       struct nvme_prps *prps = NULL;
+       int status, length;
+       struct nvme_iod *iod;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
@@ -1158,19 +1165,21 @@ static int nvme_user_admin_cmd(struct nvme_ns *ns,
 
        length = cmd.data_len;
        if (cmd.data_len) {
-               nents = nvme_map_user_pages(dev, 1, cmd.addr, length, &sg);
-               if (nents < 0)
-                       return nents;
-               prps = nvme_setup_prps(dev, &c.common, sg, &length, GFP_KERNEL);
+               iod = nvme_map_user_pages(dev, 1, cmd.addr, length);
+               if (IS_ERR(iod))
+                       return PTR_ERR(iod);
+               length = nvme_setup_prps(dev, &c.common, iod, length,
+                                                               GFP_KERNEL);
        }
 
        if (length != cmd.data_len)
                status = -ENOMEM;
        else
                status = nvme_submit_admin_cmd(dev, &c, NULL);
+
        if (cmd.data_len) {
-               nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, sg);
-               nvme_free_prps(dev, prps);
+               nvme_unmap_user_pages(dev, 0, cmd.addr, cmd.data_len, iod);
+               nvme_free_iod(dev, iod);
        }
        return status;
 }