#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
#define ADMIN_TIMEOUT (60 * HZ)
+#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT)
unsigned char io_timeout = 30;
module_param(io_timeout, byte, 0644);
wait_queue_head_t sq_full;
wait_queue_t sq_cong_wait;
struct bio_list sq_cong;
+ struct list_head iod_bio;
u32 __iomem *q_db;
u16 q_depth;
u16 cq_vector;
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
}
-typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
struct nvme_completion *);
struct nvme_cmd_info {
#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE)
-static void special_completion(struct nvme_dev *dev, void *ctx,
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
if (ctx == CMD_CTX_CANCELLED)
if (ctx == CMD_CTX_FLUSH)
return;
if (ctx == CMD_CTX_ABORT) {
- ++dev->abort_limit;
+ ++nvmeq->dev->abort_limit;
return;
}
if (ctx == CMD_CTX_COMPLETED) {
- dev_warn(&dev->pci_dev->dev,
+ dev_warn(nvmeq->q_dmadev,
"completed id %d twice on queue %d\n",
cqe->command_id, le16_to_cpup(&cqe->sq_id));
return;
}
if (ctx == CMD_CTX_INVALID) {
- dev_warn(&dev->pci_dev->dev,
+ dev_warn(nvmeq->q_dmadev,
"invalid id %d completed on queue %d\n",
cqe->command_id, le16_to_cpup(&cqe->sq_id));
return;
}
- dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
+ dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
}
-static void async_completion(struct nvme_dev *dev, void *ctx,
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct async_cmd_info *cmdinfo = ctx;
iod->npages = -1;
iod->length = nbytes;
iod->nents = 0;
+ iod->first_dma = 0ULL;
iod->start_time = jiffies;
}
part_stat_unlock();
}
-static void bio_completion(struct nvme_dev *dev, void *ctx,
+static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
struct bio *bio = iod->private;
u16 status = le16_to_cpup(&cqe->status) >> 1;
+ if (unlikely(status)) {
+ if (!(status & NVME_SC_DNR ||
+ bio->bi_rw & REQ_FAILFAST_MASK) &&
+ (jiffies - iod->start_time) < IOD_TIMEOUT) {
+ if (!waitqueue_active(&nvmeq->sq_full))
+ add_wait_queue(&nvmeq->sq_full,
+ &nvmeq->sq_cong_wait);
+ list_add_tail(&iod->node, &nvmeq->iod_bio);
+ wake_up(&nvmeq->sq_full);
+ return;
+ }
+ }
if (iod->nents) {
- dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+ dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
nvme_end_io_acct(bio, iod->start_time);
}
- nvme_free_iod(dev, iod);
+ nvme_free_iod(nvmeq->dev, iod);
if (status)
bio_endio(bio, -EIO);
else
}
/* length is in bytes. gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
- struct nvme_iod *iod, int total_len, gfp_t gfp)
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+ gfp_t gfp)
{
struct dma_pool *pool;
int length = total_len;
dma_addr_t prp_dma;
int nprps, i;
- cmd->prp1 = cpu_to_le64(dma_addr);
length -= (PAGE_SIZE - offset);
if (length <= 0)
return total_len;
}
if (length <= PAGE_SIZE) {
- cmd->prp2 = cpu_to_le64(dma_addr);
+ iod->first_dma = dma_addr;
return total_len;
}
prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
if (!prp_list) {
- cmd->prp2 = cpu_to_le64(dma_addr);
+ iod->first_dma = dma_addr;
iod->npages = -1;
return (total_len - length) + PAGE_SIZE;
}
list[0] = prp_list;
iod->first_dma = prp_dma;
- cmd->prp2 = cpu_to_le64(prp_dma);
i = 0;
for (;;) {
if (i == PAGE_SIZE / 8) {
bio_chain(split, bio);
- if (bio_list_empty(&nvmeq->sq_cong))
+ if (!waitqueue_active(&nvmeq->sq_full))
add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
bio_list_add(&nvmeq->sq_cong, split);
bio_list_add(&nvmeq->sq_cong, bio);
+ wake_up(&nvmeq->sq_full);
return 0;
}
return length;
}
-/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
struct bio *bio, struct nvme_iod *iod, int cmdid)
{
- struct nvme_dsm_range *range;
+ struct nvme_dsm_range *range =
+ (struct nvme_dsm_range *)iod_list(iod)[0];
struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
- range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
- &iod->first_dma);
- if (!range)
- return -ENOMEM;
-
- iod_list(iod)[0] = (__le64 *)range;
- iod->npages = 0;
-
range->cattr = cpu_to_le32(0);
range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
return nvme_submit_flush(nvmeq, ns, cmdid);
}
-/*
- * Called with local interrupts disabled and the q_lock held. May not sleep.
- */
-static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
- struct bio *bio)
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
{
+ struct bio *bio = iod->private;
+ struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
struct nvme_command *cmnd;
- struct nvme_iod *iod;
- enum dma_data_direction dma_dir;
- int cmdid, length, result;
+ int cmdid;
u16 control;
u32 dsmgmt;
- int psegs = bio_phys_segments(ns->queue, bio);
-
- if ((bio->bi_rw & REQ_FLUSH) && psegs) {
- result = nvme_submit_flush_data(nvmeq, ns);
- if (result)
- return result;
- }
-
- result = -ENOMEM;
- iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
- if (!iod)
- goto nomem;
- iod->private = bio;
- result = -EBUSY;
cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
if (unlikely(cmdid < 0))
- goto free_iod;
+ return cmdid;
- if (bio->bi_rw & REQ_DISCARD) {
- result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
- if (result)
- goto free_cmdid;
- return result;
- }
- if ((bio->bi_rw & REQ_FLUSH) && !psegs)
+ if (bio->bi_rw & REQ_DISCARD)
+ return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
+ if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
return nvme_submit_flush(nvmeq, ns, cmdid);
control = 0;
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
-
memset(cmnd, 0, sizeof(*cmnd));
- if (bio_data_dir(bio)) {
- cmnd->rw.opcode = nvme_cmd_write;
- dma_dir = DMA_TO_DEVICE;
- } else {
- cmnd->rw.opcode = nvme_cmd_read;
- dma_dir = DMA_FROM_DEVICE;
- }
-
- result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
- if (result <= 0)
- goto free_cmdid;
- length = result;
+ cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read;
cmnd->rw.command_id = cmdid;
cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
- length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
- GFP_ATOMIC);
+ cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
- cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+ cmnd->rw.length =
+ cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1);
cmnd->rw.control = cpu_to_le16(control);
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
- nvme_start_io_acct(bio);
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
writel(nvmeq->sq_tail, nvmeq->q_db);
return 0;
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held. May not sleep.
+ */
+static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ struct bio *bio)
+{
+ struct nvme_iod *iod;
+ int psegs = bio_phys_segments(ns->queue, bio);
+ int result;
+
+ if ((bio->bi_rw & REQ_FLUSH) && psegs) {
+ result = nvme_submit_flush_data(nvmeq, ns);
+ if (result)
+ return result;
+ }
+
+ iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
+ if (!iod)
+ return -ENOMEM;
+
+ iod->private = bio;
+ if (bio->bi_rw & REQ_DISCARD) {
+ void *range;
+ /*
+ * We reuse the small pool to allocate the 16-byte range here
+ * as it is not worth having a special pool for these or
+ * additional cases to handle freeing the iod.
+ */
+ range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+ GFP_ATOMIC,
+ &iod->first_dma);
+ if (!range) {
+ result = -ENOMEM;
+ goto free_iod;
+ }
+ iod_list(iod)[0] = (__le64 *)range;
+ iod->npages = 0;
+ } else if (psegs) {
+ result = nvme_map_bio(nvmeq, iod, bio,
+ bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+ psegs);
+ if (result <= 0)
+ goto free_iod;
+ if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) !=
+ result) {
+ result = -ENOMEM;
+ goto free_iod;
+ }
+ nvme_start_io_acct(bio);
+ }
+ if (unlikely(nvme_submit_iod(nvmeq, iod))) {
+ if (!waitqueue_active(&nvmeq->sq_full))
+ add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+ list_add_tail(&iod->node, &nvmeq->iod_bio);
+ }
+ return 0;
- free_cmdid:
- free_cmdid(nvmeq, cmdid, NULL);
free_iod:
nvme_free_iod(nvmeq->dev, iod);
- nomem:
return result;
}
}
ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
- fn(nvmeq->dev, ctx, &cqe);
+ fn(nvmeq, ctx, &cqe);
}
/* If the controller ignores the cq head doorbell and continuously
if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
result = nvme_submit_bio_queue(nvmeq, ns, bio);
if (unlikely(result)) {
- if (bio_list_empty(&nvmeq->sq_cong))
+ if (!waitqueue_active(&nvmeq->sq_full))
add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
bio_list_add(&nvmeq->sq_cong, bio);
}
int status;
};
-static void sync_completion(struct nvme_dev *dev, void *ctx,
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct sync_cmd_info *cmdinfo = ctx;
dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
nvmeq->qid);
ctx = cancel_cmdid(nvmeq, cmdid, &fn);
- fn(nvmeq->dev, ctx, &cqe);
+ fn(nvmeq, ctx, &cqe);
}
}
struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
bio_endio(bio, -EIO);
}
+ while (!list_empty(&nvmeq->iod_bio)) {
+ static struct nvme_completion cqe = {
+ .status = cpu_to_le16(
+ (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1),
+ };
+ struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio,
+ struct nvme_iod,
+ node);
+ list_del(&iod->node);
+ bio_completion(nvmeq, iod, &cqe);
+ }
spin_unlock_irq(&nvmeq->q_lock);
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
init_waitqueue_head(&nvmeq->sq_full);
init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
bio_list_init(&nvmeq->sq_cong);
+ INIT_LIST_HEAD(&nvmeq->iod_bio);
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->cq_vector = vector;
c.rw.metadata = cpu_to_le64(meta_dma_addr);
}
- length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
+ length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+ c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ c.rw.prp2 = cpu_to_le64(iod->first_dma);
if (length != (io.nblocks + 1) << ns->lba_shift)
status = -ENOMEM;
length);
if (IS_ERR(iod))
return PTR_ERR(iod);
- length = nvme_setup_prps(dev, &c.common, iod, length,
- GFP_KERNEL);
+ length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+ c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ c.common.prp2 = cpu_to_le64(iod->first_dma);
}
timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
.getgeo = nvme_getgeo,
};
+static void nvme_resubmit_iods(struct nvme_queue *nvmeq)
+{
+ struct nvme_iod *iod, *next;
+
+ list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) {
+ if (unlikely(nvme_submit_iod(nvmeq, iod)))
+ break;
+ list_del(&iod->node);
+ if (bio_list_empty(&nvmeq->sq_cong) &&
+ list_empty(&nvmeq->iod_bio))
+ remove_wait_queue(&nvmeq->sq_full,
+ &nvmeq->sq_cong_wait);
+ }
+}
+
static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
{
while (bio_list_peek(&nvmeq->sq_cong)) {
struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
- if (bio_list_empty(&nvmeq->sq_cong))
+ if (bio_list_empty(&nvmeq->sq_cong) &&
+ list_empty(&nvmeq->iod_bio))
remove_wait_queue(&nvmeq->sq_full,
&nvmeq->sq_cong_wait);
if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
- if (bio_list_empty(&nvmeq->sq_cong))
+ if (!waitqueue_active(&nvmeq->sq_full))
add_wait_queue(&nvmeq->sq_full,
&nvmeq->sq_cong_wait);
bio_list_add_head(&nvmeq->sq_cong, bio);
nvme_process_cq(nvmeq);
nvme_cancel_ios(nvmeq, true);
nvme_resubmit_bios(nvmeq);
+ nvme_resubmit_iods(nvmeq);
unlock:
spin_unlock_irq(&nvmeq->q_lock);
}