blk-mq-sched: add framework for MQ capable IO schedulers

author Jens Axboe <axboe@fb.com>

Tue, 17 Jan 2017 13:03:22 +0000 (06:03 -0700)

committer Jens Axboe <axboe@fb.com>

Tue, 17 Jan 2017 17:04:20 +0000 (10:04 -0700)
author Jens Axboe <axboe@fb.com>
Tue, 17 Jan 2017 13:03:22 +0000 (06:03 -0700)
committer Jens Axboe <axboe@fb.com>
Tue, 17 Jan 2017 17:04:20 +0000 (10:04 -0700)
diff --git a/block/Makefile b/block/Makefile

index a827f988c4e67435a90513bd65d498eebcf8ff6f..2eee9e1bb6db3b46ac7138c553ec7b784ab78eef 100644 (file)
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                         blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                         blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                         blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
-                       blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+                       blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
                         genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
                         badblocks.o partitions/
  
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index 8ba0af780e880e1c6ed72d772ba77dea5a79f436..2630f64bed197f549561b8f09c68143f397f3882 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1223,7 +1223,11 @@ int blkcg_activate_policy(struct request_queue *q,
         if (blkcg_policy_enabled(q, pol))
                 return 0;
  
-       blk_queue_bypass_start(q);
+       if (q->mq_ops) {
+               blk_mq_freeze_queue(q);
+               blk_mq_quiesce_queue(q);
+       } else
+               blk_queue_bypass_start(q);
  pd_prealloc:
         if (!pd_prealloc) {
                 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1261,7 +1265,10 @@ pd_prealloc:
  
         spin_unlock_irq(q->queue_lock);
  out_bypass_end:
-       blk_queue_bypass_end(q);
+       if (q->mq_ops)
+               blk_mq_unfreeze_queue(q);
+       else
+               blk_queue_bypass_end(q);
         if (pd_prealloc)
                 pol->pd_free_fn(pd_prealloc);
         return ret;
@@ -1284,7 +1291,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
         if (!blkcg_policy_enabled(q, pol))
                 return;
  
-       blk_queue_bypass_start(q);
+       if (q->mq_ops) {
+               blk_mq_freeze_queue(q);
+               blk_mq_quiesce_queue(q);
+       } else
+               blk_queue_bypass_start(q);
+
         spin_lock_irq(q->queue_lock);
  
         __clear_bit(pol->plid, q->blkcg_pols);
@@ -1304,7 +1316,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
         }
  
         spin_unlock_irq(q->queue_lock);
-       blk_queue_bypass_end(q);
+
+       if (q->mq_ops)
+               blk_mq_unfreeze_queue(q);
+       else
+               blk_queue_bypass_end(q);
  }
  EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  
diff --git a/block/blk-core.c b/block/blk-core.c

index 92baea07acbc662c3559f0f54360127f2ea7e2b3..a61f1407f4f6e5e159c4441f0d25ccea33719a77 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
  
  #include "blk.h"
  #include "blk-mq.h"
+#include "blk-mq-sched.h"
  #include "blk-wbt.h"
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
         rq->cmd = rq->__cmd;
         rq->cmd_len = BLK_MAX_CDB;
         rq->tag = -1;
+       rq->internal_tag = -1;
         rq->start_time = jiffies;
         set_start_time_ns(rq);
         rq->part = NULL;
@@ -2127,7 +2129,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
         if (q->mq_ops) {
                 if (blk_queue_io_stat(q))
                         blk_account_io_start(rq, true);
-               blk_mq_insert_request(rq, false, true, false);
+               blk_mq_sched_insert_request(rq, false, true, false);
                 return 0;
         }
  
diff --git a/block/blk-exec.c b/block/blk-exec.c

index 3ecb00a6cf45dd5463331a18dd124128a7eec94d..86656fdfa6378e07143d4f58bffc127c56f839ae 100644 (file)
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
  #include <linux/sched/sysctl.h>
  
  #include "blk.h"
+#include "blk-mq-sched.h"
  
  /*
   * for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
          * be reused after dying flag is set
          */
         if (q->mq_ops) {
-               blk_mq_insert_request(rq, at_head, true, false);
+               blk_mq_sched_insert_request(rq, at_head, true, false);
                 return;
         }
  
diff --git a/block/blk-flush.c b/block/blk-flush.c

index 20b7c7a02f1cbdfe5a63c5918d3abd356bd4265b..d7de34ee39c2082d9fdd2f224dd0c9c49eae732e 100644 (file)
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
  
  /* FLUSH/FUA sequences */
  enum {
@@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
          * the comment in flush_end_io().
          */
         spin_lock_irqsave(&fq->mq_flush_lock, flags);
-       if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
-               blk_mq_run_hw_queue(hctx, true);
+       blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
         spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+
+       blk_mq_run_hw_queue(hctx, true);
  }
  
  /**
@@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq)
          */
         if ((policy & REQ_FSEQ_DATA) &&
             !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-               if (q->mq_ops) {
-                       blk_mq_insert_request(rq, false, true, false);
-               } else
+               if (q->mq_ops)
+                       blk_mq_sched_insert_request(rq, false, true, false);
+               else
                         list_add_tail(&rq->queuelist, &q->queue_head);
                 return;
         }
diff --git a/block/blk-ioc.c b/block/blk-ioc.c

index ab372092a57d4e6714b8beb8bfaa8d95f49a0b04..fe186a9eade98bf65f38070f72508619bf2d3f2b 100644 (file)
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,7 +43,9 @@ static void ioc_exit_icq(struct io_cq *icq)
         if (icq->flags & ICQ_EXITED)
                 return;
  
-       if (et->ops.sq.elevator_exit_icq_fn)
+       if (et->uses_mq && et->ops.mq.exit_icq)
+               et->ops.mq.exit_icq(icq);
+       else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
                 et->ops.sq.elevator_exit_icq_fn(icq);
  
         icq->flags |= ICQ_EXITED;
@@ -383,7 +385,9 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
         if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
                 hlist_add_head(&icq->ioc_node, &ioc->icq_list);
                 list_add(&icq->q_node, &q->icq_list);
-               if (et->ops.sq.elevator_init_icq_fn)
+               if (et->uses_mq && et->ops.mq.init_icq)
+                       et->ops.mq.init_icq(icq);
+               else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
                         et->ops.sq.elevator_init_icq_fn(icq);
         } else {
                 kmem_cache_free(et->icq_cache, icq);
diff --git a/block/blk-merge.c b/block/blk-merge.c

index 480570b691dc6ffe2d2902e66b5fc7c48511a163..6aa43dec5af49ad6ed11c4bd3c71e26dad61d2b4 100644 (file)
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,7 +763,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
  {
         struct elevator_queue *e = q->elevator;
  
-       if (e->type->ops.sq.elevator_allow_rq_merge_fn)
+       if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
                 if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
                         return 0;
  
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c

new file mode 100644 (file)

index 0000000..2675979
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,368 @@
+/*
+ * blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+                                void (*exit)(struct blk_mq_hw_ctx *))
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (exit && hctx->sched_data)
+                       exit(hctx);
+               kfree(hctx->sched_data);
+               hctx->sched_data = NULL;
+       }
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+                               int (*init)(struct blk_mq_hw_ctx *),
+                               void (*exit)(struct blk_mq_hw_ctx *))
+{
+       struct blk_mq_hw_ctx *hctx;
+       int ret;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+               if (!hctx->sched_data) {
+                       ret = -ENOMEM;
+                       goto error;
+               }
+
+               if (init) {
+                       ret = init(hctx);
+                       if (ret) {
+                               /*
+                                * We don't want to give exit() a partially
+                                * initialized sched_data. init() must clean up
+                                * if it fails.
+                                */
+                               kfree(hctx->sched_data);
+                               hctx->sched_data = NULL;
+                               goto error;
+                       }
+               }
+       }
+
+       return 0;
+error:
+       blk_mq_sched_free_hctx_data(q, exit);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+static void __blk_mq_sched_assign_ioc(struct request_queue *q,
+                                     struct request *rq, struct io_context *ioc)
+{
+       struct io_cq *icq;
+
+       spin_lock_irq(q->queue_lock);
+       icq = ioc_lookup_icq(ioc, q);
+       spin_unlock_irq(q->queue_lock);
+
+       if (!icq) {
+               icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+               if (!icq)
+                       return;
+       }
+
+       rq->elv.icq = icq;
+       if (!blk_mq_sched_get_rq_priv(q, rq)) {
+               rq->rq_flags |= RQF_ELVPRIV;
+               get_io_context(icq->ioc);
+               return;
+       }
+
+       rq->elv.icq = NULL;
+}
+
+static void blk_mq_sched_assign_ioc(struct request_queue *q,
+                                   struct request *rq, struct bio *bio)
+{
+       struct io_context *ioc;
+
+       ioc = rq_ioc(bio);
+       if (ioc)
+               __blk_mq_sched_assign_ioc(q, rq, ioc);
+}
+
+struct request *blk_mq_sched_get_request(struct request_queue *q,
+                                        struct bio *bio,
+                                        unsigned int op,
+                                        struct blk_mq_alloc_data *data)
+{
+       struct elevator_queue *e = q->elevator;
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+       struct request *rq;
+       const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA);
+
+       blk_queue_enter_live(q);
+       ctx = blk_mq_get_ctx(q);
+       hctx = blk_mq_map_queue(q, ctx->cpu);
+
+       blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+
+       if (e) {
+               data->flags |= BLK_MQ_REQ_INTERNAL;
+
+               /*
+                * Flush requests are special and go directly to the
+                * dispatch list.
+                */
+               if (!is_flush && e->type->ops.mq.get_request) {
+                       rq = e->type->ops.mq.get_request(q, op, data);
+                       if (rq)
+                               rq->rq_flags |= RQF_QUEUED;
+               } else
+                       rq = __blk_mq_alloc_request(data, op);
+       } else {
+               rq = __blk_mq_alloc_request(data, op);
+               data->hctx->tags->rqs[rq->tag] = rq;
+       }
+
+       if (rq) {
+               if (!is_flush) {
+                       rq->elv.icq = NULL;
+                       if (e && e->type->icq_cache)
+                               blk_mq_sched_assign_ioc(q, rq, bio);
+               }
+               data->hctx->queued++;
+               return rq;
+       }
+
+       blk_queue_exit(q);
+       return NULL;
+}
+
+void blk_mq_sched_put_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+
+       if (rq->rq_flags & RQF_ELVPRIV) {
+               blk_mq_sched_put_rq_priv(rq->q, rq);
+               if (rq->elv.icq) {
+                       put_io_context(rq->elv.icq->ioc);
+                       rq->elv.icq = NULL;
+               }
+       }
+
+       if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
+               e->type->ops.mq.put_request(rq);
+       else
+               blk_mq_finish_request(rq);
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+       struct elevator_queue *e = hctx->queue->elevator;
+       LIST_HEAD(rq_list);
+
+       if (unlikely(blk_mq_hctx_stopped(hctx)))
+               return;
+
+       hctx->run++;
+
+       /*
+        * If we have previous entries on our dispatch list, grab them first for
+        * more fair dispatch.
+        */
+       if (!list_empty_careful(&hctx->dispatch)) {
+               spin_lock(&hctx->lock);
+               if (!list_empty(&hctx->dispatch))
+                       list_splice_init(&hctx->dispatch, &rq_list);
+               spin_unlock(&hctx->lock);
+       }
+
+       /*
+        * Only ask the scheduler for requests, if we didn't have residual
+        * requests from the dispatch list. This is to avoid the case where
+        * we only ever dispatch a fraction of the requests available because
+        * of low device queue depth. Once we pull requests out of the IO
+        * scheduler, we can no longer merge or sort them. So it's best to
+        * leave them there for as long as we can. Mark the hw queue as
+        * needing a restart in that case.
+        */
+       if (list_empty(&rq_list)) {
+               if (e && e->type->ops.mq.dispatch_requests)
+                       e->type->ops.mq.dispatch_requests(hctx, &rq_list);
+               else
+                       blk_mq_flush_busy_ctxs(hctx, &rq_list);
+       } else
+               blk_mq_sched_mark_restart(hctx);
+
+       blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
+
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+                                  struct list_head *rq_list,
+                                  struct request *(*get_rq)(struct blk_mq_hw_ctx *))
+{
+       do {
+               struct request *rq;
+
+               rq = get_rq(hctx);
+               if (!rq)
+                       break;
+
+               list_add_tail(&rq->queuelist, rq_list);
+       } while (1);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
+{
+       struct request *rq;
+       int ret;
+
+       ret = elv_merge(q, &rq, bio);
+       if (ret == ELEVATOR_BACK_MERGE) {
+               if (!blk_mq_sched_allow_merge(q, rq, bio))
+                       return false;
+               if (bio_attempt_back_merge(q, rq, bio)) {
+                       if (!attempt_back_merge(q, rq))
+                               elv_merged_request(q, rq, ret);
+                       return true;
+               }
+       } else if (ret == ELEVATOR_FRONT_MERGE) {
+               if (!blk_mq_sched_allow_merge(q, rq, bio))
+                       return false;
+               if (bio_attempt_front_merge(q, rq, bio)) {
+                       if (!attempt_front_merge(q, rq))
+                               elv_merged_request(q, rq, ret);
+                       return true;
+               }
+       }
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
+
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e->type->ops.mq.bio_merge) {
+               struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+               struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+               blk_mq_put_ctx(ctx);
+               return e->type->ops.mq.bio_merge(hctx, bio);
+       }
+
+       return false;
+}
+
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+{
+       return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
+
+void blk_mq_sched_request_inserted(struct request *rq)
+{
+       trace_block_rq_insert(rq->q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+       if (rq->tag == -1) {
+               rq->rq_flags |= RQF_SORTED;
+               return false;
+       }
+
+       /*
+        * If we already have a real request tag, send directly to
+        * the dispatch list.
+        */
+       spin_lock(&hctx->lock);
+       list_add(&rq->queuelist, &hctx->dispatch);
+       spin_unlock(&hctx->lock);
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
+
+static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
+                                  struct blk_mq_hw_ctx *hctx,
+                                  unsigned int hctx_idx)
+{
+       if (hctx->sched_tags) {
+               blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
+               blk_mq_free_rq_map(hctx->sched_tags);
+               hctx->sched_tags = NULL;
+       }
+}
+
+int blk_mq_sched_setup(struct request_queue *q)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+       struct blk_mq_hw_ctx *hctx;
+       int ret, i;
+
+       /*
+        * Default to 256, since we don't split into sync/async like the
+        * old code did. Additionally, this is a per-hw queue depth.
+        */
+       q->nr_requests = 2 * BLKDEV_MAX_RQ;
+
+       /*
+        * We're switching to using an IO scheduler, so setup the hctx
+        * scheduler tags and switch the request map from the regular
+        * tags to scheduler tags. First allocate what we need, so we
+        * can safely fail and fallback, if needed.
+        */
+       ret = 0;
+       queue_for_each_hw_ctx(q, hctx, i) {
+               hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+               if (!hctx->sched_tags) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+               if (ret)
+                       break;
+       }
+
+       /*
+        * If we failed, free what we did allocate
+        */
+       if (ret) {
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (!hctx->sched_tags)
+                               continue;
+                       blk_mq_sched_free_tags(set, hctx, i);
+               }
+
+               return ret;
+       }
+
+       return 0;
+}
+
+void blk_mq_sched_teardown(struct request_queue *q)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_sched_free_tags(set, hctx, i);
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h

new file mode 100644 (file)

index 0000000..35c49e2
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,170 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+                               int (*init)(struct blk_mq_hw_ctx *),
+                               void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+                                void (*exit)(struct blk_mq_hw_ctx *));
+
+struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
+void blk_mq_sched_put_request(struct request *rq);
+
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+                       struct list_head *rq_list,
+                       struct request *(*get_rq)(struct blk_mq_hw_ctx *));
+
+int blk_mq_sched_setup(struct request_queue *q);
+void blk_mq_sched_teardown(struct request_queue *q);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+               return false;
+
+       return __blk_mq_sched_bio_merge(q, bio);
+}
+
+static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
+                                          struct request *rq)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.get_rq_priv)
+               return e->type->ops.mq.get_rq_priv(q, rq);
+
+       return 0;
+}
+
+static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
+                                           struct request *rq)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.put_rq_priv)
+               e->type->ops.mq.put_rq_priv(q, rq);
+}
+
+static inline void
+blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue,
+                           bool async)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+       if (e && e->type->ops.mq.insert_requests) {
+               LIST_HEAD(list);
+
+               list_add(&rq->queuelist, &list);
+               e->type->ops.mq.insert_requests(hctx, &list, at_head);
+       } else {
+               spin_lock(&ctx->lock);
+               __blk_mq_insert_request(hctx, rq, at_head);
+               spin_unlock(&ctx->lock);
+       }
+
+       if (run_queue)
+               blk_mq_run_hw_queue(hctx, async);
+}
+
+static inline void
+blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx,
+                            struct list_head *list, bool run_queue_async)
+{
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+       struct elevator_queue *e = hctx->queue->elevator;
+
+       if (e && e->type->ops.mq.insert_requests)
+               e->type->ops.mq.insert_requests(hctx, list, false);
+       else
+               blk_mq_insert_requests(hctx, ctx, list);
+
+       blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+                        struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.allow_merge)
+               return e->type->ops.mq.allow_merge(q, rq, bio);
+
+       return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+       struct elevator_queue *e = hctx->queue->elevator;
+
+       if (e && e->type->ops.mq.completed_request)
+               e->type->ops.mq.completed_request(hctx, rq);
+
+       BUG_ON(rq->internal_tag == -1);
+
+       blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+
+       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+               clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+               blk_mq_run_hw_queue(hctx, true);
+       }
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.started_request)
+               e->type->ops.mq.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.requeue_request)
+               e->type->ops.mq.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+       struct elevator_queue *e = hctx->queue->elevator;
+
+       if (e && e->type->ops.mq.has_work)
+               return e->type->ops.mq.has_work(hctx);
+
+       return false;
+}
+
+static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+{
+       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+               set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
+{
+       return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c

index eacd3af72099018c82e0a41fb9460b670d3e2845..2caecaa98e40bb57dd3c0f43e8f75e3fab7b3701 100644 (file)
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -231,6 +231,14 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
         return ret;
  }
  
+static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+       if (hctx->sched_tags)
+               return blk_mq_tag_sysfs_show(hctx->sched_tags, page);
+
+       return 0;
+}
+
  static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
  {
         return blk_mq_tag_sysfs_show(hctx->tags, page);
@@ -345,6 +353,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
         .attr = {.name = "pending", .mode = S_IRUGO },
         .show = blk_mq_hw_sysfs_rq_list_show,
  };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = {
+       .attr = {.name = "sched_tags", .mode = S_IRUGO },
+       .show = blk_mq_hw_sysfs_sched_tags_show,
+};
  static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
         .attr = {.name = "tags", .mode = S_IRUGO },
         .show = blk_mq_hw_sysfs_tags_show,
@@ -370,6 +382,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
         &blk_mq_hw_sysfs_dispatched.attr,
         &blk_mq_hw_sysfs_pending.attr,
         &blk_mq_hw_sysfs_tags.attr,
+       &blk_mq_hw_sysfs_sched_tags.attr,
         &blk_mq_hw_sysfs_cpus.attr,
         &blk_mq_hw_sysfs_active.attr,
         &blk_mq_hw_sysfs_poll.attr,
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 89b81254201bc7679ae992898792207c85900b9d..45e1707a9f8652027eaa59347cb4006df1de5a79 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@
  #include "blk-mq-tag.h"
  #include "blk-stat.h"
  #include "blk-wbt.h"
+#include "blk-mq-sched.h"
  
  static DEFINE_MUTEX(all_q_mutex);
  static LIST_HEAD(all_q_list);
@@ -41,7 +42,9 @@ static LIST_HEAD(all_q_list);
   */
  static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  {
-       return sbitmap_any_bit_set(&hctx->ctx_map);
+       return sbitmap_any_bit_set(&hctx->ctx_map) ||
+                       !list_empty_careful(&hctx->dispatch) ||
+                       blk_mq_sched_has_work(hctx);
  }
  
  /*
@@ -223,15 +226,23 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
  
         tag = blk_mq_get_tag(data);
         if (tag != BLK_MQ_TAG_FAIL) {
-               rq = data->hctx->tags->static_rqs[tag];
+               struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+
+               rq = tags->static_rqs[tag];
  
                 if (blk_mq_tag_busy(data->hctx)) {
                         rq->rq_flags = RQF_MQ_INFLIGHT;
                         atomic_inc(&data->hctx->nr_active);
                 }
  
-               rq->tag = tag;
-               data->hctx->tags->rqs[tag] = rq;
+               if (data->flags & BLK_MQ_REQ_INTERNAL) {
+                       rq->tag = -1;
+                       rq->internal_tag = tag;
+               } else {
+                       rq->tag = tag;
+                       rq->internal_tag = -1;
+               }
+
                 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
                 return rq;
         }
@@ -243,26 +254,21 @@ EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
  struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
                 unsigned int flags)
  {
-       struct blk_mq_ctx *ctx;
-       struct blk_mq_hw_ctx *hctx;
-       struct request *rq;
         struct blk_mq_alloc_data alloc_data;
+       struct request *rq;
         int ret;
  
         ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
         if (ret)
                 return ERR_PTR(ret);
  
-       ctx = blk_mq_get_ctx(q);
-       hctx = blk_mq_map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw);
-       blk_mq_put_ctx(ctx);
+       rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
  
-       if (!rq) {
-               blk_queue_exit(q);
+       blk_mq_put_ctx(alloc_data.ctx);
+       blk_queue_exit(q);
+
+       if (!rq)
                 return ERR_PTR(-EWOULDBLOCK);
-       }
  
         rq->__data_len = 0;
         rq->__sector = (sector_t) -1;
@@ -322,10 +328,10 @@ out_queue_exit:
  }
  EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
  
-void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-                          struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                            struct request *rq)
  {
-       const int tag = rq->tag;
+       const int sched_tag = rq->internal_tag;
         struct request_queue *q = rq->q;
  
         if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -336,22 +342,30 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
  
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
         clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-       blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+       if (rq->tag != -1)
+               blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+       if (sched_tag != -1)
+               blk_mq_sched_completed_request(hctx, rq);
         blk_queue_exit(q);
  }
  
-static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
                                      struct request *rq)
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
  
         ctx->rq_completed[rq_is_sync(rq)]++;
-       __blk_mq_free_request(hctx, ctx, rq);
+       __blk_mq_finish_request(hctx, ctx, rq);
+}
+
+void blk_mq_finish_request(struct request *rq)
+{
+       blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
  }
  
  void blk_mq_free_request(struct request *rq)
  {
-       blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
+       blk_mq_sched_put_request(rq);
  }
  EXPORT_SYMBOL_GPL(blk_mq_free_request);
  
@@ -469,6 +483,8 @@ void blk_mq_start_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
  
+       blk_mq_sched_started_request(rq);
+
         trace_block_rq_issue(q, rq);
  
         rq->resid_len = blk_rq_bytes(rq);
@@ -517,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq)
  
         trace_block_rq_requeue(q, rq);
         wbt_requeue(q->rq_wb, &rq->issue_stat);
+       blk_mq_sched_requeue_request(rq);
  
         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                 if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -551,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
  
                 rq->rq_flags &= ~RQF_SOFTBARRIER;
                 list_del_init(&rq->queuelist);
-               blk_mq_insert_request(rq, true, false, false);
+               blk_mq_sched_insert_request(rq, true, false, false);
         }
  
         while (!list_empty(&rq_list)) {
                 rq = list_entry(rq_list.next, struct request, queuelist);
                 list_del_init(&rq->queuelist);
-               blk_mq_insert_request(rq, false, false, false);
+               blk_mq_sched_insert_request(rq, false, false, false);
         }
  
         blk_mq_run_hw_queues(q, false);
@@ -765,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
                         continue;
  
                 el_ret = blk_try_merge(rq, bio);
+               if (el_ret == ELEVATOR_NO_MERGE)
+                       continue;
+
+               if (!blk_mq_sched_allow_merge(q, rq, bio))
+                       break;
+
                 if (el_ret == ELEVATOR_BACK_MERGE) {
                         if (bio_attempt_back_merge(q, rq, bio)) {
                                 ctx->rq_merged++;
@@ -824,6 +847,59 @@ static inline unsigned int queued_to_index(unsigned int queued)
         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
  }
  
+static bool blk_mq_get_driver_tag(struct request *rq,
+                                 struct blk_mq_hw_ctx **hctx, bool wait)
+{
+       struct blk_mq_alloc_data data = {
+               .q = rq->q,
+               .ctx = rq->mq_ctx,
+               .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+               .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+       };
+
+       if (blk_mq_hctx_stopped(data.hctx))
+               return false;
+
+       if (rq->tag != -1) {
+done:
+               if (hctx)
+                       *hctx = data.hctx;
+               return true;
+       }
+
+       rq->tag = blk_mq_get_tag(&data);
+       if (rq->tag >= 0) {
+               data.hctx->tags->rqs[rq->tag] = rq;
+               goto done;
+       }
+
+       return false;
+}
+
+/*
+ * If we fail getting a driver tag because all the driver tags are already
+ * assigned and on the dispatch list, BUT the first entry does not have a
+ * tag, then we could deadlock. For that case, move entries with assigned
+ * driver tags to the front, leaving the set of tagged requests in the
+ * same order, and the untagged set in the same order.
+ */
+static bool reorder_tags_to_front(struct list_head *list)
+{
+       struct request *rq, *tmp, *first = NULL;
+
+       list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
+               if (rq == first)
+                       break;
+               if (rq->tag != -1) {
+                       list_move(&rq->queuelist, list);
+                       if (!first)
+                               first = rq;
+               }
+       }
+
+       return first != NULL;
+}
+
  bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
  {
         struct request_queue *q = hctx->queue;
@@ -846,6 +922,12 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                 struct blk_mq_queue_data bd;
  
                 rq = list_first_entry(list, struct request, queuelist);
+               if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+                       if (!queued && reorder_tags_to_front(list))
+                               continue;
+                       blk_mq_sched_mark_restart(hctx);
+                       break;
+               }
                 list_del_init(&rq->queuelist);
  
                 bd.rq = rq;
@@ -899,48 +981,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                  * the requests in rq_list might get lost.
                  *
                  * blk_mq_run_hw_queue() already checks the STOPPED bit
-                **/
-               blk_mq_run_hw_queue(hctx, true);
+                *
+                * If RESTART is set, then let completion restart the queue
+                * instead of potentially looping here.
+                */
+               if (!blk_mq_sched_needs_restart(hctx))
+                       blk_mq_run_hw_queue(hctx, true);
         }
  
         return ret != BLK_MQ_RQ_QUEUE_BUSY;
  }
  
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
-{
-       LIST_HEAD(rq_list);
-       LIST_HEAD(driver_list);
-
-       if (unlikely(blk_mq_hctx_stopped(hctx)))
-               return;
-
-       hctx->run++;
-
-       /*
-        * Touch any software queue that has pending entries.
-        */
-       blk_mq_flush_busy_ctxs(hctx, &rq_list);
-
-       /*
-        * If we have previous entries on our dispatch list, grab them
-        * and stuff them at the front for more fair dispatch.
-        */
-       if (!list_empty_careful(&hctx->dispatch)) {
-               spin_lock(&hctx->lock);
-               if (!list_empty(&hctx->dispatch))
-                       list_splice_init(&hctx->dispatch, &rq_list);
-               spin_unlock(&hctx->lock);
-       }
-
-       blk_mq_dispatch_rq_list(hctx, &rq_list);
-}
-
  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  {
         int srcu_idx;
@@ -950,11 +1001,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  
         if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
                 rcu_read_lock();
-               blk_mq_process_rq_list(hctx);
+               blk_mq_sched_dispatch_requests(hctx);
                 rcu_read_unlock();
         } else {
                 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-               blk_mq_process_rq_list(hctx);
+               blk_mq_sched_dispatch_requests(hctx);
                 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
         }
  }
@@ -1010,8 +1061,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
         int i;
  
         queue_for_each_hw_ctx(q, hctx, i) {
-               if ((!blk_mq_hctx_has_pending(hctx) &&
-                   list_empty_careful(&hctx->dispatch)) ||
+               if (!blk_mq_hctx_has_pending(hctx) ||
                     blk_mq_hctx_stopped(hctx))
                         continue;
  
@@ -1148,32 +1198,10 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
         blk_mq_hctx_mark_pending(hctx, ctx);
  }
  
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-                          bool async)
-{
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       spin_lock(&ctx->lock);
-       __blk_mq_insert_request(hctx, rq, at_head);
-       spin_unlock(&ctx->lock);
-
-       if (run_queue)
-               blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
-                                    struct blk_mq_ctx *ctx,
-                                    struct list_head *list,
-                                    int depth,
-                                    bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                           struct list_head *list)
  
  {
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       trace_block_unplug(q, depth, !from_schedule);
-
         /*
          * preemption doesn't flush plug list, so it's possible ctx->cpu is
          * offline now
@@ -1189,8 +1217,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
         }
         blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
-
-       blk_mq_run_hw_queue(hctx, from_schedule);
  }
  
  static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1226,9 +1252,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                 BUG_ON(!rq->q);
                 if (rq->mq_ctx != this_ctx) {
                         if (this_ctx) {
-                               blk_mq_insert_requests(this_q, this_ctx,
-                                                       &ctx_list, depth,
-                                                       from_schedule);
+                               trace_block_unplug(this_q, depth, from_schedule);
+                               blk_mq_sched_insert_requests(this_q, this_ctx,
+                                                               &ctx_list,
+                                                               from_schedule);
                         }
  
                         this_ctx = rq->mq_ctx;
@@ -1245,8 +1272,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
          * on 'ctx_list'. Do those.
          */
         if (this_ctx) {
-               blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
-                                      from_schedule);
+               trace_block_unplug(this_q, depth, from_schedule);
+               blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+                                               from_schedule);
         }
  }
  
@@ -1284,51 +1312,39 @@ insert_rq:
                 }
  
                 spin_unlock(&ctx->lock);
-               __blk_mq_free_request(hctx, ctx, rq);
+               __blk_mq_finish_request(hctx, ctx, rq);
                 return true;
         }
  }
  
-static struct request *blk_mq_map_request(struct request_queue *q,
-                                         struct bio *bio,
-                                         struct blk_mq_alloc_data *data)
-{
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-       struct request *rq;
-
-       blk_queue_enter_live(q);
-       ctx = blk_mq_get_ctx(q);
-       hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       trace_block_getrq(q, bio, bio->bi_opf);
-       blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-       rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
-       data->hctx->queued++;
-       return rq;
-}
-
  static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
  {
-       return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
+       if (rq->tag != -1)
+               return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
+
+       return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
  }
  
  static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
  {
-       int ret;
         struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
         struct blk_mq_queue_data bd = {
                 .rq = rq,
                 .list = NULL,
                 .last = 1
         };
-       blk_qc_t new_cookie = request_to_qc_t(hctx, rq);
+       struct blk_mq_hw_ctx *hctx;
+       blk_qc_t new_cookie;
+       int ret;
  
-       if (blk_mq_hctx_stopped(hctx))
+       if (q->elevator)
                 goto insert;
  
+       if (!blk_mq_get_driver_tag(rq, &hctx, false))
+               goto insert;
+
+       new_cookie = request_to_qc_t(hctx, rq);
+
         /*
          * For OK queue, we are done. For error, kill it. Any other
          * error (busy), just add it to our list as we previously
@@ -1350,7 +1366,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
         }
  
  insert:
-       blk_mq_insert_request(rq, false, true, true);
+       blk_mq_sched_insert_request(rq, false, true, true);
  }
  
  /*
@@ -1383,9 +1399,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
             blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
                 return BLK_QC_T_NONE;
  
+       if (blk_mq_sched_bio_merge(q, bio))
+               return BLK_QC_T_NONE;
+
         wb_acct = wbt_wait(q->rq_wb, bio, NULL);
  
-       rq = blk_mq_map_request(q, bio, &data);
+       trace_block_getrq(q, bio, bio->bi_opf);
+
+       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
         if (unlikely(!rq)) {
                 __wbt_done(q->rq_wb, wb_acct);
                 return BLK_QC_T_NONE;
@@ -1397,6 +1418,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  
         if (unlikely(is_flush_fua)) {
                 blk_mq_bio_to_request(rq, bio);
+               blk_mq_get_driver_tag(rq, NULL, true);
                 blk_insert_flush(rq);
                 goto run_queue;
         }
@@ -1447,6 +1469,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 goto done;
         }
  
+       if (q->elevator) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true, true);
+               goto done;
+       }
         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
                 /*
                  * For a SYNC request, send it to the hardware immediately. For
@@ -1492,9 +1520,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
         } else
                 request_count = blk_plug_queued_count(q);
  
+       if (blk_mq_sched_bio_merge(q, bio))
+               return BLK_QC_T_NONE;
+
         wb_acct = wbt_wait(q->rq_wb, bio, NULL);
  
-       rq = blk_mq_map_request(q, bio, &data);
+       trace_block_getrq(q, bio, bio->bi_opf);
+
+       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
         if (unlikely(!rq)) {
                 __wbt_done(q->rq_wb, wb_acct);
                 return BLK_QC_T_NONE;
@@ -1506,6 +1539,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
  
         if (unlikely(is_flush_fua)) {
                 blk_mq_bio_to_request(rq, bio);
+               blk_mq_get_driver_tag(rq, NULL, true);
                 blk_insert_flush(rq);
                 goto run_queue;
         }
@@ -1544,6 +1578,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
                 return cookie;
         }
  
+       if (q->elevator) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true, true);
+               goto done;
+       }
         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
                 /*
                  * For a SYNC request, send it to the hardware immediately. For
@@ -1556,6 +1596,7 @@ run_queue:
         }
  
         blk_mq_put_ctx(data.ctx);
+done:
         return cookie;
  }
  
@@ -1925,9 +1966,11 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
  static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
                                          unsigned int hctx_idx)
  {
-       blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-       blk_mq_free_rq_map(set->tags[hctx_idx]);
-       set->tags[hctx_idx] = NULL;
+       if (set->tags[hctx_idx]) {
+               blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+               blk_mq_free_rq_map(set->tags[hctx_idx]);
+               set->tags[hctx_idx] = NULL;
+       }
  }
  
  static void blk_mq_map_swqueue(struct request_queue *q,
@@ -2084,6 +2127,8 @@ void blk_mq_release(struct request_queue *q)
         struct blk_mq_hw_ctx *hctx;
         unsigned int i;
  
+       blk_mq_sched_teardown(q);
+
         /* hctx kobj stays in hctx */
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (!hctx)
@@ -2504,14 +2549,22 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
         struct blk_mq_hw_ctx *hctx;
         int i, ret;
  
-       if (!set || nr > set->queue_depth)
+       if (!set)
                 return -EINVAL;
  
         ret = 0;
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (!hctx->tags)
                         continue;
-               ret = blk_mq_tag_update_depth(hctx->tags, nr);
+               /*
+                * If we're using an MQ scheduler, just update the scheduler
+                * queue depth. This is similar to what the old code would do.
+                */
+               if (!hctx->sched_tags)
+                       ret = blk_mq_tag_update_depth(hctx->tags,
+                                                       min(nr, set->queue_depth));
+               else
+                       ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
                 if (ret)
                         break;
         }
@@ -2704,7 +2757,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
                 blk_flush_plug_list(plug, false);
  
         hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-       rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       if (!blk_qc_t_is_internal(cookie))
+               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       else
+               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
  
         return __blk_mq_poll(hctx, rq);
  }
diff --git a/block/blk-mq.h b/block/blk-mq.h

index 1b279b02d0f6a5554c8493e40300dbf32dac6a5c..0c7c034d9ddde2e7a211326b36762b9612af0ec2 100644 (file)
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,6 +52,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
   */
  void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                                 bool at_head);
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                               struct list_head *list);
  /*
   * CPU hotplug helpers
   */
@@ -124,6 +126,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
  
  static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
  {
+       if (data->flags & BLK_MQ_REQ_INTERNAL)
+               return data->hctx->sched_tags;
+
         return data->hctx->tags;
  }
  
@@ -132,8 +137,9 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
   */
  void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
                         struct request *rq, unsigned int op);
-void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                                 struct request *rq);
+void blk_mq_finish_request(struct request *rq);
  struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
                                         unsigned int op);
  
diff --git a/block/blk-tag.c b/block/blk-tag.c

index bae1decb6ec39c7c9032ee9379aad43464e75edf..07cc329fa4b0aa9f01fc32b4f56211df8dff7107 100644 (file)
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
         list_del_init(&rq->queuelist);
         rq->rq_flags &= ~RQF_QUEUED;
         rq->tag = -1;
+       rq->internal_tag = -1;
  
         if (unlikely(bqt->tag_index[tag] == NULL))
                 printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/block/elevator.c b/block/elevator.c

index 022a26830297f3ee94bb14ef1da65f943af2f3da..0e1ccddab8a26db70513b7fa566b1d0f39ff4859 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
  #include <trace/events/block.h>
  
  #include "blk.h"
+#include "blk-mq-sched.h"
  
  static DEFINE_SPINLOCK(elv_list_lock);
  static LIST_HEAD(elv_list);
@@ -58,7 +59,9 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
         struct request_queue *q = rq->q;
         struct elevator_queue *e = q->elevator;
  
-       if (e->type->ops.sq.elevator_allow_bio_merge_fn)
+       if (e->uses_mq && e->type->ops.mq.allow_merge)
+               return e->type->ops.mq.allow_merge(q, rq, bio);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
                 return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
  
         return 1;
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
         kobject_init(&eq->kobj, &elv_ktype);
         mutex_init(&eq->sysfs_lock);
         hash_init(eq->hash);
+       eq->uses_mq = e->uses_mq;
  
         return eq;
  }
@@ -219,14 +223,26 @@ int elevator_init(struct request_queue *q, char *name)
                 if (!e) {
                         printk(KERN_ERR
                                 "Default I/O scheduler not found. " \
-                               "Using noop.\n");
+                               "Using noop/none.\n");
+                       if (q->mq_ops) {
+                               elevator_put(e);
+                               return 0;
+                       }
                         e = elevator_get("noop", false);
                 }
         }
  
-       err = e->ops.sq.elevator_init_fn(q, e);
-       if (err)
+       if (e->uses_mq) {
+               err = blk_mq_sched_setup(q);
+               if (!err)
+                       err = e->ops.mq.init_sched(q, e);
+       } else
+               err = e->ops.sq.elevator_init_fn(q, e);
+       if (err) {
+               if (e->uses_mq)
+                       blk_mq_sched_teardown(q);
                 elevator_put(e);
+       }
         return err;
  }
  EXPORT_SYMBOL(elevator_init);
@@ -234,7 +250,9 @@ EXPORT_SYMBOL(elevator_init);
  void elevator_exit(struct elevator_queue *e)
  {
         mutex_lock(&e->sysfs_lock);
-       if (e->type->ops.sq.elevator_exit_fn)
+       if (e->uses_mq && e->type->ops.mq.exit_sched)
+               e->type->ops.mq.exit_sched(e);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
                 e->type->ops.sq.elevator_exit_fn(e);
         mutex_unlock(&e->sysfs_lock);
  
@@ -253,6 +271,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq)
         if (ELV_ON_HASH(rq))
                 __elv_rqhash_del(rq);
  }
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
  
  void elv_rqhash_add(struct request_queue *q, struct request *rq)
  {
@@ -262,6 +281,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq)
         hash_add(e->hash, &rq->hash, rq_hash_key(rq));
         rq->rq_flags |= RQF_HASHED;
  }
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
  
  void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
  {
@@ -443,7 +463,9 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
                 return ELEVATOR_BACK_MERGE;
         }
  
-       if (e->type->ops.sq.elevator_merge_fn)
+       if (e->uses_mq && e->type->ops.mq.request_merge)
+               return e->type->ops.mq.request_merge(q, req, bio);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
                 return e->type->ops.sq.elevator_merge_fn(q, req, bio);
  
         return ELEVATOR_NO_MERGE;
@@ -456,8 +478,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
   *
   * Returns true if we merged, false otherwise
   */
-static bool elv_attempt_insert_merge(struct request_queue *q,
-                                    struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
  {
         struct request *__rq;
         bool ret;
@@ -495,7 +516,9 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
  {
         struct elevator_queue *e = q->elevator;
  
-       if (e->type->ops.sq.elevator_merged_fn)
+       if (e->uses_mq && e->type->ops.mq.request_merged)
+               e->type->ops.mq.request_merged(q, rq, type);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
                 e->type->ops.sq.elevator_merged_fn(q, rq, type);
  
         if (type == ELEVATOR_BACK_MERGE)
@@ -508,10 +531,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
                              struct request *next)
  {
         struct elevator_queue *e = q->elevator;
-       const int next_sorted = next->rq_flags & RQF_SORTED;
-
-       if (next_sorted && e->type->ops.sq.elevator_merge_req_fn)
-               e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+       bool next_sorted = false;
+
+       if (e->uses_mq && e->type->ops.mq.requests_merged)
+               e->type->ops.mq.requests_merged(q, rq, next);
+       else if (e->type->ops.sq.elevator_merge_req_fn) {
+               next_sorted = next->rq_flags & RQF_SORTED;
+               if (next_sorted)
+                       e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+       }
  
         elv_rqhash_reposition(q, rq);
  
@@ -528,6 +556,9 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
  {
         struct elevator_queue *e = q->elevator;
  
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
         if (e->type->ops.sq.elevator_bio_merged_fn)
                 e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
  }
@@ -574,11 +605,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
  
  void elv_drain_elevator(struct request_queue *q)
  {
+       struct elevator_queue *e = q->elevator;
         static int printed;
  
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
         lockdep_assert_held(q->queue_lock);
  
-       while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1))
+       while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
                 ;
         if (q->nr_sorted && printed++ < 10) {
                 printk(KERN_ERR "%s: forced dispatching is broken "
@@ -682,8 +717,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
  {
         struct elevator_queue *e = q->elevator;
  
-       if (e->type->ops.sq.elevator_latter_req_fn)
+       if (e->uses_mq && e->type->ops.mq.next_request)
+               return e->type->ops.mq.next_request(q, rq);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
                 return e->type->ops.sq.elevator_latter_req_fn(q, rq);
+
         return NULL;
  }
  
@@ -691,7 +729,9 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
  {
         struct elevator_queue *e = q->elevator;
  
-       if (e->type->ops.sq.elevator_former_req_fn)
+       if (e->uses_mq && e->type->ops.mq.former_request)
+               return e->type->ops.mq.former_request(q, rq);
+       if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
                 return e->type->ops.sq.elevator_former_req_fn(q, rq);
         return NULL;
  }
@@ -701,6 +741,9 @@ int elv_set_request(struct request_queue *q, struct request *rq,
  {
         struct elevator_queue *e = q->elevator;
  
+       if (WARN_ON_ONCE(e->uses_mq))
+               return 0;
+
         if (e->type->ops.sq.elevator_set_req_fn)
                 return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
         return 0;
@@ -710,6 +753,9 @@ void elv_put_request(struct request_queue *q, struct request *rq)
  {
         struct elevator_queue *e = q->elevator;
  
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
         if (e->type->ops.sq.elevator_put_req_fn)
                 e->type->ops.sq.elevator_put_req_fn(rq);
  }
@@ -718,6 +764,9 @@ int elv_may_queue(struct request_queue *q, unsigned int op)
  {
         struct elevator_queue *e = q->elevator;
  
+       if (WARN_ON_ONCE(e->uses_mq))
+               return 0;
+
         if (e->type->ops.sq.elevator_may_queue_fn)
                 return e->type->ops.sq.elevator_may_queue_fn(q, op);
  
@@ -728,6 +777,9 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
  {
         struct elevator_queue *e = q->elevator;
  
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
         /*
          * request is released from the driver, io must be done
          */
@@ -803,7 +855,7 @@ int elv_register_queue(struct request_queue *q)
                 }
                 kobject_uevent(&e->kobj, KOBJ_ADD);
                 e->registered = 1;
-               if (e->type->ops.sq.elevator_registered_fn)
+               if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
                         e->type->ops.sq.elevator_registered_fn(q);
         }
         return error;
@@ -891,9 +943,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
  {
         struct elevator_queue *old = q->elevator;
-       bool registered = old->registered;
+       bool old_registered = false;
         int err;
  
+       if (q->mq_ops) {
+               blk_mq_freeze_queue(q);
+               blk_mq_quiesce_queue(q);
+       }
+
         /*
          * Turn on BYPASS and drain all requests w/ elevator private data.
          * Block layer doesn't call into a quiesced elevator - all requests
@@ -901,42 +958,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
          * using INSERT_BACK.  All requests have SOFTBARRIER set and no
          * merge happens either.
          */
-       blk_queue_bypass_start(q);
+       if (old) {
+               old_registered = old->registered;
+
+               if (old->uses_mq)
+                       blk_mq_sched_teardown(q);
  
-       /* unregister and clear all auxiliary data of the old elevator */
-       if (registered)
-               elv_unregister_queue(q);
+               if (!q->mq_ops)
+                       blk_queue_bypass_start(q);
  
-       spin_lock_irq(q->queue_lock);
-       ioc_clear_queue(q);
-       spin_unlock_irq(q->queue_lock);
+               /* unregister and clear all auxiliary data of the old elevator */
+               if (old_registered)
+                       elv_unregister_queue(q);
+
+               spin_lock_irq(q->queue_lock);
+               ioc_clear_queue(q);
+               spin_unlock_irq(q->queue_lock);
+       }
  
         /* allocate, init and register new elevator */
-       err = new_e->ops.sq.elevator_init_fn(q, new_e);
-       if (err)
-               goto fail_init;
+       if (new_e) {
+               if (new_e->uses_mq) {
+                       err = blk_mq_sched_setup(q);
+                       if (!err)
+                               err = new_e->ops.mq.init_sched(q, new_e);
+               } else
+                       err = new_e->ops.sq.elevator_init_fn(q, new_e);
+               if (err)
+                       goto fail_init;
  
-       if (registered) {
                 err = elv_register_queue(q);
                 if (err)
                         goto fail_register;
-       }
+       } else
+               q->elevator = NULL;
  
         /* done, kill the old one and finish */
-       elevator_exit(old);
-       blk_queue_bypass_end(q);
+       if (old) {
+               elevator_exit(old);
+               if (!q->mq_ops)
+                       blk_queue_bypass_end(q);
+       }
+
+       if (q->mq_ops) {
+               blk_mq_unfreeze_queue(q);
+               blk_mq_start_stopped_hw_queues(q, true);
+       }
  
-       blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+       if (new_e)
+               blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+       else
+               blk_add_trace_msg(q, "elv switch: none");
  
         return 0;
  
  fail_register:
+       if (q->mq_ops)
+               blk_mq_sched_teardown(q);
         elevator_exit(q->elevator);
  fail_init:
         /* switch failed, restore and re-register old elevator */
-       q->elevator = old;
-       elv_register_queue(q);
-       blk_queue_bypass_end(q);
+       if (old) {
+               q->elevator = old;
+               elv_register_queue(q);
+               if (!q->mq_ops)
+                       blk_queue_bypass_end(q);
+       }
+       if (q->mq_ops) {
+               blk_mq_unfreeze_queue(q);
+               blk_mq_start_stopped_hw_queues(q, true);
+       }
  
         return err;
  }
@@ -949,8 +1040,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
         char elevator_name[ELV_NAME_MAX];
         struct elevator_type *e;
  
-       if (!q->elevator)
-               return -ENXIO;
+       /*
+        * Special case for mq, turn off scheduling
+        */
+       if (q->mq_ops && !strncmp(name, "none", 4))
+               return elevator_switch(q, NULL);
  
         strlcpy(elevator_name, name, sizeof(elevator_name));
         e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1053,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
                 return -EINVAL;
         }
  
-       if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+       if (q->elevator &&
+           !strcmp(elevator_name, q->elevator->type->elevator_name)) {
                 elevator_put(e);
                 return 0;
         }
  
+       if (!e->uses_mq && q->mq_ops) {
+               elevator_put(e);
+               return -EINVAL;
+       }
+       if (e->uses_mq && !q->mq_ops) {
+               elevator_put(e);
+               return -EINVAL;
+       }
+
         return elevator_switch(q, e);
  }
  
@@ -985,7 +1089,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
  {
         int ret;
  
-       if (!q->elevator)
+       if (!(q->mq_ops || q->request_fn))
                 return count;
  
         ret = __elevator_change(q, name);
@@ -999,24 +1103,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
  ssize_t elv_iosched_show(struct request_queue *q, char *name)
  {
         struct elevator_queue *e = q->elevator;
-       struct elevator_type *elv;
+       struct elevator_type *elv = NULL;
         struct elevator_type *__e;
         int len = 0;
  
-       if (!q->elevator || !blk_queue_stackable(q))
+       if (!blk_queue_stackable(q))
                 return sprintf(name, "none\n");
  
-       elv = e->type;
+       if (!q->elevator)
+               len += sprintf(name+len, "[none] ");
+       else
+               elv = e->type;
  
         spin_lock(&elv_list_lock);
         list_for_each_entry(__e, &elv_list, list) {
-               if (!strcmp(elv->elevator_name, __e->elevator_name))
+               if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
                         len += sprintf(name+len, "[%s] ", elv->elevator_name);
-               else
+                       continue;
+               }
+               if (__e->uses_mq && q->mq_ops)
+                       len += sprintf(name+len, "%s ", __e->elevator_name);
+               else if (!__e->uses_mq && !q->mq_ops)
                         len += sprintf(name+len, "%s ", __e->elevator_name);
         }
         spin_unlock(&elv_list_lock);
  
+       if (q->mq_ops && q->elevator)
+               len += sprintf(name+len, "none");
+
         len += sprintf(len+name, "\n");
         return len;
  }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h

index 2686f9e7302a5e6e624e1c7d645bf8ab3fb36b3c..63569eb46d150abb6aa9b4283a1320d2a1b02bf0 100644 (file)
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,7 @@ struct blk_mq_hw_ctx {
  
         unsigned long           flags;          /* BLK_MQ_F_* flags */
  
+       void                    *sched_data;
         struct request_queue    *queue;
         struct blk_flush_queue  *fq;
  
@@ -35,6 +36,7 @@ struct blk_mq_hw_ctx {
         atomic_t                wait_index;
  
         struct blk_mq_tags      *tags;
+       struct blk_mq_tags      *sched_tags;
  
         struct srcu_struct      queue_rq_srcu;
  
@@ -156,6 +158,7 @@ enum {
  
         BLK_MQ_S_STOPPED        = 0,
         BLK_MQ_S_TAG_ACTIVE     = 1,
+       BLK_MQ_S_SCHED_RESTART  = 2,
  
         BLK_MQ_MAX_DEPTH        = 10240,
  
@@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
  
  void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
  
-void blk_mq_insert_request(struct request *, bool, bool, bool);
  void blk_mq_free_request(struct request *rq);
  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
  
  enum {
         BLK_MQ_REQ_NOWAIT       = (1 << 0), /* return when out of requests */
         BLK_MQ_REQ_RESERVED     = (1 << 1), /* allocate from reserved pool */
+       BLK_MQ_REQ_INTERNAL     = (1 << 2), /* allocate internal/sched tag */
  };
  
  struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 2e99d659b0f16786dd64ed489b79bb0431caa0bf..25564857f5f8de216c7cef8ed77b1f3c6c57075b 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -154,6 +154,7 @@ struct request {
  
         /* the following two fields are internal, NEVER access directly */
         unsigned int __data_len;        /* total data len */
+       int tag;
         sector_t __sector;              /* sector cursor */
  
         struct bio *bio;
@@ -220,9 +221,10 @@ struct request {
  
         unsigned short ioprio;
  
+       int internal_tag;
+
         void *special;          /* opaque pointer available for LLD use */
  
-       int tag;
         int errors;
  
         /*
diff --git a/include/linux/elevator.h b/include/linux/elevator.h

index 2a9e966eed03daac81cf2d8a850d15ea4e5eacc1..ecb96fd67c6d477c042173a14c0765582201db5a 100644 (file)
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -77,6 +77,34 @@ struct elevator_ops
         elevator_registered_fn *elevator_registered_fn;
  };
  
+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+       int (*init_sched)(struct request_queue *, struct elevator_type *);
+       void (*exit_sched)(struct elevator_queue *);
+
+       bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+       bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+       int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
+       void (*request_merged)(struct request_queue *, struct request *, int);
+       void (*requests_merged)(struct request_queue *, struct request *, struct request *);
+       struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
+       void (*put_request)(struct request *);
+       void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
+       void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *);
+       bool (*has_work)(struct blk_mq_hw_ctx *);
+       void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+       void (*started_request)(struct request *);
+       void (*requeue_request)(struct request *);
+       struct request *(*former_request)(struct request_queue *, struct request *);
+       struct request *(*next_request)(struct request_queue *, struct request *);
+       int (*get_rq_priv)(struct request_queue *, struct request *);
+       void (*put_rq_priv)(struct request_queue *, struct request *);
+       void (*init_icq)(struct io_cq *);
+       void (*exit_icq)(struct io_cq *);
+};
+
  #define ELV_NAME_MAX   (16)
  
  struct elv_fs_entry {
@@ -96,12 +124,14 @@ struct elevator_type
         /* fields provided by elevator implementation */
         union {
                 struct elevator_ops sq;
+               struct elevator_mq_ops mq;
         } ops;
         size_t icq_size;        /* see iocontext.h */
         size_t icq_align;       /* ditto */
         struct elv_fs_entry *elevator_attrs;
         char elevator_name[ELV_NAME_MAX];
         struct module *elevator_owner;
+       bool uses_mq;
  
         /* managed by elevator core */
         char icq_cache_name[ELV_NAME_MAX + 5];  /* elvname + "_io_cq" */
@@ -125,6 +155,7 @@ struct elevator_queue
         struct kobject kobj;
         struct mutex sysfs_lock;
         unsigned int registered:1;
+       unsigned int uses_mq:1;
         DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
  };
  
@@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
  extern void elv_merged_request(struct request_queue *, struct request *, int);
  extern void elv_bio_merged(struct request_queue *q, struct request *,
                                 struct bio *);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
  extern void elv_requeue_request(struct request_queue *, struct request *);
  extern struct request *elv_former_request(struct request_queue *, struct request *);
  extern struct request *elv_latter_request(struct request_queue *, struct request *);
author	Jens Axboe <axboe@fb.com>
	Tue, 17 Jan 2017 13:03:22 +0000 (06:03 -0700)
committer	Jens Axboe <axboe@fb.com>
	Tue, 17 Jan 2017 17:04:20 +0000 (10:04 -0700)
block/Makefile		patch \| blob \| blame \| history
block/blk-cgroup.c		patch \| blob \| blame \| history
block/blk-core.c		patch \| blob \| blame \| history
block/blk-exec.c		patch \| blob \| blame \| history
block/blk-flush.c		patch \| blob \| blame \| history
block/blk-ioc.c		patch \| blob \| blame \| history
block/blk-merge.c		patch \| blob \| blame \| history
block/blk-mq-sched.c	[new file with mode: 0644]	patch \| blob
block/blk-mq-sched.h	[new file with mode: 0644]	patch \| blob
block/blk-mq-sysfs.c		patch \| blob \| blame \| history
block/blk-mq.c		patch \| blob \| blame \| history
block/blk-mq.h		patch \| blob \| blame \| history
block/blk-tag.c		patch \| blob \| blame \| history
block/elevator.c		patch \| blob \| blame \| history
include/linux/blk-mq.h		patch \| blob \| blame \| history
include/linux/blkdev.h		patch \| blob \| blame \| history
include/linux/elevator.h		patch \| blob \| blame \| history