blk-mq-sched: fix starvation for multiple hardware queues and shared tags
authorJens Axboe <axboe@fb.com>
Thu, 26 Jan 2017 21:42:34 +0000 (14:42 -0700)
committerJens Axboe <axboe@fb.com>
Fri, 27 Jan 2017 15:20:34 +0000 (08:20 -0700)
If we have both multiple hardware queues and shared tag map between
devices, we need to ensure that we propagate the hardware queue
restart bit higher up. This is because we can get into a situation
where we don't have any IO pending on a hardware queue, yet we fail
getting a tag to start new IO. If that happens, it's not enough to
mark the hardware queue as needing a restart, we need to bubble
that up to the higher level queue as well.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Tested-by: Hannes Reinecke <hare@suse.com>
block/blk-mq-sched.c
block/blk-mq-sched.h
block/blk-mq.c
block/blk-mq.h
include/linux/blkdev.h

index 4cee060a292dff406d93c724da85a9e07f1a74ec..fcc0e893d6870a76ab0c338a4018e09438ad542b 100644 (file)
@@ -301,6 +301,34 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
 
+static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+               clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+               if (blk_mq_hctx_has_pending(hctx))
+                       blk_mq_run_hw_queue(hctx, true);
+       }
+}
+
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
+{
+       unsigned int i;
+
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+               blk_mq_sched_restart_hctx(hctx);
+       else {
+               struct request_queue *q = hctx->queue;
+
+               if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+                       return;
+
+               clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+
+               queue_for_each_hw_ctx(q, hctx, i)
+                       blk_mq_sched_restart_hctx(hctx);
+       }
+}
+
 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
                                   struct blk_mq_hw_ctx *hctx,
                                   unsigned int hctx_idx)
index 6b465bc7014c38b1b5d789e1b331125eb7f8d968..becbc78403643609f3c9f10ba31c4762be4ab95c 100644 (file)
@@ -19,6 +19,7 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
@@ -123,11 +124,6 @@ blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
        BUG_ON(rq->internal_tag == -1);
 
        blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
-
-       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
-               clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-               blk_mq_run_hw_queue(hctx, true);
-       }
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
@@ -160,8 +156,15 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
 
 static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
 {
-       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
                set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+               if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+                       struct request_queue *q = hctx->queue;
+
+                       if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+                               set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+               }
+       }
 }
 
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
index 711883384585490ba489346758fc33f334d71103..21795c6575bc61fff24e900188950d61eb0da28f 100644 (file)
@@ -40,7 +40,7 @@ static LIST_HEAD(all_q_list);
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
        return sbitmap_any_bit_set(&hctx->ctx_map) ||
                        !list_empty_careful(&hctx->dispatch) ||
@@ -345,6 +345,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
        if (sched_tag != -1)
                blk_mq_sched_completed_request(hctx, rq);
+       blk_mq_sched_restart_queues(hctx);
        blk_queue_exit(q);
 }
 
index 6c24b901acd764e0c614830884cc91af88b109fd..077a4003f1fd0947238bb1ac7104ec9f88f2f818 100644 (file)
@@ -33,6 +33,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 
 /*
  * Internal helpers for allocating/freeing the request map
index 0ee283f3cffe7657cb58bcb65c6c56050f49a570..883b8abe43052d6c7ab57f1bc0488888b634b037 100644 (file)
@@ -607,6 +607,7 @@ struct request_queue {
 #define QUEUE_FLAG_FLUSH_NQ    25      /* flush not queueuable */
 #define QUEUE_FLAG_DAX         26      /* device supports DAX */
 #define QUEUE_FLAG_STATS       27      /* track rq completion times */
+#define QUEUE_FLAG_RESTART     28      /* queue needs restart at completion */
 
 #define QUEUE_FLAG_DEFAULT     ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \