blk-mq: implement hybrid poll mode for sync O_DIRECT
authorJens Axboe <axboe@fb.com>
Mon, 14 Nov 2016 20:01:59 +0000 (13:01 -0700)
committerJens Axboe <axboe@fb.com>
Thu, 17 Nov 2016 20:34:51 +0000 (13:34 -0700)
This patch enables a hybrid polling mode. Instead of polling after IO
submission, we can induce an artificial delay, and then poll after that.
For example, if the IO is presumed to complete in 8 usecs from now, we
can sleep for 4 usecs, wake up, and then do our polling. This still puts
a sleep/wakeup cycle in the IO path, but instead of the wakeup happening
after the IO has completed, it'll happen before. With this hybrid
scheme, we can achieve big latency reductions while still using the same
(or less) amount of CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-By: Stephen Bates <sbates@raithlin.com>
Reviewed-By: Stephen Bates <sbates@raithlin.com>
block/blk-mq.c
block/blk-sysfs.c
block/blk.h
include/linux/blkdev.h

index f39e69c732cc628c7fa54802160a2c495b28e87d..8cb248fb6a68308d948d2162b95b3a4fe9b5cd8f 100644 (file)
@@ -332,6 +332,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
        rq->rq_flags = 0;
 
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
        blk_mq_put_tag(hctx, ctx, tag);
        blk_queue_exit(q);
 }
@@ -2468,11 +2469,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+                                    struct request *rq)
+{
+       struct hrtimer_sleeper hs;
+       enum hrtimer_mode mode;
+       ktime_t kt;
+
+       if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+               return false;
+
+       set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+       /*
+        * This will be replaced with the stats tracking code, using
+        * 'avg_completion_time / 2' as the pre-sleep target.
+        */
+       kt = ktime_set(0, q->poll_nsec);
+
+       mode = HRTIMER_MODE_REL;
+       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+       hrtimer_set_expires(&hs.timer, kt);
+
+       hrtimer_init_sleeper(&hs, current);
+       do {
+               if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+                       break;
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               hrtimer_start_expires(&hs.timer, mode);
+               if (hs.task)
+                       io_schedule();
+               hrtimer_cancel(&hs.timer);
+               mode = HRTIMER_MODE_ABS;
+       } while (hs.task && !signal_pending(current));
+
+       __set_current_state(TASK_RUNNING);
+       destroy_hrtimer_on_stack(&hs.timer);
+       return true;
+}
+
 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
        struct request_queue *q = hctx->queue;
        long state;
 
+       /*
+        * If we sleep, have the caller restart the poll loop to reset
+        * the state. Like for the other success return cases, the
+        * caller is responsible for checking if the IO completed. If
+        * the IO isn't complete, we'll get called again and will go
+        * straight to the busy poll loop.
+        */
+       if (blk_mq_poll_hybrid_sleep(q, rq))
+               return true;
+
        hctx->poll_considered++;
 
        state = current->state;
index 415e764807d03de33c55cafa582c0ebf957295ee..dcdfcaa126539c3e79dbf7f55834397631a82bac 100644 (file)
@@ -350,6 +350,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
        return ret;
 }
 
+static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(q->poll_nsec / 1000, page);
+}
+
+static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
+                               size_t count)
+{
+       unsigned long poll_usec;
+       ssize_t ret;
+
+       if (!q->mq_ops || !q->mq_ops->poll)
+               return -EINVAL;
+
+       ret = queue_var_store(&poll_usec, page, count);
+       if (ret < 0)
+               return ret;
+
+       q->poll_nsec = poll_usec * 1000;
+       return ret;
+}
+
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
 {
        return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -602,6 +624,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
        .store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_poll_delay_entry = {
+       .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_poll_delay_show,
+       .store = queue_poll_delay_store,
+};
+
 static struct queue_sysfs_entry queue_wc_entry = {
        .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
        .show = queue_wc_show,
@@ -655,6 +683,7 @@ static struct attribute *default_attrs[] = {
        &queue_dax_entry.attr,
        &queue_stats_entry.attr,
        &queue_wb_lat_entry.attr,
+       &queue_poll_delay_entry.attr,
        NULL,
 };
 
index aa132dea598c5e14a253fc91c5cae063e0981da0..041185e5f12994dc146528db2627707e42a700ee 100644 (file)
@@ -111,6 +111,7 @@ void blk_account_io_done(struct request *req);
 enum rq_atomic_flags {
        REQ_ATOM_COMPLETE = 0,
        REQ_ATOM_STARTED,
+       REQ_ATOM_POLL_SLEPT,
 };
 
 /*
index bab18ee5810d3ac40c2d51ace9b80e947164769d..37ed4ea705c8401b411c7bcb47bd586635357362 100644 (file)
@@ -509,6 +509,7 @@ struct request_queue {
        unsigned int            request_fn_active;
 
        unsigned int            rq_timeout;
+       unsigned int            poll_nsec;
        struct timer_list       timeout;
        struct work_struct      timeout_work;
        struct list_head        timeout_list;