block: add support for REQ_OP_WRITE_ZEROES
authorChaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Wed, 30 Nov 2016 20:28:59 +0000 (12:28 -0800)
committerJens Axboe <axboe@fb.com>
Thu, 1 Dec 2016 14:58:40 +0000 (07:58 -0700)
This adds a new block layer operation to zero out a range of
LBAs. This allows to implement zeroing for devices that don't use
either discard with a predictable zero pattern or WRITE SAME of zeroes.
The prominent example of that is NVMe with the Write Zeroes command,
but in the future, this should also help with improving the way
zeroing discards work. For this operation, suitable entry is exported in
sysfs which indicate the number of maximum bytes allowed in one
write zeroes operation by the device.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
Documentation/ABI/testing/sysfs-block
block/bio.c
block/blk-core.c
block/blk-lib.c
block/blk-merge.c
block/blk-settings.c
block/blk-sysfs.c
block/blk-wbt.c
include/linux/bio.h
include/linux/blk_types.h
include/linux/blkdev.h

index ee2d5cd26bfe2d3b3ee49350361ea3b4a00c8b61..2da04ce6aeef482645bfd9edd924ce195275ea26 100644 (file)
@@ -235,6 +235,19 @@ Description:
                write_same_max_bytes is 0, write same is not supported
                by the device.
 
+What:          /sys/block/<disk>/queue/write_zeroes_max_bytes
+Date:          November 2016
+Contact:       Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Description:
+               Devices that support write zeroes operation in which a
+               single request can be issued to zero out the range of
+               contiguous blocks on storage without having any payload
+               in the request. This can be used to optimize writing zeroes
+               to the devices. write_zeroes_max_bytes indicates how many
+               bytes can be written in a single write zeroes command. If
+               write_zeroes_max_bytes is 0, write zeroes is not supported
+               by the device.
+
 What:          /sys/block/<disk>/queue/zoned
 Date:          September 2016
 Contact:       Damien Le Moal <damien.lemoal@hgst.com>
index de257ced69b1df41f98169aaccd59e113070245a..83db1f37fd0bfbe4ec4c64fc008fecd91e92e637 100644 (file)
@@ -674,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
                break;
        case REQ_OP_WRITE_SAME:
                bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
index 6c4a425690fc7fe0d4b74ae81c652ba96d28bfb8..3f2eb8d8018979e6b7c1ef1c2f90d894d55bae29 100644 (file)
@@ -1950,6 +1950,10 @@ generic_make_request_checks(struct bio *bio)
                if (!bdev_is_zoned(bio->bi_bdev))
                        goto not_supported;
                break;
+       case REQ_OP_WRITE_ZEROES:
+               if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+                       goto not_supported;
+               break;
        default:
                break;
        }
index bfb28b03765ee372ac69b4dc7c721e3f7222d7a2..510a6fb1531822e10765184733066614b6740a36 100644 (file)
@@ -226,6 +226,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
+/**
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
+ * @bdev:      blockdev to issue
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to write
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ * @biop:      pointer to anchor bio
+ *
+ * Description:
+ *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
+ */
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+               sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+               struct bio **biop)
+{
+       struct bio *bio = *biop;
+       unsigned int max_write_zeroes_sectors;
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (!q)
+               return -ENXIO;
+
+       /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+       max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+
+       if (max_write_zeroes_sectors == 0)
+               return -EOPNOTSUPP;
+
+       while (nr_sects) {
+               bio = next_bio(bio, 0, gfp_mask);
+               bio->bi_iter.bi_sector = sector;
+               bio->bi_bdev = bdev;
+               bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+
+               if (nr_sects > max_write_zeroes_sectors) {
+                       bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+                       nr_sects -= max_write_zeroes_sectors;
+                       sector += max_write_zeroes_sectors;
+               } else {
+                       bio->bi_iter.bi_size = nr_sects << 9;
+                       nr_sects = 0;
+               }
+               cond_resched();
+       }
+
+       *biop = bio;
+       return 0;
+}
+
 /**
  * __blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:      blockdev to issue
@@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                        goto out;
        }
 
+       ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+                       biop);
+       if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+               goto out;
+
        ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
                        ZERO_PAGE(0), biop);
        if (ret == 0 || (ret && ret != -EOPNOTSUPP))
@@ -304,8 +358,8 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
  *  the discard request fail, if the discard flag is not set, or if
  *  discard_zeroes_data is not supported, this function will resort to
  *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports the WRITE SAME command
- *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
  *  clearing the block range. Otherwise the zeroing will be performed
  *  using regular WRITE calls.
  */
index fda6a12fc776b16bdfccbb21724250c04ecadc65..cf2848cb91d8037d9eccdb4d0fdf84b2563975e9 100644 (file)
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
        case REQ_OP_SECURE_ERASE:
                split = blk_bio_discard_split(q, *bio, bs, &nsegs);
                break;
+       case REQ_OP_WRITE_ZEROES:
+               split = NULL;
+               nsegs = (*bio)->bi_phys_segments;
+               break;
        case REQ_OP_WRITE_SAME:
                split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
                break;
@@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
         * This should probably be returning 0, but blk_add_request_payload()
         * (Christoph!!!!)
         */
-       if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-               return 1;
-
-       if (bio_op(bio) == REQ_OP_WRITE_SAME)
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_SAME:
+       case REQ_OP_WRITE_ZEROES:
                return 1;
+       default:
+               break;
+       }
 
        fbio = bio;
        cluster = blk_queue_cluster(q);
@@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
                /*
                 * This is a hack - drivers should be neither modifying the
                 * biovec, nor relying on bi_vcnt - but because of
index c7ccabc0ec3ea6eebc40f2a9585f9148d277889f..8a2bc124a6840f2542355bdd01b543404a333796 100644 (file)
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->max_dev_sectors = 0;
        lim->chunk_sectors = 0;
        lim->max_write_same_sectors = 0;
+       lim->max_write_zeroes_sectors = 0;
        lim->max_discard_sectors = 0;
        lim->max_hw_discard_sectors = 0;
        lim->discard_granularity = 0;
@@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
        lim->max_sectors = UINT_MAX;
        lim->max_dev_sectors = UINT_MAX;
        lim->max_write_same_sectors = UINT_MAX;
+       lim->max_write_zeroes_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
 
@@ -299,6 +301,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
 
+/**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ *                                      write zeroes
+ * @q:  the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+               unsigned int max_write_zeroes_sectors)
+{
+       q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
 /**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
@@ -527,6 +542,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
        t->max_write_same_sectors = min(t->max_write_same_sectors,
                                        b->max_write_same_sectors);
+       t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
+                                       b->max_write_zeroes_sectors);
        t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
        t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
index a9784149176902beab4b25223989c2d702819f40..706b27bd73a1dfee8d678215fe97b4074d4a4496 100644 (file)
@@ -211,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
                (unsigned long long)q->limits.max_write_same_sectors << 9);
 }
 
+static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
+{
+       return sprintf(page, "%llu\n",
+               (unsigned long long)q->limits.max_write_zeroes_sectors << 9);
+}
 
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
@@ -611,6 +616,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
        .show = queue_write_same_max_show,
 };
 
+static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
+       .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+       .show = queue_write_zeroes_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
        .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
        .show = queue_show_nonrot,
@@ -700,6 +710,7 @@ static struct attribute *default_attrs[] = {
        &queue_discard_max_hw_entry.attr,
        &queue_discard_zeroes_data_entry.attr,
        &queue_write_same_max_entry.attr,
+       &queue_write_zeroes_max_entry.attr,
        &queue_nonrot_entry.attr,
        &queue_zoned_entry.attr,
        &queue_nomerges_entry.attr,
index b8647343141f8996704c85d1c0dc88df3c2dff10..d500e43da5d9ccdf2476107ac6fd69da3e2636c7 100644 (file)
@@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
        const int op = bio_op(bio);
 
        /*
-        * If not a WRITE (or a discard), do nothing
+        * If not a WRITE (or a discard or write zeroes), do nothing
         */
-       if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+       if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+                               op == REQ_OP_WRITE_ZEROES))
                return false;
 
        /*
index 70a7244f08a72e7c01ff00945003c69a55a8a810..b15323934a298e8e9c386f4af534176d927007f1 100644 (file)
@@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio)
        if (bio &&
            bio->bi_iter.bi_size &&
            bio_op(bio) != REQ_OP_DISCARD &&
-           bio_op(bio) != REQ_OP_SECURE_ERASE)
+           bio_op(bio) != REQ_OP_SECURE_ERASE &&
+           bio_op(bio) != REQ_OP_WRITE_ZEROES)
                return true;
 
        return false;
@@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio)
 {
        return bio_op(bio) == REQ_OP_DISCARD ||
               bio_op(bio) == REQ_OP_SECURE_ERASE ||
-              bio_op(bio) == REQ_OP_WRITE_SAME;
+              bio_op(bio) == REQ_OP_WRITE_SAME ||
+              bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
 static inline bool bio_mergeable(struct bio *bio)
@@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio)
        struct bvec_iter iter;
 
        /*
-        * We special case discard/write same, because they interpret bi_size
-        * differently:
+        * We special case discard/write same/write zeroes, because they
+        * interpret bi_size differently:
         */
 
-       if (bio_op(bio) == REQ_OP_DISCARD)
-               return 1;
-
-       if (bio_op(bio) == REQ_OP_SECURE_ERASE)
-               return 1;
-
-       if (bio_op(bio) == REQ_OP_WRITE_SAME)
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_SAME:
+       case REQ_OP_WRITE_ZEROES:
                return 1;
+       default:
+               break;
+       }
 
        bio_for_each_segment(bv, bio, iter)
                segs++;
index f57458a6a93bc3cebf146fda6d339b0b38bfdfe4..519ea2c9df612becf997cf8fafbc1411ce6cd4be 100644 (file)
@@ -159,6 +159,8 @@ enum req_opf {
        REQ_OP_ZONE_RESET       = 6,
        /* write the same sector many times */
        REQ_OP_WRITE_SAME       = 7,
+       /* write the zero filled sector many times */
+       REQ_OP_WRITE_ZEROES     = 8,
 
        REQ_OP_LAST,
 };
index 7e9d8a0895beb7b47d82a610285920d635f79557..ebeef2b79c5adac3a26c5386e50dc7e2816eb16c 100644 (file)
@@ -323,6 +323,7 @@ struct queue_limits {
        unsigned int            max_discard_sectors;
        unsigned int            max_hw_discard_sectors;
        unsigned int            max_write_same_sectors;
+       unsigned int            max_write_zeroes_sectors;
        unsigned int            discard_granularity;
        unsigned int            discard_alignment;
 
@@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq)
        if (req_op(rq) == REQ_OP_FLUSH)
                return false;
 
+       if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+               return false;
+
        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
@@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
        if (unlikely(op == REQ_OP_WRITE_SAME))
                return q->limits.max_write_same_sectors;
 
+       if (unlikely(op == REQ_OP_WRITE_ZEROES))
+               return q->limits.max_write_zeroes_sectors;
+
        return q->limits.max_sectors;
 }
 
@@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
                unsigned int max_discard_sectors);
 extern void blk_queue_max_write_same_sectors(struct request_queue *q,
                unsigned int max_write_same_sectors);
+extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+               unsigned int max_write_same_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
@@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev)
        return 0;
 }
 
+static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (q)
+               return q->limits.max_write_zeroes_sectors;
+
+       return 0;
+}
+
 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
 {
        struct request_queue *q = bdev_get_queue(bdev);