From c230e7e53526c223a3e1caf40747d6e37c0e4394 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: [PATCH] md/raid1: simplify the splitting of requests.

raid1 currently splits requests in two different ways for
two different reasons.

First, bio_split() is used to ensure the bio fits within a
resync accounting region.
Second, multiple r1bios are allocated for each bio to handle
the possiblity of known bad blocks on some devices.

This can be simplified to just use bio_split() once, and not
use multiple r1bios.
We delay the split until we know a maximum bio size that can
be handled with a single r1bio, and then split the bio and
queue the remainder for later handling.

This avoids all loops inside raid1.c request handling.  Just
a single read, or a single set of writes, is submitted to
lower-level devices for each bio that comes from
generic_make_request().

When the bio needs to be split, generic_make_request() will
do the necessary looping and call md_make_request() multiple
times.

raid1_make_request() no longer queues request for raid1 to handle,
so we can remove that branch from the 'if'.

This patch also creates a new private bio_set
(conf->bio_split) for splitting bios.  Using fs_bio_set
is wrong, as it is meant to be used by filesystems, not
block devices.  Using it inside md can lead to deadlocks
under high memory pressure.

Delete unused variable in raid1_write_request() (Shaohua)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 141 ++++++++++++++++-----------------------------
 drivers/md/raid1.h |   2 +
 2 files changed, 52 insertions(+), 91 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 70278b9a3c43..94f1d754aa09 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1202,7 +1202,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
 	return r1_bio;
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+			       int max_read_sectors)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
@@ -1211,7 +1212,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	struct bitmap *bitmap = mddev->bitmap;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	int sectors_handled;
 	int max_sectors;
 	int rdisk;
 
@@ -1222,12 +1222,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	wait_read_barrier(conf, bio->bi_iter.bi_sector);
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio->sectors = max_read_sectors;
 
 	/*
 	 * make_request() can abort the operation when read-ahead is being
 	 * used and no empty request is available.
 	 */
-read_again:
 	rdisk = read_balance(conf, r1_bio, &max_sectors);
 
 	if (rdisk < 0) {
@@ -1247,11 +1247,20 @@ read_again:
 		wait_event(bitmap->behind_wait,
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
+
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = max_sectors;
+	}
+
 	r1_bio->read_disk = rdisk;
 
 	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
-		 max_sectors);
 
 	r1_bio->bios[rdisk] = read_bio;
 
@@ -1270,30 +1279,11 @@ read_again:
 	                              read_bio, disk_devt(mddev->gendisk),
 	                              r1_bio->sector);
 
-	if (max_sectors < r1_bio->sectors) {
-		/*
-		 * could not read all from this device, so we will need another
-		 * r1_bio.
-		 */
-		sectors_handled = (r1_bio->sector + max_sectors
-				   - bio->bi_iter.bi_sector);
-		r1_bio->sectors = max_sectors;
-		bio_inc_remaining(bio);
-
-		/*
-		 * Cannot call generic_make_request directly as that will be
-		 * queued in __make_request and subsequent mempool_alloc might
-		 * block waiting for it.  So hand bio over to raid1d.
-		 */
-		reschedule_retry(r1_bio);
-
-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-		goto read_again;
-	} else
-		generic_make_request(read_bio);
+	generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+				int max_write_sectors)
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
@@ -1304,9 +1294,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
-	int sectors_handled;
 	int max_sectors;
-	sector_t offset;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1345,6 +1333,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	wait_barrier(conf, bio->bi_iter.bi_sector);
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1443,17 +1432,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		goto retry_write;
 	}
 
-	if (max_sectors < r1_bio->sectors)
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r1_bio->master_bio = bio;
 		r1_bio->sectors = max_sectors;
-
-	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
+	}
 
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
 	first_clone = 1;
 
-	offset = r1_bio->sector - bio->bi_iter.bi_sector;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
 		if (!r1_bio->bios[i])
@@ -1470,7 +1463,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
 				mbio = alloc_behind_master_bio(r1_bio, bio,
-							       offset << 9,
+							       0,
 							       max_sectors << 9);
 			}
 
@@ -1486,10 +1479,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 				mbio = bio_clone_fast(r1_bio->behind_master_bio,
 						      GFP_NOIO,
 						      mddev->bio_set);
-			else {
+			else
 				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-				bio_trim(mbio, offset, max_sectors);
-			}
 		}
 
 		if (r1_bio->behind_master_bio) {
@@ -1536,19 +1527,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		if (!plug)
 			md_wakeup_thread(mddev->thread);
 	}
-	/* Mustn't call r1_bio_write_done before this next test,
-	 * as it could result in the bio being freed.
-	 */
-	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r1_bio, which must be counted */
-		sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
-
-		inc_pending(conf, sect);
-		bio_inc_remaining(bio);
-		r1_bio_write_done(r1_bio);
-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-		goto retry_write;
-	}
 
 	r1_bio_write_done(r1_bio);
 
@@ -1558,7 +1536,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-	struct bio *split;
 	sector_t sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
@@ -1566,43 +1543,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
-	/* if bio exceeds barrier unit boundary, split it */
-	do {
-		sectors = align_to_barrier_unit_end(
-				bio->bi_iter.bi_sector, bio_sectors(bio));
-		if (sectors < bio_sectors(bio)) {
-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
-
-		if (bio_data_dir(split) == READ) {
-			raid1_read_request(mddev, split);
+	/*
+	 * There is a limit to the maximum size, but
+	 * the read/write handler might find a lower limit
+	 * due to bad blocks.  To avoid multiple splits,
+	 * we pass the maximum number of sectors down
+	 * and let the lower level perform the split.
+	 */
+	sectors = align_to_barrier_unit_end(
+		bio->bi_iter.bi_sector, bio_sectors(bio));
 
-			/*
-			 * If a bio is splitted, the first part of bio will
-			 * pass barrier but the bio is queued in
-			 * current->bio_list (see generic_make_request). If
-			 * there is a raise_barrier() called here, the second
-			 * part of bio can't pass barrier. But since the first
-			 * part bio isn't dispatched to underlaying disks yet,
-			 * the barrier is never released, hence raise_barrier
-			 * will alays wait. We have a deadlock.
-			 * Note, this only happens in read path. For write
-			 * path, the first part of bio is dispatched in a
-			 * schedule() call (because of blk plug) or offloaded
-			 * to raid10d.
-			 * Quitting from the function immediately can change
-			 * the bio order queued in bio_list and avoid the deadlock.
-			 */
-			if (split != bio) {
-				generic_make_request(bio);
-				break;
-			}
-		} else
-			raid1_write_request(mddev, split);
-	} while (split != bio);
+	if (bio_data_dir(bio) == READ)
+		raid1_read_request(mddev, bio, sectors);
+	else
+		raid1_write_request(mddev, bio, sectors);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -2647,10 +2601,7 @@ static void raid1d(struct md_thread *thread)
 		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
 			handle_read_error(conf, r1_bio);
 		else
-			/* just a partial read to be scheduled from separate
-			 * context
-			 */
-			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+			WARN_ON_ONCE(1);
 
 		cond_resched();
 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3038,6 +2989,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf->r1bio_pool)
 		goto abort;
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto abort;
+
 	conf->poolinfo->mddev = mddev;
 
 	err = -EINVAL;
@@ -3119,6 +3074,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		kfree(conf->nr_waiting);
 		kfree(conf->nr_queued);
 		kfree(conf->barrier);
+		if (conf->bio_split)
+			bioset_free(conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3224,6 +3181,8 @@ static void raid1_free(struct mddev *mddev, void *priv)
 	kfree(conf->nr_waiting);
 	kfree(conf->nr_queued);
 	kfree(conf->barrier);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 4271cd7ac2de..b0ab0da6e39e 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -107,6 +107,8 @@ struct r1conf {
 	mempool_t		*r1bio_pool;
 	mempool_t		*r1buf_pool;
 
+	struct bio_set		*bio_split;
+
 	/* temporary buffer to synchronous IO when attempting to repair
 	 * a read error.
 	 */
-- 
2.20.1