*/
#define NR_RAID1_BIOS 256
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
/* When there are this many requests queue to be written by
* the raid1 thread, we become 'congested' to provide back-pressure
* for writeback.
const sector_t this_sector = r1_bio->sector;
int sectors;
int best_good_sectors;
- int start_disk;
- int best_disk;
- int i;
+ int best_disk, best_dist_disk, best_pending_disk;
+ int has_nonrot_disk;
+ int disk;
sector_t best_dist;
+ unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
+ int choose_next_idle;
rcu_read_lock();
/*
retry:
sectors = r1_bio->sectors;
best_disk = -1;
+ best_dist_disk = -1;
best_dist = MaxSector;
+ best_pending_disk = -1;
+ min_pending = UINT_MAX;
best_good_sectors = 0;
+ has_nonrot_disk = 0;
+ choose_next_idle = 0;
if (conf->mddev->recovery_cp < MaxSector &&
- (this_sector + sectors >= conf->next_resync)) {
+ (this_sector + sectors >= conf->next_resync))
choose_first = 1;
- start_disk = 0;
- } else {
+ else
choose_first = 0;
- start_disk = conf->last_used;
- }
- for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
-
- int disk = start_disk + i;
- if (disk >= conf->raid_disks * 2)
- disk -= conf->raid_disks * 2;
+ unsigned int pending;
+ bool nonrot;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
} else
best_good_sectors = sectors;
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position);
- if (choose_first
- /* Don't change to another disk for sequential reads */
- || conf->next_seq_sect == this_sector
- || dist == 0
- /* If device is idle, use it */
- || atomic_read(&rdev->nr_pending) == 0) {
+ if (choose_first) {
+ best_disk = disk;
+ break;
+ }
+ /* Don't change to another disk for sequential reads */
+ if (conf->mirrors[disk].next_seq_sect == this_sector
+ || dist == 0) {
+ int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+ struct raid1_info *mirror = &conf->mirrors[disk];
+
+ best_disk = disk;
+ /*
+ * If buffered sequential IO size exceeds optimal
+ * iosize, check if there is idle disk. If yes, choose
+ * the idle disk. read_balance could already choose an
+ * idle disk before noticing it's a sequential IO in
+ * this disk. This doesn't matter because this disk
+ * will idle, next time it will be utilized after the
+ * first disk has IO size exceeds optimal iosize. In
+ * this way, iosize of the first disk will be optimal
+ * iosize at least. iosize of the second disk might be
+ * small, but not a big deal since when the second disk
+ * starts IO, the first disk is likely still busy.
+ */
+ if (nonrot && opt_iosize > 0 &&
+ mirror->seq_start != MaxSector &&
+ mirror->next_seq_sect > opt_iosize &&
+ mirror->next_seq_sect - opt_iosize >=
+ mirror->seq_start) {
+ choose_next_idle = 1;
+ continue;
+ }
+ break;
+ }
+ /* If device is idle, use it */
+ if (pending == 0) {
best_disk = disk;
break;
}
+
+ if (choose_next_idle)
+ continue;
+
+ if (min_pending > pending) {
+ min_pending = pending;
+ best_pending_disk = disk;
+ }
+
if (dist < best_dist) {
best_dist = dist;
- best_disk = disk;
+ best_dist_disk = disk;
}
}
+ /*
+ * If all disks are rotational, choose the closest disk. If any disk is
+ * non-rotational, choose the disk with less pending request even the
+ * disk is rotational, which might/might not be optimal for raids with
+ * mixed ratation/non-rotational disks depending on workload.
+ */
+ if (best_disk == -1) {
+ if (has_nonrot_disk)
+ best_disk = best_pending_disk;
+ else
+ best_disk = best_dist_disk;
+ }
+
if (best_disk >= 0) {
rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev)
goto retry;
}
sectors = best_good_sectors;
- conf->next_seq_sect = this_sector + sectors;
- conf->last_used = best_disk;
+
+ if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+ conf->mirrors[best_disk].seq_start = this_sector;
+
+ conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
}
rcu_read_unlock();
*max_sectors = sectors;
static void make_request(struct mddev *mddev, struct bio * bio)
{
struct r1conf *conf = mddev->private;
- struct mirror_info *mirror;
+ struct raid1_info *mirror;
struct r1bio *r1_bio;
struct bio *read_bio;
int i, disks;
struct r1conf *conf = mddev->private;
int err = -EEXIST;
int mirror = 0;
- struct mirror_info *p;
+ struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r1conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct mirror_info *p = conf->mirrors+ number;
+ struct raid1_info *p = conf->mirrors + number;
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
if (atomic_dec_and_test(&r1_bio->remaining)) {
/* if we're here, all write(s) have completed, so clean up */
- md_done_sync(mddev, r1_bio->sectors, 1);
- put_buf(r1_bio);
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, 1);
+ }
}
}
/* There is nowhere to write, so all non-sync
* drives must be failed - so we are finished
*/
- sector_t rv = max_sector - sector_nr;
+ sector_t rv;
+ if (min_bad > 0)
+ max_sector = sector_nr + min_bad;
+ rv = max_sector - sector_nr;
*skipped = 1;
put_buf(r1_bio);
return rv;
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
- for (i = 0; i < conf->raid_disks * 2; i++) {
+ for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
+ read_targets--;
md_sync_acct(bio->bi_bdev, nr_sectors);
generic_make_request(bio);
}
{
struct r1conf *conf;
int i;
- struct mirror_info *disk;
+ struct raid1_info *disk;
struct md_rdev *rdev;
int err = -ENOMEM;
if (!conf)
goto abort;
- conf->mirrors = kzalloc(sizeof(struct mirror_info)
+ conf->mirrors = kzalloc(sizeof(struct raid1_info)
* mddev->raid_disks * 2,
GFP_KERNEL);
if (!conf->mirrors)
mddev->merge_check_needed = 1;
disk->head_position = 0;
+ disk->seq_start = MaxSector;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
conf->recovery_disabled = mddev->recovery_disabled - 1;
err = -EIO;
- conf->last_used = -1;
for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
if (disk->rdev &&
(disk->rdev->saved_raid_disk < 0))
conf->fullsync = 1;
- } else if (conf->last_used < 0)
- /*
- * The first working device is used as a
- * starting point to read balancing.
- */
- conf->last_used = i;
+ }
}
- if (conf->last_used < 0) {
- printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
- mdname(mddev));
- goto abort;
- }
err = -ENOMEM;
conf->thread = md_register_thread(raid1d, mddev, "raid1");
if (!conf->thread) {
*/
mempool_t *newpool, *oldpool;
struct pool_info *newpoolinfo;
- struct mirror_info *newmirrors;
+ struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
kfree(newpoolinfo);
return -ENOMEM;
}
- newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+ newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0;
- conf->last_used = 0; /* just make sure it is in-range */
lower_barrier(conf);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);