md/raid1: don't abort a resync on the first badblock.

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / drivers / md / raid1.c
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index a9c7981ddd245c6955cbed390b617a9172abde31..d2361b162de59b316fe8fba4d1a2d3808dc416a6 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
   */
  #define        NR_RAID1_BIOS 256
  
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
  /* When there are this many requests queue to be written by
   * the raid1 thread, we become 'congested' to provide back-pressure
   * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
         const sector_t this_sector = r1_bio->sector;
         int sectors;
         int best_good_sectors;
-       int start_disk;
-       int best_disk;
-       int i;
+       int best_disk, best_dist_disk, best_pending_disk;
+       int has_nonrot_disk;
+       int disk;
         sector_t best_dist;
+       unsigned int min_pending;
         struct md_rdev *rdev;
         int choose_first;
+       int choose_next_idle;
  
         rcu_read_lock();
         /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
   retry:
         sectors = r1_bio->sectors;
         best_disk = -1;
+       best_dist_disk = -1;
         best_dist = MaxSector;
+       best_pending_disk = -1;
+       min_pending = UINT_MAX;
         best_good_sectors = 0;
+       has_nonrot_disk = 0;
+       choose_next_idle = 0;
  
         if (conf->mddev->recovery_cp < MaxSector &&
-           (this_sector + sectors >= conf->next_resync)) {
+           (this_sector + sectors >= conf->next_resync))
                 choose_first = 1;
-               start_disk = 0;
-       } else {
+       else
                 choose_first = 0;
-               start_disk = conf->last_used;
-       }
  
-       for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
+       for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
                 sector_t dist;
                 sector_t first_bad;
                 int bad_sectors;
-
-               int disk = start_disk + i;
-               if (disk >= conf->raid_disks)
-                       disk -= conf->raid_disks;
+               unsigned int pending;
+               bool nonrot;
  
                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
                 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                 } else
                         best_good_sectors = sectors;
  
+               nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+               has_nonrot_disk |= nonrot;
+               pending = atomic_read(&rdev->nr_pending);
                 dist = abs(this_sector - conf->mirrors[disk].head_position);
-               if (choose_first
-                   /* Don't change to another disk for sequential reads */
-                   || conf->next_seq_sect == this_sector
-                   || dist == 0
-                   /* If device is idle, use it */
-                   || atomic_read(&rdev->nr_pending) == 0) {
+               if (choose_first) {
                         best_disk = disk;
                         break;
                 }
+               /* Don't change to another disk for sequential reads */
+               if (conf->mirrors[disk].next_seq_sect == this_sector
+                   || dist == 0) {
+                       int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+                       struct raid1_info *mirror = &conf->mirrors[disk];
+
+                       best_disk = disk;
+                       /*
+                        * If buffered sequential IO size exceeds optimal
+                        * iosize, check if there is idle disk. If yes, choose
+                        * the idle disk. read_balance could already choose an
+                        * idle disk before noticing it's a sequential IO in
+                        * this disk. This doesn't matter because this disk
+                        * will idle, next time it will be utilized after the
+                        * first disk has IO size exceeds optimal iosize. In
+                        * this way, iosize of the first disk will be optimal
+                        * iosize at least. iosize of the second disk might be
+                        * small, but not a big deal since when the second disk
+                        * starts IO, the first disk is likely still busy.
+                        */
+                       if (nonrot && opt_iosize > 0 &&
+                           mirror->seq_start != MaxSector &&
+                           mirror->next_seq_sect > opt_iosize &&
+                           mirror->next_seq_sect - opt_iosize >=
+                           mirror->seq_start) {
+                               choose_next_idle = 1;
+                               continue;
+                       }
+                       break;
+               }
+               /* If device is idle, use it */
+               if (pending == 0) {
+                       best_disk = disk;
+                       break;
+               }
+
+               if (choose_next_idle)
+                       continue;
+
+               if (min_pending > pending) {
+                       min_pending = pending;
+                       best_pending_disk = disk;
+               }
+
                 if (dist < best_dist) {
                         best_dist = dist;
-                       best_disk = disk;
+                       best_dist_disk = disk;
                 }
         }
  
+       /*
+        * If all disks are rotational, choose the closest disk. If any disk is
+        * non-rotational, choose the disk with less pending request even the
+        * disk is rotational, which might/might not be optimal for raids with
+        * mixed ratation/non-rotational disks depending on workload.
+        */
+       if (best_disk == -1) {
+               if (has_nonrot_disk)
+                       best_disk = best_pending_disk;
+               else
+                       best_disk = best_dist_disk;
+       }
+
         if (best_disk >= 0) {
                 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
                 if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                         goto retry;
                 }
                 sectors = best_good_sectors;
-               conf->next_seq_sect = this_sector + sectors;
-               conf->last_used = best_disk;
+
+               if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+                       conf->mirrors[best_disk].seq_start = this_sector;
+
+               conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
         }
         rcu_read_unlock();
         *max_sectors = sectors;
@@ -873,7 +947,7 @@ do_sync_io:
  static void make_request(struct mddev *mddev, struct bio * bio)
  {
         struct r1conf *conf = mddev->private;
-       struct mirror_info *mirror;
+       struct raid1_info *mirror;
         struct r1bio *r1_bio;
         struct bio *read_bio;
         int i, disks;
@@ -883,7 +957,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
         const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
         struct md_rdev *blocked_rdev;
-       int plugged;
         int first_clone;
         int sectors_handled;
         int max_sectors;
@@ -1034,7 +1107,6 @@ read_again:
          * the bad blocks.  Each set of writes gets it's own r1bio
          * with a set of bios attached.
          */
-       plugged = mddev_check_plugged(mddev);
  
         disks = conf->raid_disks * 2;
   retry_write:
@@ -1191,6 +1263,8 @@ read_again:
                 bio_list_add(&conf->pending_bio_list, mbio);
                 conf->pending_count++;
                 spin_unlock_irqrestore(&conf->device_lock, flags);
+               if (!mddev_check_plugged(mddev))
+                       md_wakeup_thread(mddev->thread);
         }
         /* Mustn't call r1_bio_write_done before this next test,
          * as it could result in the bio being freed.
@@ -1213,9 +1287,6 @@ read_again:
  
         /* In case raid1d snuck in to freeze_array */
         wake_up(&conf->wait_barrier);
-
-       if (do_sync || !bitmap || !plugged)
-               md_wakeup_thread(mddev->thread);
  }
  
  static void status(struct seq_file *seq, struct mddev *mddev)
@@ -1367,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
         struct r1conf *conf = mddev->private;
         int err = -EEXIST;
         int mirror = 0;
-       struct mirror_info *p;
+       struct raid1_info *p;
         int first = 0;
         int last = conf->raid_disks - 1;
         struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1436,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
         struct r1conf *conf = mddev->private;
         int err = 0;
         int number = rdev->raid_disk;
-       struct mirror_info *p = conf->mirrors+ number;
+       struct raid1_info *p = conf->mirrors + number;
  
         if (rdev != p->rdev)
                 p = conf->mirrors + conf->raid_disks + number;
@@ -1821,8 +1892,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
  
         if (atomic_dec_and_test(&r1_bio->remaining)) {
                 /* if we're here, all write(s) have completed, so clean up */
-               md_done_sync(mddev, r1_bio->sectors, 1);
-               put_buf(r1_bio);
+               int s = r1_bio->sectors;
+               if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+                   test_bit(R1BIO_WriteError, &r1_bio->state))
+                       reschedule_retry(r1_bio);
+               else {
+                       put_buf(r1_bio);
+                       md_done_sync(mddev, s, 1);
+               }
         }
  }
  
@@ -2425,7 +2502,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                 /* There is nowhere to write, so all non-sync
                  * drives must be failed - so we are finished
                  */
-               sector_t rv = max_sector - sector_nr;
+               sector_t rv;
+               if (min_bad > 0)
+                       max_sector = sector_nr + min_bad;
+               rv = max_sector - sector_nr;
                 *skipped = 1;
                 put_buf(r1_bio);
                 return rv;
@@ -2488,9 +2568,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
          */
         if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                 atomic_set(&r1_bio->remaining, read_targets);
-               for (i = 0; i < conf->raid_disks * 2; i++) {
+               for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
                         bio = r1_bio->bios[i];
                         if (bio->bi_end_io == end_sync_read) {
+                               read_targets--;
                                 md_sync_acct(bio->bi_bdev, nr_sectors);
                                 generic_make_request(bio);
                         }
@@ -2517,7 +2598,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
  {
         struct r1conf *conf;
         int i;
-       struct mirror_info *disk;
+       struct raid1_info *disk;
         struct md_rdev *rdev;
         int err = -ENOMEM;
  
@@ -2525,7 +2606,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
         if (!conf)
                 goto abort;
  
-       conf->mirrors = kzalloc(sizeof(struct mirror_info)
+       conf->mirrors = kzalloc(sizeof(struct raid1_info)
                                 * mddev->raid_disks * 2,
                                  GFP_KERNEL);
         if (!conf->mirrors)
@@ -2568,6 +2649,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                         mddev->merge_check_needed = 1;
  
                 disk->head_position = 0;
+               disk->seq_start = MaxSector;
         }
         conf->raid_disks = mddev->raid_disks;
         conf->mddev = mddev;
@@ -2581,7 +2663,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
         conf->recovery_disabled = mddev->recovery_disabled - 1;
  
         err = -EIO;
-       conf->last_used = -1;
         for (i = 0; i < conf->raid_disks * 2; i++) {
  
                 disk = conf->mirrors + i;
@@ -2607,21 +2688,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                         if (disk->rdev &&
                             (disk->rdev->saved_raid_disk < 0))
                                 conf->fullsync = 1;
-               } else if (conf->last_used < 0)
-                       /*
-                        * The first working device is used as a
-                        * starting point to read balancing.
-                        */
-                       conf->last_used = i;
+               }
         }
  
-       if (conf->last_used < 0) {
-               printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
-                      mdname(mddev));
-               goto abort;
-       }
         err = -ENOMEM;
-       conf->thread = md_register_thread(raid1d, mddev, NULL);
+       conf->thread = md_register_thread(raid1d, mddev, "raid1");
         if (!conf->thread) {
                 printk(KERN_ERR
                        "md/raid1:%s: couldn't allocate thread\n",
@@ -2794,7 +2865,7 @@ static int raid1_reshape(struct mddev *mddev)
          */
         mempool_t *newpool, *oldpool;
         struct pool_info *newpoolinfo;
-       struct mirror_info *newmirrors;
+       struct raid1_info *newmirrors;
         struct r1conf *conf = mddev->private;
         int cnt, raid_disks;
         unsigned long flags;
@@ -2837,7 +2908,7 @@ static int raid1_reshape(struct mddev *mddev)
                 kfree(newpoolinfo);
                 return -ENOMEM;
         }
-       newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+       newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
                              GFP_KERNEL);
         if (!newmirrors) {
                 kfree(newpoolinfo);
@@ -2876,7 +2947,6 @@ static int raid1_reshape(struct mddev *mddev)
         conf->raid_disks = mddev->raid_disks = raid_disks;
         mddev->delta_disks = 0;
  
-       conf->last_used = 0; /* just make sure it is in-range */
         lower_barrier(conf);
  
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);