md/raid5: make sure to_read and to_write never go negative.
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / drivers / md / raid5.c
index adda94df5eb2352775e64fb7fae4e88c6e89a98b..ab613efbbeadfc3b458f19c7681c20a6b4051a32 100644 (file)
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                rw = WRITE_FUA;
                        else
                                rw = WRITE;
+                       if (test_bit(R5_Discard, &sh->dev[i].flags))
+                               rw |= REQ_DISCARD;
                } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                        rw = READ;
                else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                                        set_bit(R5_WantFUA, &dev->flags);
                                if (wbi->bi_rw & REQ_SYNC)
                                        set_bit(R5_SyncIO, &dev->flags);
-                               tx = async_copy_data(1, wbi, dev->page,
-                                       dev->sector, tx);
+                               if (wbi->bi_rw & REQ_DISCARD)
+                                       set_bit(R5_Discard, &dev->flags);
+                               else
+                                       tx = async_copy_data(1, wbi, dev->page,
+                                               dev->sector, tx);
                                wbi = r5_next_bio(wbi, dev->sector);
                        }
                }
@@ -1187,7 +1192,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        int i;
-       bool fua = false, sync = false;
+       bool fua = false, sync = false, discard = false;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
@@ -1195,13 +1200,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
        for (i = disks; i--; ) {
                fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
                sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+               discard |= test_bit(R5_Discard, &sh->dev[i].flags);
        }
 
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
 
                if (dev->written || i == pd_idx || i == qd_idx) {
-                       set_bit(R5_UPTODATE, &dev->flags);
+                       if (!discard)
+                               set_bit(R5_UPTODATE, &dev->flags);
                        if (fua)
                                set_bit(R5_WantFUA, &dev->flags);
                        if (sync)
@@ -1237,6 +1244,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
+       for (i = 0; i < sh->disks; i++) {
+               if (pd_idx == i)
+                       continue;
+               if (!test_bit(R5_Discard, &sh->dev[i].flags))
+                       break;
+       }
+       if (i >= sh->disks) {
+               atomic_inc(&sh->count);
+               set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+               ops_complete_reconstruct(sh);
+               return;
+       }
        /* check if prexor is active which means only process blocks
         * that are part of a read-modify-write (written)
         */
@@ -1281,10 +1300,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 {
        struct async_submit_ctl submit;
        struct page **blocks = percpu->scribble;
-       int count;
+       int count, i;
 
        pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+       for (i = 0; i < sh->disks; i++) {
+               if (sh->pd_idx == i || sh->qd_idx == i)
+                       continue;
+               if (!test_bit(R5_Discard, &sh->dev[i].flags))
+                       break;
+       }
+       if (i >= sh->disks) {
+               atomic_inc(&sh->count);
+               set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+               set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+               ops_complete_reconstruct(sh);
+               return;
+       }
+
        count = set_syndrome_sources(blocks, sh);
 
        atomic_inc(&sh->count);
@@ -2403,11 +2436,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
        }
-       spin_unlock_irq(&sh->stripe_lock);
 
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
                (unsigned long long)(*bip)->bi_sector,
                (unsigned long long)sh->sector, dd_idx);
+       spin_unlock_irq(&sh->stripe_lock);
 
        if (conf->mddev->bitmap && firstwrite) {
                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2474,10 +2507,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                bi = sh->dev[i].towrite;
                sh->dev[i].towrite = NULL;
                spin_unlock_irq(&sh->stripe_lock);
-               if (bi) {
-                       s->to_write--;
+               if (bi)
                        bitmap_end = 1;
-               }
 
                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                        wake_up(&conf->wait_for_overlap);
@@ -2519,11 +2550,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
                    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
                      test_bit(R5_ReadError, &sh->dev[i].flags))) {
+                       spin_lock_irq(&sh->stripe_lock);
                        bi = sh->dev[i].toread;
                        sh->dev[i].toread = NULL;
+                       spin_unlock_irq(&sh->stripe_lock);
                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                wake_up(&conf->wait_for_overlap);
-                       if (bi) s->to_read--;
                        while (bi && bi->bi_sector <
                               sh->dev[i].sector + STRIPE_SECTORS) {
                                struct bio *nextbi =
@@ -2736,7 +2768,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                if (sh->dev[i].written) {
                        dev = &sh->dev[i];
                        if (!test_bit(R5_LOCKED, &dev->flags) &&
-                               test_bit(R5_UPTODATE, &dev->flags)) {
+                           (test_bit(R5_UPTODATE, &dev->flags) ||
+                            test_and_clear_bit(R5_Discard, &dev->flags))) {
                                /* We can return any write requests */
                                struct bio *wbi, *wbi2;
                                pr_debug("Return write for disc %d\n", i);
@@ -2770,12 +2803,25 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                   int disks)
 {
        int rmw = 0, rcw = 0, i;
-       if (conf->max_degraded == 2) {
-               /* RAID6 requires 'rcw' in current implementation
-                * Calculate the real rcw later - for now fake it
+       sector_t recovery_cp = conf->mddev->recovery_cp;
+
+       /* RAID6 requires 'rcw' in current implementation.
+        * Otherwise, check whether resync is now happening or should start.
+        * If yes, then the array is dirty (after unclean shutdown or
+        * initial creation), so parity in some stripes might be inconsistent.
+        * In this case, we need to always do reconstruct-write, to ensure
+        * that in case of drive failure or read-error correction, we
+        * generate correct data from the parity.
+        */
+       if (conf->max_degraded == 2 ||
+           (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+               /* Calculate the real rcw later - for now make it
                 * look like rcw is cheaper
                 */
                rcw = 1; rmw = 2;
+               pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
+                        conf->max_degraded, (unsigned long long)recovery_cp,
+                        (unsigned long long)sh->sector);
        } else for (i = disks; i--; ) {
                /* would I have to read this buffer for read_modify_write */
                struct r5dev *dev = &sh->dev[i];
@@ -3454,10 +3500,12 @@ static void handle_stripe(struct stripe_head *sh)
        if (s.written &&
            (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
                             && !test_bit(R5_LOCKED, &pdev->flags)
-                            && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+                            && (test_bit(R5_UPTODATE, &pdev->flags) ||
+                                test_bit(R5_Discard, &pdev->flags))))) &&
            (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
                             && !test_bit(R5_LOCKED, &qdev->flags)
-                            && test_bit(R5_UPTODATE, &qdev->flags)))))
+                            && (test_bit(R5_UPTODATE, &qdev->flags) ||
+                                test_bit(R5_Discard, &qdev->flags))))))
                handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
        /* Now we might consider reading some blocks, either to check/generate
@@ -3484,9 +3532,11 @@ static void handle_stripe(struct stripe_head *sh)
                /* All the 'written' buffers and the parity block are ready to
                 * be written back to disk
                 */
-               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+                      !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
                BUG_ON(sh->qd_idx >= 0 &&
-                      !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+                      !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+                      !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -4067,6 +4117,88 @@ static void release_stripe_plug(struct mddev *mddev,
                release_stripe(sh);
 }
 
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+       struct r5conf *conf = mddev->private;
+       sector_t logical_sector, last_sector;
+       struct stripe_head *sh;
+       int remaining;
+       int stripe_sectors;
+
+       if (mddev->reshape_position != MaxSector)
+               /* Skip discard while reshape is happening */
+               return;
+
+       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+       bi->bi_next = NULL;
+       bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+       stripe_sectors = conf->chunk_sectors *
+               (conf->raid_disks - conf->max_degraded);
+       logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+                                              stripe_sectors);
+       sector_div(last_sector, stripe_sectors);
+
+       logical_sector *= conf->chunk_sectors;
+       last_sector *= conf->chunk_sectors;
+
+       for (; logical_sector < last_sector;
+            logical_sector += STRIPE_SECTORS) {
+               DEFINE_WAIT(w);
+               int d;
+       again:
+               sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+               prepare_to_wait(&conf->wait_for_overlap, &w,
+                               TASK_UNINTERRUPTIBLE);
+               spin_lock_irq(&sh->stripe_lock);
+               for (d = 0; d < conf->raid_disks; d++) {
+                       if (d == sh->pd_idx || d == sh->qd_idx)
+                               continue;
+                       if (sh->dev[d].towrite || sh->dev[d].toread) {
+                               set_bit(R5_Overlap, &sh->dev[d].flags);
+                               spin_unlock_irq(&sh->stripe_lock);
+                               release_stripe(sh);
+                               schedule();
+                               goto again;
+                       }
+               }
+               finish_wait(&conf->wait_for_overlap, &w);
+               for (d = 0; d < conf->raid_disks; d++) {
+                       if (d == sh->pd_idx || d == sh->qd_idx)
+                               continue;
+                       sh->dev[d].towrite = bi;
+                       set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+                       raid5_inc_bi_active_stripes(bi);
+               }
+               spin_unlock_irq(&sh->stripe_lock);
+               if (conf->mddev->bitmap) {
+                       for (d = 0;
+                            d < conf->raid_disks - conf->max_degraded;
+                            d++)
+                               bitmap_startwrite(mddev->bitmap,
+                                                 sh->sector,
+                                                 STRIPE_SECTORS,
+                                                 0);
+                       sh->bm_seq = conf->seq_flush + 1;
+                       set_bit(STRIPE_BIT_DELAY, &sh->state);
+               }
+
+               set_bit(STRIPE_HANDLE, &sh->state);
+               clear_bit(STRIPE_DELAYED, &sh->state);
+               if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       atomic_inc(&conf->preread_active_stripes);
+               release_stripe_plug(mddev, sh);
+       }
+
+       remaining = raid5_dec_bi_active_stripes(bi);
+       if (remaining == 0) {
+               md_write_end(mddev);
+               bio_endio(bi, 0);
+       }
+}
+
 static void make_request(struct mddev *mddev, struct bio * bi)
 {
        struct r5conf *conf = mddev->private;
@@ -4089,6 +4221,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
             chunk_aligned_read(mddev,bi))
                return;
 
+       if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+               make_discard_request(mddev, bi);
+               return;
+       }
+
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bi->bi_sector + (bi->bi_size>>9);
        bi->bi_next = NULL;
@@ -4625,8 +4762,9 @@ static int handle_active_stripes(struct r5conf *conf)
  * During the scan, completed stripes are saved for us by the interrupt
  * handler, so that they will not have to wait for our next wakeup.
  */
-static void raid5d(struct mddev *mddev)
+static void raid5d(struct md_thread *thread)
 {
+       struct mddev *mddev = thread->mddev;
        struct r5conf *conf = mddev->private;
        int handled;
        struct blk_plug plug;
@@ -5361,6 +5499,7 @@ static int run(struct mddev *mddev)
 
        if (mddev->queue) {
                int chunk_size;
+               bool discard_supported = true;
                /* read-ahead size must cover two whole stripes, which
                 * is 2 * (datadisks) * chunksize where 'n' is the
                 * number of raid devices
@@ -5380,13 +5519,48 @@ static int run(struct mddev *mddev)
                blk_queue_io_min(mddev->queue, chunk_size);
                blk_queue_io_opt(mddev->queue, chunk_size *
                                 (conf->raid_disks - conf->max_degraded));
+               /*
+                * We can only discard a whole stripe. It doesn't make sense to
+                * discard data disk but write parity disk
+                */
+               stripe = stripe * PAGE_SIZE;
+               mddev->queue->limits.discard_alignment = stripe;
+               mddev->queue->limits.discard_granularity = stripe;
+               /*
+                * unaligned part of discard request will be ignored, so can't
+                * guarantee discard_zerors_data
+                */
+               mddev->queue->limits.discard_zeroes_data = 0;
 
                rdev_for_each(rdev, mddev) {
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->new_data_offset << 9);
+                       /*
+                        * discard_zeroes_data is required, otherwise data
+                        * could be lost. Consider a scenario: discard a stripe
+                        * (the stripe could be inconsistent if
+                        * discard_zeroes_data is 0); write one disk of the
+                        * stripe (the stripe could be inconsistent again
+                        * depending on which disks are used to calculate
+                        * parity); the disk is broken; The stripe data of this
+                        * disk is lost.
+                        */
+                       if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+                           !bdev_get_queue(rdev->bdev)->
+                                               limits.discard_zeroes_data)
+                               discard_supported = false;
                }
+
+               if (discard_supported &&
+                  mddev->queue->limits.max_discard_sectors >= stripe &&
+                  mddev->queue->limits.discard_granularity >= stripe)
+                       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+                                               mddev->queue);
+               else
+                       queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+                                               mddev->queue);
        }
 
        return 0;