md/raid5: make sure to_read and to_write never go negative.

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 04348d76bb30fa8831964ea980ec2df912a45f92..ab613efbbeadfc3b458f19c7681c20a6b4051a32 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
   * We maintain a biased count of active stripes in the bottom 16 bits of
   * bi_phys_segments, and a count of processed stripes in the upper 16 bits
   */
-static inline int raid5_bi_phys_segments(struct bio *bio)
+static inline int raid5_bi_processed_stripes(struct bio *bio)
  {
-       return bio->bi_phys_segments & 0xffff;
+       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+       return (atomic_read(segments) >> 16) & 0xffff;
  }
  
-static inline int raid5_bi_hw_segments(struct bio *bio)
+static inline int raid5_dec_bi_active_stripes(struct bio *bio)
  {
-       return (bio->bi_phys_segments >> 16) & 0xffff;
+       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+       return atomic_sub_return(1, segments) & 0xffff;
  }
  
-static inline int raid5_dec_bi_phys_segments(struct bio *bio)
+static inline void raid5_inc_bi_active_stripes(struct bio *bio)
  {
-       --bio->bi_phys_segments;
-       return raid5_bi_phys_segments(bio);
+       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+       atomic_inc(segments);
  }
  
-static inline int raid5_dec_bi_hw_segments(struct bio *bio)
+static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+       unsigned int cnt)
  {
-       unsigned short val = raid5_bi_hw_segments(bio);
+       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+       int old, new;
  
-       --val;
-       bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
-       return val;
+       do {
+               old = atomic_read(segments);
+               new = (old & 0xffff) | (cnt << 16);
+       } while (atomic_cmpxchg(segments, old, new) != old);
  }
  
-static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
+static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
  {
-       bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
+       atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+       atomic_set(segments, cnt);
  }
  
  /* Find first data disk in a raid6 stripe */
@@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh)
                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
  }
  
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
  {
-       if (atomic_dec_and_test(&sh->count)) {
-               BUG_ON(!list_empty(&sh->lru));
-               BUG_ON(atomic_read(&conf->active_stripes)==0);
-               if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                       if (test_bit(STRIPE_DELAYED, &sh->state) &&
-                           !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-                               list_add_tail(&sh->lru, &conf->delayed_list);
-                       else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                                  sh->bm_seq - conf->seq_write > 0)
-                               list_add_tail(&sh->lru, &conf->bitmap_list);
-                       else {
-                               clear_bit(STRIPE_DELAYED, &sh->state);
-                               clear_bit(STRIPE_BIT_DELAY, &sh->state);
-                               list_add_tail(&sh->lru, &conf->handle_list);
-                       }
-                       md_wakeup_thread(conf->mddev->thread);
-               } else {
-                       BUG_ON(stripe_operations_active(sh));
-                       if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-                               if (atomic_dec_return(&conf->preread_active_stripes)
-                                   < IO_THRESHOLD)
-                                       md_wakeup_thread(conf->mddev->thread);
-                       atomic_dec(&conf->active_stripes);
-                       if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-                               list_add_tail(&sh->lru, &conf->inactive_list);
-                               wake_up(&conf->wait_for_stripe);
-                               if (conf->retry_read_aligned)
-                                       md_wakeup_thread(conf->mddev->thread);
-                       }
+       BUG_ON(!list_empty(&sh->lru));
+       BUG_ON(atomic_read(&conf->active_stripes)==0);
+       if (test_bit(STRIPE_HANDLE, &sh->state)) {
+               if (test_bit(STRIPE_DELAYED, &sh->state) &&
+                   !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       list_add_tail(&sh->lru, &conf->delayed_list);
+               else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                          sh->bm_seq - conf->seq_write > 0)
+                       list_add_tail(&sh->lru, &conf->bitmap_list);
+               else {
+                       clear_bit(STRIPE_DELAYED, &sh->state);
+                       clear_bit(STRIPE_BIT_DELAY, &sh->state);
+                       list_add_tail(&sh->lru, &conf->handle_list);
+               }
+               md_wakeup_thread(conf->mddev->thread);
+       } else {
+               BUG_ON(stripe_operations_active(sh));
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       if (atomic_dec_return(&conf->preread_active_stripes)
+                           < IO_THRESHOLD)
+                               md_wakeup_thread(conf->mddev->thread);
+               atomic_dec(&conf->active_stripes);
+               if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+                       list_add_tail(&sh->lru, &conf->inactive_list);
+                       wake_up(&conf->wait_for_stripe);
+                       if (conf->retry_read_aligned)
+                               md_wakeup_thread(conf->mddev->thread);
                 }
         }
  }
  
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+       if (atomic_dec_and_test(&sh->count))
+               do_release_stripe(conf, sh);
+}
+
  static void release_stripe(struct stripe_head *sh)
  {
         struct r5conf *conf = sh->raid_conf;
         unsigned long flags;
  
-       spin_lock_irqsave(&conf->device_lock, flags);
-       __release_stripe(conf, sh);
-       spin_unlock_irqrestore(&conf->device_lock, flags);
+       local_irq_save(flags);
+       if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
+               do_release_stripe(conf, sh);
+               spin_unlock(&conf->device_lock);
+       }
+       local_irq_restore(flags);
  }
  
  static inline void remove_hash(struct stripe_head *sh)
@@ -471,7 +484,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                 } else {
                         if (atomic_read(&sh->count)) {
                                 BUG_ON(!list_empty(&sh->lru)
-                                   && !test_bit(STRIPE_EXPANDING, &sh->state));
+                                   && !test_bit(STRIPE_EXPANDING, &sh->state)
+                                   && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
                         } else {
                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
                                         atomic_inc(&conf->active_stripes);
@@ -533,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                 rw = WRITE_FUA;
                         else
                                 rw = WRITE;
+                       if (test_bit(R5_Discard, &sh->dev[i].flags))
+                               rw |= REQ_DISCARD;
                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                         rw = READ;
                 else if (test_and_clear_bit(R5_WantReplace,
@@ -640,6 +656,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                         else
                                 bi->bi_sector = (sh->sector
                                                  + rdev->data_offset);
+                       if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                               bi->bi_rw |= REQ_FLUSH;
+
                         bi->bi_flags = 1 << BIO_UPTODATE;
                         bi->bi_idx = 0;
                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -749,14 +768,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
  {
         struct stripe_head *sh = stripe_head_ref;
         struct bio *return_bi = NULL;
-       struct r5conf *conf = sh->raid_conf;
         int i;
  
         pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         /* clear completed biofills */
-       spin_lock_irq(&conf->device_lock);
         for (i = sh->disks; i--; ) {
                 struct r5dev *dev = &sh->dev[i];
  
@@ -774,7 +791,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                         while (rbi && rbi->bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 rbi2 = r5_next_bio(rbi, dev->sector);
-                               if (!raid5_dec_bi_phys_segments(rbi)) {
+                               if (!raid5_dec_bi_active_stripes(rbi)) {
                                         rbi->bi_next = return_bi;
                                         return_bi = rbi;
                                 }
@@ -782,7 +799,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
                         }
                 }
         }
-       spin_unlock_irq(&conf->device_lock);
         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
  
         return_io(return_bi);
@@ -794,7 +810,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
  static void ops_run_biofill(struct stripe_head *sh)
  {
         struct dma_async_tx_descriptor *tx = NULL;
-       struct r5conf *conf = sh->raid_conf;
         struct async_submit_ctl submit;
         int i;
  
@@ -805,10 +820,10 @@ static void ops_run_biofill(struct stripe_head *sh)
                 struct r5dev *dev = &sh->dev[i];
                 if (test_bit(R5_Wantfill, &dev->flags)) {
                         struct bio *rbi;
-                       spin_lock_irq(&conf->device_lock);
+                       spin_lock_irq(&sh->stripe_lock);
                         dev->read = rbi = dev->toread;
                         dev->toread = NULL;
-                       spin_unlock_irq(&conf->device_lock);
+                       spin_unlock_irq(&sh->stripe_lock);
                         while (rbi && rbi->bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 tx = async_copy_data(0, rbi, dev->page,
@@ -1144,12 +1159,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
                         struct bio *wbi;
  
-                       spin_lock_irq(&sh->raid_conf->device_lock);
+                       spin_lock_irq(&sh->stripe_lock);
                         chosen = dev->towrite;
                         dev->towrite = NULL;
                         BUG_ON(dev->written);
                         wbi = dev->written = chosen;
-                       spin_unlock_irq(&sh->raid_conf->device_lock);
+                       spin_unlock_irq(&sh->stripe_lock);
  
                         while (wbi && wbi->bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
@@ -1157,8 +1172,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                                         set_bit(R5_WantFUA, &dev->flags);
                                 if (wbi->bi_rw & REQ_SYNC)
                                         set_bit(R5_SyncIO, &dev->flags);
-                               tx = async_copy_data(1, wbi, dev->page,
-                                       dev->sector, tx);
+                               if (wbi->bi_rw & REQ_DISCARD)
+                                       set_bit(R5_Discard, &dev->flags);
+                               else
+                                       tx = async_copy_data(1, wbi, dev->page,
+                                               dev->sector, tx);
                                 wbi = r5_next_bio(wbi, dev->sector);
                         }
                 }
@@ -1174,7 +1192,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
         int pd_idx = sh->pd_idx;
         int qd_idx = sh->qd_idx;
         int i;
-       bool fua = false, sync = false;
+       bool fua = false, sync = false, discard = false;
  
         pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
@@ -1182,13 +1200,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
         for (i = disks; i--; ) {
                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+               discard |= test_bit(R5_Discard, &sh->dev[i].flags);
         }
  
         for (i = disks; i--; ) {
                 struct r5dev *dev = &sh->dev[i];
  
                 if (dev->written || i == pd_idx || i == qd_idx) {
-                       set_bit(R5_UPTODATE, &dev->flags);
+                       if (!discard)
+                               set_bit(R5_UPTODATE, &dev->flags);
                         if (fua)
                                 set_bit(R5_WantFUA, &dev->flags);
                         if (sync)
@@ -1224,6 +1244,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
         pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
+       for (i = 0; i < sh->disks; i++) {
+               if (pd_idx == i)
+                       continue;
+               if (!test_bit(R5_Discard, &sh->dev[i].flags))
+                       break;
+       }
+       if (i >= sh->disks) {
+               atomic_inc(&sh->count);
+               set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+               ops_complete_reconstruct(sh);
+               return;
+       }
         /* check if prexor is active which means only process blocks
          * that are part of a read-modify-write (written)
          */
@@ -1268,10 +1300,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
  {
         struct async_submit_ctl submit;
         struct page **blocks = percpu->scribble;
-       int count;
+       int count, i;
  
         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
  
+       for (i = 0; i < sh->disks; i++) {
+               if (sh->pd_idx == i || sh->qd_idx == i)
+                       continue;
+               if (!test_bit(R5_Discard, &sh->dev[i].flags))
+                       break;
+       }
+       if (i >= sh->disks) {
+               atomic_inc(&sh->count);
+               set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+               set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+               ops_complete_reconstruct(sh);
+               return;
+       }
+
         count = set_syndrome_sources(blocks, sh);
  
         atomic_inc(&sh->count);
@@ -1454,6 +1500,8 @@ static int grow_one_stripe(struct r5conf *conf)
         init_waitqueue_head(&sh->ops.wait_for_ops);
         #endif
  
+       spin_lock_init(&sh->stripe_lock);
+
         if (grow_buffers(sh)) {
                 shrink_buffers(sh);
                 kmem_cache_free(conf->slab_cache, sh);
@@ -1739,7 +1787,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
                         atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
-               }
+               } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                       clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+
                 if (atomic_read(&rdev->read_errors))
                         atomic_set(&rdev->read_errors, 0);
         } else {
@@ -1784,7 +1834,11 @@ static void raid5_end_read_request(struct bio * bi, int error)
                 else
                         retry = 1;
                 if (retry)
-                       set_bit(R5_ReadError, &sh->dev[i].flags);
+                       if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+                               set_bit(R5_ReadError, &sh->dev[i].flags);
+                               clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+                       } else
+                               set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
                 else {
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
@@ -2340,11 +2394,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                 (unsigned long long)bi->bi_sector,
                 (unsigned long long)sh->sector);
  
-
-       spin_lock_irq(&conf->device_lock);
+       /*
+        * If several bio share a stripe. The bio bi_phys_segments acts as a
+        * reference count to avoid race. The reference count should already be
+        * increased before this function is called (for example, in
+        * make_request()), so other bio sharing this stripe will not free the
+        * stripe. If a stripe is owned by one stripe, the stripe lock will
+        * protect it.
+        */
+       spin_lock_irq(&sh->stripe_lock);
         if (forwrite) {
                 bip = &sh->dev[dd_idx].towrite;
-               if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+               if (*bip == NULL)
                         firstwrite = 1;
         } else
                 bip = &sh->dev[dd_idx].toread;
@@ -2360,7 +2421,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
         if (*bip)
                 bi->bi_next = *bip;
         *bip = bi;
-       bi->bi_phys_segments++;
+       raid5_inc_bi_active_stripes(bi);
  
         if (forwrite) {
                 /* check if page is covered */
@@ -2375,11 +2436,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                         set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
         }
-       spin_unlock_irq(&conf->device_lock);
  
         pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
                 (unsigned long long)(*bip)->bi_sector,
                 (unsigned long long)sh->sector, dd_idx);
+       spin_unlock_irq(&sh->stripe_lock);
  
         if (conf->mddev->bitmap && firstwrite) {
                 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2391,7 +2452,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
  
   overlap:
         set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
-       spin_unlock_irq(&conf->device_lock);
+       spin_unlock_irq(&sh->stripe_lock);
         return 0;
  }
  
@@ -2441,14 +2502,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                 rdev_dec_pending(rdev, conf->mddev);
                         }
                 }
-               spin_lock_irq(&conf->device_lock);
+               spin_lock_irq(&sh->stripe_lock);
                 /* fail all writes first */
                 bi = sh->dev[i].towrite;
                 sh->dev[i].towrite = NULL;
-               if (bi) {
-                       s->to_write--;
+               spin_unlock_irq(&sh->stripe_lock);
+               if (bi)
                         bitmap_end = 1;
-               }
  
                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                         wake_up(&conf->wait_for_overlap);
@@ -2457,13 +2517,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                         sh->dev[i].sector + STRIPE_SECTORS) {
                         struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                       if (!raid5_dec_bi_phys_segments(bi)) {
+                       if (!raid5_dec_bi_active_stripes(bi)) {
                                 md_write_end(conf->mddev);
                                 bi->bi_next = *return_bi;
                                 *return_bi = bi;
                         }
                         bi = nextbi;
                 }
+               if (bitmap_end)
+                       bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                               STRIPE_SECTORS, 0, 0);
+               bitmap_end = 0;
                 /* and fail all 'written' */
                 bi = sh->dev[i].written;
                 sh->dev[i].written = NULL;
@@ -2472,7 +2536,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                        sh->dev[i].sector + STRIPE_SECTORS) {
                         struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                       if (!raid5_dec_bi_phys_segments(bi)) {
+                       if (!raid5_dec_bi_active_stripes(bi)) {
                                 md_write_end(conf->mddev);
                                 bi->bi_next = *return_bi;
                                 *return_bi = bi;
@@ -2486,24 +2550,24 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
                     (!test_bit(R5_Insync, &sh->dev[i].flags) ||
                       test_bit(R5_ReadError, &sh->dev[i].flags))) {
+                       spin_lock_irq(&sh->stripe_lock);
                         bi = sh->dev[i].toread;
                         sh->dev[i].toread = NULL;
+                       spin_unlock_irq(&sh->stripe_lock);
                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                 wake_up(&conf->wait_for_overlap);
-                       if (bi) s->to_read--;
                         while (bi && bi->bi_sector <
                                sh->dev[i].sector + STRIPE_SECTORS) {
                                 struct bio *nextbi =
                                         r5_next_bio(bi, sh->dev[i].sector);
                                 clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                               if (!raid5_dec_bi_phys_segments(bi)) {
+                               if (!raid5_dec_bi_active_stripes(bi)) {
                                         bi->bi_next = *return_bi;
                                         *return_bi = bi;
                                 }
                                 bi = nextbi;
                         }
                 }
-               spin_unlock_irq(&conf->device_lock);
                 if (bitmap_end)
                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
                                         STRIPE_SECTORS, 0, 0);
@@ -2704,33 +2768,27 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                 if (sh->dev[i].written) {
                         dev = &sh->dev[i];
                         if (!test_bit(R5_LOCKED, &dev->flags) &&
-                               test_bit(R5_UPTODATE, &dev->flags)) {
+                           (test_bit(R5_UPTODATE, &dev->flags) ||
+                            test_and_clear_bit(R5_Discard, &dev->flags))) {
                                 /* We can return any write requests */
                                 struct bio *wbi, *wbi2;
-                               int bitmap_end = 0;
                                 pr_debug("Return write for disc %d\n", i);
-                               spin_lock_irq(&conf->device_lock);
                                 wbi = dev->written;
                                 dev->written = NULL;
                                 while (wbi && wbi->bi_sector <
                                         dev->sector + STRIPE_SECTORS) {
                                         wbi2 = r5_next_bio(wbi, dev->sector);
-                                       if (!raid5_dec_bi_phys_segments(wbi)) {
+                                       if (!raid5_dec_bi_active_stripes(wbi)) {
                                                 md_write_end(conf->mddev);
                                                 wbi->bi_next = *return_bi;
                                                 *return_bi = wbi;
                                         }
                                         wbi = wbi2;
                                 }
-                               if (dev->towrite == NULL)
-                                       bitmap_end = 1;
-                               spin_unlock_irq(&conf->device_lock);
-                               if (bitmap_end)
-                                       bitmap_endwrite(conf->mddev->bitmap,
-                                                       sh->sector,
-                                                       STRIPE_SECTORS,
+                               bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+                                               STRIPE_SECTORS,
                                          !test_bit(STRIPE_DEGRADED, &sh->state),
-                                                       0);
+                                               0);
                         }
                 }
  
@@ -2745,12 +2803,25 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                    int disks)
  {
         int rmw = 0, rcw = 0, i;
-       if (conf->max_degraded == 2) {
-               /* RAID6 requires 'rcw' in current implementation
-                * Calculate the real rcw later - for now fake it
+       sector_t recovery_cp = conf->mddev->recovery_cp;
+
+       /* RAID6 requires 'rcw' in current implementation.
+        * Otherwise, check whether resync is now happening or should start.
+        * If yes, then the array is dirty (after unclean shutdown or
+        * initial creation), so parity in some stripes might be inconsistent.
+        * In this case, we need to always do reconstruct-write, to ensure
+        * that in case of drive failure or read-error correction, we
+        * generate correct data from the parity.
+        */
+       if (conf->max_degraded == 2 ||
+           (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+               /* Calculate the real rcw later - for now make it
                  * look like rcw is cheaper
                  */
                 rcw = 1; rmw = 2;
+               pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
+                        conf->max_degraded, (unsigned long long)recovery_cp,
+                        (unsigned long long)sh->sector);
         } else for (i = disks; i--; ) {
                 /* would I have to read this buffer for read_modify_write */
                 struct r5dev *dev = &sh->dev[i];
@@ -3182,7 +3253,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
  
         /* Now to look around and see what can be done */
         rcu_read_lock();
-       spin_lock_irq(&conf->device_lock);
         for (i=disks; i--; ) {
                 struct md_rdev *rdev;
                 sector_t first_bad;
@@ -3328,7 +3398,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                                 do_recovery = 1;
                 }
         }
-       spin_unlock_irq(&conf->device_lock);
         if (test_bit(STRIPE_SYNCING, &sh->state)) {
                 /* If there is a failed device being replaced,
                  *     we must be recovering.
@@ -3431,10 +3500,12 @@ static void handle_stripe(struct stripe_head *sh)
         if (s.written &&
             (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
                              && !test_bit(R5_LOCKED, &pdev->flags)
-                            && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+                            && (test_bit(R5_UPTODATE, &pdev->flags) ||
+                                test_bit(R5_Discard, &pdev->flags))))) &&
             (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
                              && !test_bit(R5_LOCKED, &qdev->flags)
-                            && test_bit(R5_UPTODATE, &qdev->flags)))))
+                            && (test_bit(R5_UPTODATE, &qdev->flags) ||
+                                test_bit(R5_Discard, &qdev->flags))))))
                 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
  
         /* Now we might consider reading some blocks, either to check/generate
@@ -3461,9 +3532,11 @@ static void handle_stripe(struct stripe_head *sh)
                 /* All the 'written' buffers and the parity block are ready to
                  * be written back to disk
                  */
-               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+                      !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
                 BUG_ON(sh->qd_idx >= 0 &&
-                      !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+                      !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+                      !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
                         if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -3791,7 +3864,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
                  * this sets the active strip count to 1 and the processed
                  * strip count to zero (upper 8 bits)
                  */
-               bi->bi_phys_segments = 1; /* biased count of active stripes */
+               raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
         }
  
         return bi;
@@ -3988,6 +4061,144 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
         return sh;
  }
  
+struct raid5_plug_cb {
+       struct blk_plug_cb      cb;
+       struct list_head        list;
+};
+
+static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
+{
+       struct raid5_plug_cb *cb = container_of(
+               blk_cb, struct raid5_plug_cb, cb);
+       struct stripe_head *sh;
+       struct mddev *mddev = cb->cb.data;
+       struct r5conf *conf = mddev->private;
+
+       if (cb->list.next && !list_empty(&cb->list)) {
+               spin_lock_irq(&conf->device_lock);
+               while (!list_empty(&cb->list)) {
+                       sh = list_first_entry(&cb->list, struct stripe_head, lru);
+                       list_del_init(&sh->lru);
+                       /*
+                        * avoid race release_stripe_plug() sees
+                        * STRIPE_ON_UNPLUG_LIST clear but the stripe
+                        * is still in our list
+                        */
+                       smp_mb__before_clear_bit();
+                       clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
+                       __release_stripe(conf, sh);
+               }
+               spin_unlock_irq(&conf->device_lock);
+       }
+       kfree(cb);
+}
+
+static void release_stripe_plug(struct mddev *mddev,
+                               struct stripe_head *sh)
+{
+       struct blk_plug_cb *blk_cb = blk_check_plugged(
+               raid5_unplug, mddev,
+               sizeof(struct raid5_plug_cb));
+       struct raid5_plug_cb *cb;
+
+       if (!blk_cb) {
+               release_stripe(sh);
+               return;
+       }
+
+       cb = container_of(blk_cb, struct raid5_plug_cb, cb);
+
+       if (cb->list.next == NULL)
+               INIT_LIST_HEAD(&cb->list);
+
+       if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
+               list_add_tail(&sh->lru, &cb->list);
+       else
+               release_stripe(sh);
+}
+
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+       struct r5conf *conf = mddev->private;
+       sector_t logical_sector, last_sector;
+       struct stripe_head *sh;
+       int remaining;
+       int stripe_sectors;
+
+       if (mddev->reshape_position != MaxSector)
+               /* Skip discard while reshape is happening */
+               return;
+
+       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+       bi->bi_next = NULL;
+       bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+       stripe_sectors = conf->chunk_sectors *
+               (conf->raid_disks - conf->max_degraded);
+       logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+                                              stripe_sectors);
+       sector_div(last_sector, stripe_sectors);
+
+       logical_sector *= conf->chunk_sectors;
+       last_sector *= conf->chunk_sectors;
+
+       for (; logical_sector < last_sector;
+            logical_sector += STRIPE_SECTORS) {
+               DEFINE_WAIT(w);
+               int d;
+       again:
+               sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+               prepare_to_wait(&conf->wait_for_overlap, &w,
+                               TASK_UNINTERRUPTIBLE);
+               spin_lock_irq(&sh->stripe_lock);
+               for (d = 0; d < conf->raid_disks; d++) {
+                       if (d == sh->pd_idx || d == sh->qd_idx)
+                               continue;
+                       if (sh->dev[d].towrite || sh->dev[d].toread) {
+                               set_bit(R5_Overlap, &sh->dev[d].flags);
+                               spin_unlock_irq(&sh->stripe_lock);
+                               release_stripe(sh);
+                               schedule();
+                               goto again;
+                       }
+               }
+               finish_wait(&conf->wait_for_overlap, &w);
+               for (d = 0; d < conf->raid_disks; d++) {
+                       if (d == sh->pd_idx || d == sh->qd_idx)
+                               continue;
+                       sh->dev[d].towrite = bi;
+                       set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+                       raid5_inc_bi_active_stripes(bi);
+               }
+               spin_unlock_irq(&sh->stripe_lock);
+               if (conf->mddev->bitmap) {
+                       for (d = 0;
+                            d < conf->raid_disks - conf->max_degraded;
+                            d++)
+                               bitmap_startwrite(mddev->bitmap,
+                                                 sh->sector,
+                                                 STRIPE_SECTORS,
+                                                 0);
+                       sh->bm_seq = conf->seq_flush + 1;
+                       set_bit(STRIPE_BIT_DELAY, &sh->state);
+               }
+
+               set_bit(STRIPE_HANDLE, &sh->state);
+               clear_bit(STRIPE_DELAYED, &sh->state);
+               if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       atomic_inc(&conf->preread_active_stripes);
+               release_stripe_plug(mddev, sh);
+       }
+
+       remaining = raid5_dec_bi_active_stripes(bi);
+       if (remaining == 0) {
+               md_write_end(mddev);
+               bio_endio(bi, 0);
+       }
+}
+
  static void make_request(struct mddev *mddev, struct bio * bi)
  {
         struct r5conf *conf = mddev->private;
@@ -4010,6 +4221,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
              chunk_aligned_read(mddev,bi))
                 return;
  
+       if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+               make_discard_request(mddev, bi);
+               return;
+       }
+
         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         last_sector = bi->bi_sector + (bi->bi_size>>9);
         bi->bi_next = NULL;
@@ -4113,11 +4329,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                         finish_wait(&conf->wait_for_overlap, &w);
                         set_bit(STRIPE_HANDLE, &sh->state);
                         clear_bit(STRIPE_DELAYED, &sh->state);
-                       if ((bi->bi_rw & REQ_SYNC) &&
+                       if ((bi->bi_rw & REQ_NOIDLE) &&
                             !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                 atomic_inc(&conf->preread_active_stripes);
-                       mddev_check_plugged(mddev);
-                       release_stripe(sh);
+                       release_stripe_plug(mddev, sh);
                 } else {
                         /* cannot get stripe for read-ahead, just give-up */
                         clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -4126,9 +4341,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                 }
         }
  
-       spin_lock_irq(&conf->device_lock);
-       remaining = raid5_dec_bi_phys_segments(bi);
-       spin_unlock_irq(&conf->device_lock);
+       remaining = raid5_dec_bi_active_stripes(bi);
         if (remaining == 0) {
  
                 if ( rw == WRITE )
@@ -4484,7 +4697,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
                      sector += STRIPE_SECTORS,
                      scnt++) {
  
-               if (scnt < raid5_bi_hw_segments(raid_bio))
+               if (scnt < raid5_bi_processed_stripes(raid_bio))
                         /* already done this stripe */
                         continue;
  
@@ -4492,25 +4705,24 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
  
                 if (!sh) {
                         /* failed to get a stripe - must wait */
-                       raid5_set_bi_hw_segments(raid_bio, scnt);
+                       raid5_set_bi_processed_stripes(raid_bio, scnt);
                         conf->retry_read_aligned = raid_bio;
                         return handled;
                 }
  
                 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
                         release_stripe(sh);
-                       raid5_set_bi_hw_segments(raid_bio, scnt);
+                       raid5_set_bi_processed_stripes(raid_bio, scnt);
                         conf->retry_read_aligned = raid_bio;
                         return handled;
                 }
  
+               set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
                 handle_stripe(sh);
                 release_stripe(sh);
                 handled++;
         }
-       spin_lock_irq(&conf->device_lock);
-       remaining = raid5_dec_bi_phys_segments(raid_bio);
-       spin_unlock_irq(&conf->device_lock);
+       remaining = raid5_dec_bi_active_stripes(raid_bio);
         if (remaining == 0)
                 bio_endio(raid_bio, 0);
         if (atomic_dec_and_test(&conf->active_aligned_reads))
@@ -4518,6 +4730,30 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
         return handled;
  }
  
+#define MAX_STRIPE_BATCH 8
+static int handle_active_stripes(struct r5conf *conf)
+{
+       struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
+       int i, batch_size = 0;
+
+       while (batch_size < MAX_STRIPE_BATCH &&
+                       (sh = __get_priority_stripe(conf)) != NULL)
+               batch[batch_size++] = sh;
+
+       if (batch_size == 0)
+               return batch_size;
+       spin_unlock_irq(&conf->device_lock);
+
+       for (i = 0; i < batch_size; i++)
+               handle_stripe(batch[i]);
+
+       cond_resched();
+
+       spin_lock_irq(&conf->device_lock);
+       for (i = 0; i < batch_size; i++)
+               __release_stripe(conf, batch[i]);
+       return batch_size;
+}
  
  /*
   * This is our raid5 kernel thread.
@@ -4526,9 +4762,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
   * During the scan, completed stripes are saved for us by the interrupt
   * handler, so that they will not have to wait for our next wakeup.
   */
-static void raid5d(struct mddev *mddev)
+static void raid5d(struct md_thread *thread)
  {
-       struct stripe_head *sh;
+       struct mddev *mddev = thread->mddev;
         struct r5conf *conf = mddev->private;
         int handled;
         struct blk_plug plug;
@@ -4542,8 +4778,9 @@ static void raid5d(struct mddev *mddev)
         spin_lock_irq(&conf->device_lock);
         while (1) {
                 struct bio *bio;
+               int batch_size;
  
-               if (atomic_read(&mddev->plug_cnt) == 0 &&
+               if (
                     !list_empty(&conf->bitmap_list)) {
                         /* Now is a good time to flush some bitmap updates */
                         conf->seq_flush++;
@@ -4553,8 +4790,7 @@ static void raid5d(struct mddev *mddev)
                         conf->seq_write = conf->seq_flush;
                         activate_bit_delay(conf);
                 }
-               if (atomic_read(&mddev->plug_cnt) == 0)
-                       raid5_activate_delayed(conf);
+               raid5_activate_delayed(conf);
  
                 while ((bio = remove_bio_from_retry(conf))) {
                         int ok;
@@ -4566,21 +4802,16 @@ static void raid5d(struct mddev *mddev)
                         handled++;
                 }
  
-               sh = __get_priority_stripe(conf);
-
-               if (!sh)
+               batch_size = handle_active_stripes(conf);
+               if (!batch_size)
                         break;
-               spin_unlock_irq(&conf->device_lock);
-               
-               handled++;
-               handle_stripe(sh);
-               release_stripe(sh);
-               cond_resched();
+               handled += batch_size;
  
-               if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+               if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+                       spin_unlock_irq(&conf->device_lock);
                         md_check_recovery(mddev);
-
-               spin_lock_irq(&conf->device_lock);
+                       spin_lock_irq(&conf->device_lock);
+               }
         }
         pr_debug("%d stripes handled\n", handled);
  
@@ -5268,6 +5499,7 @@ static int run(struct mddev *mddev)
  
         if (mddev->queue) {
                 int chunk_size;
+               bool discard_supported = true;
                 /* read-ahead size must cover two whole stripes, which
                  * is 2 * (datadisks) * chunksize where 'n' is the
                  * number of raid devices
@@ -5287,13 +5519,48 @@ static int run(struct mddev *mddev)
                 blk_queue_io_min(mddev->queue, chunk_size);
                 blk_queue_io_opt(mddev->queue, chunk_size *
                                  (conf->raid_disks - conf->max_degraded));
+               /*
+                * We can only discard a whole stripe. It doesn't make sense to
+                * discard data disk but write parity disk
+                */
+               stripe = stripe * PAGE_SIZE;
+               mddev->queue->limits.discard_alignment = stripe;
+               mddev->queue->limits.discard_granularity = stripe;
+               /*
+                * unaligned part of discard request will be ignored, so can't
+                * guarantee discard_zerors_data
+                */
+               mddev->queue->limits.discard_zeroes_data = 0;
  
                 rdev_for_each(rdev, mddev) {
                         disk_stack_limits(mddev->gendisk, rdev->bdev,
                                           rdev->data_offset << 9);
                         disk_stack_limits(mddev->gendisk, rdev->bdev,
                                           rdev->new_data_offset << 9);
+                       /*
+                        * discard_zeroes_data is required, otherwise data
+                        * could be lost. Consider a scenario: discard a stripe
+                        * (the stripe could be inconsistent if
+                        * discard_zeroes_data is 0); write one disk of the
+                        * stripe (the stripe could be inconsistent again
+                        * depending on which disks are used to calculate
+                        * parity); the disk is broken; The stripe data of this
+                        * disk is lost.
+                        */
+                       if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+                           !bdev_get_queue(rdev->bdev)->
+                                               limits.discard_zeroes_data)
+                               discard_supported = false;
                 }
+
+               if (discard_supported &&
+                  mddev->queue->limits.max_discard_sectors >= stripe &&
+                  mddev->queue->limits.discard_granularity >= stripe)
+                       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+                                               mddev->queue);
+               else
+                       queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+                                               mddev->queue);
         }
  
         return 0;