X-Git-Url: https://git.stricted.de/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid5.c;h=ab613efbbeadfc3b458f19c7681c20a6b4051a32;hb=1ed850f356a0a422013846b5291acff08815008b;hp=04348d76bb30fa8831964ea980ec2df912a45f92;hpb=2e3ee613480563a6d5c01b57d342e65cc58c06df;p=GitHub%2Fmt8127%2Fandroid_kernel_alcatel_ttab.git diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 04348d76bb30..ab613efbbead 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits */ -static inline int raid5_bi_phys_segments(struct bio *bio) +static inline int raid5_bi_processed_stripes(struct bio *bio) { - return bio->bi_phys_segments & 0xffff; + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + return (atomic_read(segments) >> 16) & 0xffff; } -static inline int raid5_bi_hw_segments(struct bio *bio) +static inline int raid5_dec_bi_active_stripes(struct bio *bio) { - return (bio->bi_phys_segments >> 16) & 0xffff; + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + return atomic_sub_return(1, segments) & 0xffff; } -static inline int raid5_dec_bi_phys_segments(struct bio *bio) +static inline void raid5_inc_bi_active_stripes(struct bio *bio) { - --bio->bi_phys_segments; - return raid5_bi_phys_segments(bio); + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + atomic_inc(segments); } -static inline int raid5_dec_bi_hw_segments(struct bio *bio) +static inline void raid5_set_bi_processed_stripes(struct bio *bio, + unsigned int cnt) { - unsigned short val = raid5_bi_hw_segments(bio); + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + int old, new; - --val; - bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); - return val; + do { + old = atomic_read(segments); + new = (old & 0xffff) | (cnt << 16); + } while (atomic_cmpxchg(segments, old, new) != old); } -static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) +static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) { - bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + atomic_set(segments, cnt); } /* Find first data disk in a raid6 stripe */ @@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) { - if (atomic_dec_and_test(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); - BUG_ON(atomic_read(&conf->active_stripes)==0); - if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - sh->bm_seq - conf->seq_write > 0) - list_add_tail(&sh->lru, &conf->bitmap_list); - else { - clear_bit(STRIPE_DELAYED, &sh->state); - clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); - } - md_wakeup_thread(conf->mddev->thread); - } else { - BUG_ON(stripe_operations_active(sh)); - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - if (atomic_dec_return(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } + BUG_ON(!list_empty(&sh->lru)); + BUG_ON(atomic_read(&conf->active_stripes)==0); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_DELAYED, &sh->state); + clear_bit(STRIPE_BIT_DELAY, &sh->state); + list_add_tail(&sh->lru, &conf->handle_list); + } + md_wakeup_thread(conf->mddev->thread); + } else { + BUG_ON(stripe_operations_active(sh)); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + atomic_dec(&conf->active_stripes); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + list_add_tail(&sh->lru, &conf->inactive_list); + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); } } } +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +{ + if (atomic_dec_and_test(&sh->count)) + do_release_stripe(conf, sh); +} + static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); + local_irq_save(flags); + if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { + do_release_stripe(conf, sh); + spin_unlock(&conf->device_lock); + } + local_irq_restore(flags); } static inline void remove_hash(struct stripe_head *sh) @@ -471,7 +484,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) - && !test_bit(STRIPE_EXPANDING, &sh->state)); + && !test_bit(STRIPE_EXPANDING, &sh->state) + && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -533,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; + if (test_bit(R5_Discard, &sh->dev[i].flags)) + rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else if (test_and_clear_bit(R5_WantReplace, @@ -640,6 +656,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else bi->bi_sector = (sh->sector + rdev->data_offset); + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + bi->bi_rw |= REQ_FLUSH; + bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_idx = 0; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -749,14 +768,12 @@ static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; struct bio *return_bi = NULL; - struct r5conf *conf = sh->raid_conf; int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* clear completed biofills */ - spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -774,7 +791,7 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_phys_segments(rbi)) { + if (!raid5_dec_bi_active_stripes(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -782,7 +799,6 @@ static void ops_complete_biofill(void *stripe_head_ref) } } } - spin_unlock_irq(&conf->device_lock); clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -794,7 +810,6 @@ static void ops_complete_biofill(void *stripe_head_ref) static void ops_run_biofill(struct stripe_head *sh) { struct dma_async_tx_descriptor *tx = NULL; - struct r5conf *conf = sh->raid_conf; struct async_submit_ctl submit; int i; @@ -805,10 +820,10 @@ static void ops_run_biofill(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi; - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&sh->stripe_lock); dev->read = rbi = dev->toread; dev->toread = NULL; - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { tx = async_copy_data(0, rbi, dev->page, @@ -1144,12 +1159,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; - spin_lock_irq(&sh->raid_conf->device_lock); + spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; BUG_ON(dev->written); wbi = dev->written = chosen; - spin_unlock_irq(&sh->raid_conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1157,8 +1172,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + if (wbi->bi_rw & REQ_DISCARD) + set_bit(R5_Discard, &dev->flags); + else + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); } } @@ -1174,7 +1192,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false, sync = false; + bool fua = false, sync = false, discard = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1182,13 +1200,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - set_bit(R5_UPTODATE, &dev->flags); + if (!discard) + set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -1224,6 +1244,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ @@ -1268,10 +1300,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks = percpu->scribble; - int count; + int count, i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + count = set_syndrome_sources(blocks, sh); atomic_inc(&sh->count); @@ -1454,6 +1500,8 @@ static int grow_one_stripe(struct r5conf *conf) init_waitqueue_head(&sh->ops.wait_for_ops); #endif + spin_lock_init(&sh->stripe_lock); + if (grow_buffers(sh)) { shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); @@ -1739,7 +1787,9 @@ static void raid5_end_read_request(struct bio * bi, int error) atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - } + } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + if (atomic_read(&rdev->read_errors)) atomic_set(&rdev->read_errors, 0); } else { @@ -1784,7 +1834,11 @@ static void raid5_end_read_request(struct bio * bi, int error) else retry = 1; if (retry) - set_bit(R5_ReadError, &sh->dev[i].flags); + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + set_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + } else + set_bit(R5_ReadNoMerge, &sh->dev[i].flags); else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); @@ -2340,11 +2394,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector); - - spin_lock_irq(&conf->device_lock); + /* + * If several bio share a stripe. The bio bi_phys_segments acts as a + * reference count to avoid race. The reference count should already be + * increased before this function is called (for example, in + * make_request()), so other bio sharing this stripe will not free the + * stripe. If a stripe is owned by one stripe, the stripe lock will + * protect it. + */ + spin_lock_irq(&sh->stripe_lock); if (forwrite) { bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL && sh->dev[dd_idx].written == NULL) + if (*bip == NULL) firstwrite = 1; } else bip = &sh->dev[dd_idx].toread; @@ -2360,7 +2421,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (*bip) bi->bi_next = *bip; *bip = bi; - bi->bi_phys_segments++; + raid5_inc_bi_active_stripes(bi); if (forwrite) { /* check if page is covered */ @@ -2375,11 +2436,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&conf->device_lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, (unsigned long long)sh->sector, dd_idx); + spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { bitmap_startwrite(conf->mddev->bitmap, sh->sector, @@ -2391,7 +2452,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); return 0; } @@ -2441,14 +2502,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, rdev_dec_pending(rdev, conf->mddev); } } - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&sh->stripe_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) { - s->to_write--; + spin_unlock_irq(&sh->stripe_lock); + if (bi) bitmap_end = 1; - } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -2457,13 +2517,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { + if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); + bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; @@ -2472,7 +2536,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { + if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -2486,24 +2550,24 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - if (bi) s->to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { + if (!raid5_dec_bi_active_stripes(bi)) { bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } } - spin_unlock_irq(&conf->device_lock); if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0, 0); @@ -2704,33 +2768,27 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_and_clear_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; - int bitmap_end = 0; pr_debug("Return write for disc %d\n", i); - spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (!raid5_dec_bi_phys_segments(wbi)) { + if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; } wbi = wbi2; } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, - sh->sector, - STRIPE_SECTORS, + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + 0); } } @@ -2745,12 +2803,25 @@ static void handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* RAID6 requires 'rcw' in current implementation. + * Otherwise, check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->max_degraded == 2 || + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->max_degraded, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -3182,7 +3253,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Now to look around and see what can be done */ rcu_read_lock(); - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { struct md_rdev *rdev; sector_t first_bad; @@ -3328,7 +3398,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) do_recovery = 1; } } - spin_unlock_irq(&conf->device_lock); if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, * we must be recovering. @@ -3431,10 +3500,12 @@ static void handle_stripe(struct stripe_head *sh) if (s.written && (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); /* Now we might consider reading some blocks, either to check/generate @@ -3461,9 +3532,11 @@ static void handle_stripe(struct stripe_head *sh) /* All the 'written' buffers and the parity block are ready to * be written back to disk */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && @@ -3791,7 +3864,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * this sets the active strip count to 1 and the processed * strip count to zero (upper 8 bits) */ - bi->bi_phys_segments = 1; /* biased count of active stripes */ + raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ } return bi; @@ -3988,6 +4061,144 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) return sh; } +struct raid5_plug_cb { + struct blk_plug_cb cb; + struct list_head list; +}; + +static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) +{ + struct raid5_plug_cb *cb = container_of( + blk_cb, struct raid5_plug_cb, cb); + struct stripe_head *sh; + struct mddev *mddev = cb->cb.data; + struct r5conf *conf = mddev->private; + + if (cb->list.next && !list_empty(&cb->list)) { + spin_lock_irq(&conf->device_lock); + while (!list_empty(&cb->list)) { + sh = list_first_entry(&cb->list, struct stripe_head, lru); + list_del_init(&sh->lru); + /* + * avoid race release_stripe_plug() sees + * STRIPE_ON_UNPLUG_LIST clear but the stripe + * is still in our list + */ + smp_mb__before_clear_bit(); + clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); + __release_stripe(conf, sh); + } + spin_unlock_irq(&conf->device_lock); + } + kfree(cb); +} + +static void release_stripe_plug(struct mddev *mddev, + struct stripe_head *sh) +{ + struct blk_plug_cb *blk_cb = blk_check_plugged( + raid5_unplug, mddev, + sizeof(struct raid5_plug_cb)); + struct raid5_plug_cb *cb; + + if (!blk_cb) { + release_stripe(sh); + return; + } + + cb = container_of(blk_cb, struct raid5_plug_cb, cb); + + if (cb->list.next == NULL) + INIT_LIST_HEAD(&cb->list); + + if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) + list_add_tail(&sh->lru, &cb->list); + else + release_stripe(sh); +} + +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int remaining; + int stripe_sectors; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int d; + again: + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + schedule(); + goto again; + } + } + finish_wait(&conf->wait_for_overlap, &w); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + raid5_inc_bi_active_stripes(bi); + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + remaining = raid5_dec_bi_active_stripes(bi); + if (remaining == 0) { + md_write_end(mddev); + bio_endio(bi, 0); + } +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4010,6 +4221,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) chunk_aligned_read(mddev,bi)) return; + if (unlikely(bi->bi_rw & REQ_DISCARD)) { + make_discard_request(mddev, bi); + return; + } + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -4113,11 +4329,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - if ((bi->bi_rw & REQ_SYNC) && + if ((bi->bi_rw & REQ_NOIDLE) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - mddev_check_plugged(mddev); - release_stripe(sh); + release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -4126,9 +4341,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) } } - spin_lock_irq(&conf->device_lock); - remaining = raid5_dec_bi_phys_segments(bi); - spin_unlock_irq(&conf->device_lock); + remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { if ( rw == WRITE ) @@ -4484,7 +4697,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid5_bi_hw_segments(raid_bio)) + if (scnt < raid5_bi_processed_stripes(raid_bio)) /* already done this stripe */ continue; @@ -4492,25 +4705,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) if (!sh) { /* failed to get a stripe - must wait */ - raid5_set_bi_hw_segments(raid_bio, scnt); + raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); - raid5_set_bi_hw_segments(raid_bio, scnt); + raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } + set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); handle_stripe(sh); release_stripe(sh); handled++; } - spin_lock_irq(&conf->device_lock); - remaining = raid5_dec_bi_phys_segments(raid_bio); - spin_unlock_irq(&conf->device_lock); + remaining = raid5_dec_bi_active_stripes(raid_bio); if (remaining == 0) bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) @@ -4518,6 +4730,30 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } +#define MAX_STRIPE_BATCH 8 +static int handle_active_stripes(struct r5conf *conf) +{ + struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; + int i, batch_size = 0; + + while (batch_size < MAX_STRIPE_BATCH && + (sh = __get_priority_stripe(conf)) != NULL) + batch[batch_size++] = sh; + + if (batch_size == 0) + return batch_size; + spin_unlock_irq(&conf->device_lock); + + for (i = 0; i < batch_size; i++) + handle_stripe(batch[i]); + + cond_resched(); + + spin_lock_irq(&conf->device_lock); + for (i = 0; i < batch_size; i++) + __release_stripe(conf, batch[i]); + return batch_size; +} /* * This is our raid5 kernel thread. @@ -4526,9 +4762,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { - struct stripe_head *sh; + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; @@ -4542,8 +4778,9 @@ static void raid5d(struct mddev *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; + int batch_size; - if (atomic_read(&mddev->plug_cnt) == 0 && + if ( !list_empty(&conf->bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; @@ -4553,8 +4790,7 @@ static void raid5d(struct mddev *mddev) conf->seq_write = conf->seq_flush; activate_bit_delay(conf); } - if (atomic_read(&mddev->plug_cnt) == 0) - raid5_activate_delayed(conf); + raid5_activate_delayed(conf); while ((bio = remove_bio_from_retry(conf))) { int ok; @@ -4566,21 +4802,16 @@ static void raid5d(struct mddev *mddev) handled++; } - sh = __get_priority_stripe(conf); - - if (!sh) + batch_size = handle_active_stripes(conf); + if (!batch_size) break; - spin_unlock_irq(&conf->device_lock); - - handled++; - handle_stripe(sh); - release_stripe(sh); - cond_resched(); + handled += batch_size; - if (mddev->flags & ~(1<flags & ~(1<device_lock); md_check_recovery(mddev); - - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&conf->device_lock); + } } pr_debug("%d stripes handled\n", handled); @@ -5268,6 +5499,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { int chunk_size; + bool discard_supported = true; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -5287,13 +5519,48 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + mddev->queue->limits.discard_alignment = stripe; + mddev->queue->limits.discard_granularity = stripe; + /* + * unaligned part of discard request will be ignored, so can't + * guarantee discard_zerors_data + */ + mddev->queue->limits.discard_zeroes_data = 0; rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->new_data_offset << 9); + /* + * discard_zeroes_data is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + */ + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || + !bdev_get_queue(rdev->bdev)-> + limits.discard_zeroes_data) + discard_supported = false; } + + if (discard_supported && + mddev->queue->limits.max_discard_sectors >= stripe && + mddev->queue->limits.discard_granularity >= stripe) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } return 0;