* We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits
*/
-static inline int raid5_bi_phys_segments(struct bio *bio)
+static inline int raid5_bi_processed_stripes(struct bio *bio)
{
- return bio->bi_phys_segments & 0xffff;
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ return (atomic_read(segments) >> 16) & 0xffff;
}
-static inline int raid5_bi_hw_segments(struct bio *bio)
+static inline int raid5_dec_bi_active_stripes(struct bio *bio)
{
- return (bio->bi_phys_segments >> 16) & 0xffff;
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ return atomic_sub_return(1, segments) & 0xffff;
}
-static inline int raid5_dec_bi_phys_segments(struct bio *bio)
+static inline void raid5_inc_bi_active_stripes(struct bio *bio)
{
- --bio->bi_phys_segments;
- return raid5_bi_phys_segments(bio);
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ atomic_inc(segments);
}
-static inline int raid5_dec_bi_hw_segments(struct bio *bio)
+static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+ unsigned int cnt)
{
- unsigned short val = raid5_bi_hw_segments(bio);
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ int old, new;
- --val;
- bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
- return val;
+ do {
+ old = atomic_read(segments);
+ new = (old & 0xffff) | (cnt << 16);
+ } while (atomic_cmpxchg(segments, old, new) != old);
}
-static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
+static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
{
- bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ atomic_set(segments, cnt);
}
/* Find first data disk in a raid6 stripe */
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
{
- if (atomic_dec_and_test(&sh->count)) {
- BUG_ON(!list_empty(&sh->lru));
- BUG_ON(atomic_read(&conf->active_stripes)==0);
- if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- sh->bm_seq - conf->seq_write > 0)
- list_add_tail(&sh->lru, &conf->bitmap_list);
- else {
- clear_bit(STRIPE_DELAYED, &sh->state);
- clear_bit(STRIPE_BIT_DELAY, &sh->state);
- list_add_tail(&sh->lru, &conf->handle_list);
- }
- md_wakeup_thread(conf->mddev->thread);
- } else {
- BUG_ON(stripe_operations_active(sh));
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- if (atomic_dec_return(&conf->preread_active_stripes)
- < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- atomic_dec(&conf->active_stripes);
- if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
- list_add_tail(&sh->lru, &conf->inactive_list);
- wake_up(&conf->wait_for_stripe);
- if (conf->retry_read_aligned)
- md_wakeup_thread(conf->mddev->thread);
- }
+ BUG_ON(!list_empty(&sh->lru));
+ BUG_ON(atomic_read(&conf->active_stripes)==0);
+ if (test_bit(STRIPE_HANDLE, &sh->state)) {
+ if (test_bit(STRIPE_DELAYED, &sh->state) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ list_add_tail(&sh->lru, &conf->delayed_list);
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0)
+ list_add_tail(&sh->lru, &conf->bitmap_list);
+ else {
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ clear_bit(STRIPE_BIT_DELAY, &sh->state);
+ list_add_tail(&sh->lru, &conf->handle_list);
+ }
+ md_wakeup_thread(conf->mddev->thread);
+ } else {
+ BUG_ON(stripe_operations_active(sh));
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ if (atomic_dec_return(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ atomic_dec(&conf->active_stripes);
+ if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+ list_add_tail(&sh->lru, &conf->inactive_list);
+ wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
}
}
}
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+ if (atomic_dec_and_test(&sh->count))
+ do_release_stripe(conf, sh);
+}
+
static void release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- __release_stripe(conf, sh);
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ local_irq_save(flags);
+ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
+ do_release_stripe(conf, sh);
+ spin_unlock(&conf->device_lock);
+ }
+ local_irq_restore(flags);
}
static inline void remove_hash(struct stripe_head *sh)
} else {
if (atomic_read(&sh->count)) {
BUG_ON(!list_empty(&sh->lru)
- && !test_bit(STRIPE_EXPANDING, &sh->state));
+ && !test_bit(STRIPE_EXPANDING, &sh->state)
+ && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
rw = WRITE_FUA;
else
rw = WRITE;
+ if (test_bit(R5_Discard, &sh->dev[i].flags))
+ rw |= REQ_DISCARD;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
else if (test_and_clear_bit(R5_WantReplace,
else
bi->bi_sector = (sh->sector
+ rdev->data_offset);
+ if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+ bi->bi_rw |= REQ_FLUSH;
+
bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
{
struct stripe_head *sh = stripe_head_ref;
struct bio *return_bi = NULL;
- struct r5conf *conf = sh->raid_conf;
int i;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
/* clear completed biofills */
- spin_lock_irq(&conf->device_lock);
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_phys_segments(rbi)) {
+ if (!raid5_dec_bi_active_stripes(rbi)) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
}
}
}
- spin_unlock_irq(&conf->device_lock);
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
return_io(return_bi);
static void ops_run_biofill(struct stripe_head *sh)
{
struct dma_async_tx_descriptor *tx = NULL;
- struct r5conf *conf = sh->raid_conf;
struct async_submit_ctl submit;
int i;
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_Wantfill, &dev->flags)) {
struct bio *rbi;
- spin_lock_irq(&conf->device_lock);
+ spin_lock_irq(&sh->stripe_lock);
dev->read = rbi = dev->toread;
dev->toread = NULL;
- spin_unlock_irq(&conf->device_lock);
+ spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
tx = async_copy_data(0, rbi, dev->page,
if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi;
- spin_lock_irq(&sh->raid_conf->device_lock);
+ spin_lock_irq(&sh->stripe_lock);
chosen = dev->towrite;
dev->towrite = NULL;
BUG_ON(dev->written);
wbi = dev->written = chosen;
- spin_unlock_irq(&sh->raid_conf->device_lock);
+ spin_unlock_irq(&sh->stripe_lock);
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_rw & REQ_SYNC)
set_bit(R5_SyncIO, &dev->flags);
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
+ if (wbi->bi_rw & REQ_DISCARD)
+ set_bit(R5_Discard, &dev->flags);
+ else
+ tx = async_copy_data(1, wbi, dev->page,
+ dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector);
}
}
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
int i;
- bool fua = false, sync = false;
+ bool fua = false, sync = false, discard = false;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+ discard |= test_bit(R5_Discard, &sh->dev[i].flags);
}
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- set_bit(R5_UPTODATE, &dev->flags);
+ if (!discard)
+ set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
if (sync)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ for (i = 0; i < sh->disks; i++) {
+ if (pd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
{
struct async_submit_ctl submit;
struct page **blocks = percpu->scribble;
- int count;
+ int count, i;
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+ for (i = 0; i < sh->disks; i++) {
+ if (sh->pd_idx == i || sh->qd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
+
count = set_syndrome_sources(blocks, sh);
atomic_inc(&sh->count);
init_waitqueue_head(&sh->ops.wait_for_ops);
#endif
+ spin_lock_init(&sh->stripe_lock);
+
if (grow_buffers(sh)) {
shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh);
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
- }
+ } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+ clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+
if (atomic_read(&rdev->read_errors))
atomic_set(&rdev->read_errors, 0);
} else {
else
retry = 1;
if (retry)
- set_bit(R5_ReadError, &sh->dev[i].flags);
+ if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+ } else
+ set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
else {
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector);
-
- spin_lock_irq(&conf->device_lock);
+ /*
+ * If several bio share a stripe. The bio bi_phys_segments acts as a
+ * reference count to avoid race. The reference count should already be
+ * increased before this function is called (for example, in
+ * make_request()), so other bio sharing this stripe will not free the
+ * stripe. If a stripe is owned by one stripe, the stripe lock will
+ * protect it.
+ */
+ spin_lock_irq(&sh->stripe_lock);
if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
- if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+ if (*bip == NULL)
firstwrite = 1;
} else
bip = &sh->dev[dd_idx].toread;
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- bi->bi_phys_segments++;
+ raid5_inc_bi_active_stripes(bi);
if (forwrite) {
/* check if page is covered */
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
- spin_unlock_irq(&conf->device_lock);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_sector,
(unsigned long long)sh->sector, dd_idx);
+ spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap && firstwrite) {
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
overlap:
set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
- spin_unlock_irq(&conf->device_lock);
+ spin_unlock_irq(&sh->stripe_lock);
return 0;
}
rdev_dec_pending(rdev, conf->mddev);
}
}
- spin_lock_irq(&conf->device_lock);
+ spin_lock_irq(&sh->stripe_lock);
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
- if (bi) {
- s->to_write--;
+ spin_unlock_irq(&sh->stripe_lock);
+ if (bi)
bitmap_end = 1;
- }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (!raid5_dec_bi_phys_segments(bi)) {
+ if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
*return_bi = bi;
}
bi = nextbi;
}
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0, 0);
+ bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (!raid5_dec_bi_phys_segments(bi)) {
+ if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
*return_bi = bi;
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
+ spin_lock_irq(&sh->stripe_lock);
bi = sh->dev[i].toread;
sh->dev[i].toread = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (bi) s->to_read--;
while (bi && bi->bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (!raid5_dec_bi_phys_segments(bi)) {
+ if (!raid5_dec_bi_active_stripes(bi)) {
bi->bi_next = *return_bi;
*return_bi = bi;
}
bi = nextbi;
}
}
- spin_unlock_irq(&conf->device_lock);
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags)) {
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_and_clear_bit(R5_Discard, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
- int bitmap_end = 0;
pr_debug("Return write for disc %d\n", i);
- spin_lock_irq(&conf->device_lock);
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_phys_segments(wbi)) {
+ if (!raid5_dec_bi_active_stripes(wbi)) {
md_write_end(conf->mddev);
wbi->bi_next = *return_bi;
*return_bi = wbi;
}
wbi = wbi2;
}
- if (dev->towrite == NULL)
- bitmap_end = 1;
- spin_unlock_irq(&conf->device_lock);
- if (bitmap_end)
- bitmap_endwrite(conf->mddev->bitmap,
- sh->sector,
- STRIPE_SECTORS,
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
+ 0);
}
}
int disks)
{
int rmw = 0, rcw = 0, i;
- if (conf->max_degraded == 2) {
- /* RAID6 requires 'rcw' in current implementation
- * Calculate the real rcw later - for now fake it
+ sector_t recovery_cp = conf->mddev->recovery_cp;
+
+ /* RAID6 requires 'rcw' in current implementation.
+ * Otherwise, check whether resync is now happening or should start.
+ * If yes, then the array is dirty (after unclean shutdown or
+ * initial creation), so parity in some stripes might be inconsistent.
+ * In this case, we need to always do reconstruct-write, to ensure
+ * that in case of drive failure or read-error correction, we
+ * generate correct data from the parity.
+ */
+ if (conf->max_degraded == 2 ||
+ (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+ /* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
+ pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
+ conf->max_degraded, (unsigned long long)recovery_cp,
+ (unsigned long long)sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
/* Now to look around and see what can be done */
rcu_read_lock();
- spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) {
struct md_rdev *rdev;
sector_t first_bad;
do_recovery = 1;
}
}
- spin_unlock_irq(&conf->device_lock);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced,
* we must be recovering.
if (s.written &&
(s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
- && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+ && (test_bit(R5_UPTODATE, &pdev->flags) ||
+ test_bit(R5_Discard, &pdev->flags))))) &&
(s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
- && test_bit(R5_UPTODATE, &qdev->flags)))))
+ && (test_bit(R5_UPTODATE, &qdev->flags) ||
+ test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
/* Now we might consider reading some blocks, either to check/generate
/* All the 'written' buffers and the parity block are ready to
* be written back to disk
*/
- BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
BUG_ON(sh->qd_idx >= 0 &&
- !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+ !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_LOCKED, &dev->flags) &&
* this sets the active strip count to 1 and the processed
* strip count to zero (upper 8 bits)
*/
- bi->bi_phys_segments = 1; /* biased count of active stripes */
+ raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
}
return bi;
return sh;
}
+struct raid5_plug_cb {
+ struct blk_plug_cb cb;
+ struct list_head list;
+};
+
+static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
+{
+ struct raid5_plug_cb *cb = container_of(
+ blk_cb, struct raid5_plug_cb, cb);
+ struct stripe_head *sh;
+ struct mddev *mddev = cb->cb.data;
+ struct r5conf *conf = mddev->private;
+
+ if (cb->list.next && !list_empty(&cb->list)) {
+ spin_lock_irq(&conf->device_lock);
+ while (!list_empty(&cb->list)) {
+ sh = list_first_entry(&cb->list, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ /*
+ * avoid race release_stripe_plug() sees
+ * STRIPE_ON_UNPLUG_LIST clear but the stripe
+ * is still in our list
+ */
+ smp_mb__before_clear_bit();
+ clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
+ __release_stripe(conf, sh);
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+ kfree(cb);
+}
+
+static void release_stripe_plug(struct mddev *mddev,
+ struct stripe_head *sh)
+{
+ struct blk_plug_cb *blk_cb = blk_check_plugged(
+ raid5_unplug, mddev,
+ sizeof(struct raid5_plug_cb));
+ struct raid5_plug_cb *cb;
+
+ if (!blk_cb) {
+ release_stripe(sh);
+ return;
+ }
+
+ cb = container_of(blk_cb, struct raid5_plug_cb, cb);
+
+ if (cb->list.next == NULL)
+ INIT_LIST_HEAD(&cb->list);
+
+ if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
+ list_add_tail(&sh->lru, &cb->list);
+ else
+ release_stripe(sh);
+}
+
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t logical_sector, last_sector;
+ struct stripe_head *sh;
+ int remaining;
+ int stripe_sectors;
+
+ if (mddev->reshape_position != MaxSector)
+ /* Skip discard while reshape is happening */
+ return;
+
+ logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+ stripe_sectors = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+ stripe_sectors);
+ sector_div(last_sector, stripe_sectors);
+
+ logical_sector *= conf->chunk_sectors;
+ last_sector *= conf->chunk_sectors;
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS) {
+ DEFINE_WAIT(w);
+ int d;
+ again:
+ sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock_irq(&sh->stripe_lock);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ if (sh->dev[d].towrite || sh->dev[d].toread) {
+ set_bit(R5_Overlap, &sh->dev[d].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ }
+ finish_wait(&conf->wait_for_overlap, &w);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ sh->dev[d].towrite = bi;
+ set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+ raid5_inc_bi_active_stripes(bi);
+ }
+ spin_unlock_irq(&sh->stripe_lock);
+ if (conf->mddev->bitmap) {
+ for (d = 0;
+ d < conf->raid_disks - conf->max_degraded;
+ d++)
+ bitmap_startwrite(mddev->bitmap,
+ sh->sector,
+ STRIPE_SECTORS,
+ 0);
+ sh->bm_seq = conf->seq_flush + 1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe_plug(mddev, sh);
+ }
+
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+ md_write_end(mddev);
+ bio_endio(bi, 0);
+ }
+}
+
static void make_request(struct mddev *mddev, struct bio * bi)
{
struct r5conf *conf = mddev->private;
chunk_aligned_read(mddev,bi))
return;
+ if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+ make_discard_request(mddev, bi);
+ return;
+ }
+
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
finish_wait(&conf->wait_for_overlap, &w);
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
- if ((bi->bi_rw & REQ_SYNC) &&
+ if ((bi->bi_rw & REQ_NOIDLE) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
- mddev_check_plugged(mddev);
- release_stripe(sh);
+ release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
clear_bit(BIO_UPTODATE, &bi->bi_flags);
}
}
- spin_lock_irq(&conf->device_lock);
- remaining = raid5_dec_bi_phys_segments(bi);
- spin_unlock_irq(&conf->device_lock);
+ remaining = raid5_dec_bi_active_stripes(bi);
if (remaining == 0) {
if ( rw == WRITE )
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < raid5_bi_hw_segments(raid_bio))
+ if (scnt < raid5_bi_processed_stripes(raid_bio))
/* already done this stripe */
continue;
if (!sh) {
/* failed to get a stripe - must wait */
- raid5_set_bi_hw_segments(raid_bio, scnt);
+ raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
release_stripe(sh);
- raid5_set_bi_hw_segments(raid_bio, scnt);
+ raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
}
+ set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh);
release_stripe(sh);
handled++;
}
- spin_lock_irq(&conf->device_lock);
- remaining = raid5_dec_bi_phys_segments(raid_bio);
- spin_unlock_irq(&conf->device_lock);
+ remaining = raid5_dec_bi_active_stripes(raid_bio);
if (remaining == 0)
bio_endio(raid_bio, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
return handled;
}
+#define MAX_STRIPE_BATCH 8
+static int handle_active_stripes(struct r5conf *conf)
+{
+ struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
+ int i, batch_size = 0;
+
+ while (batch_size < MAX_STRIPE_BATCH &&
+ (sh = __get_priority_stripe(conf)) != NULL)
+ batch[batch_size++] = sh;
+
+ if (batch_size == 0)
+ return batch_size;
+ spin_unlock_irq(&conf->device_lock);
+
+ for (i = 0; i < batch_size; i++)
+ handle_stripe(batch[i]);
+
+ cond_resched();
+
+ spin_lock_irq(&conf->device_lock);
+ for (i = 0; i < batch_size; i++)
+ __release_stripe(conf, batch[i]);
+ return batch_size;
+}
/*
* This is our raid5 kernel thread.
* During the scan, completed stripes are saved for us by the interrupt
* handler, so that they will not have to wait for our next wakeup.
*/
-static void raid5d(struct mddev *mddev)
+static void raid5d(struct md_thread *thread)
{
- struct stripe_head *sh;
+ struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
int handled;
struct blk_plug plug;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
+ int batch_size;
- if (atomic_read(&mddev->plug_cnt) == 0 &&
+ if (
!list_empty(&conf->bitmap_list)) {
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf);
}
- if (atomic_read(&mddev->plug_cnt) == 0)
- raid5_activate_delayed(conf);
+ raid5_activate_delayed(conf);
while ((bio = remove_bio_from_retry(conf))) {
int ok;
handled++;
}
- sh = __get_priority_stripe(conf);
-
- if (!sh)
+ batch_size = handle_active_stripes(conf);
+ if (!batch_size)
break;
- spin_unlock_irq(&conf->device_lock);
-
- handled++;
- handle_stripe(sh);
- release_stripe(sh);
- cond_resched();
+ handled += batch_size;
- if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+ spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
-
- spin_lock_irq(&conf->device_lock);
+ spin_lock_irq(&conf->device_lock);
+ }
}
pr_debug("%d stripes handled\n", handled);
if (mddev->queue) {
int chunk_size;
+ bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
+ /*
+ * We can only discard a whole stripe. It doesn't make sense to
+ * discard data disk but write parity disk
+ */
+ stripe = stripe * PAGE_SIZE;
+ mddev->queue->limits.discard_alignment = stripe;
+ mddev->queue->limits.discard_granularity = stripe;
+ /*
+ * unaligned part of discard request will be ignored, so can't
+ * guarantee discard_zerors_data
+ */
+ mddev->queue->limits.discard_zeroes_data = 0;
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
+ /*
+ * discard_zeroes_data is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ */
+ if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+ !bdev_get_queue(rdev->bdev)->
+ limits.discard_zeroes_data)
+ discard_supported = false;
}
+
+ if (discard_supported &&
+ mddev->queue->limits.max_discard_sectors >= stripe &&
+ mddev->queue->limits.discard_granularity >= stripe)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
}
return 0;