Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted
authorMiao Xie <miaox@cn.fujitsu.com>
Thu, 23 Oct 2014 06:42:50 +0000 (14:42 +0800)
committerMiao Xie <miaox@cn.fujitsu.com>
Wed, 3 Dec 2014 02:18:45 +0000 (10:18 +0800)
This patch implement the RAID5/6 common data repair function, the
implementation is similar to the scrub on the other RAID such as
RAID1, the differentia is that we don't read the data from the
mirror, we use the data repair function of RAID5/6.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
fs/btrfs/raid56.c
fs/btrfs/raid56.h
fs/btrfs/scrub.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index c54b0e64c5906ca9eb682624d96dc40d53bc9328..95053a9034749aa85d8a7480ab6671650204906e 100644 (file)
  */
 #define RBIO_CACHE_READY_BIT   3
 
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT 4
+
 #define RBIO_CACHE_SIZE 1024
 
 struct btrfs_raid_bio {
@@ -799,6 +808,21 @@ done_nolock:
                remove_rbio_from_cache(rbio);
 }
 
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+       if (need) {
+               kfree(raid_map);
+               kfree(bbio);
+       }
+}
+
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+       __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+                       !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
+
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
        int i;
@@ -817,8 +841,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                        rbio->stripe_pages[i] = NULL;
                }
        }
-       kfree(rbio->raid_map);
-       kfree(rbio->bbio);
+
+       free_bbio_and_raid_map(rbio);
+
        kfree(rbio);
 }
 
@@ -933,11 +958,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
                        GFP_NOFS);
-       if (!rbio) {
-               kfree(raid_map);
-               kfree(bbio);
+       if (!rbio)
                return ERR_PTR(-ENOMEM);
-       }
 
        bio_list_init(&rbio->bio_list);
        INIT_LIST_HEAD(&rbio->plug_list);
@@ -1692,8 +1714,10 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
        struct blk_plug_cb *cb;
 
        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-       if (IS_ERR(rbio))
+       if (IS_ERR(rbio)) {
+               __free_bbio_and_raid_map(bbio, raid_map, 1);
                return PTR_ERR(rbio);
+       }
        bio_list_add(&rbio->bio_list, bio);
        rbio->bio_list_bytes = bio->bi_iter.bi_size;
 
@@ -1888,7 +1912,8 @@ cleanup:
 cleanup_io:
 
        if (rbio->read_rebuild) {
-               if (err == 0)
+               if (err == 0 &&
+                   !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
                        cache_rbio_pages(rbio);
                else
                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2038,15 +2063,19 @@ cleanup:
  */
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                          struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num)
+                         u64 stripe_len, int mirror_num, int hold_bbio)
 {
        struct btrfs_raid_bio *rbio;
        int ret;
 
        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-       if (IS_ERR(rbio))
+       if (IS_ERR(rbio)) {
+               __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
                return PTR_ERR(rbio);
+       }
 
+       if (hold_bbio)
+               set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
        rbio->read_rebuild = 1;
        bio_list_add(&rbio->bio_list, bio);
        rbio->bio_list_bytes = bio->bi_iter.bi_size;
@@ -2054,8 +2083,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
                BUG();
-               kfree(raid_map);
-               kfree(bbio);
+               __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
                kfree(rbio);
                return -EIO;
        }
index ea5d73bfdfbe4c6c6486f4bd2eb4b18370c39564..b310e8c830d1ed595e7c5966e82c776cf442ad1f 100644 (file)
@@ -41,7 +41,7 @@ static inline int nr_data_stripes(struct map_lookup *map)
 
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                                 struct btrfs_bio *bbio, u64 *raid_map,
-                                u64 stripe_len, int mirror_num);
+                                u64 stripe_len, int mirror_num, int hold_bbio);
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                               struct btrfs_bio *bbio, u64 *raid_map,
                               u64 stripe_len);
index efa08311382725c6770f3899033778ba8e3129ac..ca4b9eb8b5daa4f0d8ca50b209c26bf62cb58482 100644 (file)
@@ -63,6 +63,13 @@ struct scrub_ctx;
  */
 #define SCRUB_MAX_PAGES_PER_BLOCK      16      /* 64k per node/leaf/sector */
 
+struct scrub_recover {
+       atomic_t                refs;
+       struct btrfs_bio        *bbio;
+       u64                     *raid_map;
+       u64                     map_length;
+};
+
 struct scrub_page {
        struct scrub_block      *sblock;
        struct page             *page;
@@ -79,6 +86,8 @@ struct scrub_page {
                unsigned int    io_error:1;
        };
        u8                      csum[BTRFS_CSUM_SIZE];
+
+       struct scrub_recover    *recover;
 };
 
 struct scrub_bio {
@@ -196,7 +205,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock, int is_metadata,
                                int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size);
+                               u16 csum_size, int retry_failed_mirror);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         struct scrub_block *sblock,
                                         int is_metadata, int have_csum,
@@ -790,6 +799,20 @@ out:
        scrub_pending_trans_workers_dec(sctx);
 }
 
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+       atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+       if (atomic_dec_and_test(&recover->refs)) {
+               kfree(recover->bbio);
+               kfree(recover->raid_map);
+               kfree(recover);
+       }
+}
+
 /*
  * scrub_handle_errored_block gets called when either verification of the
  * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +929,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
        /* build and submit the bios for the failed mirror, check checksums */
        scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-                           csum, generation, sctx->csum_size);
+                           csum, generation, sctx->csum_size, 1);
 
        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
            sblock_bad->no_io_error_seen) {
@@ -1019,7 +1042,7 @@ nodatasum_case:
                /* build and submit the bios, check checksums */
                scrub_recheck_block(fs_info, sblock_other, is_metadata,
                                    have_csum, csum, generation,
-                                   sctx->csum_size);
+                                   sctx->csum_size, 0);
 
                if (!sblock_other->header_error &&
                    !sblock_other->checksum_error &&
@@ -1169,7 +1192,7 @@ nodatasum_case:
                         */
                        scrub_recheck_block(fs_info, sblock_bad,
                                            is_metadata, have_csum, csum,
-                                           generation, sctx->csum_size);
+                                           generation, sctx->csum_size, 1);
                        if (!sblock_bad->header_error &&
                            !sblock_bad->checksum_error &&
                            sblock_bad->no_io_error_seen)
@@ -1201,11 +1224,18 @@ out:
                     mirror_index++) {
                        struct scrub_block *sblock = sblocks_for_recheck +
                                                     mirror_index;
+                       struct scrub_recover *recover;
                        int page_index;
 
                        for (page_index = 0; page_index < sblock->page_count;
                             page_index++) {
                                sblock->pagev[page_index]->sblock = NULL;
+                               recover = sblock->pagev[page_index]->recover;
+                               if (recover) {
+                                       scrub_put_recover(recover);
+                                       sblock->pagev[page_index]->recover =
+                                                                       NULL;
+                               }
                                scrub_page_put(sblock->pagev[page_index]);
                        }
                }
@@ -1215,14 +1245,63 @@ out:
        return 0;
 }
 
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+       if (raid_map) {
+               if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+                       return 3;
+               else
+                       return 2;
+       } else {
+               return (int)bbio->num_stripes;
+       }
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+                                                u64 mapped_length,
+                                                int nstripes, int mirror,
+                                                int *stripe_index,
+                                                u64 *stripe_offset)
+{
+       int i;
+
+       if (raid_map) {
+               /* RAID5/6 */
+               for (i = 0; i < nstripes; i++) {
+                       if (raid_map[i] == RAID6_Q_STRIPE ||
+                           raid_map[i] == RAID5_P_STRIPE)
+                               continue;
+
+                       if (logical >= raid_map[i] &&
+                           logical < raid_map[i] + mapped_length)
+                               break;
+               }
+
+               *stripe_index = i;
+               *stripe_offset = logical - raid_map[i];
+       } else {
+               /* The other RAID type */
+               *stripe_index = mirror;
+               *stripe_offset = 0;
+       }
+}
+
 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
                                     struct btrfs_fs_info *fs_info,
                                     struct scrub_block *original_sblock,
                                     u64 length, u64 logical,
                                     struct scrub_block *sblocks_for_recheck)
 {
+       struct scrub_recover *recover;
+       struct btrfs_bio *bbio;
+       u64 *raid_map;
+       u64 sublen;
+       u64 mapped_length;
+       u64 stripe_offset;
+       int stripe_index;
        int page_index;
        int mirror_index;
+       int nmirrors;
        int ret;
 
        /*
@@ -1233,23 +1312,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 
        page_index = 0;
        while (length > 0) {
-               u64 sublen = min_t(u64, length, PAGE_SIZE);
-               u64 mapped_length = sublen;
-               struct btrfs_bio *bbio = NULL;
+               sublen = min_t(u64, length, PAGE_SIZE);
+               mapped_length = sublen;
+               bbio = NULL;
+               raid_map = NULL;
 
                /*
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
-               ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                     &mapped_length, &bbio, 0);
+               ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+                                      &mapped_length, &bbio, 0, &raid_map);
                if (ret || !bbio || mapped_length < sublen) {
                        kfree(bbio);
+                       kfree(raid_map);
                        return -EIO;
                }
 
+               recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+               if (!recover) {
+                       kfree(bbio);
+                       kfree(raid_map);
+                       return -ENOMEM;
+               }
+
+               atomic_set(&recover->refs, 1);
+               recover->bbio = bbio;
+               recover->raid_map = raid_map;
+               recover->map_length = mapped_length;
+
                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
-               for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+
+               nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+               for (mirror_index = 0; mirror_index < nmirrors;
                     mirror_index++) {
                        struct scrub_block *sblock;
                        struct scrub_page *page;
@@ -1265,26 +1360,38 @@ leave_nomem:
                                spin_lock(&sctx->stat_lock);
                                sctx->stat.malloc_errors++;
                                spin_unlock(&sctx->stat_lock);
-                               kfree(bbio);
+                               scrub_put_recover(recover);
                                return -ENOMEM;
                        }
                        scrub_page_get(page);
                        sblock->pagev[page_index] = page;
                        page->logical = logical;
-                       page->physical = bbio->stripes[mirror_index].physical;
+
+                       scrub_stripe_index_and_offset(logical, raid_map,
+                                                     mapped_length,
+                                                     bbio->num_stripes,
+                                                     mirror_index,
+                                                     &stripe_index,
+                                                     &stripe_offset);
+                       page->physical = bbio->stripes[stripe_index].physical +
+                                        stripe_offset;
+                       page->dev = bbio->stripes[stripe_index].dev;
+
                        BUG_ON(page_index >= original_sblock->page_count);
                        page->physical_for_dev_replace =
                                original_sblock->pagev[page_index]->
                                physical_for_dev_replace;
                        /* for missing devices, dev->bdev is NULL */
-                       page->dev = bbio->stripes[mirror_index].dev;
                        page->mirror_num = mirror_index + 1;
                        sblock->page_count++;
                        page->page = alloc_page(GFP_NOFS);
                        if (!page->page)
                                goto leave_nomem;
+
+                       scrub_get_recover(recover);
+                       page->recover = recover;
                }
-               kfree(bbio);
+               scrub_put_recover(recover);
                length -= sublen;
                logical += sublen;
                page_index++;
@@ -1293,6 +1400,51 @@ leave_nomem:
        return 0;
 }
 
+struct scrub_bio_ret {
+       struct completion event;
+       int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+       struct scrub_bio_ret *ret = bio->bi_private;
+
+       ret->error = error;
+       complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+       return page->recover && page->recover->raid_map;
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+                                       struct bio *bio,
+                                       struct scrub_page *page)
+{
+       struct scrub_bio_ret done;
+       int ret;
+
+       init_completion(&done.event);
+       done.error = 0;
+       bio->bi_iter.bi_sector = page->logical >> 9;
+       bio->bi_private = &done;
+       bio->bi_end_io = scrub_bio_wait_endio;
+
+       ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+                                   page->recover->raid_map,
+                                   page->recover->map_length,
+                                   page->mirror_num, 1);
+       if (ret)
+               return ret;
+
+       wait_for_completion(&done.event);
+       if (done.error)
+               return -EIO;
+
+       return 0;
+}
+
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1455,7 @@ leave_nomem:
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock, int is_metadata,
                                int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size)
+                               u16 csum_size, int retry_failed_mirror)
 {
        int page_num;
 
@@ -1329,11 +1481,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                        continue;
                }
                bio->bi_bdev = page->dev->bdev;
-               bio->bi_iter.bi_sector = page->physical >> 9;
 
                bio_add_page(bio, page->page, PAGE_SIZE, 0);
-               if (btrfsic_submit_bio_wait(READ, bio))
-                       sblock->no_io_error_seen = 0;
+               if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+                       if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+                               sblock->no_io_error_seen = 0;
+               } else {
+                       bio->bi_iter.bi_sector = page->physical >> 9;
+
+                       if (btrfsic_submit_bio_wait(READ, bio))
+                               sblock->no_io_error_seen = 0;
+               }
 
                bio_put(bio);
        }
index 6f5b302a08cf80f42f59599a95f4cb3cac3963f2..217c42ea90b020c7feee46803bbba10f274a0d16 100644 (file)
@@ -5161,7 +5161,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                BTRFS_BLOCK_GROUP_RAID6)) {
                u64 tmp;
 
-               if (raid_map_ret && ((rw & REQ_WRITE) || mirror_num > 1)) {
+               if (raid_map_ret &&
+                   ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+                    mirror_num > 1)) {
                        int i, rot;
 
                        /* push stripe_nr back to the start of the full stripe */
@@ -5440,6 +5442,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 mirror_num, NULL);
 }
 
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+                    u64 logical, u64 *length,
+                    struct btrfs_bio **bbio_ret, int mirror_num,
+                    u64 **raid_map_ret)
+{
+       return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+                                mirror_num, raid_map_ret);
+}
+
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len)
@@ -5809,7 +5821,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                } else {
                        ret = raid56_parity_recover(root, bio, bbio,
                                                    raid_map, map_length,
-                                                   mirror_num);
+                                                   mirror_num, 0);
                }
                /*
                 * FIXME, replace dosen't support raid56 yet, please fix
index 08980fa2303916ee08c0f2b6bd30a6b9a6f0a711..01094bb804c75497e4360f09f0e7a56a5f8dcae7 100644 (file)
@@ -393,6 +393,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    u64 logical, u64 *length,
                    struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+                    u64 logical, u64 *length,
+                    struct btrfs_bio **bbio_ret, int mirror_num,
+                    u64 **raid_map_ret);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len);