Btrfs: Handle write errors on raid1 and raid10
authorChris Mason <chris.mason@oracle.com>
Mon, 12 May 2008 17:39:03 +0000 (13:39 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:03 +0000 (11:04 -0400)
When duplicate copies exist, writes are allowed to fail to one of those
copies.  This changeset includes a few changes that allow the FS to
continue even when some IOs fail.

It also adds verification of the parent generation number for btree blocks.
This generation is stored in the pointer to a block, and it ensures
that missed writes to are detected.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode.c
fs/btrfs/tree-defrag.c
fs/btrfs/volumes.c

index 289d71d8653a9caf19a51fbe0e51b07a2033efcb..02e571e6ee620efacb5529a67d262f81759ab153 100644 (file)
@@ -379,7 +379,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
                cur = btrfs_find_tree_block(root, blocknr, blocksize);
                if (cur)
-                       uptodate = btrfs_buffer_uptodate(cur);
+                       uptodate = btrfs_buffer_uptodate(cur, gen);
                else
                        uptodate = 0;
                if (!cur || !uptodate) {
index edee7a44f861dda2a05fac98dfb7387b88b126f3..574b1245964eff19947425c293e406ac4e15a16f 100644 (file)
@@ -205,6 +205,33 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
        return 0;
 }
 
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+                                struct extent_buffer *eb, u64 parent_transid)
+{
+       int ret;
+
+       if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+               return 0;
+
+       lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+       if (extent_buffer_uptodate(io_tree, eb) &&
+           btrfs_header_generation(eb) == parent_transid) {
+               ret = 0;
+               goto out;
+       }
+       printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+              (unsigned long long)eb->start,
+              (unsigned long long)parent_transid,
+              (unsigned long long)btrfs_header_generation(eb));
+       ret = 1;
+out:
+       clear_extent_buffer_uptodate(io_tree, eb);
+       unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+                     GFP_NOFS);
+       return ret;
+
+}
+
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                                          struct extent_buffer *eb,
                                          u64 start, u64 parent_transid)
@@ -218,7 +245,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
        while (1) {
                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
                                               btree_get_extent, mirror_num);
-               if (!ret)
+               if (!ret &&
+                   !verify_parent_transid(io_tree, eb, parent_transid))
                        return ret;
 
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
@@ -330,6 +358,13 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                ret = -EIO;
                goto err;
        }
+       if (memcmp_extent_buffer(eb, root->fs_info->fsid,
+                                (unsigned long)btrfs_header_fsid(eb),
+                                BTRFS_FSID_SIZE)) {
+               printk("bad fsid on block %Lu\n", eb->start);
+               ret = -EIO;
+               goto err;
+       }
        found_level = btrfs_header_level(eb);
 
        ret = csum_tree_block(root, eb, 1);
@@ -1363,7 +1398,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
                }
-               set_buffer_write_io_error(bh);
+               /* note, we dont' set_buffer_write_io_error because we have
+                * our own ways of dealing with the IO errors
+                */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
@@ -1459,7 +1496,8 @@ int write_all_supers(struct btrfs_root *root)
                                ret = submit_bh(WRITE, bh);
                                BUG_ON(ret);
                                wait_on_buffer(bh);
-                               BUG_ON(!buffer_uptodate(bh));
+                               if (!buffer_uptodate(bh))
+                                       total_errors++;
                        } else {
                                total_errors++;
                        }
@@ -1607,10 +1645,18 @@ int close_ctree(struct btrfs_root *root)
        return 0;
 }
 
-int btrfs_buffer_uptodate(struct extent_buffer *buf)
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 {
+       int ret;
        struct inode *btree_inode = buf->first_page->mapping->host;
-       return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+
+       ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+       if (!ret)
+               return ret;
+
+       ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+                                   parent_transid);
+       return !ret;
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
index e29c895d52309e31bda8e802e22e29624b29b022..30d1ed293c25e0b6f495a52c0dced3e3126147b2 100644 (file)
@@ -56,7 +56,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
                                 struct extent_buffer *buf);
index db07dde4a87057d39713e67ebc2577df1597207c..605018c6045cbeeb910ce1a4d147106c6df72375 100644 (file)
@@ -1366,7 +1366,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
        if (!pending) {
                buf = btrfs_find_tree_block(root, bytenr, num_bytes);
                if (buf) {
-                       if (btrfs_buffer_uptodate(buf)) {
+                       if (btrfs_buffer_uptodate(buf, 0)) {
                                u64 transid =
                                    root->fs_info->running_transaction->transid;
                                u64 header_transid =
@@ -2151,7 +2151,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
                        continue;
                }
                next = btrfs_find_tree_block(root, bytenr, blocksize);
-               if (!next || !btrfs_buffer_uptodate(next)) {
+               if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
                        free_extent_buffer(next);
                        reada_walk_down(root, cur, path->slots[*level]);
 
index dd403b426ff5dc37c379f2dc2d77ed4d81f78728..2a3624adc0cfaaec555defb95ff642927ffb0c8c 100644 (file)
@@ -1366,7 +1366,7 @@ static int end_bio_extent_writepage(struct bio *bio,
                                   unsigned int bytes_done, int err)
 #endif
 {
-       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       int uptodate = err == 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct extent_state *state = bio->bi_private;
        struct extent_io_tree *tree = state->tree;
@@ -1375,6 +1375,7 @@ static int end_bio_extent_writepage(struct bio *bio,
        u64 end;
        u64 cur;
        int whole_page;
+       int ret;
        unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -1395,17 +1396,30 @@ static int end_bio_extent_writepage(struct bio *bio,
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
 
+               if (tree->ops && tree->ops->writepage_end_io_hook) {
+                       ret = tree->ops->writepage_end_io_hook(page, start,
+                                                      end, state);
+                       if (ret)
+                               uptodate = 0;
+               }
+
+               if (!uptodate && tree->ops &&
+                   tree->ops->writepage_io_failed_hook) {
+                       ret = tree->ops->writepage_io_failed_hook(bio, page,
+                                                        start, end, state);
+                       if (ret == 0) {
+                               state = NULL;
+                               uptodate = (err == 0);
+                               continue;
+                       }
+               }
+
                if (!uptodate) {
                        clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
 
-               if (tree->ops && tree->ops->writepage_end_io_hook) {
-                       tree->ops->writepage_end_io_hook(page, start, end,
-                                                        state);
-               }
-
                /*
                 * bios can get merged in funny ways, and so we need to
                 * be careful with the state variable.  We know the
@@ -2073,9 +2087,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                } else {
                        ret = 0;
                }
-               if (ret)
+               if (ret) {
                        SetPageError(page);
-               else {
+               else {
                        unsigned long max_nr = end_index + 1;
                        set_range_writeback(tree, cur, cur + iosize - 1);
                        if (!PageWriteback(page)) {
@@ -2948,6 +2962,25 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                               struct extent_buffer *eb)
+{
+       unsigned long i;
+       struct page *page;
+       unsigned long num_pages;
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       eb->flags &= ~EXTENT_UPTODATE;
+
+       clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                             GFP_NOFS);
+       for (i = 0; i < num_pages; i++) {
+               page = extent_buffer_page(eb, i);
+               ClearPageUptodate(page);
+       }
+       return 0;
+}
+
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                                struct extent_buffer *eb)
 {
index e483461475144c4603e8076881982603c473c9d5..f1960dafaa19d80d7b803dff2a5a1f01db218daf 100644 (file)
@@ -36,9 +36,12 @@ struct extent_io_ops {
        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
                                       u64 start, u64 end,
                                       struct extent_state *state);
+       int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+                                       u64 start, u64 end,
+                                      struct extent_state *state);
        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
                                    struct extent_state *state);
-       void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+       int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state);
        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
                            unsigned long old, unsigned long bits);
@@ -212,6 +215,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                               struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
                           struct extent_buffer *eb);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
index a492fd238c887a145af5a9634d33c7417e275861..08760ff9bab757bb330c358a010823972099d6b5 100644 (file)
@@ -430,9 +430,9 @@ struct io_failure_record {
        int last_mirror;
 };
 
-int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
-                                 struct page *page, u64 start, u64 end,
-                                 struct extent_state *state)
+int btrfs_io_failed_hook(struct bio *failed_bio,
+                        struct page *page, u64 start, u64 end,
+                        struct extent_state *state)
 {
        struct io_failure_record *failrec = NULL;
        u64 private;
@@ -443,6 +443,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
        struct bio *bio;
        int num_copies;
        int ret;
+       int rw;
        u64 logical;
 
        ret = get_state_private(failure_tree, start, &private);
@@ -505,7 +506,41 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
        bio->bi_bdev = failed_bio->bi_bdev;
        bio->bi_size = 0;
        bio_add_page(bio, page, failrec->len, start - page_offset(page));
-       btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
+       if (failed_bio->bi_rw & (1 << BIO_RW))
+               rw = WRITE;
+       else
+               rw = READ;
+
+       BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+                                                     failrec->last_mirror);
+       return 0;
+}
+
+int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+       u64 private;
+       u64 private_failure;
+       struct io_failure_record *failure;
+       int ret;
+
+       private = 0;
+       if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                            (u64)-1, 1, EXTENT_DIRTY)) {
+               ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+                                       start, &private_failure);
+               if (ret == 0) {
+                       failure = (struct io_failure_record *)(unsigned long)
+                                  private_failure;
+                       set_state_private(&BTRFS_I(inode)->io_failure_tree,
+                                         failure->start, 0);
+                       clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+                                         failure->start,
+                                         failure->start + failure->len - 1,
+                                         EXTENT_DIRTY | EXTENT_LOCKED,
+                                         GFP_NOFS);
+                       kfree(failure);
+               }
+       }
        return 0;
 }
 
@@ -547,26 +582,7 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        /* if the io failure tree for this inode is non-empty,
         * check to see if we've recovered from a failed IO
         */
-       private = 0;
-       if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                            (u64)-1, 1, EXTENT_DIRTY)) {
-               u64 private_failure;
-               struct io_failure_record *failure;
-               ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-                                       start, &private_failure);
-               if (ret == 0) {
-                       failure = (struct io_failure_record *)(unsigned long)
-                                  private_failure;
-                       set_state_private(&BTRFS_I(inode)->io_failure_tree,
-                                         failure->start, 0);
-                       clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-                                         failure->start,
-                                         failure->start + failure->len - 1,
-                                         EXTENT_DIRTY | EXTENT_LOCKED,
-                                         GFP_NOFS);
-                       kfree(failure);
-               }
-       }
+       btrfs_clean_io_failures(inode, start);
        return 0;
 
 zeroit:
@@ -3657,7 +3673,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
        .merge_bio_hook = btrfs_merge_bio_hook,
        .readpage_io_hook = btrfs_readpage_io_hook,
        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-       .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
+       .readpage_io_failed_hook = btrfs_io_failed_hook,
        .set_bit_hook = btrfs_set_bit_hook,
        .clear_bit_hook = btrfs_clear_bit_hook,
 };
index 5085e9e693b931942ce838dd929a314f2e404673..c02e2bf2f02828ed39c120f74c8f0360316b8f23 100644 (file)
@@ -51,6 +51,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
        struct extent_buffer *next;
        struct extent_buffer *cur;
        u64 bytenr;
+       u64 ptr_gen;
        int ret = 0;
        int is_extent = 0;
 
@@ -93,11 +94,12 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
                        break;
                }
                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 
                if (cache_only) {
                        next = btrfs_find_tree_block(root, bytenr,
                                           btrfs_level_size(root, *level - 1));
-                       if (!next || !btrfs_buffer_uptodate(next) ||
+                       if (!next || !btrfs_buffer_uptodate(next, ptr_gen) ||
                            !btrfs_buffer_defrag(next)) {
                                free_extent_buffer(next);
                                path->slots[*level]++;
@@ -106,8 +108,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
                } else {
                        next = read_tree_block(root, bytenr,
                                       btrfs_level_size(root, *level - 1),
-                                      btrfs_node_ptr_generation(cur,
-                                                        path->slots[*level]));
+                                      ptr_gen);
                }
                ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
                                      path->slots[*level], &next);
index b5d7bd1915b47d4f048eb200027ca3360939fe03..5fc7fb4814740bdc8aa7428da5cc41266c3668c2 100644 (file)
@@ -1807,14 +1807,19 @@ static int end_bio_multi_stripe(struct bio *bio,
        if (atomic_dec_and_test(&multi->stripes_pending)) {
                bio->bi_private = multi->private;
                bio->bi_end_io = multi->end_io;
-
                /* only send an error to the higher layers if it is
                 * beyond the tolerance of the multi-bio
                 */
-               if (atomic_read(&multi->error) > multi->max_errors)
+               if (atomic_read(&multi->error) > multi->max_errors) {
                        err = -EIO;
-               else
+               } else if (err) {
+                       /*
+                        * this bio is actually up to date, we didn't
+                        * go over the max number of errors
+                        */
+                       set_bit(BIO_UPTODATE, &bio->bi_flags);
                        err = 0;
+               }
                kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)