btrfs: scrub
authorArne Jansen <sensille@gmx.net>
Tue, 8 Mar 2011 13:14:00 +0000 (14:14 +0100)
committerArne Jansen <sensille@gmx.net>
Thu, 12 May 2011 12:45:20 +0000 (14:45 +0200)
This adds an initial implementation for scrub. It works quite
straightforward. The usermode issues an ioctl for each device in the
fs. For each device, it enumerates the allocated device chunks. For
each chunk, the contained extents are enumerated and the data checksums
fetched. The extents are read sequentially and the checksums verified.
If an error occurs (checksum or EIO), a good copy is searched for. If
one is found, the bad copy will be rewritten.
All enumerations happen from the commit roots. During a transaction
commit, the scrubs get paused and afterwards continue from the new
roots.

This commit is based on the series originally posted to linux-btrfs
with some improvements that resulted from comments from David Sterba,
Ilya Dryomov and Jan Schmidt.

Signed-off-by: Arne Jansen <sensille@gmx.net>
12 files changed:
fs/btrfs/Makefile
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/file-item.c
fs/btrfs/inode.c
fs/btrfs/ioctl.h
fs/btrfs/relocation.c
fs/btrfs/scrub.c [new file with mode: 0644]
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 31610ea73aec2bff3410ec7c65b3b52e2c7a14ba..8fda3133c1b81ca3703d41001ae570149d6c8a8a 100644 (file)
@@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
-          compression.o delayed-ref.o relocation.o
+          compression.o delayed-ref.o relocation.o scrub.o
index 2e61fe1b6b8cc76111f62c3f4b4aded03f7a0cce..31141ba6072d0530d39dff58c1e3014cd27f2a10 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
+#include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
@@ -33,6 +34,7 @@
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
+#include "ioctl.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -510,6 +512,12 @@ struct btrfs_extent_item_v0 {
 /* use full backrefs for extent pointers in the block */
 #define BTRFS_BLOCK_FLAG_FULL_BACKREF  (1ULL << 8)
 
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER                (1ULL << 48)
+
 struct btrfs_tree_block_info {
        struct btrfs_disk_key key;
        u8 level;
@@ -1077,6 +1085,17 @@ struct btrfs_fs_info {
 
        void *bdev_holder;
 
+       /* private scrub information */
+       struct mutex scrub_lock;
+       atomic_t scrubs_running;
+       atomic_t scrub_pause_req;
+       atomic_t scrubs_paused;
+       atomic_t scrub_cancel_req;
+       wait_queue_head_t scrub_pause_wait;
+       struct rw_semaphore scrub_super_lock;
+       int scrub_workers_refcnt;
+       struct btrfs_workers scrub_workers;
+
        /* filesystem state */
        u64 fs_state;
 };
@@ -2472,8 +2491,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *path,
                        u64 isize);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
-                            u64 end, struct list_head *list);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+                            struct list_head *list, int search_commit);
 /* inode.c */
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@@ -2637,4 +2656,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
                              u64 *bytes_to_reserve);
 void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                              struct btrfs_pending_snapshot *pending);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                   struct btrfs_scrub_progress *progress);
+int btrfs_scrub_pause(struct btrfs_root *root);
+int btrfs_scrub_pause_super(struct btrfs_root *root);
+int btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_continue_super(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+                        struct btrfs_scrub_progress *progress);
+
 #endif
index fe5aec9b39249dc20b3e0e28a53990b9cbc53fe4..e48e8095c61f88766709307d13b621cf660483dd 100644 (file)
@@ -1773,6 +1773,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->ordered_extents);
        spin_lock_init(&fs_info->ordered_extent_lock);
 
+       mutex_init(&fs_info->scrub_lock);
+       atomic_set(&fs_info->scrubs_running, 0);
+       atomic_set(&fs_info->scrub_pause_req, 0);
+       atomic_set(&fs_info->scrubs_paused, 0);
+       atomic_set(&fs_info->scrub_cancel_req, 0);
+       init_waitqueue_head(&fs_info->scrub_pause_wait);
+       init_rwsem(&fs_info->scrub_super_lock);
+       fs_info->scrub_workers_refcnt = 0;
+       btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                          fs_info->thread_pool_size, &fs_info->generic_worker);
+
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
        sb->s_bdi = &fs_info->bdi;
@@ -2599,6 +2610,7 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
 
+       btrfs_scrub_cancel(root);
        btrfs_put_block_group_cache(fs_info);
 
        /*
index a6a9d4e8b491eee488316c97f6f975e167bf13db..39ca7c1250e7b4328404603722929b49b84250da 100644 (file)
@@ -266,7 +266,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                            struct list_head *list)
+                            struct list_head *list, int search_commit)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -283,6 +283,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       if (search_commit) {
+               path->skip_locking = 1;
+               path->reada = 2;
+               path->search_commit_root = 1;
+       }
+
        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        key.offset = start;
        key.type = BTRFS_EXTENT_CSUM_KEY;
index 870869aab0b89ce92ef7c33412e2abc6e408bdd8..27142446b30af6867b789840296273af2ad4ac66 100644 (file)
@@ -1007,7 +1007,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
        LIST_HEAD(list);
 
        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
-                                      bytenr + num_bytes - 1, &list);
+                                      bytenr + num_bytes - 1, &list, 0);
        if (ret == 0 && list_empty(&list))
                return 0;
 
index 8fb382167b13b55670e6411785f006defccc9d0d..37ac030d64b4f307b2e4dbed2c363d5e691851c5 100644 (file)
@@ -42,6 +42,43 @@ struct btrfs_ioctl_vol_args_v2 {
        char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 
+/*
+ * structure to report errors and progress to userspace, either as a
+ * result of a finished scrub, a canceled scrub or a progress inquiry
+ */
+struct btrfs_scrub_progress {
+       __u64 data_extents_scrubbed;    /* # of data extents scrubbed */
+       __u64 tree_extents_scrubbed;    /* # of tree extents scrubbed */
+       __u64 data_bytes_scrubbed;      /* # of data bytes scrubbed */
+       __u64 tree_bytes_scrubbed;      /* # of tree bytes scrubbed */
+       __u64 read_errors;              /* # of read errors encountered (EIO) */
+       __u64 csum_errors;              /* # of failed csum checks */
+       __u64 verify_errors;            /* # of occurences, where the metadata
+                                        * of a tree block did not match the
+                                        * expected values, like generation or
+                                        * logical */
+       __u64 no_csum;                  /* # of 4k data block for which no csum
+                                        * is present, probably the result of
+                                        * data written with nodatasum */
+       __u64 csum_discards;            /* # of csum for which no data was found
+                                        * in the extent tree. */
+       __u64 super_errors;             /* # of bad super blocks encountered */
+       __u64 malloc_errors;            /* # of internal kmalloc errors. These
+                                        * will likely cause an incomplete
+                                        * scrub */
+       __u64 uncorrectable_errors;     /* # of errors where either no intact
+                                        * copy was found or the writeback
+                                        * failed */
+       __u64 corrected_errors;         /* # of errors corrected */
+       __u64 last_physical;            /* last physical address scrubbed. In
+                                        * case a scrub was aborted, this can
+                                        * be used to restart the scrub */
+       __u64 unverified_errors;        /* # of occurences where a read for a
+                                        * full (64k) bio failed, but the re-
+                                        * check succeeded for each 4k piece.
+                                        * Intermittent error. */
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
index 58250e09eb056213944db6703d78679fbb94d35f..db1dffa9952b6acfd9e59c6cd4379be3e79b9143 100644 (file)
@@ -4242,7 +4242,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 
        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-                                      disk_bytenr + len - 1, &list);
+                                      disk_bytenr + len - 1, &list, 0);
 
        while (!list_empty(&list)) {
                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644 (file)
index 0000000..70f9fa7
--- /dev/null
@@ -0,0 +1,1492 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ *  - To enhance the performance, better read-ahead strategies for the
+ *    extent-tree can be employed.
+ *  - In case an unrepairable extent is encountered, track which files are
+ *    affected and report them
+ *  - In case of a read error on files with nodatasum, map the file and read
+ *    the extent to trigger a writeback of the good copy
+ *  - track and record media errors, throw out bad devices
+ *  - add a readonly mode
+ *  - add a mode to also read unallocated space
+ *  - make the prefetch cancellable
+ */
+
+struct scrub_bio;
+struct scrub_page;
+struct scrub_dev;
+struct scrub_fixup;
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_checksum(struct btrfs_work *work);
+static int scrub_checksum_data(struct scrub_dev *sdev,
+                              struct scrub_page *spag, void *buffer);
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+                                    struct scrub_page *spag, u64 logical,
+                                    void *buffer);
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
+static void scrub_recheck_end_io(struct bio *bio, int err);
+static void scrub_fixup_worker(struct btrfs_work *work);
+static void scrub_fixup(struct scrub_fixup *fixup);
+
+#define SCRUB_PAGES_PER_BIO    16      /* 64k per bio */
+#define SCRUB_BIOS_PER_DEV     16      /* 1 MB per device in flight */
+
+struct scrub_page {
+       u64                     flags;  /* extent flags */
+       u64                     generation;
+       u64                     mirror_num;
+       int                     have_csum;
+       u8                      csum[BTRFS_CSUM_SIZE];
+};
+
+struct scrub_bio {
+       int                     index;
+       struct scrub_dev        *sdev;
+       struct bio              *bio;
+       int                     err;
+       u64                     logical;
+       u64                     physical;
+       struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
+       u64                     count;
+       int                     next_free;
+       struct btrfs_work       work;
+};
+
+struct scrub_dev {
+       struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
+       struct btrfs_device     *dev;
+       int                     first_free;
+       int                     curr;
+       atomic_t                in_flight;
+       spinlock_t              list_lock;
+       wait_queue_head_t       list_wait;
+       u16                     csum_size;
+       struct list_head        csum_list;
+       atomic_t                cancel_req;
+       /*
+        * statistics
+        */
+       struct btrfs_scrub_progress stat;
+       spinlock_t              stat_lock;
+};
+
+struct scrub_fixup {
+       struct scrub_dev        *sdev;
+       struct bio              *bio;
+       u64                     logical;
+       u64                     physical;
+       struct scrub_page       spag;
+       struct btrfs_work       work;
+       int                     err;
+       int                     recheck;
+};
+
+static void scrub_free_csums(struct scrub_dev *sdev)
+{
+       while (!list_empty(&sdev->csum_list)) {
+               struct btrfs_ordered_sum *sum;
+               sum = list_first_entry(&sdev->csum_list,
+                                      struct btrfs_ordered_sum, list);
+               list_del(&sum->list);
+               kfree(sum);
+       }
+}
+
+static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+{
+       int i;
+       int j;
+       struct page *last_page;
+
+       if (!sdev)
+               return;
+
+       for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+               struct scrub_bio *sbio = sdev->bios[i];
+               struct bio *bio;
+
+               if (!sbio)
+                       break;
+
+               bio = sbio->bio;
+               if (bio) {
+                       last_page = NULL;
+                       for (j = 0; j < bio->bi_vcnt; ++j) {
+                               if (bio->bi_io_vec[j].bv_page == last_page)
+                                       continue;
+                               last_page = bio->bi_io_vec[j].bv_page;
+                               __free_page(last_page);
+                       }
+                       bio_put(bio);
+               }
+               kfree(sbio);
+       }
+
+       scrub_free_csums(sdev);
+       kfree(sdev);
+}
+
+static noinline_for_stack
+struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+{
+       struct scrub_dev *sdev;
+       int             i;
+       int             j;
+       int             ret;
+       struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+
+       sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+       if (!sdev)
+               goto nomem;
+       sdev->dev = dev;
+       for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+               struct bio *bio;
+               struct scrub_bio *sbio;
+
+               sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+               if (!sbio)
+                       goto nomem;
+               sdev->bios[i] = sbio;
+
+               bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+               if (!bio)
+                       goto nomem;
+
+               sbio->index = i;
+               sbio->sdev = sdev;
+               sbio->bio = bio;
+               sbio->count = 0;
+               sbio->work.func = scrub_checksum;
+               bio->bi_private = sdev->bios[i];
+               bio->bi_end_io = scrub_bio_end_io;
+               bio->bi_sector = 0;
+               bio->bi_bdev = dev->bdev;
+               bio->bi_size = 0;
+
+               for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
+                       struct page *page;
+                       page = alloc_page(GFP_NOFS);
+                       if (!page)
+                               goto nomem;
+
+                       ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+                       if (!ret)
+                               goto nomem;
+               }
+               WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
+
+               if (i != SCRUB_BIOS_PER_DEV-1)
+                       sdev->bios[i]->next_free = i + 1;
+                else
+                       sdev->bios[i]->next_free = -1;
+       }
+       sdev->first_free = 0;
+       sdev->curr = -1;
+       atomic_set(&sdev->in_flight, 0);
+       atomic_set(&sdev->cancel_req, 0);
+       sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+       INIT_LIST_HEAD(&sdev->csum_list);
+
+       spin_lock_init(&sdev->list_lock);
+       spin_lock_init(&sdev->stat_lock);
+       init_waitqueue_head(&sdev->list_wait);
+       return sdev;
+
+nomem:
+       scrub_free_dev(sdev);
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * scrub_recheck_error gets called when either verification of the page
+ * failed or the bio failed to read, e.g. with EIO. In the latter case,
+ * recheck_error gets called for every page in the bio, even though only
+ * one may be bad
+ */
+static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+{
+       struct scrub_dev *sdev = sbio->sdev;
+       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       struct bio *bio = NULL;
+       struct page *page = NULL;
+       struct scrub_fixup *fixup = NULL;
+       int ret;
+
+       /*
+        * while we're in here we do not want the transaction to commit.
+        * To prevent it, we increment scrubs_running. scrub_pause will
+        * have to wait until we're finished
+        * we can safely increment scrubs_running here, because we're
+        * in the context of the original bio which is still marked in_flight
+        */
+       atomic_inc(&fs_info->scrubs_running);
+
+       fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+       if (!fixup)
+               goto malloc_error;
+
+       fixup->logical = sbio->logical + ix * PAGE_SIZE;
+       fixup->physical = sbio->physical + ix * PAGE_SIZE;
+       fixup->spag = sbio->spag[ix];
+       fixup->sdev = sdev;
+
+       bio = bio_alloc(GFP_NOFS, 1);
+       if (!bio)
+               goto malloc_error;
+       bio->bi_private = fixup;
+       bio->bi_size = 0;
+       bio->bi_bdev = sdev->dev->bdev;
+       fixup->bio = bio;
+       fixup->recheck = 0;
+
+       page = alloc_page(GFP_NOFS);
+       if (!page)
+               goto malloc_error;
+
+       ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+       if (!ret)
+               goto malloc_error;
+
+       if (!sbio->err) {
+               /*
+                * shorter path: just a checksum error, go ahead and correct it
+                */
+               scrub_fixup_worker(&fixup->work);
+               return;
+       }
+
+       /*
+        * an I/O-error occured for one of the blocks in the bio, not
+        * necessarily for this one, so first try to read it separately
+        */
+       fixup->work.func = scrub_fixup_worker;
+       fixup->recheck = 1;
+       bio->bi_end_io = scrub_recheck_end_io;
+       bio->bi_sector = fixup->physical >> 9;
+       bio->bi_bdev = sdev->dev->bdev;
+       submit_bio(0, bio);
+
+       return;
+
+malloc_error:
+       if (bio)
+               bio_put(bio);
+       if (page)
+               __free_page(page);
+       kfree(fixup);
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.malloc_errors;
+       spin_unlock(&sdev->stat_lock);
+       atomic_dec(&fs_info->scrubs_running);
+       wake_up(&fs_info->scrub_pause_wait);
+}
+
+static void scrub_recheck_end_io(struct bio *bio, int err)
+{
+       struct scrub_fixup *fixup = bio->bi_private;
+       struct btrfs_fs_info *fs_info = fixup->sdev->dev->dev_root->fs_info;
+
+       fixup->err = err;
+       btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+}
+
+static int scrub_fixup_check(struct scrub_fixup *fixup)
+{
+       int ret = 1;
+       struct page *page;
+       void *buffer;
+       u64 flags = fixup->spag.flags;
+
+       page = fixup->bio->bi_io_vec[0].bv_page;
+       buffer = kmap_atomic(page, KM_USER0);
+       if (flags & BTRFS_EXTENT_FLAG_DATA) {
+               ret = scrub_checksum_data(fixup->sdev,
+                                         &fixup->spag, buffer);
+       } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               ret = scrub_checksum_tree_block(fixup->sdev,
+                                               &fixup->spag,
+                                               fixup->logical,
+                                               buffer);
+       } else {
+               WARN_ON(1);
+       }
+       kunmap_atomic(buffer, KM_USER0);
+
+       return ret;
+}
+
+static void scrub_fixup_worker(struct btrfs_work *work)
+{
+       struct scrub_fixup *fixup;
+       struct btrfs_fs_info *fs_info;
+       u64 flags;
+       int ret = 1;
+
+       fixup = container_of(work, struct scrub_fixup, work);
+       fs_info = fixup->sdev->dev->dev_root->fs_info;
+       flags = fixup->spag.flags;
+
+       if (fixup->recheck && fixup->err == 0)
+               ret = scrub_fixup_check(fixup);
+
+       if (ret || fixup->err)
+               scrub_fixup(fixup);
+
+       __free_page(fixup->bio->bi_io_vec[0].bv_page);
+       bio_put(fixup->bio);
+
+       atomic_dec(&fs_info->scrubs_running);
+       wake_up(&fs_info->scrub_pause_wait);
+
+       kfree(fixup);
+}
+
+static void scrub_fixup_end_io(struct bio *bio, int err)
+{
+       complete((struct completion *)bio->bi_private);
+}
+
+static void scrub_fixup(struct scrub_fixup *fixup)
+{
+       struct scrub_dev *sdev = fixup->sdev;
+       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+       struct btrfs_multi_bio *multi = NULL;
+       struct bio *bio = fixup->bio;
+       u64 length;
+       int i;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(complete);
+
+       if ((fixup->spag.flags & BTRFS_EXTENT_FLAG_DATA) &&
+           (fixup->spag.have_csum == 0)) {
+               /*
+                * nodatasum, don't try to fix anything
+                * FIXME: we can do better, open the inode and trigger a
+                * writeback
+                */
+               goto uncorrectable;
+       }
+
+       length = PAGE_SIZE;
+       ret = btrfs_map_block(map_tree, REQ_WRITE, fixup->logical, &length,
+                             &multi, 0);
+       if (ret || !multi || length < PAGE_SIZE) {
+               printk(KERN_ERR
+                      "scrub_fixup: btrfs_map_block failed us for %llu\n",
+                      (unsigned long long)fixup->logical);
+               WARN_ON(1);
+               return;
+       }
+
+       if (multi->num_stripes == 1) {
+               /* there aren't any replicas */
+               goto uncorrectable;
+       }
+
+       /*
+        * first find a good copy
+        */
+       for (i = 0; i < multi->num_stripes; ++i) {
+               if (i == fixup->spag.mirror_num)
+                       continue;
+
+               bio->bi_sector = multi->stripes[i].physical >> 9;
+               bio->bi_bdev = multi->stripes[i].dev->bdev;
+               bio->bi_size = PAGE_SIZE;
+               bio->bi_next = NULL;
+               bio->bi_flags |= 1 << BIO_UPTODATE;
+               bio->bi_comp_cpu = -1;
+               bio->bi_end_io = scrub_fixup_end_io;
+               bio->bi_private = &complete;
+
+               submit_bio(0, bio);
+
+               wait_for_completion(&complete);
+
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                       /* I/O-error, this is not a good copy */
+                       continue;
+
+               ret = scrub_fixup_check(fixup);
+               if (ret == 0)
+                       break;
+       }
+       if (i == multi->num_stripes)
+               goto uncorrectable;
+
+       /*
+        * the bio now contains good data, write it back
+        */
+       bio->bi_sector = fixup->physical >> 9;
+       bio->bi_bdev = sdev->dev->bdev;
+       bio->bi_size = PAGE_SIZE;
+       bio->bi_next = NULL;
+       bio->bi_flags |= 1 << BIO_UPTODATE;
+       bio->bi_comp_cpu = -1;
+       bio->bi_end_io = scrub_fixup_end_io;
+       bio->bi_private = &complete;
+
+       submit_bio(REQ_WRITE, bio);
+
+       wait_for_completion(&complete);
+
+       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+               /* I/O-error, writeback failed, give up */
+               goto uncorrectable;
+
+       kfree(multi);
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.corrected_errors;
+       spin_unlock(&sdev->stat_lock);
+
+       if (printk_ratelimit())
+               printk(KERN_ERR "btrfs: fixed up at %llu\n",
+                      (unsigned long long)fixup->logical);
+       return;
+
+uncorrectable:
+       kfree(multi);
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.uncorrectable_errors;
+       spin_unlock(&sdev->stat_lock);
+
+       if (printk_ratelimit())
+               printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
+                        (unsigned long long)fixup->logical);
+}
+
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+       struct scrub_bio *sbio = bio->bi_private;
+       struct scrub_dev *sdev = sbio->sdev;
+       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+
+       sbio->err = err;
+
+       btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_checksum(struct btrfs_work *work)
+{
+       struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+       struct scrub_dev *sdev = sbio->sdev;
+       struct page *page;
+       void *buffer;
+       int i;
+       u64 flags;
+       u64 logical;
+       int ret;
+
+       if (sbio->err) {
+               struct bio *bio;
+               struct bio *old_bio;
+
+               for (i = 0; i < sbio->count; ++i)
+                       scrub_recheck_error(sbio, i);
+               spin_lock(&sdev->stat_lock);
+               ++sdev->stat.read_errors;
+               spin_unlock(&sdev->stat_lock);
+
+               /*
+                * FIXME: allocate a new bio after a media error. I haven't
+                * figured out how to reuse this one
+                */
+               old_bio = sbio->bio;
+               bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+               if (!bio) {
+                       /*
+                        * alloc failed. cancel the scrub and don't requeue
+                        * this sbio
+                        */
+                       printk(KERN_ERR "btrfs scrub: allocation failure, "
+                                       "cancelling scrub\n");
+                       atomic_inc(&sdev->dev->dev_root->fs_info->
+                                  scrub_cancel_req);
+                       goto out_no_enqueue;
+               }
+               sbio->bio = bio;
+               bio->bi_private = sbio;
+               bio->bi_end_io = scrub_bio_end_io;
+               bio->bi_sector = 0;
+               bio->bi_bdev = sbio->sdev->dev->bdev;
+               bio->bi_size = 0;
+               for (i = 0; i < SCRUB_PAGES_PER_BIO; ++i) {
+                       struct page *page;
+                       page = old_bio->bi_io_vec[i].bv_page;
+                       bio_add_page(bio, page, PAGE_SIZE, 0);
+               }
+               bio_put(old_bio);
+               goto out;
+       }
+       for (i = 0; i < sbio->count; ++i) {
+               page = sbio->bio->bi_io_vec[i].bv_page;
+               buffer = kmap_atomic(page, KM_USER0);
+               flags = sbio->spag[i].flags;
+               logical = sbio->logical + i * PAGE_SIZE;
+               ret = 0;
+               if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                       ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
+               } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                       ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
+                                                       logical, buffer);
+               } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
+                       BUG_ON(i);
+                       (void)scrub_checksum_super(sbio, buffer);
+               } else {
+                       WARN_ON(1);
+               }
+               kunmap_atomic(buffer, KM_USER0);
+               if (ret)
+                       scrub_recheck_error(sbio, i);
+       }
+
+out:
+       spin_lock(&sdev->list_lock);
+       sbio->next_free = sdev->first_free;
+       sdev->first_free = sbio->index;
+       spin_unlock(&sdev->list_lock);
+out_no_enqueue:
+       atomic_dec(&sdev->in_flight);
+       wake_up(&sdev->list_wait);
+}
+
+static int scrub_checksum_data(struct scrub_dev *sdev,
+                              struct scrub_page *spag, void *buffer)
+{
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       int fail = 0;
+       struct btrfs_root *root = sdev->dev->dev_root;
+
+       if (!spag->have_csum)
+               return 0;
+
+       crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+       btrfs_csum_final(crc, csum);
+       if (memcmp(csum, spag->csum, sdev->csum_size))
+               fail = 1;
+
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.data_extents_scrubbed;
+       sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
+       if (fail)
+               ++sdev->stat.csum_errors;
+       spin_unlock(&sdev->stat_lock);
+
+       return fail;
+}
+
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+                                    struct scrub_page *spag, u64 logical,
+                                    void *buffer)
+{
+       struct btrfs_header *h;
+       struct btrfs_root *root = sdev->dev->dev_root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       int fail = 0;
+       int crc_fail = 0;
+
+       /*
+        * we don't use the getter functions here, as we
+        * a) don't have an extent buffer and
+        * b) the page is already kmapped
+        */
+       h = (struct btrfs_header *)buffer;
+
+       if (logical != le64_to_cpu(h->bytenr))
+               ++fail;
+
+       if (spag->generation != le64_to_cpu(h->generation))
+               ++fail;
+
+       if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+               ++fail;
+
+       if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+                  BTRFS_UUID_SIZE))
+               ++fail;
+
+       crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+                             PAGE_SIZE - BTRFS_CSUM_SIZE);
+       btrfs_csum_final(crc, csum);
+       if (memcmp(csum, h->csum, sdev->csum_size))
+               ++crc_fail;
+
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.tree_extents_scrubbed;
+       sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
+       if (crc_fail)
+               ++sdev->stat.csum_errors;
+       if (fail)
+               ++sdev->stat.verify_errors;
+       spin_unlock(&sdev->stat_lock);
+
+       return fail || crc_fail;
+}
+
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+{
+       struct btrfs_super_block *s;
+       u64 logical;
+       struct scrub_dev *sdev = sbio->sdev;
+       struct btrfs_root *root = sdev->dev->dev_root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       int fail = 0;
+
+       s = (struct btrfs_super_block *)buffer;
+       logical = sbio->logical;
+
+       if (logical != le64_to_cpu(s->bytenr))
+               ++fail;
+
+       if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+               ++fail;
+
+       if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+               ++fail;
+
+       crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+                             PAGE_SIZE - BTRFS_CSUM_SIZE);
+       btrfs_csum_final(crc, csum);
+       if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+               ++fail;
+
+       if (fail) {
+               /*
+                * if we find an error in a super block, we just report it.
+                * They will get written with the next transaction commit
+                * anyway
+                */
+               spin_lock(&sdev->stat_lock);
+               ++sdev->stat.super_errors;
+               spin_unlock(&sdev->stat_lock);
+       }
+
+       return fail;
+}
+
+static int scrub_submit(struct scrub_dev *sdev)
+{
+       struct scrub_bio *sbio;
+
+       if (sdev->curr == -1)
+               return 0;
+
+       sbio = sdev->bios[sdev->curr];
+
+       sbio->bio->bi_sector = sbio->physical >> 9;
+       sbio->bio->bi_size = sbio->count * PAGE_SIZE;
+       sbio->bio->bi_next = NULL;
+       sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+       sbio->bio->bi_comp_cpu = -1;
+       sbio->bio->bi_bdev = sdev->dev->bdev;
+       sbio->err = 0;
+       sdev->curr = -1;
+       atomic_inc(&sdev->in_flight);
+
+       submit_bio(0, sbio->bio);
+
+       return 0;
+}
+
+static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
+                     u64 physical, u64 flags, u64 gen, u64 mirror_num,
+                     u8 *csum, int force)
+{
+       struct scrub_bio *sbio;
+
+again:
+       /*
+        * grab a fresh bio or wait for one to become available
+        */
+       while (sdev->curr == -1) {
+               spin_lock(&sdev->list_lock);
+               sdev->curr = sdev->first_free;
+               if (sdev->curr != -1) {
+                       sdev->first_free = sdev->bios[sdev->curr]->next_free;
+                       sdev->bios[sdev->curr]->next_free = -1;
+                       sdev->bios[sdev->curr]->count = 0;
+                       spin_unlock(&sdev->list_lock);
+               } else {
+                       spin_unlock(&sdev->list_lock);
+                       wait_event(sdev->list_wait, sdev->first_free != -1);
+               }
+       }
+       sbio = sdev->bios[sdev->curr];
+       if (sbio->count == 0) {
+               sbio->physical = physical;
+               sbio->logical = logical;
+       } else if (sbio->physical + sbio->count * PAGE_SIZE != physical) {
+               scrub_submit(sdev);
+               goto again;
+       }
+       sbio->spag[sbio->count].flags = flags;
+       sbio->spag[sbio->count].generation = gen;
+       sbio->spag[sbio->count].have_csum = 0;
+       sbio->spag[sbio->count].mirror_num = mirror_num;
+       if (csum) {
+               sbio->spag[sbio->count].have_csum = 1;
+               memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+       }
+       ++sbio->count;
+       if (sbio->count == SCRUB_PAGES_PER_BIO || force)
+               scrub_submit(sdev);
+
+       return 0;
+}
+
+static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+                          u8 *csum)
+{
+       struct btrfs_ordered_sum *sum = NULL;
+       int ret = 0;
+       unsigned long i;
+       unsigned long num_sectors;
+       u32 sectorsize = sdev->dev->dev_root->sectorsize;
+
+       while (!list_empty(&sdev->csum_list)) {
+               sum = list_first_entry(&sdev->csum_list,
+                                      struct btrfs_ordered_sum, list);
+               if (sum->bytenr > logical)
+                       return 0;
+               if (sum->bytenr + sum->len > logical)
+                       break;
+
+               ++sdev->stat.csum_discards;
+               list_del(&sum->list);
+               kfree(sum);
+               sum = NULL;
+       }
+       if (!sum)
+               return 0;
+
+       num_sectors = sum->len / sectorsize;
+       for (i = 0; i < num_sectors; ++i) {
+               if (sum->sums[i].bytenr == logical) {
+                       memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+                       ret = 1;
+                       break;
+               }
+       }
+       if (ret && i == num_sectors - 1) {
+               list_del(&sum->list);
+               kfree(sum);
+       }
+       return ret;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+                       u64 physical, u64 flags, u64 gen, u64 mirror_num)
+{
+       int ret;
+       u8 csum[BTRFS_CSUM_SIZE];
+
+       while (len) {
+               u64 l = min_t(u64, len, PAGE_SIZE);
+               int have_csum = 0;
+
+               if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                       /* push csums to sbio */
+                       have_csum = scrub_find_csum(sdev, logical, l, csum);
+                       if (have_csum == 0)
+                               ++sdev->stat.no_csum;
+               }
+               ret = scrub_page(sdev, logical, l, physical, flags, gen,
+                                mirror_num, have_csum ? csum : NULL, 0);
+               if (ret)
+                       return ret;
+               len -= l;
+               logical += l;
+               physical += l;
+       }
+       return 0;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+       struct map_lookup *map, int num, u64 base, u64 length)
+{
+       struct btrfs_path *path;
+       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       struct btrfs_root *root = fs_info->extent_root;
+       struct btrfs_root *csum_root = fs_info->csum_root;
+       struct btrfs_extent_item *extent;
+       u64 flags;
+       int ret;
+       int slot;
+       int i;
+       u64 nstripes;
+       int start_stripe;
+       struct extent_buffer *l;
+       struct btrfs_key key;
+       u64 physical;
+       u64 logical;
+       u64 generation;
+       u64 mirror_num;
+
+       u64 increment = map->stripe_len;
+       u64 offset;
+
+       nstripes = length;
+       offset = 0;
+       do_div(nstripes, map->stripe_len);
+       if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+               offset = map->stripe_len * num;
+               increment = map->stripe_len * map->num_stripes;
+               mirror_num = 0;
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+               int factor = map->num_stripes / map->sub_stripes;
+               offset = map->stripe_len * (num / map->sub_stripes);
+               increment = map->stripe_len * factor;
+               mirror_num = num % map->sub_stripes;
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+               increment = map->stripe_len;
+               mirror_num = num % map->num_stripes;
+       } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+               increment = map->stripe_len;
+               mirror_num = num % map->num_stripes;
+       } else {
+               increment = map->stripe_len;
+               mirror_num = 0;
+       }
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       path->reada = 2;
+       path->search_commit_root = 1;
+       path->skip_locking = 1;
+
+       /*
+        * find all extents for each stripe and just read them to get
+        * them into the page cache
+        * FIXME: we can do better. build a more intelligent prefetching
+        */
+       logical = base + offset;
+       physical = map->stripes[num].physical;
+       ret = 0;
+       for (i = 0; i < nstripes; ++i) {
+               key.objectid = logical;
+               key.type = BTRFS_EXTENT_ITEM_KEY;
+               key.offset = (u64)0;
+
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0)
+                       goto out;
+
+               l = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(l, &key, slot);
+               if (key.objectid != logical) {
+                       ret = btrfs_previous_item(root, path, 0,
+                                                 BTRFS_EXTENT_ITEM_KEY);
+                       if (ret < 0)
+                               goto out;
+               }
+
+               while (1) {
+                       l = path->nodes[0];
+                       slot = path->slots[0];
+                       if (slot >= btrfs_header_nritems(l)) {
+                               ret = btrfs_next_leaf(root, path);
+                               if (ret == 0)
+                                       continue;
+                               if (ret < 0)
+                                       goto out;
+
+                               break;
+                       }
+                       btrfs_item_key_to_cpu(l, &key, slot);
+
+                       if (key.objectid >= logical + map->stripe_len)
+                               break;
+
+                       path->slots[0]++;
+               }
+               btrfs_release_path(root, path);
+               logical += increment;
+               physical += map->stripe_len;
+               cond_resched();
+       }
+
+       /*
+        * collect all data csums for the stripe to avoid seeking during
+        * the scrub. This might currently (crc32) end up to be about 1MB
+        */
+       start_stripe = 0;
+again:
+       logical = base + offset + start_stripe * increment;
+       for (i = start_stripe; i < nstripes; ++i) {
+               ret = btrfs_lookup_csums_range(csum_root, logical,
+                                              logical + map->stripe_len - 1,
+                                              &sdev->csum_list, 1);
+               if (ret)
+                       goto out;
+
+               logical += increment;
+               cond_resched();
+       }
+       /*
+        * now find all extents for each stripe and scrub them
+        */
+       logical = base + offset + start_stripe * increment;
+       physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+       ret = 0;
+       for (i = start_stripe; i < nstripes; ++i) {
+               /*
+                * canceled?
+                */
+               if (atomic_read(&fs_info->scrub_cancel_req) ||
+                   atomic_read(&sdev->cancel_req)) {
+                       ret = -ECANCELED;
+                       goto out;
+               }
+               /*
+                * check to see if we have to pause
+                */
+               if (atomic_read(&fs_info->scrub_pause_req)) {
+                       /* push queued extents */
+                       scrub_submit(sdev);
+                       wait_event(sdev->list_wait,
+                                  atomic_read(&sdev->in_flight) == 0);
+                       atomic_inc(&fs_info->scrubs_paused);
+                       wake_up(&fs_info->scrub_pause_wait);
+                       mutex_lock(&fs_info->scrub_lock);
+                       while (atomic_read(&fs_info->scrub_pause_req)) {
+                               mutex_unlock(&fs_info->scrub_lock);
+                               wait_event(fs_info->scrub_pause_wait,
+                                  atomic_read(&fs_info->scrub_pause_req) == 0);
+                               mutex_lock(&fs_info->scrub_lock);
+                       }
+                       atomic_dec(&fs_info->scrubs_paused);
+                       mutex_unlock(&fs_info->scrub_lock);
+                       wake_up(&fs_info->scrub_pause_wait);
+                       scrub_free_csums(sdev);
+                       start_stripe = i;
+                       goto again;
+               }
+
+               key.objectid = logical;
+               key.type = BTRFS_EXTENT_ITEM_KEY;
+               key.offset = (u64)0;
+
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0)
+                       goto out;
+
+               l = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(l, &key, slot);
+               if (key.objectid != logical) {
+                       ret = btrfs_previous_item(root, path, 0,
+                                                 BTRFS_EXTENT_ITEM_KEY);
+                       if (ret < 0)
+                               goto out;
+               }
+
+               while (1) {
+                       l = path->nodes[0];
+                       slot = path->slots[0];
+                       if (slot >= btrfs_header_nritems(l)) {
+                               ret = btrfs_next_leaf(root, path);
+                               if (ret == 0)
+                                       continue;
+                               if (ret < 0)
+                                       goto out;
+
+                               break;
+                       }
+                       btrfs_item_key_to_cpu(l, &key, slot);
+
+                       if (key.objectid + key.offset <= logical)
+                               goto next;
+
+                       if (key.objectid >= logical + map->stripe_len)
+                               break;
+
+                       if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+                               goto next;
+
+                       extent = btrfs_item_ptr(l, slot,
+                                               struct btrfs_extent_item);
+                       flags = btrfs_extent_flags(l, extent);
+                       generation = btrfs_extent_generation(l, extent);
+
+                       if (key.objectid < logical &&
+                           (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+                               printk(KERN_ERR
+                                      "btrfs scrub: tree block %llu spanning "
+                                      "stripes, ignored. logical=%llu\n",
+                                      (unsigned long long)key.objectid,
+                                      (unsigned long long)logical);
+                               goto next;
+                       }
+
+                       /*
+                        * trim extent to this stripe
+                        */
+                       if (key.objectid < logical) {
+                               key.offset -= logical - key.objectid;
+                               key.objectid = logical;
+                       }
+                       if (key.objectid + key.offset >
+                           logical + map->stripe_len) {
+                               key.offset = logical + map->stripe_len -
+                                            key.objectid;
+                       }
+
+                       ret = scrub_extent(sdev, key.objectid, key.offset,
+                                          key.objectid - logical + physical,
+                                          flags, generation, mirror_num);
+                       if (ret)
+                               goto out;
+
+next:
+                       path->slots[0]++;
+               }
+               btrfs_release_path(root, path);
+               logical += increment;
+               physical += map->stripe_len;
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.last_physical = physical;
+               spin_unlock(&sdev->stat_lock);
+       }
+       /* push queued extents */
+       scrub_submit(sdev);
+
+out:
+       btrfs_free_path(path);
+       return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+       u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+{
+       struct btrfs_mapping_tree *map_tree =
+               &sdev->dev->dev_root->fs_info->mapping_tree;
+       struct map_lookup *map;
+       struct extent_map *em;
+       int i;
+       int ret = -EINVAL;
+
+       read_lock(&map_tree->map_tree.lock);
+       em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+       read_unlock(&map_tree->map_tree.lock);
+
+       if (!em)
+               return -EINVAL;
+
+       map = (struct map_lookup *)em->bdev;
+       if (em->start != chunk_offset)
+               goto out;
+
+       if (em->len < length)
+               goto out;
+
+       for (i = 0; i < map->num_stripes; ++i) {
+               if (map->stripes[i].dev == sdev->dev) {
+                       ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+                       if (ret)
+                               goto out;
+               }
+       }
+out:
+       free_extent_map(em);
+
+       return ret;
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+{
+       struct btrfs_dev_extent *dev_extent = NULL;
+       struct btrfs_path *path;
+       struct btrfs_root *root = sdev->dev->dev_root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u64 length;
+       u64 chunk_tree;
+       u64 chunk_objectid;
+       u64 chunk_offset;
+       int ret;
+       int slot;
+       struct extent_buffer *l;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_block_group_cache *cache;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       path->reada = 2;
+       path->search_commit_root = 1;
+       path->skip_locking = 1;
+
+       key.objectid = sdev->dev->devid;
+       key.offset = 0ull;
+       key.type = BTRFS_DEV_EXTENT_KEY;
+
+
+       while (1) {
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0)
+                       goto out;
+               ret = 0;
+
+               l = path->nodes[0];
+               slot = path->slots[0];
+
+               btrfs_item_key_to_cpu(l, &found_key, slot);
+
+               if (found_key.objectid != sdev->dev->devid)
+                       break;
+
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       break;
+
+               if (found_key.offset >= end)
+                       break;
+
+               if (found_key.offset < key.offset)
+                       break;
+
+               dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+               length = btrfs_dev_extent_length(l, dev_extent);
+
+               if (found_key.offset + length <= start) {
+                       key.offset = found_key.offset + length;
+                       btrfs_release_path(root, path);
+                       continue;
+               }
+
+               chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+               chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+               chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+               /*
+                * get a reference on the corresponding block group to prevent
+                * the chunk from going away while we scrub it
+                */
+               cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+               if (!cache) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+                                 chunk_offset, length);
+               btrfs_put_block_group(cache);
+               if (ret)
+                       break;
+
+               key.offset = found_key.offset + length;
+               btrfs_release_path(root, path);
+       }
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+{
+       int     i;
+       u64     bytenr;
+       u64     gen;
+       int     ret;
+       struct btrfs_device *device = sdev->dev;
+       struct btrfs_root *root = device->dev_root;
+
+       gen = root->fs_info->last_trans_committed;
+
+       for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+               bytenr = btrfs_sb_offset(i);
+               if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+                       break;
+
+               ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
+                                BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+               if (ret)
+                       return ret;
+       }
+       wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+
+       return 0;
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
+       mutex_lock(&fs_info->scrub_lock);
+       if (fs_info->scrub_workers_refcnt == 0)
+               btrfs_start_workers(&fs_info->scrub_workers, 1);
+       ++fs_info->scrub_workers_refcnt;
+       mutex_unlock(&fs_info->scrub_lock);
+
+       return 0;
+}
+
+static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
+       mutex_lock(&fs_info->scrub_lock);
+       if (--fs_info->scrub_workers_refcnt == 0)
+               btrfs_stop_workers(&fs_info->scrub_workers);
+       WARN_ON(fs_info->scrub_workers_refcnt < 0);
+       mutex_unlock(&fs_info->scrub_lock);
+}
+
+
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                   struct btrfs_scrub_progress *progress)
+{
+       struct scrub_dev *sdev;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       int ret;
+       struct btrfs_device *dev;
+
+       if (root->fs_info->closing)
+               return -EINVAL;
+
+       /*
+        * check some assumptions
+        */
+       if (root->sectorsize != PAGE_SIZE ||
+           root->sectorsize != root->leafsize ||
+           root->sectorsize != root->nodesize) {
+               printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+               return -EINVAL;
+       }
+
+       ret = scrub_workers_get(root);
+       if (ret)
+               return ret;
+
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       dev = btrfs_find_device(root, devid, NULL, NULL);
+       if (!dev || dev->missing) {
+               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(root);
+               return -ENODEV;
+       }
+       mutex_lock(&fs_info->scrub_lock);
+
+       if (!dev->in_fs_metadata) {
+               mutex_unlock(&fs_info->scrub_lock);
+               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(root);
+               return -ENODEV;
+       }
+
+       if (dev->scrub_device) {
+               mutex_unlock(&fs_info->scrub_lock);
+               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(root);
+               return -EINPROGRESS;
+       }
+       sdev = scrub_setup_dev(dev);
+       if (IS_ERR(sdev)) {
+               mutex_unlock(&fs_info->scrub_lock);
+               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(root);
+               return PTR_ERR(sdev);
+       }
+       dev->scrub_device = sdev;
+
+       atomic_inc(&fs_info->scrubs_running);
+       mutex_unlock(&fs_info->scrub_lock);
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+       down_read(&fs_info->scrub_super_lock);
+       ret = scrub_supers(sdev);
+       up_read(&fs_info->scrub_super_lock);
+
+       if (!ret)
+               ret = scrub_enumerate_chunks(sdev, start, end);
+
+       wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+
+       atomic_dec(&fs_info->scrubs_running);
+       wake_up(&fs_info->scrub_pause_wait);
+
+       if (progress)
+               memcpy(progress, &sdev->stat, sizeof(*progress));
+
+       mutex_lock(&fs_info->scrub_lock);
+       dev->scrub_device = NULL;
+       mutex_unlock(&fs_info->scrub_lock);
+
+       scrub_free_dev(sdev);
+       scrub_workers_put(root);
+
+       return ret;
+}
+
+int btrfs_scrub_pause(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
+       mutex_lock(&fs_info->scrub_lock);
+       atomic_inc(&fs_info->scrub_pause_req);
+       while (atomic_read(&fs_info->scrubs_paused) !=
+              atomic_read(&fs_info->scrubs_running)) {
+               mutex_unlock(&fs_info->scrub_lock);
+               wait_event(fs_info->scrub_pause_wait,
+                          atomic_read(&fs_info->scrubs_paused) ==
+                          atomic_read(&fs_info->scrubs_running));
+               mutex_lock(&fs_info->scrub_lock);
+       }
+       mutex_unlock(&fs_info->scrub_lock);
+
+       return 0;
+}
+
+int btrfs_scrub_continue(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
+       atomic_dec(&fs_info->scrub_pause_req);
+       wake_up(&fs_info->scrub_pause_wait);
+       return 0;
+}
+
+int btrfs_scrub_pause_super(struct btrfs_root *root)
+{
+       down_write(&root->fs_info->scrub_super_lock);
+       return 0;
+}
+
+int btrfs_scrub_continue_super(struct btrfs_root *root)
+{
+       up_write(&root->fs_info->scrub_super_lock);
+       return 0;
+}
+
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
+       mutex_lock(&fs_info->scrub_lock);
+       if (!atomic_read(&fs_info->scrubs_running)) {
+               mutex_unlock(&fs_info->scrub_lock);
+               return -ENOTCONN;
+       }
+
+       atomic_inc(&fs_info->scrub_cancel_req);
+       while (atomic_read(&fs_info->scrubs_running)) {
+               mutex_unlock(&fs_info->scrub_lock);
+               wait_event(fs_info->scrub_pause_wait,
+                          atomic_read(&fs_info->scrubs_running) == 0);
+               mutex_lock(&fs_info->scrub_lock);
+       }
+       atomic_dec(&fs_info->scrub_cancel_req);
+       mutex_unlock(&fs_info->scrub_lock);
+
+       return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct scrub_dev *sdev;
+
+       mutex_lock(&fs_info->scrub_lock);
+       sdev = dev->scrub_device;
+       if (!sdev) {
+               mutex_unlock(&fs_info->scrub_lock);
+               return -ENOTCONN;
+       }
+       atomic_inc(&sdev->cancel_req);
+       while (dev->scrub_device) {
+               mutex_unlock(&fs_info->scrub_lock);
+               wait_event(fs_info->scrub_pause_wait,
+                          dev->scrub_device == NULL);
+               mutex_lock(&fs_info->scrub_lock);
+       }
+       mutex_unlock(&fs_info->scrub_lock);
+
+       return 0;
+}
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_device *dev;
+       int ret;
+
+       /*
+        * we have to hold the device_list_mutex here so the device
+        * does not go away in cancel_dev. FIXME: find a better solution
+        */
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       dev = btrfs_find_device(root, devid, NULL, NULL);
+       if (!dev) {
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+               return -ENODEV;
+       }
+       ret = btrfs_scrub_cancel_dev(root, dev);
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+       return ret;
+}
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+                        struct btrfs_scrub_progress *progress)
+{
+       struct btrfs_device *dev;
+       struct scrub_dev *sdev = NULL;
+
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       dev = btrfs_find_device(root, devid, NULL, NULL);
+       if (dev)
+               sdev = dev->scrub_device;
+       if (sdev)
+               memcpy(progress, &sdev->stat, sizeof(*progress));
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+       return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+}
index c571734d5e5a802ea73a0c6458dd41c0b2e56659..37c2302a08d44bf8430930085d0b2fcbf5e1b1e9 100644 (file)
@@ -1321,6 +1321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        WARN_ON(cur_trans != trans->transaction);
 
+       btrfs_scrub_pause(root);
        /* btrfs_commit_tree_roots is responsible for getting the
         * various roots consistent with each other.  Every pointer
         * in the tree of tree roots has to point to the most up to date
@@ -1405,6 +1406,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        mutex_unlock(&root->fs_info->trans_mutex);
 
+       btrfs_scrub_continue(root);
+
        if (current->journal_info == trans)
                current->journal_info = NULL;
 
index f997ec0c1ba4b8efb6b46f88160de43cda3da4f2..f1a0726da5f51f922ab4a5dedfb9ab593805eed7 100644 (file)
@@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
                        ret = btrfs_lookup_csums_range(root->log_root,
                                                csum_start, csum_end - 1,
-                                               &ordered_sums);
+                                               &ordered_sums, 0);
                        BUG_ON(ret);
                        while (!list_empty(&ordered_sums)) {
                                struct btrfs_ordered_sum *sums;
@@ -2093,7 +2093,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * the running transaction open, so a full commit can't hop
         * in and cause problems either.
         */
+       btrfs_scrub_pause_super(root);
        write_ctree_super(trans, root->fs_info->tree_root, 1);
+       btrfs_scrub_continue_super(root);
        ret = 0;
 
        mutex_lock(&root->log_mutex);
@@ -2689,7 +2691,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                ret = btrfs_lookup_csums_range(
                                                log->fs_info->csum_root,
                                                ds + cs, ds + cs + cl - 1,
-                                               &ordered_sums);
+                                               &ordered_sums, 0);
                                BUG_ON(ret);
                        }
                }
index 8b9fb8c7683da9fc2652281cb21c99bdcbab2f2e..89ca8f110b6e8d5f9e125c499a69e96d119b9d6c 100644 (file)
@@ -38,9 +38,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
-                           (sizeof(struct btrfs_bio_stripe) * (n)))
-
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
@@ -1334,6 +1331,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                goto error_undo;
 
        device->in_fs_metadata = 0;
+       btrfs_scrub_cancel_dev(root, device);
 
        /*
         * the device list mutex makes sure that we don't change
index cc2eadaf7a27b996af24eaea258b14c5c5c1cfe0..f7c20123a1fe173187e3820a963602c3c82c30ac 100644 (file)
@@ -85,6 +85,9 @@ struct btrfs_device {
        /* physical drive uuid (or lvm uuid) */
        u8 uuid[BTRFS_UUID_SIZE];
 
+       /* per-device scrub information */
+       struct scrub_dev *scrub_device;
+
        struct btrfs_work work;
 };
 
@@ -157,6 +160,9 @@ struct map_lookup {
        struct btrfs_bio_stripe stripes[];
 };
 
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+                           (sizeof(struct btrfs_bio_stripe) * (n)))
+
 /* Used to sort the devices by max_avail(descending sort) */
 int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);