Btrfs: fix use-after-free in the finishing procedure of the device replace
authorMiao Xie <miaox@cn.fujitsu.com>
Thu, 30 Jan 2014 08:46:55 +0000 (16:46 +0800)
committerJosef Bacik <jbacik@fb.com>
Mon, 10 Mar 2014 19:15:39 +0000 (15:15 -0400)
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
  the mapping tree. And we forgot to replace the source device in those
  mapping of the new chunks.
- We might get the mapping information which including the source device
  before the mapping information update. And then submit the bio which was
  based on that mapping information after we freed the source device.

For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.

For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.

Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
fs/btrfs/ctree.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index fceddbdfdd3d0377cb6d5d4c92eadf381f978740..dac6653d4cce02100cdbb3532036e4db728ee241 100644 (file)
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FS_STATE_ERROR           0
 #define BTRFS_FS_STATE_REMOUNTING      1
 #define BTRFS_FS_STATE_TRANS_ABORTED   2
+#define BTRFS_FS_STATE_DEV_REPLACING   3
 
 /* Super block flags */
 /* Errors detected */
@@ -1674,6 +1675,9 @@ struct btrfs_fs_info {
 
        atomic_t mutually_exclusive_operation_running;
 
+       struct percpu_counter bio_counter;
+       wait_queue_head_t replace_wait;
+
        struct semaphore uuid_tree_rescan_sem;
        unsigned int update_uuid_tree_gen:1;
 };
@@ -4008,6 +4012,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
 
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
 /* reada.c */
 struct reada_control {
        struct btrfs_root       *root;          /* tree to prefetch */
index b20d59e5e5dd4ffa1db428e81320c46099f72ea9..ec1c3f3a775de28e5a06a54d73289f3d201ce533 100644 (file)
@@ -431,6 +431,35 @@ leave_no_lock:
        return ret;
 }
 
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+       s64 writers;
+       DEFINE_WAIT(wait);
+
+       set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+       do {
+               prepare_to_wait(&fs_info->replace_wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+               writers = percpu_counter_sum(&fs_info->bio_counter);
+               if (writers)
+                       schedule();
+               finish_wait(&fs_info->replace_wait, &wait);
+       } while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+       clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+       if (waitqueue_active(&fs_info->replace_wait))
+               wake_up(&fs_info->replace_wait);
+}
+
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                       int scrub_ret)
 {
@@ -458,12 +487,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        src_device = dev_replace->srcdev;
        btrfs_dev_replace_unlock(dev_replace);
 
-       /* replace old device with new one in mapping tree */
-       if (!scrub_ret)
-               btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
-                                                               src_device,
-                                                               tgt_device);
-
        /*
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
@@ -495,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        dev_replace->time_stopped = get_seconds();
        dev_replace->item_needs_writeback = 1;
 
-       if (scrub_ret) {
+       /* replace old device with new one in mapping tree */
+       if (!scrub_ret) {
+               btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+                                                               src_device,
+                                                               tgt_device);
+       } else {
                printk_in_rcu(KERN_ERR
                              "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
                              src_device->missing ? "<missing disk>" :
@@ -534,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 
+       btrfs_rm_dev_replace_blocked(fs_info);
+
        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
 
+       btrfs_rm_dev_replace_unblocked(fs_info);
+
        /*
         * this is again a consistent state where no dev_replace procedure
         * is running, the target device is part of the filesystem, the
@@ -865,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
                mutex_unlock(&dev_replace->lock_management_lock);
        }
 }
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+       percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+       percpu_counter_dec(&fs_info->bio_counter);
+
+       if (waitqueue_active(&fs_info->replace_wait))
+               wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+       DEFINE_WAIT(wait);
+again:
+       percpu_counter_inc(&fs_info->bio_counter);
+       if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+               btrfs_bio_counter_dec(fs_info);
+               wait_event(fs_info->replace_wait,
+                          !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+                                    &fs_info->fs_state));
+               goto again;
+       }
+
+}
index fcf367581073b0edec7947f24617e33d5e55de28..0cafacb07b43fd02dda1432d7d3ce958f6f59974 100644 (file)
@@ -2136,10 +2136,16 @@ int open_ctree(struct super_block *sb,
                goto fail_dirty_metadata_bytes;
        }
 
+       ret = percpu_counter_init(&fs_info->bio_counter, 0);
+       if (ret) {
+               err = ret;
+               goto fail_delalloc_bytes;
+       }
+
        fs_info->btree_inode = new_inode(sb);
        if (!fs_info->btree_inode) {
                err = -ENOMEM;
-               goto fail_delalloc_bytes;
+               goto fail_bio_counter;
        }
 
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2214,6 +2220,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->scrub_pause_req, 0);
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
+       init_waitqueue_head(&fs_info->replace_wait);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2966,6 +2973,8 @@ fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
        iput(fs_info->btree_inode);
+fail_bio_counter:
+       percpu_counter_destroy(&fs_info->bio_counter);
 fail_delalloc_bytes:
        percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
@@ -3613,6 +3622,7 @@ int close_ctree(struct btrfs_root *root)
 
        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
        percpu_counter_destroy(&fs_info->delalloc_bytes);
+       percpu_counter_destroy(&fs_info->bio_counter);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 
index b68afe32419f1a79b24712362fbe26bcd60e112e..07629e99809a3e747ff50bb62528c755a358a6af 100644 (file)
@@ -5263,6 +5263,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 static void btrfs_end_bio(struct bio *bio, int err)
 {
        struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_device *dev = bbio->stripes[0].dev;
        int is_orig_bio = 0;
 
        if (err) {
@@ -5270,7 +5271,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
                if (err == -EIO || err == -EREMOTEIO) {
                        unsigned int stripe_index =
                                btrfs_io_bio(bio)->stripe_index;
-                       struct btrfs_device *dev;
 
                        BUG_ON(stripe_index >= bbio->num_stripes);
                        dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5292,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
        if (bio == bbio->orig_bio)
                is_orig_bio = 1;
 
+       btrfs_bio_counter_dec(bbio->fs_info);
+
        if (atomic_dec_and_test(&bbio->stripes_pending)) {
                if (!is_orig_bio) {
                        bio_put(bio);
@@ -5440,6 +5442,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
        }
 #endif
        bio->bi_bdev = dev->bdev;
+
+       btrfs_bio_counter_inc_noblocked(root->fs_info);
+
        if (async)
                btrfs_schedule_bio(root, dev, rw, bio);
        else
@@ -5508,28 +5513,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        length = bio->bi_size;
        map_length = length;
 
+       btrfs_bio_counter_inc_blocked(root->fs_info);
        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                              mirror_num, &raid_map);
-       if (ret) /* -ENOMEM */
+       if (ret) {
+               btrfs_bio_counter_dec(root->fs_info);
                return ret;
+       }
 
        total_devs = bbio->num_stripes;
        bbio->orig_bio = first_bio;
        bbio->private = first_bio->bi_private;
        bbio->end_io = first_bio->bi_end_io;
+       bbio->fs_info = root->fs_info;
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
        if (raid_map) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (rw & WRITE) {
-                       return raid56_parity_write(root, bio, bbio,
-                                                  raid_map, map_length);
+                       ret = raid56_parity_write(root, bio, bbio,
+                                                 raid_map, map_length);
                } else {
-                       return raid56_parity_recover(root, bio, bbio,
-                                                    raid_map, map_length,
-                                                    mirror_num);
+                       ret = raid56_parity_recover(root, bio, bbio,
+                                                   raid_map, map_length,
+                                                   mirror_num);
                }
+               /*
+                * FIXME, replace dosen't support raid56 yet, please fix
+                * it in the future.
+                */
+               btrfs_bio_counter_dec(root->fs_info);
+               return ret;
        }
 
        if (map_length < length) {
@@ -5571,6 +5586,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                                  async_submit);
                dev_nr++;
        }
+       btrfs_bio_counter_dec(root->fs_info);
        return 0;
 }
 
index 8b3cd142b3734dc0161c479b9bfcc9eb144cdaab..80754f9dd3df9faa208a03a274bb7274a002b671 100644 (file)
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
 struct btrfs_bio {
        atomic_t stripes_pending;
+       struct btrfs_fs_info *fs_info;
        bio_end_io_t *end_io;
        struct bio *orig_bio;
        void *private;