Btrfs: track dirty block groups on their own list
authorJosef Bacik <jbacik@fb.com>
Mon, 17 Nov 2014 20:45:48 +0000 (15:45 -0500)
committerChris Mason <clm@fb.com>
Thu, 22 Jan 2015 01:36:52 +0000 (17:36 -0800)
Currently any time we try to update the block groups on disk we will walk _all_
block groups and check for the ->dirty flag to see if it is set.  This function
can get called several times during a commit.  So if you have several terabytes
of data you will be a very sad panda as we will loop through _all_ of the block
groups several times, which makes the commit take a while which slows down the
rest of the file system operations.

This patch introduces a dirty list for the block groups that we get added to
when we dirty the block group for the first time.  Then we simply update any
block groups that have been dirtied since the last time we called
btrfs_write_dirty_block_groups.  This allows us to clean up how we write the
free space cache out so it is much cleaner.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/free-space-cache.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index 45ed4dc6a0cef90589d10222b6051f93dc3b2885..0b4683f560c878405266bebcda8ee3c184538b68 100644 (file)
@@ -1238,7 +1238,6 @@ enum btrfs_disk_cache_state {
        BTRFS_DC_ERROR          = 1,
        BTRFS_DC_CLEAR          = 2,
        BTRFS_DC_SETUP          = 3,
-       BTRFS_DC_NEED_WRITE     = 4,
 };
 
 struct btrfs_caching_control {
@@ -1276,7 +1275,6 @@ struct btrfs_block_group_cache {
        unsigned long full_stripe_len;
 
        unsigned int ro:1;
-       unsigned int dirty:1;
        unsigned int iref:1;
        unsigned int has_caching_ctl:1;
        unsigned int removed:1;
@@ -1314,6 +1312,9 @@ struct btrfs_block_group_cache {
        struct list_head ro_list;
 
        atomic_t trimming;
+
+       /* For dirty block groups */
+       struct list_head dirty_list;
 };
 
 /* delayed seq elem */
index 15116585e7142d3865d822828011ee1ac38f0519..21c373fe256c09b4515582e7365b7f9af4b44656 100644 (file)
@@ -74,8 +74,9 @@ enum {
        RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
 
-static int update_block_group(struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
        struct btrfs_block_group_cache *cache;
-       int err = 0;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       int ret = 0;
        struct btrfs_path *path;
-       u64 last = 0;
+
+       if (list_empty(&cur_trans->dirty_bgs))
+               return 0;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
-again:
-       while (1) {
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-               err = cache_save_setup(cache, trans, path);
-               last = cache->key.objectid + cache->key.offset;
-               btrfs_put_block_group(cache);
-       }
-
-       while (1) {
-               if (last == 0) {
-                       err = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long)-1);
-                       if (err) /* File system offline */
-                               goto out;
-               }
-
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
-                               btrfs_put_block_group(cache);
-                               goto again;
-                       }
-
-                       if (cache->dirty)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-
-               if (cache->disk_cache_state == BTRFS_DC_SETUP)
-                       cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
-               cache->dirty = 0;
-               last = cache->key.objectid + cache->key.offset;
-
-               err = write_one_cache_group(trans, root, path, cache);
-               btrfs_put_block_group(cache);
-               if (err) /* File system offline */
-                       goto out;
-       }
-
-       while (1) {
-               /*
-                * I don't think this is needed since we're just marking our
-                * preallocated extent as written, but just in case it can't
-                * hurt.
-                */
-               if (last == 0) {
-                       err = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long)-1);
-                       if (err) /* File system offline */
-                               goto out;
-               }
-
-               cache = btrfs_lookup_first_block_group(root->fs_info, last);
-               while (cache) {
-                       /*
-                        * Really this shouldn't happen, but it could if we
-                        * couldn't write the entire preallocated extent and
-                        * splitting the extent resulted in a new block.
-                        */
-                       if (cache->dirty) {
-                               btrfs_put_block_group(cache);
-                               goto again;
-                       }
-                       if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                               break;
-                       cache = next_block_group(root, cache);
-               }
-               if (!cache) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
-
-               err = btrfs_write_out_cache(root, trans, cache, path);
-
-               /*
-                * If we didn't have an error then the cache state is still
-                * NEED_WRITE, so we can set it to WRITTEN.
-                */
-               if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                       cache->disk_cache_state = BTRFS_DC_WRITTEN;
-               last = cache->key.objectid + cache->key.offset;
+       /*
+        * We don't need the lock here since we are protected by the transaction
+        * commit.  We want to do the cache_save_setup first and then run the
+        * delayed refs to make sure we have the best chance at doing this all
+        * in one shot.
+        */
+       while (!list_empty(&cur_trans->dirty_bgs)) {
+               cache = list_first_entry(&cur_trans->dirty_bgs,
+                                        struct btrfs_block_group_cache,
+                                        dirty_list);
+               list_del_init(&cache->dirty_list);
+               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                       cache_save_setup(cache, trans, path);
+               if (!ret)
+                       ret = btrfs_run_delayed_refs(trans, root,
+                                                    (unsigned long) -1);
+               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+                       btrfs_write_out_cache(root, trans, cache, path);
+               if (!ret)
+                       ret = write_one_cache_group(trans, root, path, cache);
                btrfs_put_block_group(cache);
        }
-out:
 
        btrfs_free_path(path);
-       return err;
+       return ret;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
        btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_root *root,
-                             u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, int alloc)
 {
        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5338,14 @@ static int update_block_group(struct btrfs_root *root,
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
                        cache_block_group(cache, 1);
 
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+               if (list_empty(&cache->dirty_list)) {
+                       list_add_tail(&cache->dirty_list,
+                                     &trans->transaction->dirty_bgs);
+                       btrfs_get_block_group(cache);
+               }
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5424,7 +5356,6 @@ static int update_block_group(struct btrfs_root *root,
                    cache->disk_cache_state < BTRFS_DC_CLEAR)
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
 
-               cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
                if (alloc) {
@@ -6103,7 +6034,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
 
-               ret = update_block_group(root, bytenr, num_bytes, 0);
+               ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
                        goto out;
@@ -7063,7 +6994,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       ret = update_block_group(root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -7152,7 +7083,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                        return ret;
        }
 
-       ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+       ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+                                1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -9005,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
+       INIT_LIST_HEAD(&cache->dirty_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
 
@@ -9068,9 +9001,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                         * b) Setting 'dirty flag' makes sure that we flush
                         *    the new space cache info onto disk.
                         */
-                       cache->disk_cache_state = BTRFS_DC_CLEAR;
                        if (btrfs_test_opt(root, SPACE_CACHE))
-                               cache->dirty = 1;
+                               cache->disk_cache_state = BTRFS_DC_CLEAR;
                }
 
                read_extent_buffer(leaf, &cache->item,
@@ -9461,6 +9393,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                }
        }
 
+       spin_lock(&trans->transaction->dirty_bgs_lock);
+       if (!list_empty(&block_group->dirty_list)) {
+               list_del_init(&block_group->dirty_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&trans->transaction->dirty_bgs_lock);
+
        btrfs_remove_free_space_cache(block_group);
 
        spin_lock(&block_group->space_info->lock);
index d6c03f7f136b359c534668a38f9e9a72d299eb66..80a3141463e78374563e7829e7168990f372fa85 100644 (file)
@@ -1243,6 +1243,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct inode *inode;
        int ret = 0;
+       enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
 
        root = root->fs_info->tree_root;
 
@@ -1266,9 +1267,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
                                      path, block_group->key.objectid);
        if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->disk_cache_state = BTRFS_DC_ERROR;
-               spin_unlock(&block_group->lock);
+               dcs = BTRFS_DC_ERROR;
                ret = 0;
 #ifdef DEBUG
                btrfs_err(root->fs_info,
@@ -1277,6 +1276,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 #endif
        }
 
+       spin_lock(&block_group->lock);
+       block_group->disk_cache_state = dcs;
+       spin_unlock(&block_group->lock);
        iput(inode);
        return ret;
 }
index aa2219ebecc9b94ab9c66055ced26609806c0e39..e0faf803513a3b77cecce1fe9d83979db2660ed0 100644 (file)
@@ -248,6 +248,8 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        INIT_LIST_HEAD(&cur_trans->pending_ordered);
+       INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+       spin_lock_init(&cur_trans->dirty_bgs_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -1028,7 +1030,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
-                   old_root_used == btrfs_root_used(&root->root_item))
+                   old_root_used == btrfs_root_used(&root->root_item) &&
+                   (!extent_root ||
+                    list_empty(&trans->transaction->dirty_bgs)))
                        break;
 
                btrfs_set_root_node(&root->root_item, root->node);
@@ -1047,6 +1051,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
                if (ret)
                        return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
        }
 
        return 0;
@@ -1067,10 +1074,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        struct extent_buffer *eb;
        int ret;
 
-       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       if (ret)
-               return ret;
-
        eb = btrfs_lock_root_node(fs_info->tree_root);
        ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
                              0, &eb);
@@ -1990,6 +1993,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        switch_commit_roots(cur_trans, root->fs_info);
 
        assert_qgroups_uptodate(trans);
+       ASSERT(list_empty(&cur_trans->dirty_bgs));
        update_super_roots(root);
 
        btrfs_set_super_log_root(root->fs_info->super_copy, 0);
index 00ed29c4b3f9d0ee4bc10f3e2da2424a8642020b..3305451451ca79760de07a7da42cd5ade1002f02 100644 (file)
@@ -58,6 +58,8 @@ struct btrfs_transaction {
        struct list_head pending_chunks;
        struct list_head pending_ordered;
        struct list_head switch_commits;
+       struct list_head dirty_bgs;
+       spinlock_t dirty_bgs_lock;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };