Btrfs: allow us to overcommit our enospc reservations
authorJosef Bacik <josef@redhat.com>
Mon, 26 Sep 2011 21:12:22 +0000 (17:12 -0400)
committerJosef Bacik <josef@redhat.com>
Wed, 19 Oct 2011 19:12:50 +0000 (15:12 -0400)
One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases.  There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve.  However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit.  So this
patch adds chunk free space accounting so we always know how much unallocated
space we have.  Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit.  In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much.  Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space.  This makes this fio test

[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test

go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system.  This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/volumes.c

index 47dea7118e0ecaab2d55aeb72314abb50670de28..1eafccb162eeaa6b2cb50ae74e66153e1cf137ef 100644 (file)
@@ -893,6 +893,10 @@ struct btrfs_fs_info {
        spinlock_t block_group_cache_lock;
        struct rb_root block_group_cache_tree;
 
+       /* keep track of unallocated space */
+       spinlock_t free_chunk_lock;
+       u64 free_chunk_space;
+
        struct extent_io_tree freed_extents[2];
        struct extent_io_tree *pinned_extents;
 
index 4965a0179b31740a8089324f77f88a4b3b973487..51372a52116703a667bcc0a3646b848ea0b8f0b9 100644 (file)
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
+       spin_lock_init(&fs_info->free_chunk_lock);
        mutex_init(&fs_info->reloc_mutex);
 
        init_completion(&fs_info->kobj_unregister);
@@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
+       fs_info->free_chunk_space = 0;
 
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
index fd65f6bc676c9c5e0d2b0c900e7f95fc9b5c63e4..25b69d0f9135706edb95516663b8be598dedb812 100644 (file)
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
+ * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes, int flush, int check)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
        struct btrfs_trans_handle *trans;
-       u64 unused;
+       u64 used;
        u64 num_bytes = orig_bytes;
        int retries = 0;
        int ret = 0;
@@ -3459,9 +3460,9 @@ again:
        }
 
        ret = -ENOSPC;
-       unused = space_info->bytes_used + space_info->bytes_reserved +
-                space_info->bytes_pinned + space_info->bytes_readonly +
-                space_info->bytes_may_use;
+       used = space_info->bytes_used + space_info->bytes_reserved +
+               space_info->bytes_pinned + space_info->bytes_readonly +
+               space_info->bytes_may_use;
 
        /*
         * The idea here is that we've not already over-reserved the block group
@@ -3470,9 +3471,8 @@ again:
         * lets start flushing stuff first and then come back and try to make
         * our reservation.
         */
-       if (unused <= space_info->total_bytes) {
-               unused = space_info->total_bytes - unused;
-               if (unused >= orig_bytes) {
+       if (used <= space_info->total_bytes) {
+               if (used + orig_bytes <= space_info->total_bytes) {
                        space_info->bytes_may_use += orig_bytes;
                        ret = 0;
                } else {
@@ -3489,10 +3489,43 @@ again:
                 * amount plus the amount of bytes that we need for this
                 * reservation.
                 */
-               num_bytes = unused - space_info->total_bytes +
+               num_bytes = used - space_info->total_bytes +
                        (orig_bytes * (retries + 1));
        }
 
+       if (ret && !check) {
+               u64 profile = btrfs_get_alloc_profile(root, 0);
+               u64 avail;
+
+               spin_lock(&root->fs_info->free_chunk_lock);
+               avail = root->fs_info->free_chunk_space;
+
+               /*
+                * If we have dup, raid1 or raid10 then only half of the free
+                * space is actually useable.
+                */
+               if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                              BTRFS_BLOCK_GROUP_RAID1 |
+                              BTRFS_BLOCK_GROUP_RAID10))
+                       avail >>= 1;
+
+               /*
+                * If we aren't flushing don't let us overcommit too much, say
+                * 1/8th of the space.  If we can flush, let it overcommit up to
+                * 1/2 of the space.
+                */
+               if (flush)
+                       avail >>= 3;
+               else
+                       avail >>= 1;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+
+               if (used + orig_bytes < space_info->total_bytes + avail) {
+                       space_info->bytes_may_use += orig_bytes;
+                       ret = 0;
+               }
+       }
+
        /*
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
        if (num_bytes == 0)
                return 0;
 
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
        if (!ret)
                return 0;
 
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 0);
                return 0;
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
        spin_unlock(&BTRFS_I(inode)->lock);
 
-       ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+       ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
        if (ret) {
                u64 to_free = 0;
                unsigned dropped;
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
 
        if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
                return block_rsv;
        if (ret) {
                WARN_ON(1);
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
                if (!ret) {
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
index f2a4cc79da61da62740d1de33552037f27b072a2..e138af710de2d0bcb738eea7380c46940dde9f0a 100644 (file)
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        }
        BUG_ON(ret);
 
-       if (device->bytes_used > 0)
-               device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+       if (device->bytes_used > 0) {
+               u64 len = btrfs_dev_extent_length(leaf, extent);
+               device->bytes_used -= len;
+               spin_lock(&root->fs_info->free_chunk_lock);
+               root->fs_info->free_chunk_space += len;
+               spin_unlock(&root->fs_info->free_chunk_lock);
+       }
        ret = btrfs_del_item(trans, root, path);
 
 out:
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (ret)
                goto error_undo;
 
+       spin_lock(&root->fs_info->free_chunk_lock);
+       root->fs_info->free_chunk_space = device->total_bytes -
+               device->bytes_used;
+       spin_unlock(&root->fs_info->free_chunk_lock);
+
        device->in_fs_metadata = 0;
        btrfs_scrub_cancel_dev(root, device);
 
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+       spin_lock(&root->fs_info->free_chunk_lock);
+       root->fs_info->free_chunk_space += device->total_bytes;
+       spin_unlock(&root->fs_info->free_chunk_lock);
+
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
                root->fs_info->fs_devices->rotating = 1;
 
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        lock_chunks(root);
 
        device->total_bytes = new_size;
-       if (device->writeable)
+       if (device->writeable) {
                device->fs_devices->total_rw_bytes -= diff;
+               spin_lock(&root->fs_info->free_chunk_lock);
+               root->fs_info->free_chunk_space -= diff;
+               spin_unlock(&root->fs_info->free_chunk_lock);
+       }
        unlock_chunks(root);
 
 again:
@@ -2257,6 +2275,9 @@ again:
                device->total_bytes = old_size;
                if (device->writeable)
                        device->fs_devices->total_rw_bytes += diff;
+               spin_lock(&root->fs_info->free_chunk_lock);
+               root->fs_info->free_chunk_space += diff;
+               spin_unlock(&root->fs_info->free_chunk_lock);
                unlock_chunks(root);
                goto done;
        }
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                index++;
        }
 
+       spin_lock(&extent_root->fs_info->free_chunk_lock);
+       extent_root->fs_info->free_chunk_space -= (stripe_size *
+                                                  map->num_stripes);
+       spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
        index = 0;
        stripe = &chunk->stripe;
        while (index < map->num_stripes) {
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
        fill_device_from_item(leaf, dev_item, device);
        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
-       if (device->writeable)
+       if (device->writeable) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
+               spin_lock(&root->fs_info->free_chunk_lock);
+               root->fs_info->free_chunk_space += device->total_bytes -
+                       device->bytes_used;
+               spin_unlock(&root->fs_info->free_chunk_lock);
+       }
        ret = 0;
        return ret;
 }