btrfs: qgroup: Introduce extent changeset for qgroup reserve functions
authorQu Wenruo <quwenruo@cn.fujitsu.com>
Mon, 27 Feb 2017 07:10:38 +0000 (15:10 +0800)
committerDavid Sterba <dsterba@suse.com>
Thu, 29 Jun 2017 18:17:02 +0000 (20:17 +0200)
Introduce a new parameter, struct extent_changeset for
btrfs_qgroup_reserved_data() and its callers.

Such extent_changeset was used in btrfs_qgroup_reserve_data() to record
which range it reserved in current reserve, so it can free it in error
paths.

The reason we need to export it to callers is, at buffered write error
path, without knowing what exactly which range we reserved in current
allocation, we can free space which is not reserved by us.

This will lead to qgroup reserved space underflow.

Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/relocation.c

index 5e33e1d6d5c93b5700df62863ff2725a70a40339..1ee4489dc39879f71448b59cad08dc91763bd0a5 100644 (file)
@@ -2708,8 +2708,9 @@ enum btrfs_flush_state {
        COMMIT_TRANS            =       6,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode,
+                       struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
                                            u64 len);
@@ -2727,7 +2728,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
                                      struct btrfs_block_rsv *rsv);
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+                       struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
index b0b02c6c71aa2ca8cab78242bcab5b0e02fb85f5..70f85cfdbd463507d8ca907bf2b2f93677dad7b0 100644 (file)
@@ -3402,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_root *root = fs_info->tree_root;
        struct inode *inode = NULL;
+       struct extent_changeset *data_reserved = NULL;
        u64 alloc_hint = 0;
        int dcs = BTRFS_DC_ERROR;
        u64 num_pages = 0;
@@ -3521,7 +3522,7 @@ again:
        num_pages *= 16;
        num_pages *= PAGE_SIZE;
 
-       ret = btrfs_check_data_free_space(inode, 0, num_pages);
+       ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
        if (ret)
                goto out_put;
 
@@ -3552,6 +3553,7 @@ out:
        block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);
 
+       extent_changeset_free(data_reserved);
        return ret;
 }
 
@@ -4326,12 +4328,8 @@ commit_trans:
        return ret;
 }
 
-/*
- * New check_data_free_space() with ability for precious data reservation
- * Will replace old btrfs_check_data_free_space(), but for patch split,
- * add a new function first and then replace it.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode,
+                       struct extent_changeset **reserved, u64 start, u64 len)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        int ret;
@@ -4346,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
                return ret;
 
        /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
-       ret = btrfs_qgroup_reserve_data(inode, start, len);
+       ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
        if (ret < 0)
                btrfs_free_reserved_data_space_noquota(inode, start, len);
+       else
+               ret = 0;
        return ret;
 }
 
@@ -6175,6 +6175,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  * @inode: inode we're writing to
  * @start: start range we are writing to
  * @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ *           current reservation.
  *
  * This will do the following things
  *
@@ -6192,11 +6194,12 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  * Return 0 for success
  * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode,
+                       struct extent_changeset **reserved, u64 start, u64 len)
 {
        int ret;
 
-       ret = btrfs_check_data_free_space(inode, start, len);
+       ret = btrfs_check_data_free_space(inode, reserved, start, len);
        if (ret < 0)
                return ret;
        ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
index ce670d213913a6cd550c786b3d600717f2bd6e20..aeafdb35d90ba05d76829b105fa4d2cb9a252cd0 100644 (file)
@@ -215,6 +215,40 @@ struct extent_changeset {
        struct ulist range_changed;
 };
 
+static inline void extent_changeset_init(struct extent_changeset *changeset)
+{
+       changeset->bytes_changed = 0;
+       ulist_init(&changeset->range_changed);
+}
+
+static inline struct extent_changeset *extent_changeset_alloc(void)
+{
+       struct extent_changeset *ret;
+
+       ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+       if (!ret)
+               return NULL;
+
+       extent_changeset_init(ret);
+       return ret;
+}
+
+static inline void extent_changeset_release(struct extent_changeset *changeset)
+{
+       if (!changeset)
+               return;
+       changeset->bytes_changed = 0;
+       ulist_release(&changeset->range_changed);
+}
+
+static inline void extent_changeset_free(struct extent_changeset *changeset)
+{
+       if (!changeset)
+               return;
+       extent_changeset_release(changeset);
+       kfree(changeset);
+}
+
 static inline void extent_set_compress_type(unsigned long *bio_flags,
                                            int compress_type)
 {
index 5da85b080368bd6c564664b49920aa1df2b9c999..1b5cce51728b0829f6ece737adbd071794289739 100644 (file)
@@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
        struct extent_state *cached_state = NULL;
+       struct extent_changeset *data_reserved = NULL;
        u64 release_bytes = 0;
        u64 lockstart;
        u64 lockend;
@@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                reserve_bytes = round_up(write_bytes + sector_offset,
                                fs_info->sectorsize);
 
-               ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+               extent_changeset_release(data_reserved);
+               ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+                                                 write_bytes);
                if (ret < 0) {
                        if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
                                                      BTRFS_INODE_PREALLOC)) &&
@@ -1802,6 +1805,7 @@ again:
                }
        }
 
+       extent_changeset_free(data_reserved);
        return num_written ? num_written : ret;
 }
 
@@ -2772,6 +2776,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 {
        struct inode *inode = file_inode(file);
        struct extent_state *cached_state = NULL;
+       struct extent_changeset *data_reserved = NULL;
        struct falloc_range *range;
        struct falloc_range *tmp;
        struct list_head reserve_list;
@@ -2901,8 +2906,8 @@ static long btrfs_fallocate(struct file *file, int mode,
                                free_extent_map(em);
                                break;
                        }
-                       ret = btrfs_qgroup_reserve_data(inode, cur_offset,
-                                       last_byte - cur_offset);
+                       ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+                                       cur_offset, last_byte - cur_offset);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
@@ -2974,6 +2979,7 @@ out:
        if (ret != 0)
                btrfs_free_reserved_data_space(inode, alloc_start,
                                       alloc_end - cur_offset);
+       extent_changeset_free(data_reserved);
        return ret;
 }
 
index 5c6c20ec64d8a92845d0ae97eeaabb1c6d09112d..d02019747d001d7b118d54b19d6cc89b5d68e9ab 100644 (file)
@@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        struct btrfs_path *path;
        struct inode *inode;
        struct btrfs_block_rsv *rsv;
+       struct extent_changeset *data_reserved = NULL;
        u64 num_bytes;
        u64 alloc_hint = 0;
        int ret;
@@ -492,7 +493,7 @@ again:
        /* Just to make sure we have enough space */
        prealloc += 8 * PAGE_SIZE;
 
-       ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
        if (ret)
                goto out_put;
 
@@ -516,6 +517,7 @@ out:
        trans->bytes_reserved = num_bytes;
 
        btrfs_free_path(path);
+       extent_changeset_free(data_reserved);
        return ret;
 }
 
index b66ea03a3a1c051dd30634b3c46b01e4d973dea0..d9cf6df40d5ec0a25b2f36eeecce6129002bccb1 100644 (file)
@@ -2037,6 +2037,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
        struct btrfs_writepage_fixup *fixup;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
+       struct extent_changeset *data_reserved = NULL;
        struct page *page;
        struct inode *inode;
        u64 page_start;
@@ -2074,7 +2075,7 @@ again:
                goto again;
        }
 
-       ret = btrfs_delalloc_reserve_space(inode, page_start,
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
                                           PAGE_SIZE);
        if (ret) {
                mapping_set_error(page->mapping, ret);
@@ -2094,6 +2095,7 @@ out_page:
        unlock_page(page);
        put_page(page);
        kfree(fixup);
+       extent_changeset_free(data_reserved);
 }
 
 /*
@@ -4769,6 +4771,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
+       struct extent_changeset *data_reserved = NULL;
        char *kaddr;
        u32 blocksize = fs_info->sectorsize;
        pgoff_t index = from >> PAGE_SHIFT;
@@ -4783,7 +4786,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
 
-       ret = btrfs_delalloc_reserve_space(inode,
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
                        round_down(from, blocksize), blocksize);
        if (ret)
                goto out;
@@ -4868,6 +4871,7 @@ out_unlock:
        unlock_page(page);
        put_page(page);
 out:
+       extent_changeset_free(data_reserved);
        return ret;
 }
 
@@ -8718,6 +8722,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        struct inode *inode = file->f_mapping->host;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_dio_data dio_data = { 0 };
+       struct extent_changeset *data_reserved = NULL;
        loff_t offset = iocb->ki_pos;
        size_t count = 0;
        int flags = 0;
@@ -8754,7 +8759,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                        inode_unlock(inode);
                        relock = true;
                }
-               ret = btrfs_delalloc_reserve_space(inode, offset, count);
+               ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+                                                  offset, count);
                if (ret)
                        goto out;
                dio_data.outstanding_extents = count_max_extents(count);
@@ -8811,6 +8817,7 @@ out:
        if (relock)
                inode_lock(inode);
 
+       extent_changeset_free(data_reserved);
        return ret;
 }
 
@@ -9043,6 +9050,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
+       struct extent_changeset *data_reserved = NULL;
        char *kaddr;
        unsigned long zero_start;
        loff_t size;
@@ -9068,7 +9076,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
         * end up waiting indefinitely to get a lock on the page currently
         * being processed by btrfs_page_mkwrite() function.
         */
-       ret = btrfs_delalloc_reserve_space(inode, page_start,
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
                                           reserved_space);
        if (!ret) {
                ret = file_update_time(vmf->vma->vm_file);
@@ -9174,6 +9182,7 @@ again:
 out_unlock:
        if (!ret) {
                sb_end_pagefault(inode->i_sb);
+               extent_changeset_free(data_reserved);
                return VM_FAULT_LOCKED;
        }
        unlock_page(page);
@@ -9181,6 +9190,7 @@ out:
        btrfs_delalloc_release_space(inode, page_start, reserved_space);
 out_noreserve:
        sb_end_pagefault(inode->i_sb);
+       extent_changeset_free(data_reserved);
        return ret;
 }
 
index e4116f9248c2c1ad9c936667befe386fb31a5bc1..ccee5417d3f6c428dcf0e4fcf0bb5aad5d21db7d 100644 (file)
@@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_io_tree *tree;
+       struct extent_changeset *data_reserved = NULL;
        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 
        file_end = (isize - 1) >> PAGE_SHIFT;
@@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
        page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-       ret = btrfs_delalloc_reserve_space(inode,
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
                        start_index << PAGE_SHIFT,
                        page_cnt << PAGE_SHIFT);
        if (ret)
@@ -1247,6 +1248,7 @@ again:
                unlock_page(pages[i]);
                put_page(pages[i]);
        }
+       extent_changeset_free(data_reserved);
        return i_done;
 out:
        for (i = 0; i < i_done; i++) {
@@ -1256,6 +1258,7 @@ out:
        btrfs_delalloc_release_space(inode,
                        start_index << PAGE_SHIFT,
                        page_cnt << PAGE_SHIFT);
+       extent_changeset_free(data_reserved);
        return ret;
 
 }
index 475d53c492c88d77aeae5b693c6d50e9ce161b2d..0020889370216f5cfa1e187ef99c17c7ff3162ed 100644 (file)
@@ -2835,43 +2835,60 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
  * Return <0 for error (including -EQUOT)
  *
  * NOTE: this function may sleep for memory allocation.
+ *       if btrfs_qgroup_reserve_data() is called multiple times with
+ *       same @reserved, caller must ensure when error happens it's OK
+ *       to free *ALL* reserved space.
  */
-int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_reserve_data(struct inode *inode,
+                       struct extent_changeset **reserved_ret, u64 start,
+                       u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_changeset changeset;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
+       struct extent_changeset *reserved;
+       u64 orig_reserved;
+       u64 to_reserve;
        int ret;
 
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
            !is_fstree(root->objectid) || len == 0)
                return 0;
 
-       changeset.bytes_changed = 0;
-       ulist_init(&changeset.range_changed);
+       /* @reserved parameter is mandatory for qgroup */
+       if (WARN_ON(!reserved_ret))
+               return -EINVAL;
+       if (!*reserved_ret) {
+               *reserved_ret = extent_changeset_alloc();
+               if (!*reserved_ret)
+                       return -ENOMEM;
+       }
+       reserved = *reserved_ret;
+       /* Record already reserved space */
+       orig_reserved = reserved->bytes_changed;
        ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
-                       start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+                       start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+
+       /* Newly reserved space */
+       to_reserve = reserved->bytes_changed - orig_reserved;
        trace_btrfs_qgroup_reserve_data(inode, start, len,
-                                       changeset.bytes_changed,
-                                       QGROUP_RESERVE);
+                                       to_reserve, QGROUP_RESERVE);
        if (ret < 0)
                goto cleanup;
-       ret = qgroup_reserve(root, changeset.bytes_changed, true);
+       ret = qgroup_reserve(root, to_reserve, true);
        if (ret < 0)
                goto cleanup;
 
-       ulist_release(&changeset.range_changed);
        return ret;
 
 cleanup:
-       /* cleanup already reserved ranges */
+       /* cleanup *ALL* already reserved ranges */
        ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(&changeset.range_changed, &uiter)))
+       while ((unode = ulist_next(&reserved->range_changed, &uiter)))
                clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
                                 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
                                 GFP_NOFS);
-       ulist_release(&changeset.range_changed);
+       extent_changeset_release(reserved);
        return ret;
 }
 
@@ -2882,8 +2899,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
        int trace_op = QGROUP_RELEASE;
        int ret;
 
-       changeset.bytes_changed = 0;
-       ulist_init(&changeset.range_changed);
+       extent_changeset_init(&changeset);
        ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
                        start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
        if (ret < 0)
@@ -2899,7 +2915,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
                                changeset.bytes_changed);
        ret = changeset.bytes_changed;
 out:
-       ulist_release(&changeset.range_changed);
+       extent_changeset_release(&changeset);
        return ret;
 }
 
@@ -2999,8 +3015,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
        struct ulist_iterator iter;
        int ret;
 
-       changeset.bytes_changed = 0;
-       ulist_init(&changeset.range_changed);
+       extent_changeset_init(&changeset);
        ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                        EXTENT_QGROUP_RESERVED, &changeset);
 
@@ -3017,5 +3032,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
                                changeset.bytes_changed);
 
        }
-       ulist_release(&changeset.range_changed);
+       extent_changeset_release(&changeset);
 }
index d11125d6afb9bf8db8b3166bb5d347c65b1a349c..99408e93eb0dda5edb647e33c6078dd62bf4ac9f 100644 (file)
@@ -242,7 +242,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 #endif
 
 /* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_reserve_data(struct inode *inode,
+                       struct extent_changeset **reserved, u64 start, u64 len);
 int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 
index b291d1bebb4c148fa51e8945bc9e7e3163c5b116..6407423151ab6c23223421b80a539a29b54ec6ef 100644 (file)
@@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
        u64 prealloc_start = cluster->start - offset;
        u64 prealloc_end = cluster->end - offset;
        u64 cur_offset;
+       struct extent_changeset *data_reserved = NULL;
 
        BUG_ON(cluster->start != cluster->boundary[0]);
        inode_lock(inode);
 
-       ret = btrfs_check_data_free_space(inode, prealloc_start,
+       ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
                                          prealloc_end + 1 - prealloc_start);
        if (ret)
                goto out;
@@ -3129,6 +3130,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
                                       prealloc_end + 1 - cur_offset);
 out:
        inode_unlock(inode);
+       extent_changeset_free(data_reserved);
        return ret;
 }