*/
u64 last_unlink_trans;
+ /*
+ * These two counters are for delalloc metadata reservations. We keep
+ * track of how many extents we've accounted for vs how many extents we
+ * have.
+ */
+ int delalloc_reserved_extents;
+ int delalloc_extents;
+
/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 bytes_super; /* total bytes reserved for the super blocks */
-
- /* delalloc accounting */
- u64 bytes_delalloc; /* number of bytes reserved for allocation,
- this space is not necessarily reserved yet
- by the allocator */
+ u64 bytes_root; /* the number of bytes needed to commit a
+ transaction */
u64 bytes_may_use; /* number of bytes that may be used for
- delalloc */
+ delalloc/allocations */
+ u64 bytes_delalloc; /* number of bytes currently reserved for
+ delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
+ int force_delalloc; /* make people start doing filemap_flush until
+ we're under a threshold */
struct list_head list;
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
+
+ int allocating_chunk;
+ wait_queue_head_t wait;
};
/*
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
- fs_info->metadata_ratio = 8;
+ fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
alloc_target);
}
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+ u64 num_bytes;
+ int level;
+
+ level = BTRFS_MAX_LEVEL - 2;
+ /*
+ * NOTE: these calculations are absolutely the worst possible case.
+ * This assumes that _every_ item we insert will require a new leaf, and
+ * that the tree has grown to its maximum level size.
+ */
+
+ /*
+ * for every item we insert we could insert both an extent item and a
+ * extent ref item. Then for ever item we insert, we will need to cow
+ * both the original leaf, plus the leaf to the left and right of it.
+ *
+ * Unless we are talking about the extent root, then we just want the
+ * number of items * 2, since we just need the extent item plus its ref.
+ */
+ if (root == root->fs_info->extent_root)
+ num_bytes = num_items * 2;
+ else
+ num_bytes = (num_items + (2 * num_items)) * 3;
+
+ /*
+ * num_bytes is total number of leaves we could need times the leaf
+ * size, and then for every leaf we could end up cow'ing 2 nodes per
+ * level, down to the leaf level.
+ */
+ num_bytes = (num_bytes * root->leafsize) +
+ (num_bytes * (level * 2)) * root->nodesize;
+
+ return num_bytes;
+}
+
/*
- * for now this just makes sure we have at least 5% of our metadata space free
- * for use.
+ * Unreserve metadata space for delalloc. If we have less reserved credits than
+ * we have extents, this function does nothing.
*/
-int btrfs_check_metadata_free_space(struct btrfs_root *root)
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
- u64 alloc_target, thresh;
- int committed = 0, ret;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
- if (!meta_sinfo)
- goto alloc;
-again:
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+
spin_lock(&meta_sinfo->lock);
- if (!meta_sinfo->full)
- thresh = meta_sinfo->total_bytes * 80;
- else
- thresh = meta_sinfo->total_bytes * 95;
+ if (BTRFS_I(inode)->delalloc_reserved_extents <=
+ BTRFS_I(inode)->delalloc_extents) {
+ spin_unlock(&meta_sinfo->lock);
+ return 0;
+ }
+
+ BTRFS_I(inode)->delalloc_reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+
+ if (meta_sinfo->bytes_delalloc < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_delalloc = 0;
+ } else {
+ meta_sinfo->bytes_delalloc -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+ BUG_ON(bug);
+
+ return 0;
+}
+
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+ u64 thresh;
+
+ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use;
+
+ thresh = meta_sinfo->total_bytes - thresh;
+ thresh *= 80;
do_div(thresh, 100);
+ if (thresh <= meta_sinfo->bytes_delalloc)
+ meta_sinfo->force_delalloc = 1;
+ else
+ meta_sinfo->force_delalloc = 0;
+}
- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super > thresh) {
- struct btrfs_trans_handle *trans;
- if (!meta_sinfo->full) {
- meta_sinfo->force_alloc = 1;
+static int maybe_allocate_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ bool wait = false;
+ int ret = 0;
+ u64 min_metadata;
+ u64 free_space;
+
+ free_space = btrfs_super_total_bytes(disk_super);
+ /*
+ * we allow the metadata to grow to a max of either 5gb or 5% of the
+ * space in the volume.
+ */
+ min_metadata = min((u64)5 * 1024 * 1024 * 1024,
+ div64_u64(free_space * 5, 100));
+ if (info->total_bytes >= min_metadata) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (info->full) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (!info->allocating_chunk) {
+ info->force_alloc = 1;
+ info->allocating_chunk = 1;
+ init_waitqueue_head(&info->wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_event(info->wait,
+ !info->allocating_chunk);
+ return 1;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 4096 + 2 * 1024 * 1024,
+ info->flags, 0);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+out:
+ spin_lock(&info->lock);
+ info->allocating_chunk = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->wait);
+
+ if (ret)
+ return 0;
+ return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int flushed = 0;
+ int force_delalloc;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ force_delalloc = meta_sinfo->force_delalloc;
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!flushed)
+ meta_sinfo->bytes_delalloc += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ flushed++;
+
+ if (flushed == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ flushed++;
+ } else {
spin_unlock(&meta_sinfo->lock);
-alloc:
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ }
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, alloc_target, 0);
- btrfs_end_transaction(trans, root);
- if (!meta_sinfo) {
- meta_sinfo = __find_space_info(info,
- alloc_target);
- }
+ if (flushed == 2) {
+ filemap_flush(inode->i_mapping);
+ goto again;
+ } else if (flushed == 3) {
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
+ printk(KERN_ERR "enospc, has %d, reserved %d\n",
+ BTRFS_I(inode)->delalloc_extents,
+ BTRFS_I(inode)->delalloc_reserved_extents);
+ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
- if (!committed) {
- committed = 1;
- trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
+ BTRFS_I(inode)->delalloc_reserved_extents++;
+ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ if (!flushed && force_delalloc)
+ filemap_flush(inode->i_mapping);
+
+ return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space. This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+
+ spin_lock(&meta_sinfo->lock);
+ if (meta_sinfo->bytes_may_use < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_may_use = 0;
+ } else {
+ meta_sinfo->bytes_may_use -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+/*
+ * Reserve some metadata space for use. We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items. If we
+ * have space, fantastic, if not, you get -ENOSPC. Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space. THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int retries = 0;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!retries)
+ meta_sinfo->bytes_may_use += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ retries++;
+ if (retries == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ retries++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (retries == 2) {
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_may_use -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
+
+ dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
+
+ check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
return 0;
BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
- return btrfs_check_metadata_free_space(root);
+ return 0;
}
/*
BUG_ON(!space_info);
spin_lock(&space_info->lock);
- if (space_info->force_alloc) {
+ if (space_info->force_alloc)
force = 1;
- space_info->force_alloc = 0;
- }
if (space_info->full) {
spin_unlock(&space_info->lock);
goto out;
}
thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 6);
+ thresh = div_factor(thresh, 8);
if (!force &&
(space_info->bytes_used + space_info->bytes_pinned +
space_info->bytes_reserved + alloc_bytes) < thresh) {
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
- if (flags & BTRFS_BLOCK_GROUP_DATA) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ space_info->force_alloc = 0;
+ spin_unlock(&space_info->lock);
out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
return ret;
}
-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
- info->bytes_pinned - info->bytes_reserved),
+ info->bytes_pinned - info->bytes_reserved -
+ info->bytes_super),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu\n",
+ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+ "\n",
(unsigned long long)info->total_bytes,
(unsigned long long)info->bytes_pinned,
(unsigned long long)info->bytes_delalloc,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used);
+ (unsigned long long)info->bytes_used,
+ (unsigned long long)info->bytes_root,
+ (unsigned long long)info->bytes_super,
+ (unsigned long long)info->bytes_reserved);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes);
+ dump_space_info(sinfo, num_bytes, 1);
}
return ret;
return NULL;
}
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+ struct extent_state *other)
+{
+ if (tree->ops && tree->ops->merge_extent_hook)
+ tree->ops->merge_extent_hook(tree->mapping->host, new,
+ other);
+}
+
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
+ state = NULL;
}
}
+
return 0;
}
-static void set_state_cb(struct extent_io_tree *tree,
+static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
- tree->ops->set_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
+ return tree->ops->set_bit_hook(tree->mapping->host,
+ state->start, state->end,
+ state->state, bits);
}
+
+ return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
- if (tree->ops && tree->ops->clear_bit_hook) {
- tree->ops->clear_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
- }
+ if (tree->ops && tree->ops->clear_bit_hook)
+ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
/*
int bits)
{
struct rb_node *node;
+ int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
(unsigned long long)start);
WARN_ON(1);
}
- if (bits & EXTENT_DIRTY)
- tree->dirty_bytes += end - start + 1;
state->start = start;
state->end = end;
- set_state_cb(tree, state, bits);
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
+ if (bits & EXTENT_DIRTY)
+ tree->dirty_bytes += end - start + 1;
state->state |= bits;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
return 0;
}
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+ u64 split)
+{
+ if (tree->ops && tree->ops->split_extent_hook)
+ return tree->ops->split_extent_hook(tree->mapping->host,
+ orig, split);
+ return 0;
+}
+
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
+
+ split_cb(tree, orig, split);
+
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits,
- wake, delete);
+ set |= clear_state_bit(tree, state, bits, wake,
+ delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
prealloc = alloc_extent_state(GFP_ATOMIC);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
-
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits,
- wake, delete);
+ set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+
prealloc = NULL;
goto out;
}
return 0;
}
-static void set_state_bits(struct extent_io_tree *tree,
+static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int bits)
{
+ int ret;
+
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- set_state_cb(tree, state, bits);
state->state |= bits;
+
+ return 0;
}
static void cache_state(struct extent_state *state,
goto out;
}
- set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
BUG_ON(err == -EEXIST);
- if (err)
+ if (err) {
+ prealloc = NULL;
goto out;
+ }
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, bits);
+ if (err) {
+ prealloc = NULL;
+ goto out;
+ }
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
struct extent_state *state, int uptodate);
int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned long bits);
+ int (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ int (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
};
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
+ u64 split_start;
+ u64 split_end;
/* for use by the FS */
u64 private;
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ if (err)
+ return err;
+
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
err = file_remove_suid(file);
if (err)
goto out_nolock;
+
+ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (err)
+ goto out_nolock;
+
file_update_time(file);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
out_nolock:
kfree(pages);
return ret;
}
+static int btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 size;
+
+ if (!(orig->state & EXTENT_DELALLOC))
+ return 0;
+
+ size = orig->end - orig->start + 1;
+ if (size > root->fs_info->max_extent) {
+ u64 num_extents;
+ u64 new_size;
+
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+
+ /*
+ * if we break a large extent up then leave delalloc_extents be,
+ * since we've already accounted for the large extent.
+ */
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) < num_extents)
+ return 0;
+ }
+
+ BTRFS_I(inode)->delalloc_extents++;
+
+ return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 new_size, old_size;
+ u64 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return 0;
+
+ old_size = other->end - other->start + 1;
+ if (new->start < other->start)
+ new_size = other->end - new->start + 1;
+ else
+ new_size = new->end - other->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= root->fs_info->max_extent) {
+ BTRFS_I(inode)->delalloc_extents--;
+ return 0;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) > num_extents)
+ return 0;
+
+ BTRFS_I(inode)->delalloc_extents--;
+
+ return 0;
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits)
{
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
*/
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ BTRFS_I(inode)->delalloc_extents++;
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state, unsigned long bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ BTRFS_I(inode)->delalloc_extents--;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+
spin_lock(&root->fs_info->delalloc_lock);
- if (end - start + 1 > root->fs_info->delalloc_bytes) {
+ if (state->end - state->start + 1 >
+ root->fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs warning: delalloc account "
"%llu %llu\n",
- (unsigned long long)end - start + 1,
+ (unsigned long long)
+ state->end - state->start + 1,
(unsigned long long)
root->fs_info->delalloc_bytes);
btrfs_delalloc_free_space(root, inode, (u64)-1);
BTRFS_I(inode)->delalloc_bytes = 0;
} else {
btrfs_delalloc_free_space(root, inode,
- end - start + 1);
- root->fs_info->delalloc_bytes -= end - start + 1;
- BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+ state->end -
+ state->start + 1);
+ root->fs_info->delalloc_bytes -= state->end -
+ state->start + 1;
+ BTRFS_I(inode)->delalloc_bytes -= state->end -
+ state->start + 1;
}
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ goto out_unlock;
+ }
+
ret = 0;
if (offset != PAGE_CACHE_SIZE) {
kaddr = kmap(page);
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err;
+ int err = 0;
if (size <= hole_start)
return 0;
- err = btrfs_check_metadata_free_space(root);
- if (err)
- return err;
-
btrfs_truncate_page(inode->i_mapping, inode->i_size);
while (1) {
cur_offset, &hint_byte, 1);
if (err)
break;
+
+ err = btrfs_reserve_metadata_space(root, 1);
+ if (err)
+ break;
+
err = btrfs_insert_file_extent(trans, root,
inode->i_ino, cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
+ btrfs_unreserve_metadata_space(root, 1);
}
free_extent_map(em);
cur_offset = last_byte;
if (!new_valid_dev(rdev))
return -EINVAL;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
u64 objectid;
u64 index = 0;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
+
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
if (inode->i_nlink == 0)
return -ENOENT;
- btrfs_inc_nlink(inode);
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ err = btrfs_reserve_metadata_space(root, 3);
if (err)
- goto fail;
+ return err;
+
+ btrfs_inc_nlink(inode);
+
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
u64 index = 0;
unsigned long nr = 1;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode and ref
+ * 2 items for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_unlock;
+ return err;
trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, dir);
-
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (!trans) {
+ err = -ENOMEM;
goto out_unlock;
}
+ btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
if (err) {
btrfs_end_transaction_throttle(trans, root);
out_unlock:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
goto out;
}
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
ret = 0;
/* page is wholly or partially inside EOF */
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
return NULL;
ei->last_trans = 0;
ei->logged_trans = 0;
+ ei->delalloc_extents = 0;
+ ei->delalloc_reserved_extents = 0;
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for dir items
+ * 1 item for orphan entry
+ * 1 item for ref
+ */
+ ret = btrfs_reserve_metadata_space(root, 4);
if (ret)
return ret;
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
+
+ btrfs_unreserve_metadata_space(root, 4);
return ret;
}
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode item and ref
+ * 2 items for dir items
+ * 1 item for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto out_fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
out_fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
+
+ ret = btrfs_reserve_metadata_space(root, 1);
+ if (ret)
+ goto out;
+
ret = btrfs_reserve_extent(trans, root, alloc_size,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
+ btrfs_unreserve_metadata_space(root, 1);
}
out:
if (cur_offset > start) {
.readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
+ .merge_extent_hook = btrfs_merge_extent_hook,
+ .split_extent_hook = btrfs_split_extent_hook,
};
/*
u64 index = 0;
unsigned long nr = 1;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
return ret;
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
+
+ btrfs_unreserve_metadata_space(root, 6);
+ btrfs_btree_balance_dirty(root, nr);
return ret;
}
if (!root->ref_cows)
return -EINVAL;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
goto fail_unlock;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
memcpy(pending_snapshot->name, name, namelen);
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
+ if (!current->journal_info)
+ current->journal_info = h;
+
root->fs_info->running_transaction->use_count++;
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
mutex_unlock(&info->trans_mutex);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
+ btrfs_unreserve_metadata_space(root, 6);
return ret;
}
mutex_unlock(&root->fs_info->trans_mutex);
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
}