struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
+
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
* ref if it was able to find one, or NULL if nothing was in that spot
*/
static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
- u64 bytenr, u64 parent)
+ u64 bytenr, u64 parent,
+ struct btrfs_delayed_ref_node **last)
{
struct rb_node *n = root->rb_node;
struct btrfs_delayed_ref_node *entry;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
WARN_ON(!entry->in_tree);
+ if (last)
+ *last = entry;
cmp = comp_entry(entry, bytenr, parent);
if (cmp < 0)
return NULL;
}
-/*
- * Locking on delayed refs is done by taking a lock on the head node,
- * which has the (impossible) parent id of (u64)-1. Once a lock is held
- * on the head node, you're allowed (and required) to process all the
- * delayed refs for a given byte number in the tree.
- *
- * This will walk forward in the rbtree until it finds a head node it
- * is able to lock. It might not lock the delayed ref you asked for,
- * and so it will return the one it did lock in next_ret and return 0.
- *
- * If no locks are taken, next_ret is set to null and 1 is returned. This
- * means there are no more unlocked head nodes in the rbtree.
- */
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref,
- struct btrfs_delayed_ref_head **next_ret)
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ assert_spin_locked(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (!head->node.in_tree) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 start)
+{
+ int count = 0;
+ struct btrfs_delayed_ref_root *delayed_refs;
struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *head;
- int ret = 0;
- while (1) {
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (start == 0) {
+ node = rb_first(&delayed_refs->root);
+ } else {
+ ref = NULL;
+ tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+ if (ref) {
+ struct btrfs_delayed_ref_node *tmp;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ tmp = rb_entry(node,
+ struct btrfs_delayed_ref_node,
+ rb_node);
+ if (tmp->bytenr < start)
+ break;
+ ref = tmp;
+ node = rb_prev(&ref->rb_node);
+ }
+ node = &ref->rb_node;
+ } else
+ node = rb_first(&delayed_refs->root);
+ }
+again:
+ while (node && count < 32) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
if (btrfs_delayed_ref_is_head(ref)) {
head = btrfs_delayed_node_to_head(ref);
- if (mutex_trylock(&head->mutex)) {
- *next_ret = head;
- ret = 0;
+ if (list_empty(&head->cluster)) {
+ list_add_tail(&head->cluster, cluster);
+ delayed_refs->run_delayed_start =
+ head->node.bytenr;
+ count++;
+
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ } else if (count) {
+ /* the goal of the clustering is to find extents
+ * that are likely to end up in the same extent
+ * leaf on disk. So, we don't want them spread
+ * all over the tree. Stop now if we've hit
+ * a head that was already in use
+ */
break;
}
}
- node = rb_next(&ref->rb_node);
- if (!node) {
- ret = 1;
- *next_ret = NULL;
- break;
- }
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
}
- return ret;
+ if (count) {
+ return 0;
+ } else if (start) {
+ /*
+ * we've gone to the end of the rbtree without finding any
+ * clusters. start from the beginning and try again
+ */
+ start = 0;
+ node = rb_first(&delayed_refs->root);
+ goto again;
+ }
+ return 1;
}
/*
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
if (ref) {
prev_node = rb_prev(&ref->rb_node);
if (!prev_node)
}
spin_lock(&delayed_refs->lock);
- ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
if (ref) {
head = btrfs_delayed_node_to_head(ref);
if (mutex_trylock(&head->mutex)) {
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_ref *full_ref;
- struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
int count_mod = 1;
int must_insert_reserved = 0;
if (btrfs_delayed_ref_is_head(ref)) {
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
+ INIT_LIST_HEAD(&head_ref->cluster);
mutex_init(&head_ref->mutex);
} else {
full_ref = btrfs_delayed_node_to_ref(ref);
*/
kfree(ref);
} else {
+ if (btrfs_delayed_ref_is_head(ref)) {
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ }
delayed_refs->num_entries++;
trans->delayed_ref_updates++;
}
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
if (ref)
return btrfs_delayed_node_to_head(ref);
return NULL;
*/
struct mutex mutex;
+ struct list_head cluster;
+
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
*/
unsigned long num_entries;
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
/*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
* to be run right away
*/
int flushing;
+
+ u64 run_delayed_start;
};
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref,
- struct btrfs_delayed_ref_head **next_ret);
int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u32 *refs);
u64 parent, u64 orig_ref_root, u64 ref_root,
u64 orig_ref_generation, u64 ref_generation,
u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 search_start);
/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
struct btrfs_root *root = arg;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
- struct btrfs_fs_info *info = root->fs_info;
unsigned long now;
unsigned long delay;
int ret;
now = get_seconds();
if (now < cur->start_time || now - cur->start_time < 30) {
- unsigned long num_delayed;
- num_delayed = cur->delayed_refs.num_entries;
mutex_unlock(&root->fs_info->trans_mutex);
delay = HZ * 5;
-
- /*
- * we may have been woken up early to start
- * processing the delayed extent ref updates
- * If so, run some of them and then loop around again
- * to see if we need to force a commit
- */
- if (num_delayed > 64) {
- mutex_unlock(&info->transaction_kthread_mutex);
- trans = btrfs_start_transaction(root, 1);
- btrfs_run_delayed_refs(trans, root, 256);
- btrfs_end_transaction(trans, root);
- continue;
- }
goto sleep;
}
mutex_unlock(&root->fs_info->trans_mutex);
return NULL;
}
-/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far. count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
- */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long count)
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct list_head *cluster)
{
- struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
+ int count = 0;
int must_insert_reserved = 0;
- int run_all = count == (unsigned long)-1;
-
- if (root == root->fs_info->extent_root)
- root = root->fs_info->tree_root;
delayed_refs = &trans->transaction->delayed_refs;
-again:
- spin_lock(&delayed_refs->lock);
- if (count == 0)
- count = delayed_refs->num_entries;
while (1) {
if (!locked_ref) {
- /*
- * no locked ref, go find something we can
- * process in the rbtree. We start at
- * the beginning of the tree, there may be less
- * lock contention if we do something smarter here.
- */
- node = rb_first(&delayed_refs->root);
- if (!node) {
- spin_unlock(&delayed_refs->lock);
+ /* pick a new head ref from the cluster list */
+ if (list_empty(cluster))
break;
- }
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
- if (ret) {
- spin_unlock(&delayed_refs->lock);
- break;
+ locked_ref = list_entry(cluster->next,
+ struct btrfs_delayed_ref_head, cluster);
+
+ /* grab the lock that says we are going to process
+ * all the refs for this head */
+ ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+ /*
+ * we may have dropped the spin lock to get the head
+ * mutex lock, and that might have given someone else
+ * time to free the head. If that's true, it has been
+ * removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN) {
+ locked_ref = NULL;
+ count++;
+ continue;
}
}
* locked_ref is the head node, so we have to go one
* node back for any delayed ref updates
*/
-
ref = select_delayed_ref(locked_ref);
if (!ref) {
/* All delayed refs have been processed, Go ahead
* so that any accounting fixes can happen
*/
ref = &locked_ref->node;
+ list_del_init(&locked_ref->cluster);
locked_ref = NULL;
}
BUG_ON(ret);
btrfs_put_delayed_ref(ref);
- /* once we lock the head ref, we have to process all the
- * entries for it. So, we might end up doing more entries
- * that count was asking us to do.
- */
- if (count > 0)
- count--;
+ count++;
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct list_head cluster;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+ int run_most = 0;
+
+ if (root == root->fs_info->extent_root)
+ root = root->fs_info->tree_root;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ INIT_LIST_HEAD(&cluster);
+again:
+ spin_lock(&delayed_refs->lock);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+ while (1) {
+ if (!(run_all || run_most) &&
+ delayed_refs->num_heads_ready < 64)
+ break;
/*
- * we set locked_ref to null above if we're all done
- * with this bytenr
+ * go find something we can process in the rbtree. We start at
+ * the beginning of the tree, and then build a cluster
+ * of refs to process starting at the first one we are able to
+ * lock
*/
- if (!locked_ref && count == 0)
+ ret = btrfs_find_ref_cluster(trans, &cluster,
+ delayed_refs->run_delayed_start);
+ if (ret)
break;
- cond_resched();
- spin_lock(&delayed_refs->lock);
+ ret = run_clustered_refs(trans, root, &cluster);
+ BUG_ON(ret < 0);
+
+ count -= min_t(unsigned long, ret, count);
+
+ if (count == 0)
+ break;
}
+
if (run_all) {
- spin_lock(&delayed_refs->lock);
node = rb_first(&delayed_refs->root);
- if (!node) {
- spin_unlock(&delayed_refs->lock);
+ if (!node)
goto out;
- }
+ count = (unsigned long)-1;
while (node) {
ref = rb_entry(node, struct btrfs_delayed_ref_node,
node = rb_next(node);
}
spin_unlock(&delayed_refs->lock);
- count = (unsigned long)-1;
schedule_timeout(1);
goto again;
}
out:
+ spin_unlock(&delayed_refs->lock);
return 0;
}
*/
head->node.in_tree = 0;
rb_erase(&head->node.rb_node, &delayed_refs->root);
+
delayed_refs->num_entries--;
/*
* we don't take a ref on the node because we're removing it from the
* tree, so we just steal the ref the tree was holding.
*/
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+
+ list_del_init(&head->cluster);
spin_unlock(&delayed_refs->lock);
ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
struct btrfs_path *path;
int i;
int orig_level;
+ int update_count;
struct btrfs_root_item *root_item = &root->root_item;
WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
}
}
while (1) {
+ unsigned long update;
wret = walk_down_tree(trans, root, path, &level);
if (wret > 0)
break;
}
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
+ for (update_count = 0; update_count < 16; update_count++) {
+ update = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (update)
+ btrfs_run_delayed_refs(trans, root, update);
+ else
+ break;
+ }
}
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
cur_trans->delayed_refs.root.rb_node = NULL;
cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *info = root->fs_info;
-
- if (trans->delayed_ref_updates &&
- (trans->transaction->delayed_refs.flushing ||
- trans->transaction->delayed_refs.num_entries > 16384)) {
- btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates);
- } else if (trans->transaction->delayed_refs.num_entries > 64) {
- wake_up_process(root->fs_info->transaction_kthread);
+ int count = 0;
+
+ while (count < 4) {
+ unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (cur &&
+ trans->transaction->delayed_refs.num_heads_ready > 64) {
+ trans->delayed_ref_updates = 0;
+ btrfs_run_delayed_refs(trans, root, cur);
+ } else {
+ break;
+ }
+ count++;
}
mutex_lock(&info->trans_mutex);
*/
trans->transaction->delayed_refs.flushing = 1;
- ret = btrfs_run_delayed_refs(trans, root, (u64)-1);
+ ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
INIT_LIST_HEAD(&dirty_fs_roots);