btrfs: fix refcount_t usage when deleting btrfs_delayed_nodes
authorChris Mason <clm@fb.com>
Fri, 15 Dec 2017 19:58:27 +0000 (11:58 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 10 Jan 2018 08:31:17 +0000 (09:31 +0100)
commit ec35e48b286959991cdbb886f1bdeda4575c80b4 upstream.

refcounts have a generic implementation and an asm optimized one.  The
generic version has extra debugging to make sure that once a refcount
goes to zero, refcount_inc won't increase it.

The btrfs delayed inode code wasn't expecting this, and we're tripping
over the warnings when the generic refcounts are used.  We ended up with
this race:

Process A                                         Process B
                                                  btrfs_get_delayed_node()
  spin_lock(root->inode_lock)
  radix_tree_lookup()
__btrfs_release_delayed_node()
refcount_dec_and_test(&delayed_node->refs)
our refcount is now zero
  refcount_add(2) <---
  warning here, refcount
                                                  unchanged

spin_lock(root->inode_lock)
radix_tree_delete()

With the generic refcounts, we actually warn again when process B above
tries to release his refcount because refcount_add() turned into a
no-op.

We saw this in production on older kernels without the asm optimized
refcounts.

The fix used here is to use refcount_inc_not_zero() to detect when the
object is in the middle of being freed and return NULL.  This is almost
always the right answer anyway, since we usually end up pitching the
delayed_node if it didn't have fresh data in it.

This also changes __btrfs_release_delayed_node() to remove the extra
check for zero refcounts before radix tree deletion.
btrfs_get_delayed_node() was the only path that was allowing refcounts
to go from zero to one.

Fixes: 6de5f18e7b0da ("btrfs: fix refcount_t usage when deleting btrfs_delayed_node")
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
fs/btrfs/delayed-inode.c

index 19e4ad2f3f2e4a32050024f34b4e0ebc3c6511b9..0c4b690cf761b9cf5b766a075c8586fd63d46bc6 100644 (file)
@@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 
        spin_lock(&root->inode_lock);
        node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+
        if (node) {
                if (btrfs_inode->delayed_node) {
                        refcount_inc(&node->refs);      /* can be accessed */
@@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
                        spin_unlock(&root->inode_lock);
                        return node;
                }
-               btrfs_inode->delayed_node = node;
-               /* can be accessed and cached in the inode */
-               refcount_add(2, &node->refs);
+
+               /*
+                * It's possible that we're racing into the middle of removing
+                * this node from the radix tree.  In this case, the refcount
+                * was zero and it should never go back to one.  Just return
+                * NULL like it was never in the radix at all; our release
+                * function is in the process of removing it.
+                *
+                * Some implementations of refcount_inc refuse to bump the
+                * refcount once it has hit zero.  If we don't do this dance
+                * here, refcount_inc() may decide to just WARN_ONCE() instead
+                * of actually bumping the refcount.
+                *
+                * If this node is properly in the radix, we want to bump the
+                * refcount twice, once for the inode and once for this get
+                * operation.
+                */
+               if (refcount_inc_not_zero(&node->refs)) {
+                       refcount_inc(&node->refs);
+                       btrfs_inode->delayed_node = node;
+               } else {
+                       node = NULL;
+               }
+
                spin_unlock(&root->inode_lock);
                return node;
        }
@@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node(
        mutex_unlock(&delayed_node->mutex);
 
        if (refcount_dec_and_test(&delayed_node->refs)) {
-               bool free = false;
                struct btrfs_root *root = delayed_node->root;
+
                spin_lock(&root->inode_lock);
-               if (refcount_read(&delayed_node->refs) == 0) {
-                       radix_tree_delete(&root->delayed_nodes_tree,
-                                         delayed_node->inode_id);
-                       free = true;
-               }
+               /*
+                * Once our refcount goes to zero, nobody is allowed to bump it
+                * back up.  We can delete it now.
+                */
+               ASSERT(refcount_read(&delayed_node->refs) == 0);
+               radix_tree_delete(&root->delayed_nodes_tree,
+                                 delayed_node->inode_id);
                spin_unlock(&root->inode_lock);
-               if (free)
-                       kmem_cache_free(delayed_node_cache, delayed_node);
+               kmem_cache_free(delayed_node_cache, delayed_node);
        }
 }