btrfs: fix readdir deadlock with pagefault
authorJosef Bacik <jbacik@fb.com>
Mon, 24 Jul 2017 19:14:25 +0000 (15:14 -0400)
committerDavid Sterba <dsterba@suse.com>
Wed, 16 Aug 2017 14:12:05 +0000 (16:12 +0200)
Readdir does dir_emit while under the btree lock.  dir_emit can trigger
the page fault which means we can deadlock.  Fix this by allocating a
buffer on opening a directory and copying the readdir into this buffer
and doing dir_emit from outside of the tree lock.

Thread A
readdir  <holding tree lock>
  dir_emit
    <page fault>
      down_read(mmap_sem)

Thread B
mmap write
  down_write(mmap_sem)
    page_mkwrite
      wait_ordered_extents

Process C
finish_ordered_extent
  insert_reserved_file_extent
   try to lock leaf <hang>

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ copy the deadlock scenario to changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c

index b9d5d95bc583b2156a79f8ac0081852547f97cd0..a3db2b1738aad69ee5358cdafbbf3c4af5a21b5e 100644 (file)
@@ -1264,6 +1264,11 @@ struct btrfs_root {
        atomic64_t qgroup_meta_rsv;
 };
 
+struct btrfs_file_private {
+       struct btrfs_trans_handle *trans;
+       void *filldir_buf;
+};
+
 static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
 {
        return btrfs_sb(inode->i_sb)->sectorsize;
index 58818cf7f82dc044e10d551df307abd3b2d70577..74fd7756cff3c963820753fbb78b76a093c5a5fd 100644 (file)
@@ -1990,8 +1990,15 @@ out:
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
-       if (filp->private_data)
+       struct btrfs_file_private *private = filp->private_data;
+
+       if (private && private->trans)
                btrfs_ioctl_trans_end(filp);
+       if (private && private->filldir_buf)
+               kfree(private->filldir_buf);
+       kfree(private);
+       filp->private_data = NULL;
+
        /*
         * ordered_data_close is set by settattr when we are about to truncate
         * a file from a non-zero size to a zero size.  This tries to
index a17a61e2ff9da31f2dec69bb4fd96b39d6434c68..fa4b2563dfd7103127ffc7019b0993f5abf56b5f 100644 (file)
@@ -5876,25 +5876,74 @@ unsigned char btrfs_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
+/*
+ * All this infrastructure exists because dir_emit can fault, and we are holding
+ * the tree lock when doing readdir.  For now just allocate a buffer and copy
+ * our information into that, and then dir_emit from the buffer.  This is
+ * similar to what NFS does, only we don't keep the buffer around in pagecache
+ * because I'm afraid I'll mess that up.  Long term we need to make filldir do
+ * copy_to_user_inatomic so we don't have to worry about page faulting under the
+ * tree lock.
+ */
+static int btrfs_opendir(struct inode *inode, struct file *file)
+{
+       struct btrfs_file_private *private;
+
+       private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
+       if (!private)
+               return -ENOMEM;
+       private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!private->filldir_buf) {
+               kfree(private);
+               return -ENOMEM;
+       }
+       file->private_data = private;
+       return 0;
+}
+
+struct dir_entry {
+       u64 ino;
+       u64 offset;
+       unsigned type;
+       int name_len;
+};
+
+static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
+{
+       while (entries--) {
+               struct dir_entry *entry = addr;
+               char *name = (char *)(entry + 1);
+
+               ctx->pos = entry->offset;
+               if (!dir_emit(ctx, name, entry->name_len, entry->ino,
+                             entry->type))
+                       return 1;
+               addr += sizeof(struct dir_entry) + entry->name_len;
+               ctx->pos++;
+       }
+       return 0;
+}
+
 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 {
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_file_private *private = file->private_data;
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_path *path;
+       void *addr;
        struct list_head ins_list;
        struct list_head del_list;
        int ret;
        struct extent_buffer *leaf;
        int slot;
-       unsigned char d_type;
-       int over = 0;
-       char tmp_name[32];
        char *name_ptr;
        int name_len;
+       int entries = 0;
+       int total_len = 0;
        bool put = false;
        struct btrfs_key location;
 
@@ -5905,12 +5954,14 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        if (!path)
                return -ENOMEM;
 
+       addr = private->filldir_buf;
        path->reada = READA_FORWARD;
 
        INIT_LIST_HEAD(&ins_list);
        INIT_LIST_HEAD(&del_list);
        put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
 
+again:
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = ctx->pos;
        key.objectid = btrfs_ino(BTRFS_I(inode));
@@ -5920,6 +5971,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                goto err;
 
        while (1) {
+               struct dir_entry *entry;
+
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
@@ -5941,41 +5994,43 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                        goto next;
                if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
                        goto next;
-
-               ctx->pos = found_key.offset;
-
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                if (verify_dir_item(fs_info, leaf, slot, di))
                        goto next;
 
                name_len = btrfs_dir_name_len(leaf, di);
-               if (name_len <= sizeof(tmp_name)) {
-                       name_ptr = tmp_name;
-               } else {
-                       name_ptr = kmalloc(name_len, GFP_KERNEL);
-                       if (!name_ptr) {
-                               ret = -ENOMEM;
-                               goto err;
-                       }
+               if ((total_len + sizeof(struct dir_entry) + name_len) >=
+                   PAGE_SIZE) {
+                       btrfs_release_path(path);
+                       ret = btrfs_filldir(private->filldir_buf, entries, ctx);
+                       if (ret)
+                               goto nopos;
+                       addr = private->filldir_buf;
+                       entries = 0;
+                       total_len = 0;
+                       goto again;
                }
+
+               entry = addr;
+               entry->name_len = name_len;
+               name_ptr = (char *)(entry + 1);
                read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
                                   name_len);
-
-               d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+               entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
                btrfs_dir_item_key_to_cpu(leaf, di, &location);
-
-               over = !dir_emit(ctx, name_ptr, name_len, location.objectid,
-                                d_type);
-
-               if (name_ptr != tmp_name)
-                       kfree(name_ptr);
-
-               if (over)
-                       goto nopos;
-               ctx->pos++;
+               entry->ino = location.objectid;
+               entry->offset = found_key.offset;
+               entries++;
+               addr += sizeof(struct dir_entry) + name_len;
+               total_len += sizeof(struct dir_entry) + name_len;
 next:
                path->slots[0]++;
        }
+       btrfs_release_path(path);
+
+       ret = btrfs_filldir(private->filldir_buf, entries, ctx);
+       if (ret)
+               goto nopos;
 
        ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
        if (ret)
@@ -10779,6 +10834,7 @@ static const struct file_operations btrfs_dir_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .iterate_shared = btrfs_real_readdir,
+       .open           = btrfs_opendir,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_compat_ioctl,
index 7a3c56c17e00e077b345b15e67d16aa4ed158113..b21558bb12943cce36c416da1f72bf1168c44644 100644 (file)
@@ -3966,6 +3966,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+       struct btrfs_file_private *private;
        int ret;
        static bool warned = false;
 
@@ -3984,8 +3985,16 @@ static long btrfs_ioctl_trans_start(struct file *file)
        }
 
        ret = -EINPROGRESS;
-       if (file->private_data)
+       private = file->private_data;
+       if (private && private->trans)
                goto out;
+       if (!private) {
+               private = kzalloc(sizeof(struct btrfs_file_private),
+                                 GFP_KERNEL);
+               if (!private)
+                       return -ENOMEM;
+               file->private_data = private;
+       }
 
        ret = -EROFS;
        if (btrfs_root_readonly(root))
@@ -4002,7 +4011,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (IS_ERR(trans))
                goto out_drop;
 
-       file->private_data = trans;
+       private->trans = trans;
        return 0;
 
 out_drop:
@@ -4257,14 +4266,13 @@ long btrfs_ioctl_trans_end(struct file *file)
 {
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
+       struct btrfs_file_private *private = file->private_data;
 
-       trans = file->private_data;
-       if (!trans)
+       if (!private || !private->trans)
                return -EINVAL;
-       file->private_data = NULL;
 
-       btrfs_end_transaction(trans);
+       btrfs_end_transaction(private->trans);
+       private->trans = NULL;
 
        atomic_dec(&root->fs_info->open_ioctl_trans);