xfs: stop holding ILOCK over filldir callbacks
authorDave Chinner <dchinner@redhat.com>
Wed, 19 Aug 2015 00:33:00 +0000 (10:33 +1000)
committerDave Chinner <david@fromorbit.com>
Wed, 19 Aug 2015 00:33:00 +0000 (10:33 +1000)
The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.

The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.

Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.

Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.

The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
fs/xfs/libxfs/xfs_dir2.c
fs/xfs/xfs_dir2_readdir.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_symlink.c

index a69fb3a1e16182ac9bd636fa9abb42150444cacb..42799d88ecc0adb475aae85cf2aa030e1c1c1ff3 100644 (file)
@@ -362,6 +362,7 @@ xfs_dir_lookup(
        struct xfs_da_args *args;
        int             rval;
        int             v;              /* type-checking value */
+       int             lock_mode;
 
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_lookup);
@@ -387,6 +388,7 @@ xfs_dir_lookup(
        if (ci_name)
                args->op_flags |= XFS_DA_OP_CILOOKUP;
 
+       lock_mode = xfs_ilock_data_map_shared(dp);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
                rval = xfs_dir2_sf_lookup(args);
                goto out_check_rval;
@@ -419,6 +421,7 @@ out_check_rval:
                }
        }
 out_free:
+       xfs_iunlock(dp, lock_mode);
        kmem_free(args);
        return rval;
 }
index 098cd78fe708433fdf77306637626d6b69400582..a989a9c7edb7fe22e2a425c3efaef417bde89b72 100644 (file)
@@ -171,6 +171,7 @@ xfs_dir2_block_getdents(
        int                     wantoff;        /* starting block offset */
        xfs_off_t               cook;
        struct xfs_da_geometry  *geo = args->geo;
+       int                     lock_mode;
 
        /*
         * If the block number in the offset is out of range, we're done.
@@ -178,7 +179,9 @@ xfs_dir2_block_getdents(
        if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
                return 0;
 
+       lock_mode = xfs_ilock_data_map_shared(dp);
        error = xfs_dir3_block_read(NULL, dp, &bp);
+       xfs_iunlock(dp, lock_mode);
        if (error)
                return error;
 
@@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents(
                 * current buffer, need to get another one.
                 */
                if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+                       int     lock_mode;
 
+                       lock_mode = xfs_ilock_data_map_shared(dp);
                        error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
                                                      &curoff, &bp);
+                       xfs_iunlock(dp, lock_mode);
                        if (error || !map_info->map_valid)
                                break;
 
@@ -653,7 +659,6 @@ xfs_readdir(
        struct xfs_da_args      args = { NULL };
        int                     rval;
        int                     v;
-       uint                    lock_mode;
 
        trace_xfs_readdir(dp);
 
@@ -666,7 +671,7 @@ xfs_readdir(
        args.dp = dp;
        args.geo = dp->i_mount->m_dir_geo;
 
-       lock_mode = xfs_ilock_data_map_shared(dp);
+       xfs_ilock(dp, XFS_IOLOCK_SHARED);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_getdents(&args, ctx);
        else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -675,7 +680,7 @@ xfs_readdir(
                rval = xfs_dir2_block_getdents(&args, ctx);
        else
                rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
-       xfs_iunlock(dp, lock_mode);
+       xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 
        return rval;
 }
index 657f6123b445cb9b67b80258607ef0404165ef46..65792660b0430761a4ca61bf031ce886988be22b 100644 (file)
@@ -663,30 +663,29 @@ xfs_lookup(
 {
        xfs_ino_t               inum;
        int                     error;
-       uint                    lock_mode;
 
        trace_xfs_lookup(dp, name);
 
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return -EIO;
 
-       lock_mode = xfs_ilock_data_map_shared(dp);
+       xfs_ilock(dp, XFS_IOLOCK_SHARED);
        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-       xfs_iunlock(dp, lock_mode);
-
        if (error)
-               goto out;
+               goto out_unlock;
 
        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
        if (error)
                goto out_free_name;
 
+       xfs_iunlock(dp, XFS_IOLOCK_SHARED);
        return 0;
 
 out_free_name:
        if (ci_name)
                kmem_free(ci_name->name);
-out:
+out_unlock:
+       xfs_iunlock(dp, XFS_IOLOCK_SHARED);
        *ipp = NULL;
        return error;
 }
@@ -1183,7 +1182,8 @@ xfs_create(
                goto out_trans_cancel;
 
 
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+                     XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
 
        xfs_bmap_init(&free_list, &first_block);
@@ -1222,7 +1222,7 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        unlock_dp_on_error = false;
 
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1295,7 +1295,7 @@ xfs_create(
        xfs_qm_dqrele(pdqp);
 
        if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+               xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        return error;
 }
 
@@ -1443,10 +1443,11 @@ xfs_link(
        if (error)
                goto error_return;
 
+       xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 
        /*
         * If we are using project inheritance, we only allow hard link
@@ -2549,9 +2550,10 @@ xfs_remove(
                goto out_trans_cancel;
        }
 
+       xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
        /*
@@ -2932,6 +2934,12 @@ xfs_rename(
         * whether the target directory is the same as the source
         * directory, we can lock from 2 to 4 inodes.
         */
+       if (!new_parent)
+               xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+       else
+               xfs_lock_two_inodes(src_dp, target_dp,
+                                   XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 
        /*
@@ -2939,9 +2947,9 @@ xfs_rename(
         * we can rely on either trans_commit or trans_cancel to unlock
         * them.
         */
-       xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        if (new_parent)
-               xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
        if (target_ip)
                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
index 4be27b0210af863f3913f94b9b7134307a7c74ff..7a01486eff06396d3903f08862f500c37d1274ad 100644 (file)
@@ -240,7 +240,8 @@ xfs_symlink(
        if (error)
                goto out_trans_cancel;
 
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+                     XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
 
        /*
@@ -288,7 +289,7 @@ xfs_symlink(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        unlock_dp_on_error = false;
 
        /*
@@ -421,7 +422,7 @@ out_release_inode:
        xfs_qm_dqrele(pdqp);
 
        if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+               xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        return error;
 }