[XFS] Don't block pdflush when writing back inodes
authorDavid Chinner <dgc@sgi.com>
Thu, 6 Mar 2008 02:43:42 +0000 (13:43 +1100)
committerLachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Fri, 18 Apr 2008 01:37:32 +0000 (11:37 +1000)
When pdflush is writing back inodes, it can get stuck on inode cluster
buffers that are currently under I/O. This occurs when we write data to
multiple inodes in the same inode cluster at the same time.

Effectively, delayed allocation marks the inode dirty during the data
writeback. Hence if the inode cluster was flushed during the writeback of
the first inode, the writeback of the second inode will block waiting for
the inode cluster write to complete before writing it again for the newly
dirtied inode.

Basically, we want to avoid this from happening so we don't block pdflush
and slow down all of writeback. Hence we introduce a non-blocking async
inode flush flag that pdflush uses. If this flag is set, we use
non-blocking operations (e.g. try locks) whereever we can to avoid
blocking or extra I/O being issued.

SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30501a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_vnode.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_itable.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_vnodeops.c

index 8831d95187904f0ecf4a1aa1ac5b77cc767aec77..cb9ce90d1deb7010d109c590999b1229b5139dcb 100644 (file)
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
-       int                     error = 0, flags = FLUSH_INODE;
+       int                     error = 0;
+       int                     flags = 0;
 
        xfs_itrace_entry(XFS_I(inode));
        if (sync) {
index b5ea418693b1b4f1d76ef2b2dd8e6bf149733150..f200e02440827d8e4beb925b4dd18ee81560abb9 100644 (file)
@@ -73,12 +73,9 @@ typedef enum bhv_vrwlock {
 #define IO_INVIS       0x00020         /* don't update inode timestamps */
 
 /*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
  */
 #define FLUSH_SYNC             1       /* wait for flush to complete   */
-#define FLUSH_INODE            2       /* flush the inode itself       */
-#define FLUSH_LOG              4       /* force the last log entry for
-                                        * this inode out to disk       */
 
 /*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
index 6f156faf9d46e3801e39b23bcab9d4b25e04dab2..3c3e9e3c1da81613e0b222650baab4797ddf10d9 100644 (file)
@@ -145,11 +145,16 @@ xfs_imap_to_bp(
        xfs_buf_t       *bp;
 
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                  (int)imap->im_len, XFS_BUF_LOCK, &bp);
+                                  (int)imap->im_len, buf_flags, &bp);
        if (error) {
-               cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+               if (error != EAGAIN) {
+                       cmn_err(CE_WARN,
+                               "xfs_imap_to_bp: xfs_trans_read_buf()returned "
                                "an error %d on %s.  Returning error.",
                                error, mp->m_fsname);
+               } else {
+                       ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+               }
                return error;
        }
 
@@ -274,7 +279,8 @@ xfs_itobp(
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
        xfs_daddr_t     bno,
-       uint            imap_flags)
+       uint            imap_flags,
+       uint            buf_flags)
 {
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
@@ -305,10 +311,17 @@ xfs_itobp(
        }
        ASSERT(bno == 0 || bno == imap.im_blkno);
 
-       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+       error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
        if (error)
                return error;
 
+       if (!bp) {
+               ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+               ASSERT(tp == NULL);
+               *bpp = NULL;
+               return EAGAIN;
+       }
+
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        return 0;
@@ -812,7 +825,7 @@ xfs_iread(
         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
         * know that this is a new incore inode.
         */
-       error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
+       error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
        if (error) {
                kmem_zone_free(xfs_inode_zone, ip);
                return error;
@@ -1901,7 +1914,7 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error)
                        return error;
 
@@ -2009,7 +2022,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2071,7 +2084,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2334,7 +2347,7 @@ xfs_ifree(
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-       error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
+       error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
        if (error)
                return error;
 
@@ -2777,38 +2790,41 @@ xfs_iunpin(
 }
 
 /*
- * This is called to wait for the given inode to be unpinned.
- * It will sleep until this happens.  The caller must have the
- * inode locked in at least shared mode so that the buffer cannot
- * be subsequently pinned once someone is waiting for it to be
- * unpinned.
+ * This is called to unpin an inode. It can be directed to wait or to return
+ * immediately without waiting for the inode to be unpinned.  The caller must
+ * have the inode locked in at least shared mode so that the buffer cannot be
+ * subsequently pinned once someone is waiting for it to be unpinned.
  */
 STATIC void
-xfs_iunpin_wait(
-       xfs_inode_t     *ip)
+__xfs_iunpin_wait(
+       xfs_inode_t     *ip,
+       int             wait)
 {
-       xfs_inode_log_item_t    *iip;
-       xfs_lsn_t       lsn;
+       xfs_inode_log_item_t    *iip = ip->i_itemp;
 
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
-
-       if (atomic_read(&ip->i_pincount) == 0) {
+       if (atomic_read(&ip->i_pincount) == 0)
                return;
-       }
 
-       iip = ip->i_itemp;
-       if (iip && iip->ili_last_lsn) {
-               lsn = iip->ili_last_lsn;
-       } else {
-               lsn = (xfs_lsn_t)0;
-       }
+       /* Give the log a push to start the unpinning I/O */
+       xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
+                               iip->ili_last_lsn : 0, XFS_LOG_FORCE);
+       if (wait)
+               wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
 
-       /*
-        * Give the log a push so we don't wait here too long.
-        */
-       xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+static inline void
+xfs_iunpin_wait(
+       xfs_inode_t     *ip)
+{
+       __xfs_iunpin_wait(ip, 1);
+}
 
-       wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+static inline void
+xfs_iunpin_nowait(
+       xfs_inode_t     *ip)
+{
+       __xfs_iunpin_wait(ip, 0);
 }
 
 
@@ -3003,6 +3019,7 @@ xfs_iflush(
        int                     bufwasdelwri;
        struct hlist_node       *entry;
        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
+       int                     noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
 
        XFS_STATS_INC(xs_iflush_count);
 
@@ -3027,11 +3044,21 @@ xfs_iflush(
        }
 
        /*
-        * We can't flush the inode until it is unpinned, so
-        * wait for it.  We know noone new can pin it, because
-        * we are holding the inode lock shared and you need
-        * to hold it exclusively to pin the inode.
+        * We can't flush the inode until it is unpinned, so wait for it if we
+        * are allowed to block.  We know noone new can pin it, because we are
+        * holding the inode lock shared and you need to hold it exclusively to
+        * pin the inode.
+        *
+        * If we are not allowed to block, force the log out asynchronously so
+        * that when we come back the inode will be unpinned. If other inodes
+        * in the same cluster are dirty, they will probably write the inode
+        * out for us if they occur after the log force completes.
         */
+       if (noblock && xfs_ipincount(ip)) {
+               xfs_iunpin_nowait(ip);
+               xfs_ifunlock(ip);
+               return EAGAIN;
+       }
        xfs_iunpin_wait(ip);
 
        /*
@@ -3047,15 +3074,6 @@ xfs_iflush(
                return XFS_ERROR(EIO);
        }
 
-       /*
-        * Get the buffer containing the on-disk inode.
-        */
-       error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
-       if (error) {
-               xfs_ifunlock(ip);
-               return error;
-       }
-
        /*
         * Decide how buffer will be flushed out.  This is done before
         * the call to xfs_iflush_int because this field is zeroed by it.
@@ -3072,6 +3090,7 @@ xfs_iflush(
                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
                        flags = 0;
                        break;
+               case XFS_IFLUSH_ASYNC_NOBLOCK:
                case XFS_IFLUSH_ASYNC:
                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
                        flags = INT_ASYNC;
@@ -3091,6 +3110,7 @@ xfs_iflush(
                case XFS_IFLUSH_DELWRI:
                        flags = INT_DELWRI;
                        break;
+               case XFS_IFLUSH_ASYNC_NOBLOCK:
                case XFS_IFLUSH_ASYNC:
                        flags = INT_ASYNC;
                        break;
@@ -3104,6 +3124,16 @@ xfs_iflush(
                }
        }
 
+       /*
+        * Get the buffer containing the on-disk inode.
+        */
+       error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+                               noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+       if (error || !bp) {
+               xfs_ifunlock(ip);
+               return error;
+       }
+
        /*
         * First flush out the inode that xfs_iflush was called with.
         */
@@ -3112,6 +3142,13 @@ xfs_iflush(
                goto corrupt_out;
        }
 
+       /*
+        * If the buffer is pinned then push on the log now so we won't
+        * get stuck waiting in the write for too long.
+        */
+       if (XFS_BUF_ISPINNED(bp))
+               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+
        /*
         * inode clustering:
         * see if other inodes can be gathered into this write
@@ -3181,14 +3218,6 @@ xfs_iflush(
                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
        }
 
-       /*
-        * If the buffer is pinned then push on the log so we won't
-        * get stuck waiting in the write for too long.
-        */
-       if (XFS_BUF_ISPINNED(bp)){
-               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-       }
-
        if (flags & INT_DELWRI) {
                xfs_bdwrite(mp, bp);
        } else if (flags & INT_ASYNC) {
index eaa01895ff93ed42e59efa3ca0d238ba9511255f..c3bfffca9214e0fea9c86f9cafbe760fcccff399 100644 (file)
@@ -457,6 +457,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define        XFS_IFLUSH_SYNC                 3
 #define        XFS_IFLUSH_ASYNC                4
 #define        XFS_IFLUSH_DELWRI               5
+#define        XFS_IFLUSH_ASYNC_NOBLOCK        6
 
 /*
  * Flags for xfs_itruncate_start().
@@ -511,7 +512,7 @@ int         xfs_finish_reclaim_all(struct xfs_mount *, int);
  */
 int            xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                         xfs_daddr_t, uint);
+                         xfs_daddr_t, uint, uint);
 int            xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                          xfs_inode_t **, xfs_daddr_t, uint);
 int            xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
index f615e04364f47238c09f22d67a7fa74fe3548ba0..45d8776408ef2a4f0829c25caf9130adf6e57169 100644 (file)
@@ -614,7 +614,8 @@ xfs_bulkstat(
                                                        xfs_buf_relse(bp);
                                                error = xfs_itobp(mp, NULL, ip,
                                                                &dip, &bp, bno,
-                                                               XFS_IMAP_BULKSTAT);
+                                                               XFS_IMAP_BULKSTAT,
+                                                               XFS_BUF_LOCK);
                                                if (!error)
                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
                                                kmem_zone_free(xfs_inode_zone, ip);
index b2b70eba282cbaa65e9c43354a0696364d4b8e13..cd24711ae276a1d4edf0b6c4b955302e7a496288 100644 (file)
@@ -3214,7 +3214,8 @@ xlog_recover_process_iunlinks(
                                         * next inode in the bucket.
                                         */
                                        error = xfs_itobp(mp, NULL, ip, &dip,
-                                                       &ibp, 0, 0);
+                                                       &ibp, 0, 0,
+                                                       XFS_BUF_LOCK);
                                        ASSERT(error || (dip != NULL));
                                }
 
index 60b6b898022bcb1be98f96f2a21615e1bff5e131..4e5c010f50406940a5b4758e300530a6922bae63 100644 (file)
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
        if (tp == NULL) {
                bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
                if (!bp)
-                       return XFS_ERROR(ENOMEM);
+                       return (flags & XFS_BUF_TRYLOCK) ?
+                                       EAGAIN : XFS_ERROR(ENOMEM);
 
                if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
index 40b95e3a88ba143c0972231eb4c49f0d842c6b56..14140f6225ba513cc0c68a38def2ea37a47aaec5 100644 (file)
@@ -3468,29 +3468,6 @@ xfs_inode_flush(
            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
                return 0;
 
-       if (flags & FLUSH_LOG) {
-               if (iip && iip->ili_last_lsn) {
-                       xlog_t          *log = mp->m_log;
-                       xfs_lsn_t       sync_lsn;
-                       int             log_flags = XFS_LOG_FORCE;
-
-                       spin_lock(&log->l_grant_lock);
-                       sync_lsn = log->l_last_sync_lsn;
-                       spin_unlock(&log->l_grant_lock);
-
-                       if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
-                               if (flags & FLUSH_SYNC)
-                                       log_flags |= XFS_LOG_SYNC;
-                               error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
-                               if (error)
-                                       return error;
-                       }
-
-                       if (ip->i_update_core == 0)
-                               return 0;
-               }
-       }
-
        /*
         * We make this non-blocking if the inode is contended,
         * return EAGAIN to indicate to the caller that they
@@ -3498,30 +3475,22 @@ xfs_inode_flush(
         * blocking on inodes inside another operation right
         * now, they get caught later by xfs_sync.
         */
-       if (flags & FLUSH_INODE) {
-               int     flush_flags;
-
-               if (flags & FLUSH_SYNC) {
-                       xfs_ilock(ip, XFS_ILOCK_SHARED);
-                       xfs_iflock(ip);
-               } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                       if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                               return EAGAIN;
-                       }
-               } else {
+       if (flags & FLUSH_SYNC) {
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+               xfs_iflock(ip);
+       } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        return EAGAIN;
                }
-
-               if (flags & FLUSH_SYNC)
-                       flush_flags = XFS_IFLUSH_SYNC;
-               else
-                       flush_flags = XFS_IFLUSH_ASYNC;
-
-               error = xfs_iflush(ip, flush_flags);
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       } else {
+               return EAGAIN;
        }
 
+       error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+                                                   : XFS_IFLUSH_ASYNC_NOBLOCK);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
        return error;
 }