[XFS] write barrier support Issue all log sync operations as ordered
authorChristoph Hellwig <hch@sgi.com>
Tue, 1 Nov 2005 23:26:59 +0000 (10:26 +1100)
committerNathan Scott <nathans@sgi.com>
Tue, 1 Nov 2005 23:26:59 +0000 (10:26 +1100)
writes.  In addition flush the disk cache on fsync if the sync cached
operation didn't sync the log to disk (this requires some additional
bookeping in the transaction and log code). If the device doesn't claim to
support barriers, the filesystem has an extern log volume or the trial
superblock write with barriers enabled failed we disable barriers and
print a warning.  We should probably fail the mount completely, but that
could lead to nasty boot failures for the root filesystem.  Not enabled by
default yet, needs more destructive testing first.

SGI-PV: 912426
SGI-Modid: xfs-linux:xfs-kern:198723a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
12 files changed:
fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_super.h
fs/xfs/xfs_clnt.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_mount.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_vfsops.c
fs/xfs/xfs_vnodeops.c

index 188cbbd5b74a437ac23f957983b0d766c5369de0..4663f7dbff1cbc9230eb5818a0a8c2fbf539992e 100644 (file)
@@ -1295,6 +1295,11 @@ _pagebuf_ioapply(
                rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
        }
 
+       if (pb->pb_flags & PBF_ORDERED) {
+               ASSERT(!(pb->pb_flags & PBF_READ));
+               rw = WRITE_BARRIER;
+       }
+
        /* Special code path for reading a sub page size pagebuf in --
         * we populate up the whole page, and hence the other metadata
         * in the same page.  This optimization is only valid when the
index 39c8ca122534ea9b38a3698d32726e42fcfe771b..fa21d1f9cb0b7823d3353f5edc18076dc3abdbf6 100644 (file)
@@ -74,7 +74,7 @@ typedef enum page_buf_flags_e {               /* pb_flags values */
        PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
        PBF_STALE = (1 << 7),   /* buffer has been staled, do not find it  */
        PBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
-       PBF_FLUSH = (1 << 11),      /* flush disk write cache              */
+       PBF_ORDERED = (1 << 11),    /* use ordered writes                  */
        PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
 
        /* flags used only as arguments to access routines */
@@ -383,9 +383,9 @@ extern void pagebuf_trace(
 #define XFS_BUF_UNASYNC(x)      ((x)->pb_flags &= ~PBF_ASYNC)
 #define XFS_BUF_ISASYNC(x)      ((x)->pb_flags & PBF_ASYNC)
 
-#define XFS_BUF_FLUSH(x)        ((x)->pb_flags |= PBF_FLUSH)
-#define XFS_BUF_UNFLUSH(x)      ((x)->pb_flags &= ~PBF_FLUSH)
-#define XFS_BUF_ISFLUSH(x)      ((x)->pb_flags & PBF_FLUSH)
+#define XFS_BUF_ORDERED(x)      ((x)->pb_flags |= PBF_ORDERED)
+#define XFS_BUF_UNORDERED(x)    ((x)->pb_flags &= ~PBF_ORDERED)
+#define XFS_BUF_ISORDERED(x)    ((x)->pb_flags & PBF_ORDERED)
 
 #define XFS_BUF_SHUT(x)                 printk("XFS_BUF_SHUT not implemented yet\n")
 #define XFS_BUF_UNSHUT(x)       printk("XFS_BUF_UNSHUT not implemented yet\n")
index 2302454d8d479d72eeeda5bb472633511ff1d4f7..d2701cc624b918a592ea9f10b6b2d2b2633465d1 100644 (file)
@@ -278,6 +278,72 @@ xfs_blkdev_put(
                close_bdev_excl(bdev);
 }
 
+/*
+ * Try to write out the superblock using barriers.
+ */
+STATIC int
+xfs_barrier_test(
+       xfs_mount_t     *mp)
+{
+       xfs_buf_t       *sbp = xfs_getsb(mp, 0);
+       int             error;
+
+       XFS_BUF_UNDONE(sbp);
+       XFS_BUF_UNREAD(sbp);
+       XFS_BUF_UNDELAYWRITE(sbp);
+       XFS_BUF_WRITE(sbp);
+       XFS_BUF_UNASYNC(sbp);
+       XFS_BUF_ORDERED(sbp);
+
+       xfsbdstrat(mp, sbp);
+       error = xfs_iowait(sbp);
+
+       /*
+        * Clear all the flags we set and possible error state in the
+        * buffer.  We only did the write to try out whether barriers
+        * worked and shouldn't leave any traces in the superblock
+        * buffer.
+        */
+       XFS_BUF_DONE(sbp);
+       XFS_BUF_ERROR(sbp, 0);
+       XFS_BUF_UNORDERED(sbp);
+
+       xfs_buf_relse(sbp);
+       return error;
+}
+
+void
+xfs_mountfs_check_barriers(xfs_mount_t *mp)
+{
+       int error;
+
+       if (mp->m_logdev_targp != mp->m_ddev_targp) {
+               xfs_fs_cmn_err(CE_NOTE, mp,
+                 "Disabling barriers, not supported with external log device");
+               mp->m_flags &= ~XFS_MOUNT_BARRIER;
+       }
+
+       if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered ==
+                                       QUEUE_ORDERED_NONE) {
+               xfs_fs_cmn_err(CE_NOTE, mp,
+                 "Disabling barriers, not supported by the underlying device");
+               mp->m_flags &= ~XFS_MOUNT_BARRIER;
+       }
+
+       error = xfs_barrier_test(mp);
+       if (error) {
+               xfs_fs_cmn_err(CE_NOTE, mp,
+                 "Disabling barriers, trial barrier write failed");
+               mp->m_flags &= ~XFS_MOUNT_BARRIER;
+       }
+}
+
+void
+xfs_blkdev_issue_flush(
+       xfs_buftarg_t           *buftarg)
+{
+       blkdev_issue_flush(buftarg->pbr_bdev, NULL);
+}
 
 STATIC struct inode *
 linvfs_alloc_inode(
index ec7e0035c7310fe7fcdaefcce811aa9e3cbfd8cb..ad77e3743e04b59928e3a501d08fcbecf2d44d88 100644 (file)
@@ -132,6 +132,7 @@ extern void xfs_flush_device(struct xfs_inode *);
 extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
                                struct block_device **);
 extern void xfs_blkdev_put(struct block_device *);
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern struct export_operations linvfs_export_ops;
 
index b3215ffe0be86f739eb260e894d30cafb994e1b9..c93cb282f3d80af2cdc87c3fbcb303056f946727 100644 (file)
@@ -99,7 +99,7 @@ struct xfs_mount_args {
                                                 * enforcement */
 #define XFSMNT_NOUUID          0x01000000      /* Ignore fs uuid */
 #define XFSMNT_DMAPI           0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_NOLOGFLUSH      0x04000000      /* Don't flush for log blocks */
+#define XFSMNT_BARRIER         0x04000000      /* use write barriers */
 #define XFSMNT_IDELETE         0x08000000      /* inode cluster delete */
 #define XFSMNT_SWALLOC         0x10000000      /* turn on stripe width
                                                 * allocation */
index 51814c32eddf91148cc1585d6802affeb2eb64ef..b9d3ad35240ee9960d47a293c32c8e8e6af4a274 100644 (file)
@@ -93,8 +93,11 @@ STATIC int  xlog_state_release_iclog(xlog_t          *log,
 STATIC void xlog_state_switch_iclogs(xlog_t            *log,
                                     xlog_in_core_t *iclog,
                                     int                eventual_size);
-STATIC int  xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
-STATIC int  xlog_state_sync_all(xlog_t *log, uint flags);
+STATIC int  xlog_state_sync(xlog_t                     *log,
+                           xfs_lsn_t                   lsn,
+                           uint                        flags,
+                           int                         *log_flushed);
+STATIC int  xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
 STATIC void xlog_state_want_sync(xlog_t        *log, xlog_in_core_t *iclog);
 
 /* local functions to manipulate grant head */
@@ -312,12 +315,17 @@ xfs_log_done(xfs_mount_t  *mp,
  * semaphore.
  */
 int
-xfs_log_force(xfs_mount_t *mp,
-             xfs_lsn_t   lsn,
-             uint        flags)
+_xfs_log_force(
+       xfs_mount_t     *mp,
+       xfs_lsn_t       lsn,
+       uint            flags,
+       int             *log_flushed)
 {
-       int     rval;
-       xlog_t *log = mp->m_log;
+       xlog_t          *log = mp->m_log;
+       int             dummy;
+
+       if (!log_flushed)
+               log_flushed = &dummy;
 
 #if defined(DEBUG) || defined(XLOG_NOLOG)
        if (!xlog_debug && xlog_target == log->l_targ)
@@ -328,17 +336,12 @@ xfs_log_force(xfs_mount_t *mp,
 
        XFS_STATS_INC(xs_log_force);
 
-       if ((log->l_flags & XLOG_IO_ERROR) == 0) {
-               if (lsn == 0)
-                       rval = xlog_state_sync_all(log, flags);
-               else
-                       rval = xlog_state_sync(log, lsn, flags);
-       } else {
-               rval = XFS_ERROR(EIO);
-       }
-
-       return rval;
-
+       if (log->l_flags & XLOG_IO_ERROR)
+               return XFS_ERROR(EIO);
+       if (lsn == 0)
+               return xlog_state_sync_all(log, flags, log_flushed);
+       else
+               return xlog_state_sync(log, lsn, flags, log_flushed);
 }      /* xfs_log_force */
 
 /*
@@ -1467,14 +1470,13 @@ xlog_sync(xlog_t                *log,
        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
        /*
-        * Do a disk write cache flush for the log block.
-        * This is a bit of a sledgehammer, it would be better
-        * to use a tag barrier here that just prevents reordering.
+        * Do an ordered write for the log block.
+        *
         * It may not be needed to flush the first split block in the log wrap
         * case, but do it anyways to be safe -AK
         */
-       if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
-               XFS_BUF_FLUSH(bp);
+       if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+               XFS_BUF_ORDERED(bp);
 
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
        ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1505,8 +1507,8 @@ xlog_sync(xlog_t          *log,
                XFS_BUF_SET_FSPRIVATE(bp, iclog);
                XFS_BUF_BUSY(bp);
                XFS_BUF_ASYNC(bp);
-               if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
-                       XFS_BUF_FLUSH(bp);
+               if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+                       XFS_BUF_ORDERED(bp);
                dptr = XFS_BUF_PTR(bp);
                /*
                 * Bump the cycle numbers at the start of each block
@@ -2951,7 +2953,7 @@ xlog_state_switch_iclogs(xlog_t           *log,
  *             not in the active nor dirty state.
  */
 STATIC int
-xlog_state_sync_all(xlog_t *log, uint flags)
+xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 {
        xlog_in_core_t  *iclog;
        xfs_lsn_t       lsn;
@@ -3000,6 +3002,7 @@ xlog_state_sync_all(xlog_t *log, uint flags)
 
                                if (xlog_state_release_iclog(log, iclog))
                                        return XFS_ERROR(EIO);
+                               *log_flushed = 1;
                                s = LOG_LOCK(log);
                                if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
                                    iclog->ic_state != XLOG_STATE_DIRTY)
@@ -3043,6 +3046,7 @@ maybe_sleep:
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
                        return XFS_ERROR(EIO);
+               *log_flushed = 1;
 
        } else {
 
@@ -3068,7 +3072,8 @@ no_sleep:
 int
 xlog_state_sync(xlog_t   *log,
                xfs_lsn_t lsn,
-               uint      flags)
+               uint      flags,
+               int       *log_flushed)
 {
     xlog_in_core_t     *iclog;
     int                        already_slept = 0;
@@ -3120,6 +3125,7 @@ try_again:
                        XFS_STATS_INC(xs_log_force_sleep);
                        sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
                                &log->l_icloglock, s);
+                       *log_flushed = 1;
                        already_slept = 1;
                        goto try_again;
                } else {
@@ -3128,6 +3134,7 @@ try_again:
                        LOG_UNLOCK(log, s);
                        if (xlog_state_release_iclog(log, iclog))
                                return XFS_ERROR(EIO);
+                       *log_flushed = 1;
                        s = LOG_LOCK(log);
                }
        }
@@ -3152,6 +3159,7 @@ try_again:
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
                        return XFS_ERROR(EIO);
+               *log_flushed = 1;
        } else {                /* just return */
                LOG_UNLOCK(log, s);
        }
@@ -3606,6 +3614,7 @@ xfs_log_force_umount(
        xlog_ticket_t   *tic;
        xlog_t          *log;
        int             retval;
+       int             dummy;
        SPLDECL(s);
        SPLDECL(s2);
 
@@ -3684,7 +3693,7 @@ xfs_log_force_umount(
                 * Force the incore logs to disk before shutting the
                 * log down completely.
                 */
-               xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
+               xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy);
                s2 = LOG_LOCK(log);
                retval = xlog_state_ioerror(log);
                LOG_UNLOCK(log, s2);
index 18961119fc65c34e89cafde638a882950c4a5732..dc920f83412d24da4f074193048a872be26d0dd1 100644 (file)
@@ -174,9 +174,12 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       xfs_log_ticket_t ticket,
                       void             **iclog,
                       uint             flags);
-int      xfs_log_force(struct xfs_mount *mp,
-                       xfs_lsn_t        lsn,
-                       uint             flags);
+int      _xfs_log_force(struct xfs_mount *mp,
+                        xfs_lsn_t      lsn,
+                        uint           flags,
+                        int            *log_forced);
+#define xfs_log_force(mp, lsn, flags) \
+       _xfs_log_force(mp, lsn, flags, NULL);
 int      xfs_log_mount(struct xfs_mount        *mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
index 5affba38a57753510952570e6dd0f71969ea350a..bc55931ac74e93e3751e8f68fadf41791f1be70c 100644 (file)
@@ -415,7 +415,7 @@ typedef struct xfs_mount {
                                                 * 32 bits in size */
 #define XFS_MOUNT_32BITINOOPT  0x00008000      /* saved mount option state */
 #define XFS_MOUNT_NOUUID       0x00010000      /* ignore uuid during mount */
-#define XFS_MOUNT_NOLOGFLUSH   0x00020000
+#define XFS_MOUNT_BARRIER      0x00020000
 #define XFS_MOUNT_IDELETE      0x00040000      /* delete empty inode clusters*/
 #define XFS_MOUNT_SWALLOC      0x00080000      /* turn on stripe width
                                                 * allocation */
@@ -542,6 +542,7 @@ extern xfs_mount_t *xfs_mount_init(void);
 extern void    xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern void    xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
 extern int     xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern void    xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern int     xfs_unmountfs(xfs_mount_t *, struct cred *);
 extern void    xfs_unmountfs_close(xfs_mount_t *, struct cred *);
index 92efe272b83d6d0ea79d1ad184a93d76da690055..5e33891b80496c8fbcaf9d4fac0f8a4b341d0271 100644 (file)
@@ -661,10 +661,11 @@ xfs_trans_unreserve_and_mod_sb(
  */
  /*ARGSUSED*/
 int
-xfs_trans_commit(
+_xfs_trans_commit(
        xfs_trans_t     *tp,
        uint            flags,
-       xfs_lsn_t       *commit_lsn_p)
+       xfs_lsn_t       *commit_lsn_p,
+       int             *log_flushed)
 {
        xfs_log_iovec_t         *log_vector;
        int                     nvec;
@@ -893,9 +894,11 @@ shut_us_down:
         * log out now and wait for it.
         */
        if (sync) {
-               if (!error)
-                       error = xfs_log_force(mp, commit_lsn,
-                                     XFS_LOG_FORCE | XFS_LOG_SYNC);
+               if (!error) {
+                       error = _xfs_log_force(mp, commit_lsn,
+                                     XFS_LOG_FORCE | XFS_LOG_SYNC,
+                                     log_flushed);
+               }
                XFS_STATS_INC(xs_trans_sync);
        } else {
                XFS_STATS_INC(xs_trans_async);
index a263aec8b3a6532b8dd8b887cc6f073bf4348f42..0cc7af5c1f001f582e6d1b179757fc0c1fc237ee 100644 (file)
@@ -1025,7 +1025,12 @@ void             xfs_trans_log_efd_extent(xfs_trans_t *,
                                         struct xfs_efd_log_item *,
                                         xfs_fsblock_t,
                                         xfs_extlen_t);
-int            xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
+int            _xfs_trans_commit(xfs_trans_t *,
+                                 uint flags,
+                                 xfs_lsn_t *,
+                                 int *);
+#define xfs_trans_commit(tp, flags, lsn) \
+       _xfs_trans_commit(tp, flags, lsn, NULL)
 void           xfs_trans_cancel(xfs_trans_t *, int);
 void           xfs_trans_ail_init(struct xfs_mount *);
 xfs_lsn_t      xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
index f1a904e23adeff9d1c838f958940a0d42e13a14a..8238c7517822f2da54642d4d7b9c36c08b733da4 100644 (file)
@@ -321,8 +321,8 @@ xfs_start_flags(
 
        if (ap->flags & XFSMNT_NOUUID)
                mp->m_flags |= XFS_MOUNT_NOUUID;
-       if (ap->flags & XFSMNT_NOLOGFLUSH)
-               mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
+       if (ap->flags & XFSMNT_BARRIER)
+               mp->m_flags |= XFS_MOUNT_BARRIER;
 
        return 0;
 }
@@ -512,8 +512,14 @@ xfs_mount(
                goto error2;
 
        error = XFS_IOINIT(vfsp, args, flags);
-       if (!error)
-               return 0;
+       if (error)
+               goto error2;
+
+       if ((args->flags & XFSMNT_BARRIER) &&
+           !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
+               xfs_mountfs_check_barriers(mp);
+       return 0;
+
 error2:
        if (mp->m_sb_bp)
                xfs_freesb(mp);
@@ -656,19 +662,24 @@ xfs_mntupdate(
        else
                mp->m_flags &= ~XFS_MOUNT_NOATIME;
 
-       if (!(vfsp->vfs_flag & VFS_RDONLY)) {
-               VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+       if ((vfsp->vfs_flag & VFS_RDONLY) &&
+           !(*flags & MS_RDONLY)) {
+               vfsp->vfs_flag &= ~VFS_RDONLY;
+
+               if (args->flags & XFSMNT_BARRIER)
+                       xfs_mountfs_check_barriers(mp);
        }
 
-       if (*flags & MS_RDONLY) {
+       if (!(vfsp->vfs_flag & VFS_RDONLY) &&
+           (*flags & MS_RDONLY)) {
+               VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+
                xfs_quiesce_fs(mp);
 
                /* Ok now write out an unmount record */
                xfs_log_unmount_write(mp);
                xfs_unmountfs_writesb(mp);
                vfsp->vfs_flag |= VFS_RDONLY;
-       } else {
-               vfsp->vfs_flag &= ~VFS_RDONLY;
        }
 
        return 0;
@@ -1628,7 +1639,8 @@ xfs_vget(
 #define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
 #define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
-#define MNTOPT_NOLOGFLUSH   "nologflush"   /* don't hard flush on log writes */
+#define MNTOPT_BARRIER "barrier"       /* use writer barriers for log write and
+                                          unwritten extent conversion */
 #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"  /* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP   "ikeep"         /* do not free empty inode clusters */
@@ -1791,8 +1803,8 @@ xfs_parseargs(
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
                        args->flags |= XFSMNT_NOUUID;
-               } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
-                       args->flags |= XFSMNT_NOLOGFLUSH;
+               } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+                       args->flags |= XFSMNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
                        args->flags &= ~XFSMNT_IDELETE;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
@@ -1866,7 +1878,7 @@ xfs_showargs(
                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
                { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
                { XFS_MOUNT_OSYNCISOSYNC,       "," MNTOPT_OSYNCISOSYNC },
-               { XFS_MOUNT_NOLOGFLUSH,         "," MNTOPT_NOLOGFLUSH },
+               { XFS_MOUNT_BARRIER,            "," MNTOPT_BARRIER },
                { XFS_MOUNT_IDELETE,            "," MNTOPT_NOIKEEP },
                { 0, NULL }
        };
index 58bfe629b93316168b5c2ded720b6ea9cf0b6a7a..e2bf2ef58b66910962f0f6f4762b5ba46b53af34 100644 (file)
@@ -1118,6 +1118,7 @@ xfs_fsync(
        xfs_inode_t     *ip;
        xfs_trans_t     *tp;
        int             error;
+       int             log_flushed = 0, changed = 1;
 
        vn_trace_entry(BHV_TO_VNODE(bdp),
                        __FUNCTION__, (inst_t *)__return_address);
@@ -1171,10 +1172,18 @@ xfs_fsync(
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
                if (xfs_ipincount(ip)) {
-                       xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+                       _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
                                      XFS_LOG_FORCE |
                                      ((flag & FSYNC_WAIT)
-                                      ? XFS_LOG_SYNC : 0));
+                                      ? XFS_LOG_SYNC : 0),
+                                     &log_flushed);
+               } else {
+                       /*
+                        * If the inode is not pinned and nothing
+                        * has changed we don't need to flush the
+                        * cache.
+                        */
+                       changed = 0;
                }
                error = 0;
        } else  {
@@ -1210,10 +1219,27 @@ xfs_fsync(
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                if (flag & FSYNC_WAIT)
                        xfs_trans_set_sync(tp);
-               error = xfs_trans_commit(tp, 0, NULL);
+               error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
 
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
+
+       if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
+               /*
+                * If the log write didn't issue an ordered tag we need
+                * to flush the disk cache for the data device now.
+                */
+               if (!log_flushed)
+                       xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+
+               /*
+                * If this inode is on the RT dev we need to flush that
+                * cache aswell.
+                */
+               if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+                       xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+       }
+
        return error;
 }