ext3: Wait for proper transaction commit on fsync
authorJan Kara <jack@suse.cz>
Fri, 16 Oct 2009 17:26:15 +0000 (19:26 +0200)
committerJan Kara <jack@suse.cz>
Wed, 11 Nov 2009 14:22:49 +0000 (15:22 +0100)
We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
fs/ext3/fsync.c
fs/ext3/inode.c
fs/ext3/super.c
include/linux/ext3_fs_i.h

index 451d166bbe93ac499eae45b6c90550eaed1b5f3f..8209f266e9ade19dbcb244634cc39a02623f2fa5 100644 (file)
 int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+       struct ext3_inode_info *ei = EXT3_I(inode);
+       journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
        int ret = 0;
+       tid_t commit_tid;
+
+       if (inode->i_sb->s_flags & MS_RDONLY)
+               return 0;
 
        J_ASSERT(ext3_journal_current_handle() == NULL);
 
        /*
-        * data=writeback:
+        * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
-        *  sync_inode() will sync the metadata
-        *
-        * data=ordered:
-        *  The caller's filemap_fdatawrite() will write the data and
-        *  sync_inode() will write the inode if it is dirty.  Then the caller's
-        *  filemap_fdatawait() will wait on the pages.
+        *  Metadata is in the journal, we wait for a proper transaction
+        *  to commit here.
         *
         * data=journal:
         *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
                goto out;
        }
 
-       if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-               goto flush;
+       if (datasync)
+               commit_tid = atomic_read(&ei->i_datasync_tid);
+       else
+               commit_tid = atomic_read(&ei->i_sync_tid);
 
-       /*
-        * The VFS has written the file data.  If the inode is unaltered
-        * then we need not start a commit.
-        */
-       if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-               struct writeback_control wbc = {
-                       .sync_mode = WB_SYNC_ALL,
-                       .nr_to_write = 0, /* sys_fsync did this */
-               };
-               ret = sync_inode(inode, &wbc);
+       if (log_start_commit(journal, commit_tid)) {
+               log_wait_commit(journal, commit_tid);
                goto out;
        }
-flush:
+
        /*
         * In case we didn't commit a transaction, we have to flush
         * disk caches manually so that data really is on persistent
index 069a163393b464c5ad5c5a4000734ea176b265e1..354ed3b47b301dc34fb13c0ace2e5123311a9e89 100644 (file)
@@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
        int err = 0;
        struct ext3_block_alloc_info *block_i;
        ext3_fsblk_t current_block;
+       struct ext3_inode_info *ei = EXT3_I(inode);
 
-       block_i = EXT3_I(inode)->i_block_alloc_info;
+       block_i = ei->i_block_alloc_info;
        /*
         * If we're splicing into a [td]indirect block (as opposed to the
         * inode) then we need to get write access to the [td]indirect block
@@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 
        inode->i_ctime = CURRENT_TIME_SEC;
        ext3_mark_inode_dirty(handle, inode);
+       /* ext3_mark_inode_dirty already updated i_sync_tid */
+       atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
 
        /* had we spliced it onto indirect block? */
        if (where->bh) {
@@ -2754,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
        struct ext3_inode_info *ei;
        struct buffer_head *bh;
        struct inode *inode;
+       journal_t *journal = EXT3_SB(sb)->s_journal;
+       transaction_t *transaction;
        long ret;
        int block;
 
@@ -2831,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
 
+       /*
+        * Set transaction id's of transactions that have to be committed
+        * to finish f[data]sync. We set them to currently running transaction
+        * as we cannot be sure that the inode or some of its metadata isn't
+        * part of the transaction - the inode could have been reclaimed and
+        * now it is reread from disk.
+        */
+       if (journal) {
+               tid_t tid;
+
+               spin_lock(&journal->j_state_lock);
+               if (journal->j_running_transaction)
+                       transaction = journal->j_running_transaction;
+               else
+                       transaction = journal->j_committing_transaction;
+               if (transaction)
+                       tid = transaction->t_tid;
+               else
+                       tid = journal->j_commit_sequence;
+               spin_unlock(&journal->j_state_lock);
+               atomic_set(&ei->i_sync_tid, tid);
+               atomic_set(&ei->i_datasync_tid, tid);
+       }
+
        if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
            EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
                /*
@@ -3015,6 +3044,7 @@ again:
                err = rc;
        ei->i_state &= ~EXT3_STATE_NEW;
 
+       atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
 out_brelse:
        brelse (bh);
        ext3_std_error(inode->i_sb, err);
index 7a520a862f49fc300967177d3e7ca911b295bd49..427496c4767cda4c542f43cb97b8a3089e77bcce 100644 (file)
@@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
                return NULL;
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
+       atomic_set(&ei->i_datasync_tid, 0);
+       atomic_set(&ei->i_sync_tid, 0);
        return &ei->vfs_inode;
 }
 
index ca1bfe90004fa5473f35b884ad0e33abbde6ef25..93e7428156baf7e1c60ae0499b52d89f71119676 100644 (file)
@@ -137,6 +137,14 @@ struct ext3_inode_info {
         * by other means, so we have truncate_mutex.
         */
        struct mutex truncate_mutex;
+
+       /*
+        * Transactions that contain inode's metadata needed to complete
+        * fsync and fdatasync, respectively.
+        */
+       atomic_t i_sync_tid;
+       atomic_t i_datasync_tid;
+
        struct inode vfs_inode;
 };