ocfs2: Add a mount option "coherency=*" to handle cluster coherency for O_DIRECT...
authorTristan Ye <tristan.ye@oracle.com>
Mon, 11 Oct 2010 08:46:39 +0000 (16:46 +0800)
committerJoel Becker <joel.becker@oracle.com>
Mon, 11 Oct 2010 21:14:55 +0000 (14:14 -0700)
Currently, the default behavior of O_DIRECT writes was allowing
concurrent writing among nodes to the same file, with no cluster
coherency guaranteed (no EX lock held).  This can leave stale data in
the cache for buffered reads on other nodes.

The new mount option introduce a chance to choose two different
behaviors for O_DIRECT writes:

    * coherency=full, as the default value, will disallow
                      concurrent O_DIRECT writes by taking
                      EX locks.

    * coherency=buffered, allow concurrent O_DIRECT writes
                          without EX lock among nodes, which
                          gains high performance at risk of
                          getting stale data on other nodes.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Documentation/filesystems/ocfs2.txt
fs/ocfs2/file.c
fs/ocfs2/ocfs2.h
fs/ocfs2/super.c

index 1f7ae144f6d89bc351c8aa34076262bf366f4a4d..5393e6611691617db6c933a945df25baa0bd1090 100644 (file)
@@ -87,3 +87,10 @@ dir_resv_level=      (*)     By default, directory reservations will scale with file
                        reservations - users should rarely need to change this
                        value. If allocation reservations are turned off, this
                        option will have no effect.
+coherency=full  (*)    Disallow concurrent O_DIRECT writes, cluster inode
+                       lock will be taken to force other nodes drop cache,
+                       therefore full cluster coherency is guaranteed even
+                       for O_DIRECT writes.
+coherency=buffered     Allow concurrent O_DIRECT writes without EX lock among
+                       nodes, which gains high performance at risk of getting
+                       stale data on other nodes.
index 13af9937bdda15ede9c9e9a42c8eaa3e59964806..9e8cc4346b761e6246494416787b5b78c193f01e 100644 (file)
@@ -2225,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+                              OCFS2_MOUNT_COHERENCY_BUFFERED);
 
        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
@@ -2248,14 +2250,37 @@ relock:
                have_alloc_sem = 1;
        }
 
-       /* concurrent O_DIRECT writes are allowed */
-       rw_level = !direct_io;
+       /*
+        * Concurrent O_DIRECT writes are allowed with
+        * mount_option "coherency=buffered".
+        */
+       rw_level = (!direct_io || full_coherency);
+
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_sems;
        }
 
+       /*
+        * O_DIRECT writes with "coherency=full" need to take EX cluster
+        * inode_lock to guarantee coherency.
+        */
+       if (direct_io && full_coherency) {
+               /*
+                * We need to take and drop the inode lock to force
+                * other nodes to drop their caches.  Buffered I/O
+                * already does this in write_begin().
+                */
+               ret = ocfs2_inode_lock(inode, NULL, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_sems;
+               }
+
+               ocfs2_inode_unlock(inode, 1);
+       }
+
        can_do_direct = direct_io;
        ret = ocfs2_prepare_inode_for_write(file, ppos,
                                            iocb->ki_left, appending,
index 687e291d73f29ea2a56ea9e874955fa9b68bca94..3064feef143039f216d3b77235c2925fe6d81109 100644 (file)
@@ -263,6 +263,9 @@ enum ocfs2_mount_options
                                                   control lists */
        OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
        OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+
+       OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT
+                                                   writes */
 };
 
 #define OCFS2_OSB_SOFT_RO                      0x0001
index b578644b6637823819aae19cd8d142ad20d52fa1..9122d59f8127c6768d14ffe7c1ab48fa2a9e056a 100644 (file)
@@ -177,6 +177,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+       Opt_coherency_buffered,
+       Opt_coherency_full,
        Opt_resv_level,
        Opt_dir_resv_level,
        Opt_err,
@@ -205,6 +207,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+       {Opt_coherency_buffered, "coherency=buffered"},
+       {Opt_coherency_full, "coherency=full"},
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
@@ -1452,6 +1456,12 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_grpquota:
                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
                        break;
+               case Opt_coherency_buffered:
+                       mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+                       break;
+               case Opt_coherency_full:
+                       mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+                       break;
                case Opt_acl:
                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1550,6 +1560,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_GRPQUOTA)
                seq_printf(s, ",grpquota");
 
+       if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+               seq_printf(s, ",coherency=buffered");
+       else
+               seq_printf(s, ",coherency=full");
+
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
        else