ocfs2: fix a tiny race that leads file system read-only
authorJiufei Xue <xuejiufei@huawei.com>
Tue, 15 Mar 2016 21:53:20 +0000 (14:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Mar 2016 23:55:16 +0000 (16:55 -0700)
when o2hb detect a node down, it first set the dead node to recovery map
and create ocfs2rec which will replay journal for dead node.  o2hb
thread then call dlm_do_local_recovery_cleanup() to delete the lock for
dead node.  After the lock of dead node is gone, locks for other nodes
can be granted and may modify the meta data without replaying journal of
the dead node.  The detail is described as follows.

     N1                         N2                   N3(master)
modify the extent tree of
inode, and commit
dirty metadata to journal,
then goes down.
                                                 o2hb thread detects
                                                 N1 goes down, set
                                                 recovery map and
                                                 delete the lock of N1.

                                                 dlm_thread flush ast
                                                 for the lock of N2.
                        do not detect the death
                        of N1, so recovery map is
                        empty.

                        read inode from disk
                        without replaying
                        the journal of N1 and
                        modify the extent tree
                        of the inode that N1
                        had modified.
                                                 ocfs2rec recover the
                                                 journal of N1.
                                                 The modification of N2
                                                 is lost.

The modification of N1 and N2 are not serial, and it will lead to
read-only file system.  We can set recovery_waiting flag to the lock
resource after delete the lock for dead node to prevent other node from
getting the lock before dlm recovery.  After dlm recovery, the recovery
map on N2 is not empty, ocfs2_inode_lock_full_nested() will wait for ocfs2
recovery.

Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlm/dlmthread.c

index 3b77862fc85d39eac5e03babde712e177caf3d6a..004f2cbe8f71e7c4ffb8833f8cc1e249680f99c4 100644 (file)
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
 #define DLM_LOCK_RES_DROPPING_REF         0x00000040
 #define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
 #define DLM_LOCK_RES_SETREF_INPROG        0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING     0x00004000
 
 /* max milliseconds to wait to sync up a network failure with a node death */
 #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -804,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
 
        assert_spin_locked(&res->spinlock);
 
-       if (res->state & DLM_LOCK_RES_RECOVERING)
+       if (res->state & (DLM_LOCK_RES_RECOVERING|
+                       DLM_LOCK_RES_RECOVERY_WAITING))
                status = DLM_RECOVERING;
        else if (res->state & DLM_LOCK_RES_MIGRATING)
                status = DLM_MIGRATING;
@@ -1026,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
 {
        __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
                                          DLM_LOCK_RES_RECOVERING|
+                                         DLM_LOCK_RES_RECOVERY_WAITING|
                                          DLM_LOCK_RES_MIGRATING));
 }
 
index 87e22541850e3138b506cb4032dc94671c12a157..9aed6e2022014afb71988de025e583b38aea3892 100644 (file)
@@ -2550,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                return 0;
 
        /* delay migration when the lockres is in RECOCERING state */
-       if (res->state & DLM_LOCK_RES_RECOVERING)
+       if (res->state & (DLM_LOCK_RES_RECOVERING|
+                       DLM_LOCK_RES_RECOVERY_WAITING))
                return 0;
 
        if (res->owner != dlm->node_num)
index 213279db3f282021e3f4a3647e1aea460eecf56d..cd38488a10fc104fba6339cc2d6c9501a4251dfe 100644 (file)
@@ -2175,6 +2175,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, bucket, hash_node) {
+                       if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+                               spin_lock(&res->spinlock);
+                               res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+                               spin_unlock(&res->spinlock);
+                               wake_up(&res->wq);
+                       }
+
                        if (!(res->state & DLM_LOCK_RES_RECOVERING))
                                continue;
 
@@ -2312,6 +2319,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                             res->lockname.len, res->lockname.name, freed, dead_node);
                        __dlm_print_one_lock_resource(res);
                }
+               res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
                dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
        } else if (test_bit(dead_node, res->refmap)) {
                mlog(0, "%s:%.*s: dead node %u had a ref, but had "
index 22e6eb8b8d22f9f9f15aa886467cabca6649ce48..68d239ba0c63c8c925644755d16a1f020298bfba 100644 (file)
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
        if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
                return 0;
 
-       if (res->state & DLM_LOCK_RES_RECOVERING)
+       if (res->state & (DLM_LOCK_RES_RECOVERING|
+                       DLM_LOCK_RES_RECOVERY_WAITING))
                return 0;
 
        /* Another node has this resource with this node as the master */
@@ -707,7 +708,8 @@ static int dlm_thread(void *data)
                         * dirty for a short while. */
                        BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
                        if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
-                                         DLM_LOCK_RES_RECOVERING)) {
+                                         DLM_LOCK_RES_RECOVERING |
+                                         DLM_LOCK_RES_RECOVERY_WAITING)) {
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);