[PATCH] ocfs2: recheck recovery state after getting lock
authorKurt Hackel <kurt.hackel@oracle.com>
Thu, 19 Jan 2006 01:01:25 +0000 (17:01 -0800)
committerMark Fasheh <mark.fasheh@oracle.com>
Thu, 16 Feb 2006 20:00:16 +0000 (12:00 -0800)
* after successfully taking the $RECOVERY lock in EX mode, recheck to make
  sure that recovery has not already begun or completed on another node

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/dlm/dlmrecovery.c

index 186e9a76aa5807565ad09e31e0bd0fa7584dcab5..f9ce864966ec60a90ce8c25badca9028ebccce74 100644 (file)
@@ -2032,6 +2032,30 @@ again:
                             dlm->reco.new_master);
                        status = -EEXIST;
                } else {
+                       status = 0;
+
+                       /* see if recovery was already finished elsewhere */
+                       spin_lock(&dlm->spinlock);
+                       if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+                               status = -EINVAL;       
+                               mlog(0, "%s: got reco EX lock, but "
+                                    "node got recovered already\n", dlm->name);
+                               if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+                                       mlog(ML_ERROR, "%s: new master is %u "
+                                            "but no dead node!\n", 
+                                            dlm->name, dlm->reco.new_master);
+                                       BUG();
+                               }
+                       }
+                       spin_unlock(&dlm->spinlock);
+               }
+
+               /* if this node has actually become the recovery master,
+                * set the master and send the messages to begin recovery */
+               if (!status) {
+                       mlog(0, "%s: dead=%u, this=%u, sending "
+                            "begin_reco now\n", dlm->name, 
+                            dlm->reco.dead_node, dlm->node_num);
                        status = dlm_send_begin_reco_message(dlm,
                                      dlm->reco.dead_node);
                        /* this always succeeds */