#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
#define DLM_LOCK_RES_MIGRATING 0x00000020
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
#define DLM_PURGE_INTERVAL_MS (8 * 1000)
struct dlm_lock_resource
void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
+ /* instead of logging the same network error over
+ * and over, sleep here and wait for the heartbeat
+ * to notice the node is dead. times out after 5s. */
+ dlm_wait_for_node_death(dlm, res->owner,
+ DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
"from convert message!\n", res->owner);
mlog(0, "retrying lock with migration/"
"recovery/in progress\n");
msleep(100);
- dlm_wait_for_recovery(dlm);
+ /* no waiting for dlm_reco_thread */
+ if (recovery) {
+ if (status == DLM_RECOVERING) {
+ mlog(0, "%s: got RECOVERING "
+ "for $REOCVERY lock, master "
+ "was %u\n", dlm->name,
+ res->owner);
+ dlm_wait_for_node_death(dlm, res->owner,
+ DLM_NODE_DEATH_WAIT_MAX);
+ }
+ } else {
+ dlm_wait_for_recovery(dlm);
+ }
goto retry_lock;
}
return dead;
}
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+ if (timeout) {
+ mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+ "death of node %u\n", dlm->name, timeout, node);
+ wait_event_timeout(dlm->dlm_reco_thread_wq,
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ } else {
+ mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+ "of death of node %u\n", dlm->name, node);
+ wait_event(dlm->dlm_reco_thread_wq,
+ dlm_is_node_dead(dlm, node));
+ }
+ /* for now, return 0 */
+ return 0;
+}
+
/* callers of the top-level api calls (dlmlock/dlmunlock) should
* block on the dlm->reco.event when recovery is in progress.
* the dlm recovery thread will set this state when it begins