[DLM] fix requestqueue race
authorDavid Teigland <teigland@redhat.com>
Tue, 31 Oct 2006 17:55:56 +0000 (11:55 -0600)
committerSteven Whitehouse <swhiteho@redhat.com>
Thu, 30 Nov 2006 15:35:10 +0000 (10:35 -0500)
Red Hat BZ 211914

There's a race between dlm_recoverd (1) enabling locking and (2) clearing
out the requestqueue, and dlm_recvd (1) checking if locking is enabled and
(2) adding a message to the requestqueue.  An order of recoverd(1),
recvd(1), recvd(2), recoverd(2) will result in a message being left on the
requestqueue.  The fix is to have dlm_recvd check if dlm_recoverd has
enabled locking after taking the mutex for the requestqueue and if it has
processing the message instead of queueing it.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
fs/dlm/lock.c
fs/dlm/requestqueue.c
fs/dlm/requestqueue.h

index 3f2befa4797b9a983914dfc16b84c2f3bf852481..6088a16926bfb0de99baef833ba7dac1e7d88ac2 100644 (file)
@@ -3028,10 +3028,17 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 
        while (1) {
                if (dlm_locking_stopped(ls)) {
-                       if (!recovery)
-                               dlm_add_requestqueue(ls, nodeid, hd);
-                       error = -EINTR;
-                       goto out;
+                       if (recovery) {
+                               error = -EINTR;
+                               goto out;
+                       }
+                       error = dlm_add_requestqueue(ls, nodeid, hd);
+                       if (error == -EAGAIN)
+                               continue;
+                       else {
+                               error = -EINTR;
+                               goto out;
+                       }
                }
 
                if (lock_recovery_try(ls))
index 7b2b089634a2df67d7253a0b316a97294a6f86c6..0226d2a0a0faf8107fa5dcbc912c971545bd1048 100644 (file)
@@ -30,26 +30,39 @@ struct rq_entry {
  * lockspace is enabled on some while still suspended on others.
  */
 
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
 {
        struct rq_entry *e;
        int length = hd->h_length;
+       int rv = 0;
 
        if (dlm_is_removed(ls, nodeid))
-               return;
+               return 0;
 
        e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
        if (!e) {
                log_print("dlm_add_requestqueue: out of memory\n");
-               return;
+               return 0;
        }
 
        e->nodeid = nodeid;
        memcpy(e->request, hd, length);
 
+       /* We need to check dlm_locking_stopped() after taking the mutex to
+          avoid a race where dlm_recoverd enables locking and runs
+          process_requestqueue between our earlier dlm_locking_stopped check
+          and this addition to the requestqueue. */
+
        mutex_lock(&ls->ls_requestqueue_mutex);
-       list_add_tail(&e->list, &ls->ls_requestqueue);
+       if (dlm_locking_stopped(ls))
+               list_add_tail(&e->list, &ls->ls_requestqueue);
+       else {
+               log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
+               kfree(e);
+               rv = -EAGAIN;
+       }
        mutex_unlock(&ls->ls_requestqueue_mutex);
+       return rv;
 }
 
 int dlm_process_requestqueue(struct dlm_ls *ls)
index 349f0d292d95ec22688d525cfa3d9301dc903148..6a53ea03335dae50b9d22f1c0c156ff01dfd581d 100644 (file)
@@ -13,7 +13,7 @@
 #ifndef __REQUESTQUEUE_DOT_H__
 #define __REQUESTQUEUE_DOT_H__
 
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
 int dlm_process_requestqueue(struct dlm_ls *ls);
 void dlm_wait_requestqueue(struct dlm_ls *ls);
 void dlm_purge_requestqueue(struct dlm_ls *ls);