ocfs2: calculate lockid hash values outside of the spinlock
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / fs / ocfs2 / dlm / dlmmaster.c
index 2e2e95e6949924f99c81ff9d368c5852437e5ece..953aa8421be4acf28bc9f8816b838ac24ca4cf0c 100644 (file)
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
                                       struct dlm_lock_resource *res,
                                       u8 target);
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+                                      struct dlm_lock_resource *res);
 
 
 int dlm_is_host_down(int errno)
@@ -564,7 +566,7 @@ static void dlm_lockres_release(struct kref *kref)
 
        /* By the time we're ready to blow this guy away, we shouldn't
         * be on any lists. */
-       BUG_ON(!list_empty(&res->list));
+       BUG_ON(!hlist_unhashed(&res->hash_node));
        BUG_ON(!list_empty(&res->granted));
        BUG_ON(!list_empty(&res->converting));
        BUG_ON(!list_empty(&res->blocked));
@@ -601,11 +603,11 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        memcpy(qname, name, namelen);
 
        res->lockname.len = namelen;
-       res->lockname.hash = full_name_hash(name, namelen);
+       res->lockname.hash = dlm_lockid_hash(name, namelen);
 
        init_waitqueue_head(&res->wq);
        spin_lock_init(&res->spinlock);
-       INIT_LIST_HEAD(&res->list);
+       INIT_HLIST_NODE(&res->hash_node);
        INIT_LIST_HEAD(&res->granted);
        INIT_LIST_HEAD(&res->converting);
        INIT_LIST_HEAD(&res->blocked);
@@ -675,18 +677,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        int blocked = 0;
        int ret, nodenum;
        struct dlm_node_iter iter;
-       unsigned int namelen;
+       unsigned int namelen, hash;
        int tries = 0;
+       int bit, wait_on_recovery = 0;
 
        BUG_ON(!lockid);
 
        namelen = strlen(lockid);
+       hash = dlm_lockid_hash(lockid, namelen);
 
        mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 
 lookup:
        spin_lock(&dlm->spinlock);
-       tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+       tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
        if (tmpres) {
                spin_unlock(&dlm->spinlock);
                mlog(0, "found in hash!\n");
@@ -762,6 +766,18 @@ lookup:
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
                list_add(&mle->list, &dlm->master_list);
+
+               /* still holding the dlm spinlock, check the recovery map
+                * to see if there are any nodes that still need to be 
+                * considered.  these will not appear in the mle nodemap
+                * but they might own this lockres.  wait on them. */
+               bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+               if (bit < O2NM_MAX_NODES) {
+                       mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                            "recover before lock mastery can begin\n",
+                            dlm->name, namelen, (char *)lockid, bit);
+                       wait_on_recovery = 1;
+               }
        }
 
        /* at this point there is either a DLM_MLE_BLOCK or a
@@ -779,6 +795,39 @@ lookup:
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
 
+       while (wait_on_recovery) {
+               /* any cluster changes that occurred after dropping the
+                * dlm spinlock would be detectable be a change on the mle,
+                * so we only need to clear out the recovery map once. */
+               if (dlm_is_recovery_lock(lockid, namelen)) {
+                       mlog(ML_NOTICE, "%s: recovery map is not empty, but "
+                            "must master $RECOVERY lock now\n", dlm->name);
+                       if (!dlm_pre_master_reco_lockres(dlm, res))
+                               wait_on_recovery = 0;
+                       else {
+                               mlog(0, "%s: waiting 500ms for heartbeat state "
+                                   "change\n", dlm->name);
+                               msleep(500);
+                       }
+                       continue;
+               } 
+
+               dlm_kick_recovery_thread(dlm);
+               msleep(100);
+               dlm_wait_for_recovery(dlm);
+
+               spin_lock(&dlm->spinlock);
+               bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+               if (bit < O2NM_MAX_NODES) {
+                       mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                            "recover before lock mastery can begin\n",
+                            dlm->name, namelen, (char *)lockid, bit);
+                       wait_on_recovery = 1;
+               } else
+                       wait_on_recovery = 0;
+               spin_unlock(&dlm->spinlock);
+       }
+
        /* must wait for lock to be mastered elsewhere */
        if (blocked)
                goto wait;
@@ -792,7 +841,15 @@ redo_request:
                        mlog_errno(ret);
                if (mle->master != O2NM_MAX_NODES) {
                        /* found a master ! */
-                       break;
+                       if (mle->master <= nodenum)
+                               break;
+                       /* if our master request has not reached the master
+                        * yet, keep going until it does.  this is how the
+                        * master will know that asserts are needed back to
+                        * the lower nodes. */
+                       mlog(0, "%s:%.*s: requests only up to %u but master "
+                            "is %u, keep going\n", dlm->name, namelen,
+                            lockid, nodenum, mle->master);
                }
        }
 
@@ -860,7 +917,19 @@ recheck:
        /* check if another node has already become the owner */
        spin_lock(&res->spinlock);
        if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+               mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
+                    res->lockname.len, res->lockname.name, res->owner);
                spin_unlock(&res->spinlock);
+               /* this will cause the master to re-assert across
+                * the whole cluster, freeing up mles */
+               ret = dlm_do_master_request(mle, res->owner);
+               if (ret < 0) {
+                       /* give recovery a chance to run */
+                       mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+                       msleep(500);
+                       goto recheck;
+               }
+               ret = 0;
                goto leave;
        }
        spin_unlock(&res->spinlock);
@@ -1244,13 +1313,14 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 {
        u8 response = DLM_MASTER_RESP_MAYBE;
        struct dlm_ctxt *dlm = data;
-       struct dlm_lock_resource *res;
+       struct dlm_lock_resource *res = NULL;
        struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
        struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
        char *name;
-       unsigned int namelen;
+       unsigned int namelen, hash;
        int found, ret;
        int set_maybe;
+       int dispatch_assert = 0;
 
        if (!dlm_grab(dlm))
                return DLM_MASTER_RESP_NO;
@@ -1262,6 +1332,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
        name = request->name;
        namelen = request->namelen;
+       hash = dlm_lockid_hash(name, namelen);
 
        if (namelen > DLM_LOCKID_NAME_MAX) {
                response = DLM_IVBUFLEN;
@@ -1270,7 +1341,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 way_up_top:
        spin_lock(&dlm->spinlock);
-       res = __dlm_lookup_lockres(dlm, name, namelen);
+       res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        if (res) {
                spin_unlock(&dlm->spinlock);
 
@@ -1287,7 +1358,6 @@ way_up_top:
                }
 
                if (res->owner == dlm->node_num) {
-                       u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
                        spin_unlock(&res->spinlock);
                        // mlog(0, "this node is the master\n");
                        response = DLM_MASTER_RESP_YES;
@@ -1300,16 +1370,7 @@ way_up_top:
                         * caused all nodes up to this one to
                         * create mles.  this node now needs to
                         * go back and clean those up. */
-                       mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
-                            dlm->node_num, res->lockname.len, res->lockname.name);
-                       ret = dlm_dispatch_assert_master(dlm, res, 1,
-                                                        request->node_idx,
-                                                        flags);
-                       if (ret < 0) {
-                               mlog(ML_ERROR, "failed to dispatch assert "
-                                    "master work\n");
-                               response = DLM_MASTER_RESP_ERROR;
-                       }
+                       dispatch_assert = 1;
                        goto send_response;
                } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
                        spin_unlock(&res->spinlock);
@@ -1357,9 +1418,13 @@ way_up_top:
                        }
                } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
                        set_maybe = 0;
-                       if (tmpmle->master == dlm->node_num)
+                       if (tmpmle->master == dlm->node_num) {
                                response = DLM_MASTER_RESP_YES;
-                       else
+                               /* this node will be the owner.
+                                * go back and clean the mles on any
+                                * other nodes */
+                               dispatch_assert = 1;
+                       } else
                                response = DLM_MASTER_RESP_NO;
                } else {
                        // mlog(0, "this node is attempting to "
@@ -1398,8 +1463,8 @@ way_up_top:
                        mle = (struct dlm_master_list_entry *)
                                kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
                        if (!mle) {
-                               // bad bad bad... this sucks.
                                response = DLM_MASTER_RESP_ERROR;
+                               mlog_errno(-ENOMEM);
                                goto send_response;
                        }
                        spin_lock(&dlm->spinlock);
@@ -1418,25 +1483,19 @@ way_up_top:
                // mlog(0, "mle was found\n");
                set_maybe = 1;
                spin_lock(&tmpmle->spinlock);
+               if (tmpmle->master == dlm->node_num) {
+                       mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
+                       BUG();
+               }
                if (tmpmle->type == DLM_MLE_BLOCK)
                        response = DLM_MASTER_RESP_NO;
                else if (tmpmle->type == DLM_MLE_MIGRATION) {
                        mlog(0, "migration mle was found (%u->%u)\n",
                             tmpmle->master, tmpmle->new_master);
-                       if (tmpmle->master == dlm->node_num) {
-                               mlog(ML_ERROR, "no lockres, but migration mle "
-                                    "says that this node is master!\n");
-                               BUG();
-                       }
                        /* real master can respond on its own */
                        response = DLM_MASTER_RESP_NO;
-               } else {
-                       if (tmpmle->master == dlm->node_num) {
-                               response = DLM_MASTER_RESP_YES;
-                               set_maybe = 0;
-                       } else
-                               response = DLM_MASTER_RESP_MAYBE;
-               }
+               } else
+                       response = DLM_MASTER_RESP_MAYBE;
                if (set_maybe)
                        set_bit(request->node_idx, tmpmle->maybe_map);
                spin_unlock(&tmpmle->spinlock);
@@ -1449,6 +1508,24 @@ way_up_top:
                dlm_put_mle(tmpmle);
        }
 send_response:
+
+       if (dispatch_assert) {
+               if (response != DLM_MASTER_RESP_YES)
+                       mlog(ML_ERROR, "invalid response %d\n", response);
+               if (!res) {
+                       mlog(ML_ERROR, "bad lockres while trying to assert!\n");
+                       BUG();
+               }
+               mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+                            dlm->node_num, res->lockname.len, res->lockname.name);
+               ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 
+                                                DLM_ASSERT_MASTER_MLE_CLEANUP);
+               if (ret < 0) {
+                       mlog(ML_ERROR, "failed to dispatch assert master work\n");
+                       response = DLM_MASTER_RESP_ERROR;
+               }
+       }
+
        dlm_put(dlm);
        return response;
 }
@@ -1471,8 +1548,11 @@ static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
        int to, tmpret;
        struct dlm_node_iter iter;
        int ret = 0;
+       int reassert;
 
        BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+again:
+       reassert = 0;
 
        /* note that if this nodemap is empty, it returns 0 */
        dlm_node_iter_init(nodemap, &iter);
@@ -1504,9 +1584,17 @@ static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
                             "got %d.\n", namelen, lockname, to, r);
                        dlm_dump_lock_resources(dlm);
                        BUG();
+               } else if (r == EAGAIN) {
+                       mlog(0, "%.*s: node %u create mles on other "
+                            "nodes and requests a re-assert\n", 
+                            namelen, lockname, to);
+                       reassert = 1;
                }
        }
 
+       if (reassert)
+               goto again;
+
        return ret;
 }
 
@@ -1526,14 +1614,17 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
        struct dlm_lock_resource *res = NULL;
        char *name;
-       unsigned int namelen;
+       unsigned int namelen, hash;
        u32 flags;
+       int master_request = 0;
+       int ret = 0;
 
        if (!dlm_grab(dlm))
                return 0;
 
        name = assert->name;
        namelen = assert->namelen;
+       hash = dlm_lockid_hash(name, namelen);
        flags = be32_to_cpu(assert->flags);
 
        if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1582,7 +1673,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 
        /* ok everything checks out with the MLE
         * now check to see if there is a lockres */
-       res = __dlm_lookup_lockres(dlm, name, namelen);
+       res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        if (res) {
                spin_lock(&res->spinlock);
                if (res->state & DLM_LOCK_RES_RECOVERING)  {
@@ -1642,11 +1733,22 @@ ok:
        // mlog(0, "woo!  got an assert_master from node %u!\n",
        //           assert->node_idx);
        if (mle) {
-               int extra_ref;
+               int extra_ref = 0;
+               int nn = -1;
                
                spin_lock(&mle->spinlock);
-               extra_ref = !!(mle->type == DLM_MLE_BLOCK
-                              || mle->type == DLM_MLE_MIGRATION);
+               if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
+                       extra_ref = 1;
+               else {
+                       /* MASTER mle: if any bits set in the response map
+                        * then the calling node needs to re-assert to clear
+                        * up nodes that this node contacted */
+                       while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 
+                                                   nn+1)) < O2NM_MAX_NODES) {
+                               if (nn != dlm->node_num && nn != assert->node_idx)
+                                       master_request = 1;
+                       }
+               }
                mle->master = assert->node_idx;
                atomic_set(&mle->woken, 1);
                wake_up(&mle->wq);
@@ -1677,10 +1779,15 @@ ok:
        }
 
 done:
+       ret = 0;
        if (res)
                dlm_lockres_put(res);
        dlm_put(dlm);
-       return 0;
+       if (master_request) {
+               mlog(0, "need to tell master to reassert\n");
+               ret = EAGAIN;  // positive. negative would shoot down the node.
+       }
+       return ret;
 
 kill:
        /* kill the caller! */
@@ -1713,6 +1820,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
        item->u.am.request_from = request_from;
        item->u.am.flags = flags;
 
+       if (ignore_higher) 
+               mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 
+                    res->lockname.name);
+               
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
@@ -1775,6 +1886,61 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
        mlog(0, "finished with dlm_assert_master_worker\n");
 }
 
+/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
+ * We cannot wait for node recovery to complete to begin mastering this
+ * lockres because this lockres is used to kick off recovery! ;-)
+ * So, do a pre-check on all living nodes to see if any of those nodes
+ * think that $RECOVERY is currently mastered by a dead node.  If so,
+ * we wait a short time to allow that node to get notified by its own
+ * heartbeat stack, then check again.  All $RECOVERY lock resources
+ * mastered by dead nodes are purged when the hearbeat callback is 
+ * fired, so we can know for sure that it is safe to continue once
+ * the node returns a live node or no node.  */
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+                                      struct dlm_lock_resource *res)
+{
+       struct dlm_node_iter iter;
+       int nodenum;
+       int ret = 0;
+       u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+       spin_lock(&dlm->spinlock);
+       dlm_node_iter_init(dlm->domain_map, &iter);
+       spin_unlock(&dlm->spinlock);
+
+       while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+               /* do not send to self */
+               if (nodenum == dlm->node_num)
+                       continue;
+               ret = dlm_do_master_requery(dlm, res, nodenum, &master);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       if (!dlm_is_host_down(ret))
+                               BUG();
+                       /* host is down, so answer for that node would be
+                        * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+               }
+
+               if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+                       /* check to see if this master is in the recovery map */
+                       spin_lock(&dlm->spinlock);
+                       if (test_bit(master, dlm->recovery_map)) {
+                               mlog(ML_NOTICE, "%s: node %u has not seen "
+                                    "node %u go down yet, and thinks the "
+                                    "dead node is mastering the recovery "
+                                    "lock.  must wait.\n", dlm->name,
+                                    nodenum, master);
+                               ret = -EAGAIN;
+                       }
+                       spin_unlock(&dlm->spinlock);
+                       mlog(0, "%s: reco lock master is %u\n", dlm->name, 
+                            master);
+                       break;
+               }
+       }
+       return ret;
+}
+
 
 /*
  * DLM_MIGRATE_LOCKRES
@@ -2299,7 +2465,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
        struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
        const char *name;
-       unsigned int namelen;
+       unsigned int namelen, hash;
        int ret = 0;
 
        if (!dlm_grab(dlm))
@@ -2307,6 +2473,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
        name = migrate->name;
        namelen = migrate->namelen;
+       hash = dlm_lockid_hash(name, namelen);
 
        /* preallocate.. if this fails, abort */
        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
@@ -2319,7 +2486,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
        /* check for pre-existing lock */
        spin_lock(&dlm->spinlock);
-       res = __dlm_lookup_lockres(dlm, name, namelen);
+       res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        spin_lock(&dlm->master_lock);
 
        if (res) {
@@ -2438,6 +2605,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
        struct list_head *iter, *iter2;
        struct dlm_master_list_entry *mle;
        struct dlm_lock_resource *res;
+       unsigned int hash;
 
        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
@@ -2521,8 +2689,9 @@ top:
                     mle->master, mle->new_master);
                /* if there is a lockres associated with this
                 * mle, find it and set its owner to UNKNOWN */
+               hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
                res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                       mle->u.name.len);
+                                          mle->u.name.len, hash);
                if (res) {
                        /* unfortunately if we hit this rare case, our
                         * lock ordering is messed.  we need to drop