staging/lustre/ptlrpc: track unreplied requests
authorNiu Yawei <yawei.niu@intel.com>
Thu, 17 Nov 2016 01:05:29 +0000 (20:05 -0500)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 17 Nov 2016 15:32:32 +0000 (16:32 +0100)
The request xid was used to make sure the ost object timestamps
being updated by the out of order setattr/punch/write requests
properly. However, this mechanism is broken by the multiple rcvd
slot feature, where we deferred the xid assignment from request
packing to request sending.

This patch moved back the xid assignment to request packing, and
the manner of finding lowest unreplied xid is changed from scan
sending & delay list to scan a unreplied requests list.

This patch also skipped packing the known replied XID in connect
and disconnect request, so that we can make sure the known replied
XID is increased only on both server & client side.

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
Reviewed-on: http://review.whamcloud.com/16759
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-5951
Reviewed-by: Gregoire Pichon <gregoire.pichon@bull.net>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/staging/lustre/lustre/include/lustre_import.h
drivers/staging/lustre/lustre/include/lustre_net.h
drivers/staging/lustre/lustre/obdclass/genops.c
drivers/staging/lustre/lustre/ptlrpc/client.c
drivers/staging/lustre/lustre/ptlrpc/import.c
drivers/staging/lustre/lustre/ptlrpc/niobuf.c
drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
drivers/staging/lustre/lustre/ptlrpc/recover.c

index 5461ba33d90c0a1e581787902d7b86ab0681ef5c..4499c697d7faed65edbd87833243acaf427ea538 100644 (file)
@@ -185,6 +185,11 @@ struct obd_import {
        struct list_head               *imp_replay_cursor;
        /** @} */
 
+       /** List of not replied requests */
+       struct list_head        imp_unreplied_list;
+       /** Known maximal replied XID */
+       __u64                   imp_known_replied_xid;
+
        /** obd device for this import */
        struct obd_device       *imp_obd;
 
index d2cbec343c550b623ef68d8fea72a3cabced8fdd..2be135d1fc9398bc2643f92b0ee71758f44ff3c2 100644 (file)
@@ -596,6 +596,8 @@ struct ptlrpc_cli_req {
        union ptlrpc_async_args          cr_async_args;
        /** Opaq data for replay and commit callbacks. */
        void                            *cr_cb_data;
+       /** Link to the imp->imp_unreplied_list */
+       struct list_head                 cr_unreplied_list;
        /**
         * Commit callback, called when request is committed and about to be
         * freed.
@@ -635,6 +637,7 @@ struct ptlrpc_cli_req {
 #define rq_interpret_reply     rq_cli.cr_reply_interp
 #define rq_async_args          rq_cli.cr_async_args
 #define rq_cb_data             rq_cli.cr_cb_data
+#define rq_unreplied_list      rq_cli.cr_unreplied_list
 #define rq_commit_cb           rq_cli.cr_commit_cb
 #define rq_replay_cb           rq_cli.cr_replay_cb
 
index 438d619059a9a0f794e30ffcb0d1fb77e567e08d..fa0d38ddccb21aa1d0f9cc8b26ad694e4f88a549 100644 (file)
@@ -907,6 +907,8 @@ struct obd_import *class_new_import(struct obd_device *obd)
        INIT_LIST_HEAD(&imp->imp_sending_list);
        INIT_LIST_HEAD(&imp->imp_delayed_list);
        INIT_LIST_HEAD(&imp->imp_committed_list);
+       INIT_LIST_HEAD(&imp->imp_unreplied_list);
+       imp->imp_known_replied_xid = 0;
        imp->imp_replay_cursor = &imp->imp_committed_list;
        spin_lock_init(&imp->imp_lock);
        imp->imp_last_success_conn = 0;
index d2f4cd5a9a9d1949a0c29b0ef9e57ff02fa6fc62..ac959ef1fab090d9584355a5aa9cf51aa53eac0c 100644 (file)
@@ -652,6 +652,42 @@ static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
        spin_unlock(&pool->prp_lock);
 }
 
+void ptlrpc_add_unreplied(struct ptlrpc_request *req)
+{
+       struct obd_import       *imp = req->rq_import;
+       struct list_head        *tmp;
+       struct ptlrpc_request   *iter;
+
+       assert_spin_locked(&imp->imp_lock);
+       LASSERT(list_empty(&req->rq_unreplied_list));
+
+       /* unreplied list is sorted by xid in ascending order */
+       list_for_each_prev(tmp, &imp->imp_unreplied_list) {
+               iter = list_entry(tmp, struct ptlrpc_request,
+                                 rq_unreplied_list);
+
+               LASSERT(req->rq_xid != iter->rq_xid);
+               if (req->rq_xid < iter->rq_xid)
+                       continue;
+               list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list);
+               return;
+       }
+       list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list);
+}
+
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req)
+{
+       req->rq_xid = ptlrpc_next_xid();
+       ptlrpc_add_unreplied(req);
+}
+
+static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_import->imp_lock);
+       ptlrpc_assign_next_xid_nolock(req);
+       spin_unlock(&req->rq_import->imp_lock);
+}
+
 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
                             __u32 version, int opcode, char **bufs,
                             struct ptlrpc_cli_ctx *ctx)
@@ -701,6 +737,7 @@ int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
        ptlrpc_at_set_req_timeout(request);
 
        lustre_msg_set_opc(request->rq_reqmsg, opcode);
+       ptlrpc_assign_next_xid(request);
 
        /* Let's setup deadline for req/reply/bulk unlink for opcode. */
        if (cfs_fail_val == opcode) {
@@ -1230,6 +1267,24 @@ static void ptlrpc_save_versions(struct ptlrpc_request *req)
               versions[0], versions[1]);
 }
 
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+
+       assert_spin_locked(&imp->imp_lock);
+       if (list_empty(&imp->imp_unreplied_list))
+               return 0;
+
+       req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request,
+                        rq_unreplied_list);
+       LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid);
+
+       if (imp->imp_known_replied_xid < req->rq_xid - 1)
+               imp->imp_known_replied_xid = req->rq_xid - 1;
+
+       return req->rq_xid - 1;
+}
+
 /**
  * Callback function called when client receives RPC reply for \a req.
  * Returns 0 on success or error code.
@@ -1317,6 +1372,11 @@ static int after_reply(struct ptlrpc_request *req)
                else
                        req->rq_sent = now + req->rq_nr_resend;
 
+               /* Resend for EINPROGRESS will use a new XID */
+               spin_lock(&imp->imp_lock);
+               list_del_init(&req->rq_unreplied_list);
+               spin_unlock(&imp->imp_lock);
+
                return 0;
        }
 
@@ -1430,8 +1490,7 @@ static int after_reply(struct ptlrpc_request *req)
 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 {
        struct obd_import *imp = req->rq_import;
-       struct list_head *tmp;
-       u64 min_xid = ~0ULL;
+       u64 min_xid = 0;
        int rc;
 
        LASSERT(req->rq_phase == RQ_PHASE_NEW);
@@ -1451,17 +1510,8 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 
        spin_lock(&imp->imp_lock);
 
-       /*
-        * the very first time we assign XID. it's important to assign XID
-        * and put it on the list atomically, so that the lowest assigned
-        * XID is always known. this is vital for multislot last_rcvd
-        */
-       if (req->rq_send_state == LUSTRE_IMP_REPLAY) {
-               LASSERT(req->rq_xid);
-       } else {
-               LASSERT(!req->rq_xid);
-               req->rq_xid = ptlrpc_next_xid();
-       }
+       LASSERT(req->rq_xid);
+       LASSERT(!list_empty(&req->rq_unreplied_list));
 
        if (!req->rq_generation_set)
                req->rq_import_generation = imp->imp_generation;
@@ -1493,25 +1543,23 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
        list_add_tail(&req->rq_list, &imp->imp_sending_list);
        atomic_inc(&req->rq_import->imp_inflight);
 
-       /* find the lowest unreplied XID */
-       list_for_each(tmp, &imp->imp_delayed_list) {
-               struct ptlrpc_request *r;
-
-               r = list_entry(tmp, struct ptlrpc_request, rq_list);
-               if (r->rq_xid < min_xid)
-                       min_xid = r->rq_xid;
-       }
-       list_for_each(tmp, &imp->imp_sending_list) {
-               struct ptlrpc_request *r;
-
-               r = list_entry(tmp, struct ptlrpc_request, rq_list);
-               if (r->rq_xid < min_xid)
-                       min_xid = r->rq_xid;
-       }
+       /* find the known replied XID from the unreplied list, CONNECT
+        * and DISCONNECT requests are skipped to make the sanity check
+        * on server side happy. see process_req_last_xid().
+        *
+        * For CONNECT: Because replay requests have lower XID, it'll
+        * break the sanity check if CONNECT bump the exp_last_xid on
+        * server.
+        *
+        * For DISCONNECT: Since client will abort inflight RPC before
+        * sending DISCONNECT, DISCONNECT may carry an XID which higher
+        * than the inflight RPC.
+        */
+       if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req))
+               min_xid = ptlrpc_known_replied_xid(imp);
        spin_unlock(&imp->imp_lock);
 
-       if (likely(min_xid != ~0ULL))
-               lustre_msg_set_last_xid(req->rq_reqmsg, min_xid - 1);
+       lustre_msg_set_last_xid(req->rq_reqmsg, min_xid);
 
        lustre_msg_set_status(req->rq_reqmsg, current_pid());
 
@@ -1956,6 +2004,7 @@ interpret:
                        list_del_init(&req->rq_list);
                        atomic_dec(&imp->imp_inflight);
                }
+               list_del_init(&req->rq_unreplied_list);
                spin_unlock(&imp->imp_lock);
 
                atomic_dec(&set->set_remaining);
@@ -2353,6 +2402,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                if (!locked)
                        spin_lock(&request->rq_import->imp_lock);
                list_del_init(&request->rq_replay_list);
+               list_del_init(&request->rq_unreplied_list);
                if (!locked)
                        spin_unlock(&request->rq_import->imp_lock);
        }
@@ -3060,7 +3110,7 @@ void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req)
 
        LASSERT(bd);
 
-       if (!req->rq_resend || req->rq_nr_resend) {
+       if (!req->rq_resend) {
                /* this request has a new xid, just use it as bulk matchbits */
                req->rq_mbits = req->rq_xid;
 
index babb80de249af9289fe683738a037d79df5e0d96..66f5b4945e1bbe9a91a7ae40d1f91ff389dc6f8f 100644 (file)
@@ -903,6 +903,39 @@ static int ptlrpc_connect_set_flags(struct obd_import *imp,
        return 0;
 }
 
+/**
+ * Add all replay requests back to unreplied list before start replay,
+ * so that we can make sure the known replied XID is always increased
+ * only even if when replaying requests.
+ */
+static void ptlrpc_prepare_replay(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+
+       if (imp->imp_state != LUSTRE_IMP_REPLAY ||
+           imp->imp_resend_replay)
+               return;
+
+       /*
+        * If the server was restart during repaly, the requests may
+        * have been added to the unreplied list in former replay.
+        */
+       spin_lock(&imp->imp_lock);
+
+       list_for_each_entry(req, &imp->imp_committed_list, rq_replay_list) {
+               if (list_empty(&req->rq_unreplied_list))
+                       ptlrpc_add_unreplied(req);
+       }
+
+       list_for_each_entry(req, &imp->imp_replay_list, rq_replay_list) {
+               if (list_empty(&req->rq_unreplied_list))
+                       ptlrpc_add_unreplied(req);
+       }
+
+       imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp);
+       spin_unlock(&imp->imp_lock);
+}
+
 /**
  * interpret_reply callback for connect RPCs.
  * Looks into returned status of connect operation and decides
@@ -1154,6 +1187,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
        }
 
 finish:
+       ptlrpc_prepare_replay(imp);
        rc = ptlrpc_import_recovery_state_machine(imp);
        if (rc == -ENOTCONN) {
                CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
index 581056563db2537eed2c53f8481abb4b7f2bd18a..da1209e40f03636763589d76c57baade1dd43cb8 100644 (file)
@@ -509,20 +509,39 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        lustre_msg_set_conn_cnt(request->rq_reqmsg, imp->imp_conn_cnt);
        lustre_msghdr_set_flags(request->rq_reqmsg, imp->imp_msghdr_flags);
 
-       if (request->rq_nr_resend) {
+       /*
+        * If it's the first time to resend the request for EINPROGRESS,
+        * we need to allocate a new XID (see after_reply()), it's different
+        * from the resend for reply timeout.
+        */
+       if (request->rq_nr_resend && list_empty(&request->rq_unreplied_list)) {
+               __u64 min_xid = 0;
                /*
                 * resend for EINPROGRESS, allocate new xid to avoid reply
                 * reconstruction
                 */
-               request->rq_xid = ptlrpc_next_xid();
-               DEBUG_REQ(D_RPCTRACE, request, "Allocating new xid for resend on EINPROGRESS");
-       }
+               spin_lock(&imp->imp_lock);
+               ptlrpc_assign_next_xid_nolock(request);
+               request->rq_mbits = request->rq_xid;
+               min_xid = ptlrpc_known_replied_xid(imp);
+               spin_unlock(&imp->imp_lock);
 
-       if (request->rq_bulk) {
+               lustre_msg_set_last_xid(request->rq_reqmsg, min_xid);
+               DEBUG_REQ(D_RPCTRACE, request, "Allocating new xid for resend on EINPROGRESS");
+       } else if (request->rq_bulk) {
                ptlrpc_set_bulk_mbits(request);
                lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits);
        }
 
+       if (list_empty(&request->rq_unreplied_list) ||
+           request->rq_xid <= imp->imp_known_replied_xid) {
+               DEBUG_REQ(D_ERROR, request,
+                         "xid: %llu, replied: %llu, list_empty:%d\n",
+                         request->rq_xid, imp->imp_known_replied_xid,
+                         list_empty(&request->rq_unreplied_list));
+               LBUG();
+       }
+
        /**
         * For enabled AT all request should have AT_SUPPORT in the
         * FULL import state when OBD_CONNECT_AT is set
index f056c8299a5c66209c2c4898ecb705b673019290..e0f859ca62230969b1862845e52410e15b332862 100644 (file)
@@ -71,6 +71,9 @@ int ptlrpc_expired_set(void *data);
 int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req);
+void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req);
+__u64 ptlrpc_known_replied_xid(struct obd_import *imp);
+void ptlrpc_add_unreplied(struct ptlrpc_request *req);
 
 /* events.c */
 int ptlrpc_init_portals(void);
@@ -324,6 +327,7 @@ static inline void ptlrpc_cli_req_init(struct ptlrpc_request *req)
 
        INIT_LIST_HEAD(&cr->cr_set_chain);
        INIT_LIST_HEAD(&cr->cr_ctx_chain);
+       INIT_LIST_HEAD(&cr->cr_unreplied_list);
        init_waitqueue_head(&cr->cr_reply_waitq);
        init_waitqueue_head(&cr->cr_set_waitq);
 }
@@ -340,4 +344,24 @@ static inline void ptlrpc_srv_req_init(struct ptlrpc_request *req)
        INIT_LIST_HEAD(&sr->sr_hist_list);
 }
 
+static inline bool ptlrpc_req_is_connect(struct ptlrpc_request *req)
+{
+       if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CONNECT ||
+           lustre_msg_get_opc(req->rq_reqmsg) == OST_CONNECT ||
+           lustre_msg_get_opc(req->rq_reqmsg) == MGS_CONNECT)
+               return true;
+       else
+               return false;
+}
+
+static inline bool ptlrpc_req_is_disconnect(struct ptlrpc_request *req)
+{
+       if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_DISCONNECT ||
+           lustre_msg_get_opc(req->rq_reqmsg) == OST_DISCONNECT ||
+           lustre_msg_get_opc(req->rq_reqmsg) == MGS_DISCONNECT)
+               return true;
+       else
+               return false;
+}
+
 #endif /* PTLRPC_INTERNAL_H */
index 9144cd8deeaa12368011cb1621d8cd4d1f8806d7..344aedd5bd59de75a1015e36ce5eb5efc1910e0c 100644 (file)
@@ -157,10 +157,22 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
                lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
 
        spin_lock(&imp->imp_lock);
+       /* The resend replay request may have been removed from the
+        * unreplied list.
+        */
+       if (req && imp->imp_resend_replay &&
+           list_empty(&req->rq_unreplied_list))
+               ptlrpc_add_unreplied(req);
+
        imp->imp_resend_replay = 0;
        spin_unlock(&imp->imp_lock);
 
        if (req) {
+               /* The request should have been added back in unreplied list
+                * by ptlrpc_prepare_replay().
+                */
+               LASSERT(!list_empty(&req->rq_unreplied_list));
+
                rc = ptlrpc_replay_req(req);
                if (rc) {
                        CERROR("recovery replay error %d for req %llu\n",