lustre/ptlrpc: re-enqueue ptlrpcd worker
authorLiang Zhen <liang.zhen@intel.com>
Sat, 1 Mar 2014 02:16:43 +0000 (21:16 -0500)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 1 Mar 2014 03:13:21 +0000 (19:13 -0800)
osc_extent_wait can be stuck in scenario like this:

1) thread-1 held an active extent
2) thread-2 called flush cache, and marked this extent as "urgent"
   and "sync_wait"
3) thread-3 wants to write to the same extent, osc_extent_find will
   get "conflict" because this extent is "sync_wait", so it starts
   to wait...
4) cl_writeback_work has been scheduled by thread-4 to write some
   other extents, it has sent RPCs but not returned yet.
5) thread-1 finished his work, and called osc_extent_release()->
   osc_io_unplug_async()->ptlrpcd_queue_work(), but found
   cl_writeback_work is still running, so it's ignored (-EBUSY)
6) thread-3 is stuck because nobody will wake him up.

This patch allows ptlrpcd_work to be rescheduled, so it will not
miss request anymore

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Reviewed-on: http://review.whamcloud.com/8922
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4509
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Reviewed-by: Bobi Jam <bobijam@gmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
Signed-off-by: Oleg Drokin <oleg.drokin@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/staging/lustre/lustre/ptlrpc/client.c

index 87ff5ecf8847c0ba6ad2e8a5c4ecd76df1e65d47..ed8e538c302e445de23a17b2f0b21bd1bb065ebc 100644 (file)
@@ -48,6 +48,7 @@
 #include "ptlrpc_internal.h"
 
 static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+static int ptlrpcd_check_work(struct ptlrpc_request *req);
 
 /**
  * Initialize passed in client structure \a cl.
@@ -1779,6 +1780,10 @@ interpret:
 
                ptlrpc_req_interpret(env, req, req->rq_status);
 
+               if (ptlrpcd_check_work(req)) {
+                       atomic_dec(&set->set_remaining);
+                       continue;
+               }
                ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
 
                CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
@@ -2952,22 +2957,50 @@ EXPORT_SYMBOL(ptlrpc_sample_next_xid);
  *    have delay before it really runs by ptlrpcd thread.
  */
 struct ptlrpc_work_async_args {
-       __u64   magic;
        int   (*cb)(const struct lu_env *, void *);
        void   *cbdata;
 };
 
-#define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */
+static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
+{
+       /* re-initialize the req */
+       req->rq_timeout         = obd_timeout;
+       req->rq_sent            = cfs_time_current_sec();
+       req->rq_deadline        = req->rq_sent + req->rq_timeout;
+       req->rq_reply_deadline  = req->rq_deadline;
+       req->rq_phase           = RQ_PHASE_INTERPRET;
+       req->rq_next_phase      = RQ_PHASE_COMPLETE;
+       req->rq_xid             = ptlrpc_next_xid();
+       req->rq_import_generation = req->rq_import->imp_generation;
+
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+}
 
 static int work_interpreter(const struct lu_env *env,
                            struct ptlrpc_request *req, void *data, int rc)
 {
        struct ptlrpc_work_async_args *arg = data;
 
-       LASSERT(arg->magic == PTLRPC_WORK_MAGIC);
+       LASSERT(ptlrpcd_check_work(req));
        LASSERT(arg->cb != NULL);
 
-       return arg->cb(env, arg->cbdata);
+       rc = arg->cb(env, arg->cbdata);
+
+       list_del_init(&req->rq_set_chain);
+       req->rq_set = NULL;
+
+       if (atomic_dec_return(&req->rq_refcount) > 1) {
+               atomic_set(&req->rq_refcount, 2);
+               ptlrpcd_add_work_req(req);
+       }
+       return rc;
+}
+
+static int worker_format;
+
+static int ptlrpcd_check_work(struct ptlrpc_request *req)
+{
+       return req->rq_pill.rc_fmt == (void *)&worker_format;
 }
 
 /**
@@ -3000,6 +3033,7 @@ void *ptlrpcd_alloc_work(struct obd_import *imp,
        req->rq_receiving_reply = 0;
        req->rq_must_unlink = 0;
        req->rq_no_delay = req->rq_no_resend = 1;
+       req->rq_pill.rc_fmt = (void *)&worker_format;
 
        spin_lock_init(&req->rq_lock);
        INIT_LIST_HEAD(&req->rq_list);
@@ -3013,7 +3047,6 @@ void *ptlrpcd_alloc_work(struct obd_import *imp,
 
        CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
        args = ptlrpc_req_async_args(req);
-       args->magic  = PTLRPC_WORK_MAGIC;
        args->cb     = cb;
        args->cbdata = cbdata;
 
@@ -3043,25 +3076,8 @@ int ptlrpcd_queue_work(void *handler)
         * req as opaque data. - Jinshan
         */
        LASSERT(atomic_read(&req->rq_refcount) > 0);
-       if (atomic_read(&req->rq_refcount) > 1)
-               return -EBUSY;
-
-       if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */
-               atomic_dec(&req->rq_refcount);
-               return -EBUSY;
-       }
-
-       /* re-initialize the req */
-       req->rq_timeout = obd_timeout;
-       req->rq_sent       = cfs_time_current_sec();
-       req->rq_deadline       = req->rq_sent + req->rq_timeout;
-       req->rq_reply_deadline = req->rq_deadline;
-       req->rq_phase     = RQ_PHASE_INTERPRET;
-       req->rq_next_phase     = RQ_PHASE_COMPLETE;
-       req->rq_xid         = ptlrpc_next_xid();
-       req->rq_import_generation = req->rq_import->imp_generation;
-
-       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       if (atomic_inc_return(&req->rq_refcount) == 2)
+               ptlrpcd_add_work_req(req);
        return 0;
 }
 EXPORT_SYMBOL(ptlrpcd_queue_work);