IB/cm: Cancel pending LAP message when exiting IB_CM_ESTABLISH state
authorSean Hefty <sean.hefty@intel.com>
Thu, 3 Mar 2011 23:31:06 +0000 (23:31 +0000)
committerRoland Dreier <roland@purestorage.com>
Tue, 15 Mar 2011 17:56:12 +0000 (10:56 -0700)
This problem was reported by Moni Shoua <monis@mellanox.com> and Amir
Vadai <amirv@mellanox.com>:

When destroying a cm_id from a context of a work queue and if
the lap_state of this cm_id is IB_CM_LAP_SENT, we need to
release the reference of this id that was taken upon the send
of the LAP message.  Otherwise, if the expected APR message
gets lost, it is only after a long time that the reference
will be released, while during that the work handler thread is
not available to process other things.

It turns out that we need to cancel any pending LAP messages whenever
we transition out of the IB_CM_ESTABLISH state.  This occurs when
disconnecting - either sending or receiving a DREQ.  It can also
happen in a corner case where we receive a REJ message after sending
an RTU, followed by a LAP.  Add checks and cancel any outstanding LAP
messages in these three cases.

Canceling the LAP when sending a DREQ fixes the destroy problem
reported by Moni.  When a cm_id is destroyed in the IB_CM_ESTABLISHED
state, it sends a DREQ to the remote side to notify the peer that the
connection is going away.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
drivers/infiniband/core/cm.c

index 1d9616be419222d16a03a02943ba3f49d6f5c398..f804e28e1ebb5b9dc8ebdfdf8f4c53a81e9b8a72 100644 (file)
@@ -1988,6 +1988,10 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id,
                goto out;
        }
 
+       if (cm_id->lap_state == IB_CM_LAP_SENT ||
+           cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+               ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
        ret = cm_alloc_msg(cm_id_priv, &msg);
        if (ret) {
                cm_enter_timewait(cm_id_priv);
@@ -2129,6 +2133,10 @@ static int cm_dreq_handler(struct cm_work *work)
                ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
                break;
        case IB_CM_ESTABLISHED:
+               if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+                   cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+                       ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+               break;
        case IB_CM_MRA_REP_RCVD:
                break;
        case IB_CM_TIMEWAIT:
@@ -2349,9 +2357,18 @@ static int cm_rej_handler(struct cm_work *work)
                /* fall through */
        case IB_CM_REP_RCVD:
        case IB_CM_MRA_REP_SENT:
-       case IB_CM_ESTABLISHED:
                cm_enter_timewait(cm_id_priv);
                break;
+       case IB_CM_ESTABLISHED:
+               if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+                   cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+                       if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+                               ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+                                             cm_id_priv->msg);
+                       cm_enter_timewait(cm_id_priv);
+                       break;
+               }
+               /* fall through */
        default:
                spin_unlock_irq(&cm_id_priv->lock);
                ret = -EINVAL;