IB/qib, staging/rdma/hfi1: add s_hlock for use in post send
authorMike Marciniszyn <mike.marciniszyn@intel.com>
Sun, 14 Feb 2016 20:10:04 +0000 (12:10 -0800)
committerDoug Ledford <dledford@redhat.com>
Fri, 11 Mar 2016 01:38:07 +0000 (20:38 -0500)
This patch adds an additional lock to reduce contention on the s_lock.

This lock is used in post_send() so that the post_send is not
serialized with the send engine and other send related processing.

To do this the s_next_psn is now maintained on post_send() while
post_send() related fields are moved to a new cache line.  There is
an s_avail maintained for the post_send() to mitigate trading cache
lines with the send engine.  The lock is released/acquired around
releasing the just built packet to the egress mechanism.

Reviewed-by: Jubin John <jubin.john@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
18 files changed:
drivers/infiniband/hw/qib/qib_qp.c
drivers/infiniband/hw/qib/qib_rc.c
drivers/infiniband/hw/qib/qib_ruc.c
drivers/infiniband/hw/qib/qib_uc.c
drivers/infiniband/hw/qib/qib_ud.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/sw/rdmavt/qp.c
drivers/staging/rdma/hfi1/qp.c
drivers/staging/rdma/hfi1/qp.h
drivers/staging/rdma/hfi1/rc.c
drivers/staging/rdma/hfi1/ruc.c
drivers/staging/rdma/hfi1/uc.c
drivers/staging/rdma/hfi1/ud.c
drivers/staging/rdma/hfi1/verbs.c
drivers/staging/rdma/hfi1/verbs.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_qp.h

index 01d49dc91de22d2742a5b92b55f61d525743669e..6ffa0221da9f5dd523002d605d99f1ad34c7be60 100644 (file)
@@ -474,6 +474,42 @@ void qib_get_credit(struct rvt_qp *qp, u32 aeth)
        }
 }
 
+/**
+ * qib_check_send_wqe - validate wr/wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wr/wqe.  This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 0 on success, -EINVAL on failure
+ */
+int qib_check_send_wqe(struct rvt_qp *qp,
+                      struct rvt_swqe *wqe)
+{
+       struct rvt_ah *ah;
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               if (wqe->length > 0x80000000U)
+                       return -EINVAL;
+               break;
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
 #ifdef CONFIG_DEBUG_FS
 
 struct qib_qp_iter {
index ce886b2ade74e58a4b4e08d1c8861ab687405b81..9088e26d3ac8b0aef86424d244c0465c4f33b87e 100644 (file)
@@ -226,6 +226,8 @@ bail:
  * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
  * @qp: a pointer to the QP
  *
+ * Assumes the s_lock is held.
+ *
  * Return 1 if constructed; otherwise, return 0.
  */
 int qib_make_rc_req(struct rvt_qp *qp)
@@ -241,7 +243,6 @@ int qib_make_rc_req(struct rvt_qp *qp)
        u32 bth2;
        u32 pmtu = qp->pmtu;
        char newreq;
-       unsigned long flags;
        int ret = 0;
        int delta;
 
@@ -249,12 +250,6 @@ int qib_make_rc_req(struct rvt_qp *qp)
        if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
                ohdr = &priv->s_hdr->u.l.oth;
 
-       /*
-        * The lock is needed to synchronize between the sending tasklet,
-        * the receive interrupt handler, and timeout resends.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-
        /* Sending responses has higher priority over sending requests. */
        if ((qp->s_flags & RVT_S_RESP_PENDING) &&
            qib_make_rc_ack(dev, qp, ohdr, pmtu))
@@ -264,7 +259,8 @@ int qib_make_rc_req(struct rvt_qp *qp)
                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
                        goto bail;
                /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
                if (atomic_read(&priv->s_dma_busy)) {
@@ -321,8 +317,8 @@ int qib_make_rc_req(struct rvt_qp *qp)
                                qp->s_flags |= RVT_S_WAIT_FENCE;
                                goto bail;
                        }
-                       wqe->psn = qp->s_next_psn;
                        newreq = 1;
+                       qp->s_psn = wqe->psn;
                }
                /*
                 * Note that we have to be careful not to modify the
@@ -341,9 +337,7 @@ int qib_make_rc_req(struct rvt_qp *qp)
                                qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
                                goto bail;
                        }
-                       wqe->lpsn = wqe->psn;
                        if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
                                qp->s_state = OP(SEND_FIRST);
                                len = pmtu;
                                break;
@@ -381,9 +375,7 @@ int qib_make_rc_req(struct rvt_qp *qp)
                                cpu_to_be32(wqe->rdma_wr.rkey);
                        ohdr->u.rc.reth.length = cpu_to_be32(len);
                        hwords += sizeof(struct ib_reth) / sizeof(u32);
-                       wqe->lpsn = wqe->psn;
                        if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
                                qp->s_state = OP(RDMA_WRITE_FIRST);
                                len = pmtu;
                                break;
@@ -418,13 +410,6 @@ int qib_make_rc_req(struct rvt_qp *qp)
                                qp->s_num_rd_atomic++;
                                if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
                                        qp->s_lsn++;
-                               /*
-                                * Adjust s_next_psn to count the
-                                * expected number of responses.
-                                */
-                               if (len > pmtu)
-                                       qp->s_next_psn += (len - 1) / pmtu;
-                               wqe->lpsn = qp->s_next_psn++;
                        }
 
                        ohdr->u.rc.reth.vaddr =
@@ -456,7 +441,6 @@ int qib_make_rc_req(struct rvt_qp *qp)
                                qp->s_num_rd_atomic++;
                                if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
                                        qp->s_lsn++;
-                               wqe->lpsn = wqe->psn;
                        }
                        if (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
                                qp->s_state = OP(COMPARE_SWAP);
@@ -499,11 +483,8 @@ int qib_make_rc_req(struct rvt_qp *qp)
                }
                if (wqe->wr.opcode == IB_WR_RDMA_READ)
                        qp->s_psn = wqe->lpsn + 1;
-               else {
+               else
                        qp->s_psn++;
-                       if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                               qp->s_next_psn = qp->s_psn;
-               }
                break;
 
        case OP(RDMA_READ_RESPONSE_FIRST):
@@ -523,8 +504,6 @@ int qib_make_rc_req(struct rvt_qp *qp)
                /* FALLTHROUGH */
        case OP(SEND_MIDDLE):
                bth2 = qp->s_psn++ & QIB_PSN_MASK;
-               if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
                ss = &qp->s_sge;
                len = qp->s_len;
                if (len > pmtu) {
@@ -564,8 +543,6 @@ int qib_make_rc_req(struct rvt_qp *qp)
                /* FALLTHROUGH */
        case OP(RDMA_WRITE_MIDDLE):
                bth2 = qp->s_psn++ & QIB_PSN_MASK;
-               if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
                ss = &qp->s_sge;
                len = qp->s_len;
                if (len > pmtu) {
@@ -630,13 +607,9 @@ int qib_make_rc_req(struct rvt_qp *qp)
        qp->s_cur_size = len;
        qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
 done:
-       ret = 1;
-       goto unlock;
-
+       return 1;
 bail:
        qp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
@@ -1454,7 +1427,8 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp,
                goto ack_done;
 
        /* Ignore invalid responses. */
-       if (qib_cmp24(psn, qp->s_next_psn) >= 0)
+       smp_read_barrier_depends(); /* see post_one_send */
+       if (qib_cmp24(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
                goto ack_done;
 
        /* Ignore duplicate responses. */
index 2623684745f074d52b65c6fc8a9fe619ada0ab71..a5f07a64b228d2e8f5e1796f8892ddeda6aa37fd 100644 (file)
@@ -391,7 +391,8 @@ static void qib_ruc_loopback(struct rvt_qp *sqp)
        sqp->s_flags |= RVT_S_BUSY;
 
 again:
-       if (sqp->s_last == sqp->s_head)
+       smp_read_barrier_depends(); /* see post_one_send() */
+       if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
                goto clr_busy;
        wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
 
@@ -765,22 +766,24 @@ void qib_do_send(struct rvt_qp *qp)
 
        qp->s_flags |= RVT_S_BUSY;
 
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
        do {
                /* Check for a constructed packet to be sent. */
                if (qp->s_hdrwords != 0) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
                        /*
                         * If the packet cannot be sent now, return and
                         * the send tasklet will be woken up later.
                         */
                        if (qib_verbs_send(qp, priv->s_hdr, qp->s_hdrwords,
                                           qp->s_cur_sge, qp->s_cur_size))
-                               break;
+                               return;
                        /* Record that s_hdr is empty. */
                        qp->s_hdrwords = 0;
+                       spin_lock_irqsave(&qp->s_lock, flags);
                }
        } while (make_req(qp));
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
 /*
index 1b2fc69855b2ff15f5691b65d9f4499c22ad1902..7bdbc79ceaa3bbd7a25a87ea107b06f1460dd7ad 100644 (file)
@@ -41,6 +41,8 @@
  * qib_make_uc_req - construct a request packet (SEND, RDMA write)
  * @qp: a pointer to the QP
  *
+ * Assumes the s_lock is held.
+ *
  * Return 1 if constructed; otherwise, return 0.
  */
 int qib_make_uc_req(struct rvt_qp *qp)
@@ -48,20 +50,18 @@ int qib_make_uc_req(struct rvt_qp *qp)
        struct qib_qp_priv *priv = qp->priv;
        struct qib_other_headers *ohdr;
        struct rvt_swqe *wqe;
-       unsigned long flags;
        u32 hwords;
        u32 bth0;
        u32 len;
        u32 pmtu = qp->pmtu;
        int ret = 0;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
                        goto bail;
                /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
                if (atomic_read(&priv->s_dma_busy)) {
@@ -90,13 +90,13 @@ int qib_make_uc_req(struct rvt_qp *qp)
                    RVT_PROCESS_NEXT_SEND_OK))
                        goto bail;
                /* Check if send work queue is empty. */
-               if (qp->s_cur == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_cur == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /*
                 * Start a new request.
                 */
-               wqe->psn = qp->s_next_psn;
-               qp->s_psn = qp->s_next_psn;
+               qp->s_psn = wqe->psn;
                qp->s_sge.sge = wqe->sg_list[0];
                qp->s_sge.sg_list = wqe->sg_list + 1;
                qp->s_sge.num_sge = wqe->wr.num_sge;
@@ -215,15 +215,11 @@ int qib_make_uc_req(struct rvt_qp *qp)
        qp->s_cur_sge = &qp->s_sge;
        qp->s_cur_size = len;
        qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
-                           qp->s_next_psn++ & QIB_PSN_MASK);
+                           qp->s_psn++ & QIB_PSN_MASK);
 done:
-       ret = 1;
-       goto unlock;
-
+       return 1;
 bail:
        qp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
index fe4917272b891b3069b1f0e304ce5a8e86b9e349..d9502137de626b80966cfd200e3806c65c36b21a 100644 (file)
@@ -234,6 +234,8 @@ drop:
  * qib_make_ud_req - construct a UD request packet
  * @qp: the QP
  *
+ * Assumes the s_lock is held.
+ *
  * Return 1 if constructed; otherwise, return 0.
  */
 int qib_make_ud_req(struct rvt_qp *qp)
@@ -244,7 +246,6 @@ int qib_make_ud_req(struct rvt_qp *qp)
        struct qib_pportdata *ppd;
        struct qib_ibport *ibp;
        struct rvt_swqe *wqe;
-       unsigned long flags;
        u32 nwords;
        u32 extra_bytes;
        u32 bth0;
@@ -253,13 +254,12 @@ int qib_make_ud_req(struct rvt_qp *qp)
        int ret = 0;
        int next_cur;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
                        goto bail;
                /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
                if (atomic_read(&priv->s_dma_busy)) {
@@ -271,7 +271,9 @@ int qib_make_ud_req(struct rvt_qp *qp)
                goto done;
        }
 
-       if (qp->s_cur == qp->s_head)
+       /* see post_one_send() */
+       smp_read_barrier_depends();
+       if (qp->s_cur == ACCESS_ONCE(qp->s_head))
                goto bail;
 
        wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
@@ -292,6 +294,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
                this_cpu_inc(ibp->pmastats->n_unicast_xmit);
                lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
                if (unlikely(lid == ppd->lid)) {
+                       unsigned long flags;
                        /*
                         * If DMAs are in progress, we can't generate
                         * a completion for the loopback packet since
@@ -304,6 +307,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
                                goto bail;
                        }
                        qp->s_cur = next_cur;
+                       local_irq_save(flags);
                        spin_unlock_irqrestore(&qp->s_lock, flags);
                        qib_ud_loopback(qp, wqe);
                        spin_lock_irqsave(&qp->s_lock, flags);
@@ -378,7 +382,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
                ah_attr->dlid != be16_to_cpu(IB_LID_PERMISSIVE) ?
                cpu_to_be32(QIB_MULTICAST_QPN) :
                cpu_to_be32(wqe->ud_wr.remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK);
+       ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK);
        /*
         * Qkeys with the high order bit set mean use the
         * qkey from the QP context instead of the WR (see 10.2.5).
@@ -388,13 +392,9 @@ int qib_make_ud_req(struct rvt_qp *qp)
        ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done:
-       ret = 1;
-       goto unlock;
-
+       return 1;
 bail:
        qp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
index fa94f78073cf77c3da33bcea53c34ea76082369b..5cf019fb50d996f380da573934cdb3b33027cc97 100644 (file)
@@ -1662,6 +1662,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
        dd->verbs_dev.rdi.driver_f.get_card_name = qib_get_card_name;
        dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev;
        dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah;
+       dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe;
        dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah;
        dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn;
        dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc;
@@ -1677,6 +1678,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = qib_mtu_to_path_mtu;
        dd->verbs_dev.rdi.driver_f.mtu_from_qp = qib_mtu_from_qp;
        dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = qib_get_pmtu_from_attr;
+       dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _qib_schedule_send;
        dd->verbs_dev.rdi.driver_f.query_port_state = qib_query_port;
        dd->verbs_dev.rdi.driver_f.shut_down_port = qib_shut_down_port;
        dd->verbs_dev.rdi.driver_f.cap_mask_chg = qib_cap_mask_chg;
@@ -1778,17 +1780,34 @@ void qib_unregister_ib_device(struct qib_devdata *dd)
                                  dev->pio_hdrs, dev->pio_hdrs_phys);
 }
 
-/*
- * This must be called with s_lock held.
+/**
+ * _qib_schedule_send - schedule progress
+ * @qp - the qp
+ *
+ * This schedules progress w/o regard to the s_flags.
+ *
+ * It is only used in post send, which doesn't hold
+ * the s_lock.
  */
-void qib_schedule_send(struct rvt_qp *qp)
+void _qib_schedule_send(struct rvt_qp *qp)
 {
+       struct qib_ibport *ibp =
+               to_iport(qp->ibqp.device, qp->port_num);
+       struct qib_pportdata *ppd = ppd_from_ibp(ibp);
        struct qib_qp_priv *priv = qp->priv;
-       if (qib_send_ok(qp)) {
-               struct qib_ibport *ibp =
-                       to_iport(qp->ibqp.device, qp->port_num);
-               struct qib_pportdata *ppd = ppd_from_ibp(ibp);
 
-               queue_work(ppd->qib_wq, &priv->s_work);
-       }
+       queue_work(ppd->qib_wq, &priv->s_work);
+}
+
+/**
+ * qib_schedule_send - schedule progress
+ * @qp - the qp
+ *
+ * This schedules qp progress.  The s_lock
+ * should be held.
+ */
+void qib_schedule_send(struct rvt_qp *qp)
+{
+       if (qib_send_ok(qp))
+               _qib_schedule_send(qp);
 }
index b88e027b6cb0899eb924eec67b79dc294733d210..d137d714935d65f2b906ad7f1490a2dd2cc7e271 100644 (file)
@@ -298,9 +298,7 @@ static inline int qib_send_ok(struct rvt_qp *qp)
                 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
 }
 
-/*
- * This must be called with s_lock held.
- */
+void _qib_schedule_send(struct rvt_qp *qp);
 void qib_schedule_send(struct rvt_qp *qp);
 
 static inline int qib_pkey_ok(u16 pkey1, u16 pkey2)
@@ -392,6 +390,8 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
 
 int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
 
+int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
 struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid);
 
 void qib_rc_rnr_retry(unsigned long arg);
index 7dc837c6554b8c7e465f86d4ea945f7b9bc96817..522404ac7c3856843a9d5820b8e17ee8fdf9d9b9 100644 (file)
@@ -401,6 +401,7 @@ void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
                rdi->driver_f.flush_qp_waiters(qp);
                qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
                spin_unlock(&qp->s_lock);
+               spin_unlock(&qp->s_hlock);
                spin_unlock_irq(&qp->r_lock);
 
                /* Stop the send queue and the retry timer */
@@ -415,6 +416,7 @@ void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 
                /* grab the lock b/c it was locked at call time */
                spin_lock_irq(&qp->r_lock);
+               spin_lock(&qp->s_hlock);
                spin_lock(&qp->s_lock);
 
                rvt_clear_mr_refs(qp, 1);
@@ -610,6 +612,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                 * except for qp->ibqp.qp_num.
                 */
                spin_lock_init(&qp->r_lock);
+               spin_lock_init(&qp->s_hlock);
                spin_lock_init(&qp->s_lock);
                spin_lock_init(&qp->r_rq.lock);
                atomic_set(&qp->refcount, 0);
@@ -620,6 +623,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                qp->state = IB_QPS_RESET;
                qp->s_wq = swq;
                qp->s_size = init_attr->cap.max_send_wr + 1;
+               qp->s_avail = init_attr->cap.max_send_wr;
                qp->s_max_sge = init_attr->cap.max_send_sge;
                if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
                        qp->s_flags = RVT_S_SIGNAL_REQ_WR;
@@ -779,6 +783,7 @@ void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
                                                wqe->ud_wr.ah)->refcount);
                        if (++qp->s_last >= qp->s_size)
                                qp->s_last = 0;
+                       smp_wmb(); /* see qp_set_savail */
                }
                if (qp->s_rdma_mr) {
                        rvt_put_mr(qp->s_rdma_mr);
@@ -833,7 +838,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
        rdi->driver_f.notify_error_qp(qp);
 
        /* Schedule the sending tasklet to drain the send work queue. */
-       if (qp->s_last != qp->s_head)
+       if (ACCESS_ONCE(qp->s_last) != qp->s_head)
                rdi->driver_f.schedule_send(qp);
 
        rvt_clear_mr_refs(qp, 0);
@@ -979,6 +984,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
 
        spin_lock_irq(&qp->r_lock);
+       spin_lock(&qp->s_hlock);
        spin_lock(&qp->s_lock);
 
        cur_state = attr_mask & IB_QP_CUR_STATE ?
@@ -1151,6 +1157,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        if (attr_mask & IB_QP_PATH_MTU) {
                qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
                qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
+               qp->log_pmtu = ilog2(qp->pmtu);
        }
 
        if (attr_mask & IB_QP_RETRY_CNT) {
@@ -1186,6 +1193,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
 
        spin_unlock(&qp->s_lock);
+       spin_unlock(&qp->s_hlock);
        spin_unlock_irq(&qp->r_lock);
 
        if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -1207,6 +1215,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 inval:
        spin_unlock(&qp->s_lock);
+       spin_unlock(&qp->s_hlock);
        spin_unlock_irq(&qp->r_lock);
        return -EINVAL;
 }
@@ -1226,9 +1235,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
 
        spin_lock_irq(&qp->r_lock);
+       spin_lock(&qp->s_hlock);
        spin_lock(&qp->s_lock);
        rvt_reset_qp(rdi, qp, ibqp->qp_type);
        spin_unlock(&qp->s_lock);
+       spin_unlock(&qp->s_hlock);
        spin_unlock_irq(&qp->r_lock);
 
        /* qpn is now available for use again */
@@ -1357,6 +1368,28 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
        return 0;
 }
 
+/**
+ * qp_get_savail - return number of avail send entries
+ *
+ * @qp - the qp
+ *
+ * This assumes the s_hlock is held but the s_last
+ * qp variable is uncontrolled.
+ */
+static inline u32 qp_get_savail(struct rvt_qp *qp)
+{
+       u32 slast;
+       u32 ret;
+
+       smp_read_barrier_depends(); /* see rc.c */
+       slast = ACCESS_ONCE(qp->s_last);
+       if (qp->s_head >= slast)
+               ret = qp->s_size - (qp->s_head - slast);
+       else
+               ret = slast - qp->s_head;
+       return ret - 1;
+}
+
 /**
  * rvt_post_one_wr - post one RC, UC, or UD send work request
  * @qp: the QP to post on
@@ -1372,6 +1405,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp, struct ib_send_wr *wr)
        struct rvt_lkey_table *rkt;
        struct rvt_pd *pd;
        struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+       u8 log_pmtu;
+       int ret;
 
        /* IB spec says that num_sge == 0 is OK. */
        if (unlikely(wr->num_sge > qp->s_max_sge))
@@ -1403,16 +1438,16 @@ static int rvt_post_one_wr(struct rvt_qp *qp, struct ib_send_wr *wr)
        } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
                return -EINVAL;
        }
-
+       /* check for avail */
+       if (unlikely(!qp->s_avail)) {
+               qp->s_avail = qp_get_savail(qp);
+               WARN_ON(qp->s_avail > (qp->s_size - 1));
+               if (!qp->s_avail)
+                       return -ENOMEM;
+       }
        next = qp->s_head + 1;
        if (next >= qp->s_size)
                next = 0;
-       if (next == qp->s_last)
-               return -ENOMEM;
-
-       if (rdi->driver_f.check_send_wr &&
-           rdi->driver_f.check_send_wr(qp, wr))
-               return -EINVAL;
 
        rkt = &rdi->lkey_table;
        pd = ibpd_to_rvtpd(qp->ibqp.pd);
@@ -1444,21 +1479,39 @@ static int rvt_post_one_wr(struct rvt_qp *qp, struct ib_send_wr *wr)
                                continue;
                        ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j],
                                         &wr->sg_list[i], acc);
-                       if (!ok)
+                       if (!ok) {
+                               ret = -EINVAL;
                                goto bail_inval_free;
+                       }
                        wqe->length += length;
                        j++;
                }
                wqe->wr.num_sge = j;
        }
-       if (qp->ibqp.qp_type == IB_QPT_UC ||
-           qp->ibqp.qp_type == IB_QPT_RC) {
-               if (wqe->length > 0x80000000U)
+
+       /* general part of wqe valid - allow for driver checks */
+       if (rdi->driver_f.check_send_wqe) {
+               ret = rdi->driver_f.check_send_wqe(qp, wqe);
+               if (ret)
                        goto bail_inval_free;
-       } else {
+       }
+
+       log_pmtu = qp->log_pmtu;
+       if (qp->ibqp.qp_type != IB_QPT_UC &&
+           qp->ibqp.qp_type != IB_QPT_RC) {
+               struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
+
+               log_pmtu = ah->log_pmtu;
                atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
        }
+
        wqe->ssn = qp->s_ssn++;
+       wqe->psn = qp->s_next_psn;
+       wqe->lpsn = wqe->psn +
+                       (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0);
+       qp->s_next_psn = wqe->lpsn + 1;
+       smp_wmb(); /* see request builders */
+       qp->s_avail--;
        qp->s_head = next;
 
        return 0;
@@ -1470,7 +1523,7 @@ bail_inval_free:
 
                rvt_put_mr(sge->mr);
        }
-       return -EINVAL;
+       return ret;
 }
 
 /**
@@ -1491,14 +1544,14 @@ int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        unsigned nreq = 0;
        int err = 0;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
+       spin_lock_irqsave(&qp->s_hlock, flags);
 
        /*
         * Ensure QP state is such that we can send. If not bail out early,
         * there is no need to do this every time we post a send.
         */
        if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
-               spin_unlock_irqrestore(&qp->s_lock, flags);
+               spin_unlock_irqrestore(&qp->s_hlock, flags);
                return -EINVAL;
        }
 
@@ -1518,11 +1571,13 @@ int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                nreq++;
        }
 bail:
-       if (nreq && !call_send)
-               rdi->driver_f.schedule_send(qp);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (nreq && call_send)
-               rdi->driver_f.do_send(qp);
+       spin_unlock_irqrestore(&qp->s_hlock, flags);
+       if (nreq) {
+               if (call_send)
+                       rdi->driver_f.schedule_send_no_lock(qp);
+               else
+                       rdi->driver_f.do_send(qp);
+       }
        return err;
 }
 
index ec9ee726267b9582d3887823adcc6d89aabb06b3..00866c07fddcf888279833b3fb28cbde3725ed8d 100644 (file)
@@ -226,16 +226,45 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
        }
 }
 
-int hfi1_check_send_wr(struct rvt_qp *qp, struct ib_send_wr *wr)
+/**
+ * hfi1_check_send_wqe - validate wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wqe.  This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 0 on success, -EINVAL on failure
+ *
+ */
+int hfi1_check_send_wqe(struct rvt_qp *qp,
+                       struct rvt_swqe *wqe)
 {
        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct rvt_ah *ah = ibah_to_rvtah(ud_wr(wr)->ah);
+       struct rvt_ah *ah;
 
-       if (qp->ibqp.qp_type != IB_QPT_RC &&
-           qp->ibqp.qp_type != IB_QPT_UC &&
-           qp->ibqp.qp_type != IB_QPT_SMI &&
-           ibp->sl_to_sc[ah->attr.sl] == 0xf) {
-               return -EINVAL;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               if (wqe->length > 0x80000000U)
+                       return -EINVAL;
+               break;
+       case IB_QPT_SMI:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               break;
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               if (ibp->sl_to_sc[ah->attr.sl] == 0xf)
+                       return -EINVAL;
+       default:
+               break;
        }
        return 0;
 }
@@ -301,6 +330,42 @@ __be32 hfi1_compute_aeth(struct rvt_qp *qp)
        return cpu_to_be32(aeth);
 }
 
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+void _hfi1_schedule_send(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibport *ibp =
+               to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+       iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+                       priv->s_sde ?
+                       priv->s_sde->cpu :
+                       cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+void hfi1_schedule_send(struct rvt_qp *qp)
+{
+       if (hfi1_send_ok(qp))
+               _hfi1_schedule_send(qp);
+}
+
 /**
  * hfi1_get_credit - flush the send work queue of a QP
  * @qp: the qp who's send work queue to flush
index 973c14b5268a573b4fc5215aaf978311c3081612..98827b5dd2a1e2bd181ae9d0027d9dff3609fa4a 100644 (file)
@@ -137,41 +137,8 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
  */
 void qp_comm_est(struct rvt_qp *qp);
 
-/**
- * _hfi1_schedule_send - schedule progress
- * @qp: the QP
- *
- * This schedules qp progress w/o regard to the s_flags.
- *
- * It is only used in the post send, which doesn't hold
- * the s_lock.
- */
-static inline void _hfi1_schedule_send(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibport *ibp =
-               to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-
-       iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
-                       priv->s_sde ?
-                       priv->s_sde->cpu :
-                       cpumask_first(cpumask_of_node(dd->node)));
-}
-
-/**
- * hfi1_schedule_send - schedule progress
- * @qp: the QP
- *
- * This schedules qp progress and caller should hold
- * the s_lock.
- */
-static inline void hfi1_schedule_send(struct rvt_qp *qp)
-{
-       if (hfi1_send_ok(qp))
-               _hfi1_schedule_send(qp);
-}
+void _hfi1_schedule_send(struct rvt_qp *qp);
+void hfi1_schedule_send(struct rvt_qp *qp);
 
 void hfi1_migrate_qp(struct rvt_qp *qp);
 
index a4a44d33d8578506be52d66c8f7ac77b7ce0049d..a62c9424fa86353c87a9a296b8af3bc77aec012f 100644 (file)
@@ -367,6 +367,8 @@ bail:
  * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
  * @qp: a pointer to the QP
  *
+ * Assumes s_lock is held.
+ *
  * Return 1 if constructed; otherwise, return 0.
  */
 int hfi1_make_rc_req(struct rvt_qp *qp)
@@ -383,7 +385,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
        u32 bth2;
        u32 pmtu = qp->pmtu;
        char newreq;
-       unsigned long flags;
        int ret = 0;
        int middle = 0;
        int delta;
@@ -392,12 +393,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
        if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
                ohdr = &priv->s_hdr->ibh.u.l.oth;
 
-       /*
-        * The lock is needed to synchronize between the sending tasklet,
-        * the receive interrupt handler, and timeout re-sends.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-
        /* Sending responses has higher priority over sending requests. */
        if ((qp->s_flags & RVT_S_RESP_PENDING) &&
            make_rc_ack(dev, qp, ohdr, pmtu))
@@ -407,7 +402,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
                        goto bail;
                /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
                if (atomic_read(&priv->s_iowait.sdma_busy)) {
@@ -463,8 +459,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                                qp->s_flags |= RVT_S_WAIT_FENCE;
                                goto bail;
                        }
-                       wqe->psn = qp->s_next_psn;
                        newreq = 1;
+                       qp->s_psn = wqe->psn;
                }
                /*
                 * Note that we have to be careful not to modify the
@@ -483,9 +479,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                                qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
                                goto bail;
                        }
-                       wqe->lpsn = wqe->psn;
                        if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
                                qp->s_state = OP(SEND_FIRST);
                                len = pmtu;
                                break;
@@ -522,9 +516,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                                cpu_to_be32(wqe->rdma_wr.rkey);
                        ohdr->u.rc.reth.length = cpu_to_be32(len);
                        hwords += sizeof(struct ib_reth) / sizeof(u32);
-                       wqe->lpsn = wqe->psn;
                        if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
                                qp->s_state = OP(RDMA_WRITE_FIRST);
                                len = pmtu;
                                break;
@@ -559,13 +551,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                                qp->s_num_rd_atomic++;
                                if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
                                        qp->s_lsn++;
-                               /*
-                                * Adjust s_next_psn to count the
-                                * expected number of responses.
-                                */
-                               if (len > pmtu)
-                                       qp->s_next_psn += (len - 1) / pmtu;
-                               wqe->lpsn = qp->s_next_psn++;
                        }
                        ohdr->u.rc.reth.vaddr =
                                cpu_to_be64(wqe->rdma_wr.remote_addr);
@@ -596,7 +581,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                                qp->s_num_rd_atomic++;
                                if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
                                        qp->s_lsn++;
-                               wqe->lpsn = wqe->psn;
                        }
                        if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
                                qp->s_state = OP(COMPARE_SWAP);
@@ -639,11 +623,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                }
                if (wqe->wr.opcode == IB_WR_RDMA_READ)
                        qp->s_psn = wqe->lpsn + 1;
-               else {
+               else
                        qp->s_psn++;
-                       if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
-                               qp->s_next_psn = qp->s_psn;
-               }
                break;
 
        case OP(RDMA_READ_RESPONSE_FIRST):
@@ -663,8 +644,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                /* FALLTHROUGH */
        case OP(SEND_MIDDLE):
                bth2 = mask_psn(qp->s_psn++);
-               if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
                ss = &qp->s_sge;
                len = qp->s_len;
                if (len > pmtu) {
@@ -705,8 +684,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                /* FALLTHROUGH */
        case OP(RDMA_WRITE_MIDDLE):
                bth2 = mask_psn(qp->s_psn++);
-               if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
                ss = &qp->s_sge;
                len = qp->s_len;
                if (len > pmtu) {
@@ -777,13 +754,9 @@ int hfi1_make_rc_req(struct rvt_qp *qp)
                bth2,
                middle);
 done:
-       ret = 1;
-       goto unlock;
-
+       return 1;
 bail:
        qp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
@@ -1563,7 +1536,8 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp,
        trace_hfi1_rc_ack(qp, psn);
 
        /* Ignore invalid responses. */
-       if (cmp_psn(psn, qp->s_next_psn) >= 0)
+       smp_read_barrier_depends(); /* see post_one_send */
+       if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
                goto ack_done;
 
        /* Ignore duplicate responses. */
index a7add3c5d0f2b0a1ccb9b09f460b5d61582459b5..6114550bb73f3f92201b55bdb57437dccbc33ff5 100644 (file)
@@ -392,7 +392,8 @@ static void ruc_loopback(struct rvt_qp *sqp)
        sqp->s_flags |= RVT_S_BUSY;
 
 again:
-       if (sqp->s_last == sqp->s_head)
+       smp_read_barrier_depends(); /* see post_one_send() */
+       if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
                goto clr_busy;
        wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
 
@@ -871,40 +872,43 @@ void hfi1_do_send(struct rvt_qp *qp)
 
        qp->s_flags |= RVT_S_BUSY;
 
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
        timeout = jiffies + (timeout_int) / 8;
        cpu = priv->s_sde ? priv->s_sde->cpu :
                        cpumask_first(cpumask_of_node(ps.ppd->dd->node));
        do {
                /* Check for a constructed packet to be sent. */
                if (qp->s_hdrwords != 0) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
                        /*
                         * If the packet cannot be sent now, return and
                         * the send tasklet will be woken up later.
                         */
                        if (hfi1_verbs_send(qp, &ps))
-                               break;
+                               return;
                        /* Record that s_hdr is empty. */
                        qp->s_hdrwords = 0;
-               }
-
-               /* allow other tasks to run */
-               if (unlikely(time_after(jiffies, timeout))) {
-                       if (workqueue_congested(cpu, ps.ppd->hfi1_wq)) {
-                               spin_lock_irqsave(&qp->s_lock, flags);
-                               qp->s_flags &= ~RVT_S_BUSY;
-                               hfi1_schedule_send(qp);
-                               spin_unlock_irqrestore(&qp->s_lock,
-                                                      flags);
+                       /* allow other tasks to run */
+                       if (unlikely(time_after(jiffies, timeout))) {
+                               if (workqueue_congested(cpu,
+                                                       ps.ppd->hfi1_wq)) {
+                                       spin_lock_irqsave(&qp->s_lock, flags);
+                                       qp->s_flags &= ~RVT_S_BUSY;
+                                       hfi1_schedule_send(qp);
+                                       spin_unlock_irqrestore(&qp->s_lock,
+                                                              flags);
+                                       this_cpu_inc(
+                                               *ps.ppd->dd->send_schedule);
+                                       return;
+                               }
+                               cond_resched();
                                this_cpu_inc(*ps.ppd->dd->send_schedule);
-                               return;
+                               timeout = jiffies + (timeout_int) / 8;
                        }
-                       cond_resched();
-                       this_cpu_inc(*ps.ppd->dd->send_schedule);
-                       timeout = jiffies + (timeout_int) / 8;
+                       spin_lock_irqsave(&qp->s_lock, flags);
                }
        } while (make_req(qp));
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
 /*
index 0aa604b7557b58b89bfec83334f89046f7547978..f884b5c8051bd45b75d3ceb3b2d79c9d75eb0aef 100644 (file)
@@ -59,6 +59,8 @@
  * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
  * @qp: a pointer to the QP
  *
+ * Assume s_lock is held.
+ *
  * Return 1 if constructed; otherwise, return 0.
  */
 int hfi1_make_uc_req(struct rvt_qp *qp)
@@ -66,7 +68,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp)
        struct hfi1_qp_priv *priv = qp->priv;
        struct hfi1_other_headers *ohdr;
        struct rvt_swqe *wqe;
-       unsigned long flags;
        u32 hwords = 5;
        u32 bth0 = 0;
        u32 len;
@@ -74,13 +75,12 @@ int hfi1_make_uc_req(struct rvt_qp *qp)
        int ret = 0;
        int middle = 0;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
                        goto bail;
                /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
                if (atomic_read(&priv->s_iowait.sdma_busy)) {
@@ -106,15 +106,15 @@ int hfi1_make_uc_req(struct rvt_qp *qp)
                    RVT_PROCESS_NEXT_SEND_OK))
                        goto bail;
                /* Check if send work queue is empty. */
-               if (qp->s_cur == qp->s_head) {
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
                        clear_ahg(qp);
                        goto bail;
                }
                /*
                 * Start a new request.
                 */
-               wqe->psn = qp->s_next_psn;
-               qp->s_psn = qp->s_next_psn;
+               qp->s_psn = wqe->psn;
                qp->s_sge.sge = wqe->sg_list[0];
                qp->s_sge.sg_list = wqe->sg_list + 1;
                qp->s_sge.num_sge = wqe->wr.num_sge;
@@ -235,15 +235,12 @@ int hfi1_make_uc_req(struct rvt_qp *qp)
        qp->s_cur_sge = &qp->s_sge;
        qp->s_cur_size = len;
        hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
-                            mask_psn(qp->s_next_psn++), middle);
+                            mask_psn(qp->s_psn++), middle);
 done:
-       ret = 1;
-       goto unlock;
+       return 1;
 
 bail:
        qp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
index fdf6e3bee8f15353207d7b26a8917cb729b9b4b0..ba78e2e3e0bb0fdcfb7a3d4c18943203718d49f3 100644 (file)
@@ -261,6 +261,8 @@ drop:
  * hfi1_make_ud_req - construct a UD request packet
  * @qp: the QP
  *
+ * Assume s_lock is held.
+ *
  * Return 1 if constructed; otherwise, return 0.
  */
 int hfi1_make_ud_req(struct rvt_qp *qp)
@@ -271,7 +273,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
        struct hfi1_pportdata *ppd;
        struct hfi1_ibport *ibp;
        struct rvt_swqe *wqe;
-       unsigned long flags;
        u32 nwords;
        u32 extra_bytes;
        u32 bth0;
@@ -281,13 +282,12 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
        int next_cur;
        u8 sc5;
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
                        goto bail;
                /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
+               smp_read_barrier_depends(); /* see post_one_send */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
                if (atomic_read(&priv->s_iowait.sdma_busy)) {
@@ -299,7 +299,9 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
                goto done;
        }
 
-       if (qp->s_cur == qp->s_head)
+       /* see post_one_send() */
+       smp_read_barrier_depends();
+       if (qp->s_cur == ACCESS_ONCE(qp->s_head))
                goto bail;
 
        wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
@@ -317,6 +319,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
                if (unlikely(!loopback && (lid == ppd->lid ||
                    (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
                     qp->ibqp.qp_type == IB_QPT_GSI)))) {
+                       unsigned long flags;
                        /*
                         * If DMAs are in progress, we can't generate
                         * a completion for the loopback packet since
@@ -329,6 +332,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
                                goto bail;
                        }
                        qp->s_cur = next_cur;
+                       local_irq_save(flags);
                        spin_unlock_irqrestore(&qp->s_lock, flags);
                        ud_loopback(qp, wqe);
                        spin_lock_irqsave(&qp->s_lock, flags);
@@ -408,7 +412,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
                bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
        ohdr->bth[0] = cpu_to_be32(bth0);
        ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++));
+       ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
        /*
         * Qkeys with the high order bit set mean use the
         * qkey from the QP context instead of the WR (see 10.2.5).
@@ -423,13 +427,9 @@ int hfi1_make_ud_req(struct rvt_qp *qp)
        priv->s_hdr->sde = NULL;
 
 done:
-       ret = 1;
-       goto unlock;
-
+       return 1;
 bail:
        qp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
        return ret;
 }
 
index 35f6d92a6249a1ee9a0f9959947e74364c011e92..1df464815247d7c3a26f4909abb85c4dff015088 100644 (file)
@@ -1533,6 +1533,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
        dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
        dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
        dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
+       dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
        dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
        dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
@@ -1543,7 +1544,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
        dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
        dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
-       dd->verbs_dev.rdi.driver_f.check_send_wr = hfi1_check_send_wr;
+       dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
 
        /* completeion queue */
        snprintf(dd->verbs_dev.rdi.dparms.cq_name,
index adb63bb6fae2aa4b9b4ef7abea4916482cbc0fa6..d00c55d06c8cc468081d86033c67c2f487098ee3 100644 (file)
@@ -427,7 +427,7 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
                    int attr_mask, struct ib_udata *udata);
 
-int hfi1_check_send_wr(struct rvt_qp *qp, struct ib_send_wr *wr);
+int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
 
 int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
                       int has_grh, struct rvt_qp *qp, u32 bth0);
index 5ccf683b28f14723c7791e04fb53570783ab6f96..aabd2e5bc5d72bfa1de8d5458774533d08d3a5ef 100644 (file)
@@ -220,6 +220,7 @@ struct rvt_ah {
 };
 
 struct rvt_dev_info;
+struct rvt_swqe;
 struct rvt_driver_provided {
        /*
         * The work to create port files in /sys/class Infiniband is different
@@ -240,6 +241,7 @@ struct rvt_driver_provided {
        void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
        void (*notify_qp_reset)(struct rvt_qp *qp);
        void (*schedule_send)(struct rvt_qp *qp);
+       void (*schedule_send_no_lock)(struct rvt_qp *qp);
        void (*do_send)(struct rvt_qp *qp);
        int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
                                  struct ib_qp_attr *attr);
@@ -273,7 +275,7 @@ struct rvt_driver_provided {
        void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
                          int attr_mask, struct ib_udata *udata);
 
-       int (*check_send_wr)(struct rvt_qp *qp, struct ib_send_wr *wr);
+       int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
 
        void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
        void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
index b3ea745793166f13ed34860d09706e67bb8ced0f..1066b5d1b4d252850d4e41731c61daa7906b5e46 100644 (file)
@@ -250,11 +250,12 @@ struct rvt_qp {
        enum ib_mtu path_mtu;
        int srate_mbps;         /* s_srate (below) converted to Mbit/s */
        u32 remote_qpn;
-       u32 pmtu;               /* decoded from path_mtu */
        u32 qkey;               /* QKEY for this QP (for UD or RD) */
        u32 s_size;             /* send work queue size */
        u32 s_ahgpsn;           /* set to the psn in the copy of the header */
 
+       u16 pmtu;               /* decoded from path_mtu */
+       u8 log_pmtu;            /* shift for pmtu */
        u8 state;               /* QP state */
        u8 allowed_ops;         /* high order bits of allowed opcodes */
        u8 qp_access_flags;
@@ -299,6 +300,13 @@ struct rvt_qp {
        struct rvt_sge_state r_sge;     /* current receive data */
        struct rvt_rq r_rq;             /* receive work queue */
 
+       /* post send line */
+       spinlock_t s_hlock ____cacheline_aligned_in_smp;
+       u32 s_head;             /* new entries added here */
+       u32 s_next_psn;         /* PSN for next request */
+       u32 s_avail;            /* number of entries avail */
+       u32 s_ssn;              /* SSN of tail entry */
+
        spinlock_t s_lock ____cacheline_aligned_in_smp;
        struct rvt_sge_state *s_cur_sge;
        u32 s_flags;
@@ -308,19 +316,16 @@ struct rvt_qp {
        u32 s_cur_size;         /* size of send packet in bytes */
        u32 s_len;              /* total length of s_sge */
        u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
-       u32 s_next_psn;         /* PSN for next request */
        u32 s_last_psn;         /* last response PSN processed */
        u32 s_sending_psn;      /* lowest PSN that is being sent */
        u32 s_sending_hpsn;     /* highest PSN that is being sent */
        u32 s_psn;              /* current packet sequence number */
        u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
        u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
-       u32 s_head;             /* new entries added here */
        u32 s_tail;             /* next entry to process */
        u32 s_cur;              /* current work queue entry */
        u32 s_acked;            /* last un-ACK'ed entry */
        u32 s_last;             /* last completed entry */
-       u32 s_ssn;              /* SSN of tail entry */
        u32 s_lsn;              /* limit sequence number (credit) */
        u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
        u16 s_rdma_ack_cnt;