staging/rdma/hfi1: Adaptive PIO for short messages
authorMike Marciniszyn <mike.marciniszyn@intel.com>
Sun, 14 Feb 2016 20:45:36 +0000 (12:45 -0800)
committerDoug Ledford <dledford@redhat.com>
Fri, 11 Mar 2016 01:38:14 +0000 (20:38 -0500)
The change requires a new pio_busy field in the iowait structure to
track the number of outstanding pios.  The new counter together
with the sdma counter serve as the basis for a packet by packet decision
as to which egress mechanism to use.  Since packets given to different
egress mechanisms are not ordered, this scheme will preserve the order.

The iowait drain/wait mechanisms are extended for a pio case.  An
additional qp wait flag is added for the PIO drain wait case.

Currently the only pio wait is for buffers, so the no_bufs_available()
routine name is changed to pio_wait() and a third argument is passed
with one of the two pio wait flags to generalize the routine.  A module
parameter is added to hold a configurable threshold. For now, the
module parameter is zero.

A heuristic routine is added to return the func pointer of the proper
egress routine to use.

The heuristic is as follows:
- SMI always uses pio
- GSI,UD qps <= threshold use pio
- UD qps > threadhold use sdma
  o No coordination with sdma is required because order is not required
    and this qp pio count is not maintained for UD
- RC/UC ONLY packets <= threshold chose as follows:
  o If sdmas pending, use SDMA
  o Otherwise use pio and enable the pio tracking count at
    the time the pio buffer is allocated
- RC/UC ONLY packets > threshold use SDMA
  o If pio's are pending the pio_wait with the new wait flag is
    called to delay for pios to drain

The threshold is potentially reduced by the QP's mtu.

The sc_buffer_alloc() has two additional args (a callback, a void *)
which are exploited by the RC/UC cases to pass a new complete routine
and a qp *.

When the shadow ring completes the credit associated with a packet,
the new complete routine is called.  The verbs_pio_complete() will then
decrement the busy count and trigger any drain waiters in qp destroy
or reset.

Reviewed-by: Jubin John <jubin.john@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
14 files changed:
drivers/staging/rdma/hfi1/chip.c
drivers/staging/rdma/hfi1/chip.h
drivers/staging/rdma/hfi1/hfi.h
drivers/staging/rdma/hfi1/iowait.h
drivers/staging/rdma/hfi1/pio.c
drivers/staging/rdma/hfi1/qp.c
drivers/staging/rdma/hfi1/rc.c
drivers/staging/rdma/hfi1/sdma.c
drivers/staging/rdma/hfi1/uc.c
drivers/staging/rdma/hfi1/ud.c
drivers/staging/rdma/hfi1/verbs.c
drivers/staging/rdma/hfi1/verbs.h
drivers/staging/rdma/hfi1/verbs_txreq.h
include/rdma/rdmavt_qp.h

index 1294617701860a429c8d5a4eb25ba003155df251..36e8e3e9b01276ca8c6ea3f1a317737ec1021db9 100644 (file)
@@ -1588,6 +1588,14 @@ static u64 access_sw_pio_wait(const struct cntr_entry *entry,
        return dd->verbs_dev.n_piowait;
 }
 
+static u64 access_sw_pio_drain(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_piodrain;
+}
+
 static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
                              void *context, int vl, int mode, u64 data)
 {
@@ -4129,6 +4137,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
                            access_sw_vtx_wait),
 [C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
                            access_sw_pio_wait),
+[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_drain),
 [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
                            access_sw_kmem_wait),
 [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
index b86c220161e59a7d0de3dc1834e4efc6d3cb0060..6c581e0bd65f45cc1562c511cd90008e872fc993 100644 (file)
@@ -800,6 +800,7 @@ enum {
        C_SW_CPU_RCV_LIM,
        C_SW_VTX_WAIT,
        C_SW_PIO_WAIT,
+       C_SW_PIO_DRAIN,
        C_SW_KMEM_WAIT,
        C_SW_SEND_SCHED,
        C_SDMA_DESC_FETCHED_CNT,
index 702723b3ff900827be2017091938a27cbe0d2e8e..43d48613d48edb0a7c0c48c6e3cfb78ca57bd89f 100644 (file)
@@ -811,6 +811,7 @@ struct sdma_vl_map;
 #define BOARD_VERS_MAX 96 /* how long the version string can be */
 #define SERIAL_MAX 16 /* length of the serial number */
 
+typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
 struct hfi1_devdata {
        struct hfi1_ibdev verbs_dev;     /* must be first */
        struct list_head list;
@@ -1121,10 +1122,8 @@ struct hfi1_devdata {
         * Handlers for outgoing data so that snoop/capture does not
         * have to have its hooks in the send path
         */
-       int (*process_pio_send)(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                               u64 pbc);
-       int (*process_dma_send)(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                               u64 pbc);
+       send_routine process_pio_send;
+       send_routine process_dma_send;
        void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
                                u64 pbc, const void *from, size_t count);
 
index e007eb82cbc8626aeed18666ba08893bcddb992b..b5eb1e0a5aa2591c5e0d45b60f6717e3df5fa1a6 100644 (file)
@@ -55,6 +55,7 @@
 #include <linux/sched.h>
 
 #include "sdma_txreq.h"
+
 /*
  * typedef (*restart_t)() - restart callback
  * @work: pointer to work structure
@@ -71,6 +72,7 @@ struct sdma_engine;
  * @wakeup: space callback
  * @iowork: workqueue overhead
  * @wait_dma: wait for sdma_busy == 0
+ * @wait_pio: wait for pio_busy == 0
  * @sdma_busy: # of packets in flight
  * @count: total number of descriptors in tx_head'ed list
  * @tx_limit: limit for overflow queuing
@@ -104,7 +106,9 @@ struct iowait {
        void (*wakeup)(struct iowait *wait, int reason);
        struct work_struct iowork;
        wait_queue_head_t wait_dma;
+       wait_queue_head_t wait_pio;
        atomic_t sdma_busy;
+       atomic_t pio_busy;
        u32 count;
        u32 tx_limit;
        u32 tx_count;
@@ -141,7 +145,9 @@ static inline void iowait_init(
        INIT_LIST_HEAD(&wait->tx_head);
        INIT_WORK(&wait->iowork, func);
        init_waitqueue_head(&wait->wait_dma);
+       init_waitqueue_head(&wait->wait_pio);
        atomic_set(&wait->sdma_busy, 0);
+       atomic_set(&wait->pio_busy, 0);
        wait->tx_limit = tx_limit;
        wait->sleep = sleep;
        wait->wakeup = wakeup;
@@ -174,6 +180,88 @@ static inline void iowait_sdma_drain(struct iowait *wait)
        wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
 }
 
+/**
+ * iowait_sdma_pending() - return sdma pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_sdma_pending(struct iowait *wait)
+{
+       return atomic_read(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_inc - note sdma io pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_inc(struct iowait *wait)
+{
+       atomic_inc(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_add - add count to pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_add(struct iowait *wait, int count)
+{
+       atomic_add(count, &wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_dec - note sdma complete
+ * @wait: iowait structure
+ */
+static inline int iowait_sdma_dec(struct iowait *wait)
+{
+       return atomic_dec_and_test(&wait->sdma_busy);
+}
+
+/**
+ * iowait_pio_drain() - wait for pios to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait pios have
+ * completed.
+ */
+static inline void iowait_pio_drain(struct iowait *wait)
+{
+       wait_event_timeout(wait->wait_pio,
+                          !atomic_read(&wait->pio_busy),
+                          HZ);
+}
+
+/**
+ * iowait_pio_pending() - return pio pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_pio_pending(struct iowait *wait)
+{
+       return atomic_read(&wait->pio_busy);
+}
+
+/**
+ * iowait_pio_inc - note pio pending
+ * @wait: iowait structure
+ */
+static inline void iowait_pio_inc(struct iowait *wait)
+{
+       atomic_inc(&wait->pio_busy);
+}
+
+/**
+ * iowait_sdma_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_pio_dec(struct iowait *wait)
+{
+       return atomic_dec_and_test(&wait->pio_busy);
+}
+
 /**
  * iowait_drain_wakeup() - trigger iowait_drain() waiter
  *
@@ -184,6 +272,7 @@ static inline void iowait_sdma_drain(struct iowait *wait)
 static inline void iowait_drain_wakeup(struct iowait *wait)
 {
        wake_up(&wait->wait_dma);
+       wake_up(&wait->wait_pio);
 }
 
 /**
index be0dcc345f4b56525ce2af753aea7c8db1bd442a..f5aab0ed39d73448b1f000fadacea7cd921e7fa0 100644 (file)
@@ -1564,7 +1564,8 @@ full:
        write_sequnlock_irqrestore(&dev->iowait_lock, flags);
 
        for (i = 0; i < n; i++)
-               hfi1_qp_wakeup(qps[i], RVT_S_WAIT_PIO);
+               hfi1_qp_wakeup(qps[i],
+                              RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
 }
 
 /* translate a send credit update to a bit code of reasons */
index 571e78fa2633e4a5a320f70370c3b892a8525abb..c7b83d66b59bd5add3184c59ff27eac43d53c83f 100644 (file)
@@ -359,6 +359,25 @@ void _hfi1_schedule_send(struct rvt_qp *qp)
                        cpumask_first(cpumask_of_node(dd->node)));
 }
 
+static void qp_pio_drain(struct rvt_qp *qp)
+{
+       struct hfi1_ibdev *dev;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (!priv->s_sendcontext)
+               return;
+       dev = to_idev(qp->ibqp.device);
+       while (iowait_pio_pending(&priv->s_iowait)) {
+               write_seqlock_irq(&dev->iowait_lock);
+               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
+               write_sequnlock_irq(&dev->iowait_lock);
+               iowait_pio_drain(&priv->s_iowait);
+               write_seqlock_irq(&dev->iowait_lock);
+               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
+               write_sequnlock_irq(&dev->iowait_lock);
+       }
+}
+
 /**
  * hfi1_schedule_send - schedule progress
  * @qp: the QP
@@ -620,7 +639,7 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
        wqe = rvt_get_swqe_ptr(qp, qp->s_last);
        send_context = qp_to_send_context(qp, priv->s_sc);
        seq_printf(s,
-                  "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) QP%u LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p\n",
+                  "N %d %s QP%x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) QP%x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p\n",
                   iter->n,
                   qp_idle(qp) ? "I" : "B",
                   qp->ibqp.qp_num,
@@ -630,7 +649,8 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
                   wqe ? wqe->wr.opcode : 0,
                   qp->s_hdrwords,
                   qp->s_flags,
-                  atomic_read(&priv->s_iowait.sdma_busy),
+                  iowait_sdma_pending(&priv->s_iowait),
+                  iowait_pio_pending(&priv->s_iowait),
                   !list_empty(&priv->s_iowait.list),
                   qp->timeout,
                   wqe ? wqe->ssn : 0,
@@ -739,6 +759,7 @@ void quiesce_qp(struct rvt_qp *qp)
        struct hfi1_qp_priv *priv = qp->priv;
 
        iowait_sdma_drain(&priv->s_iowait);
+       qp_pio_drain(qp);
        flush_tx_list(qp);
 }
 
index 27042876ca62987ae0abd87ba065a4ac65ac8719..443fda8df380ffcf4ab69667e588dcf0f2a7385d 100644 (file)
@@ -181,6 +181,18 @@ void hfi1_del_timers_sync(struct rvt_qp *qp)
        del_timer_sync(&priv->s_rnr_timer);
 }
 
+/* only opcode mask for adaptive pio */
+const u32 rc_only_opcode =
+       BIT(OP(SEND_ONLY) & 0x1f) |
+       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
+       BIT(OP(ACKNOWLEDGE & 0x1f)) |
+       BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
+       BIT(OP(COMPARE_SWAP & 0x1f)) |
+       BIT(OP(FETCH_ADD & 0x1f));
+
 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
                       u32 psn, u32 pmtu)
 {
@@ -217,6 +229,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
        u32 bth2;
        int middle = 0;
        u32 pmtu = qp->pmtu;
+       struct hfi1_qp_priv *priv = qp->priv;
 
        /* Don't send an ACK if we aren't supposed to. */
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
@@ -350,6 +363,7 @@ normal:
        qp->s_hdrwords = hwords;
        /* pbc */
        ps->s_txreq->hdr_dwords = hwords + 2;
+       ps->s_txreq->sde = priv->s_sde;
        qp->s_cur_size = len;
        hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
        return 1;
@@ -413,7 +427,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&priv->s_iowait.sdma_busy)) {
+               if (iowait_sdma_pending(&priv->s_iowait)) {
                        qp->s_flags |= RVT_S_WAIT_DMA;
                        goto bail;
                }
@@ -754,6 +768,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
        qp->s_hdrwords = hwords;
        /* pbc */
        ps->s_txreq->hdr_dwords = hwords + 2;
+       ps->s_txreq->sde = priv->s_sde;
        qp->s_cur_sge = ss;
        qp->s_cur_size = len;
        hfi1_make_ruc_header(
index 579d82109932583e6711676617673b1f6e1de234..ff38fa3b7ca5d3854abd4066abd75b7db5509475 100644 (file)
@@ -410,7 +410,7 @@ static void sdma_flush(struct sdma_engine *sde)
 #endif
                sdma_txclean(sde->dd, txp);
                if (wait)
-                       drained = atomic_dec_and_test(&wait->sdma_busy);
+                       drained = iowait_sdma_dec(wait);
                if (txp->complete)
                        (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
                if (wait && drained)
@@ -584,7 +584,7 @@ static void sdma_flush_descq(struct sdma_engine *sde)
                        /* remove from list */
                        sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
                        if (wait)
-                               drained = atomic_dec_and_test(&wait->sdma_busy);
+                               drained = iowait_sdma_dec(wait);
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
                        trace_hfi1_sdma_out_sn(sde, txp->sn);
                        if (WARN_ON_ONCE(sde->head_sn != txp->sn))
@@ -1498,7 +1498,7 @@ retry:
                        /* remove from list */
                        sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
                        if (wait)
-                               drained = atomic_dec_and_test(&wait->sdma_busy);
+                               drained = iowait_sdma_dec(wait);
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
                        trace_hfi1_sdma_out_sn(sde, txp->sn);
                        if (WARN_ON_ONCE(sde->head_sn != txp->sn))
@@ -2092,14 +2092,14 @@ retry:
                goto nodesc;
        tail = submit_tx(sde, tx);
        if (wait)
-               atomic_inc(&wait->sdma_busy);
+               iowait_sdma_inc(wait);
        sdma_update_tail(sde, tail);
 unlock:
        spin_unlock_irqrestore(&sde->tail_lock, flags);
        return ret;
 unlock_noconn:
        if (wait)
-               atomic_inc(&wait->sdma_busy);
+               iowait_sdma_inc(wait);
        tx->next_descq_idx = 0;
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
        tx->sn = sde->tail_sn++;
@@ -2181,7 +2181,7 @@ retry:
        }
 update_tail:
        if (wait)
-               atomic_add(count, &wait->sdma_busy);
+               iowait_sdma_add(wait, count);
        if (tail != INVALID_TAIL)
                sdma_update_tail(sde, tail);
        spin_unlock_irqrestore(&sde->tail_lock, flags);
@@ -2192,7 +2192,7 @@ unlock_noconn:
                tx->wait = wait;
                list_del_init(&tx->list);
                if (wait)
-                       atomic_inc(&wait->sdma_busy);
+                       iowait_sdma_inc(wait);
                tx->next_descq_idx = 0;
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
                tx->sn = sde->tail_sn++;
index 32705618900df2843a4a7b0c92444c32c607e3dd..e58ec15dd89243e4938573aeed5c9035be5e5685 100644 (file)
 /* cut down ridiculously long IB macro names */
 #define OP(x) IB_OPCODE_UC_##x
 
+/* only opcode mask for adaptive pio */
+const u32 uc_only_opcode =
+       BIT(OP(SEND_ONLY) & 0x1f) |
+       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
+
 /**
  * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
  * @qp: a pointer to the QP
@@ -86,7 +93,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&priv->s_iowait.sdma_busy)) {
+               if (iowait_sdma_pending(&priv->s_iowait)) {
                        qp->s_flags |= RVT_S_WAIT_DMA;
                        goto bail;
                }
@@ -237,6 +244,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
        qp->s_hdrwords = hwords;
        /* pbc */
        ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       ps->s_txreq->sde = priv->s_sde;
        qp->s_cur_sge = &qp->s_sge;
        qp->s_cur_size = len;
        hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
index bae5ccdfa7f4f07d988fa600a60f044cab1aa922..da4e465ae8461442d1bc5520aeb924752f8d446a 100644 (file)
@@ -294,7 +294,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                if (qp->s_last == ACCESS_ONCE(qp->s_head))
                        goto bail;
                /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&priv->s_iowait.sdma_busy)) {
+               if (iowait_sdma_pending(&priv->s_iowait)) {
                        qp->s_flags |= RVT_S_WAIT_DMA;
                        goto bail;
                }
@@ -331,7 +331,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                         * Instead of waiting, we could queue a
                         * zero length descriptor so we get a callback.
                         */
-                       if (atomic_read(&priv->s_iowait.sdma_busy)) {
+                       if (iowait_sdma_pending(&priv->s_iowait)) {
                                qp->s_flags |= RVT_S_WAIT_DMA;
                                goto bail;
                        }
index a4f8b26f76fb41d552e11480e7181a2134fb1c2a..d900374abe70459d107243ea0ef980e8f36325aa 100644 (file)
@@ -124,11 +124,20 @@ unsigned int hfi1_max_srq_wrs = 0x1FFFF;
 module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
 
+unsigned short piothreshold;
+module_param(piothreshold, ushort, S_IRUGO);
+MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
+
 static void verbs_sdma_complete(
        struct sdma_txreq *cookie,
        int status,
        int drained);
 
+static int pio_wait(struct rvt_qp *qp,
+                   struct send_context *sc,
+                   struct hfi1_pkt_state *ps,
+                   u32 flag);
+
 /* Length of buffer to create verbs txreq cache name */
 #define TXREQ_NAME_LEN 24
 
@@ -742,9 +751,10 @@ bail_build:
  * If we are now in the error state, return zero to flush the
  * send work request.
  */
-static int no_bufs_available(struct rvt_qp *qp,
-                            struct send_context *sc,
-                            struct hfi1_pkt_state *ps)
+static int pio_wait(struct rvt_qp *qp,
+                   struct send_context *sc,
+                   struct hfi1_pkt_state *ps,
+                   u32 flag)
 {
        struct hfi1_qp_priv *priv = qp->priv;
        struct hfi1_devdata *dd = sc->dd;
@@ -767,8 +777,10 @@ static int no_bufs_available(struct rvt_qp *qp,
                        struct hfi1_ibdev *dev = &dd->verbs_dev;
                        int was_empty;
 
+                       dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
+                       dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
                        dev->n_piowait++;
-                       qp->s_flags |= RVT_S_WAIT_PIO;
+                       qp->s_flags |= flag;
                        was_empty = list_empty(&sc->piowait);
                        list_add_tail(&priv->s_iowait.list, &sc->piowait);
                        trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
@@ -797,6 +809,15 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
        return dd->vld[vl].sc;
 }
 
+static void verbs_pio_complete(void *arg, int code)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (iowait_pio_dec(&priv->s_iowait))
+               iowait_drain_wakeup(&priv->s_iowait);
+}
+
 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                        u64 pbc)
 {
@@ -815,6 +836,17 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
        struct pio_buf *pbuf;
        int wc_status = IB_WC_SUCCESS;
        int ret = 0;
+       pio_release_cb cb = NULL;
+
+       /* only RC/UC use complete */
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               cb = verbs_pio_complete;
+               break;
+       default:
+               break;
+       }
 
        /* vl15 special case taken care of in ud.c */
        sc5 = priv->s_sc;
@@ -830,8 +862,12 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
                pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
        }
-       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+       if (cb)
+               iowait_pio_inc(&priv->s_iowait);
+       pbuf = sc_buffer_alloc(sc, plen, cb, qp);
        if (unlikely(pbuf == NULL)) {
+               if (cb)
+                       verbs_pio_complete(qp, 0);
                if (ppd->host_link_state != HLS_UP_ACTIVE) {
                        /*
                         * If we have filled the PIO buffers to capacity and are
@@ -851,8 +887,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                         * so lets continue to queue the request.
                         */
                        hfi1_cdbg(PIO, "alloc failed. state active, queuing");
-                       ret = no_bufs_available(qp, sc, ps);
+                       ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
                        if (!ret)
+                               /* txreq not queued - free */
                                goto bail;
                        /* tx consumed in wait */
                        return ret;
@@ -984,6 +1021,48 @@ bad:
        return 1;
 }
 
+/**
+ * get_send_routine - choose an egress routine
+ *
+ * Choose an egress routine based on QP type
+ * and size
+ */
+static inline send_routine get_send_routine(struct rvt_qp *qp,
+                                           struct hfi1_ib_header *h)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
+               return dd->process_pio_send;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               return dd->process_pio_send;
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               if (piothreshold && qp->s_cur_size <= piothreshold)
+                       return dd->process_pio_send;
+               break;
+       case IB_QPT_RC:
+               if (piothreshold &&
+                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
+                   iowait_sdma_pending(&priv->s_iowait) == 0)
+                       return dd->process_pio_send;
+               break;
+       case IB_QPT_UC:
+               if (piothreshold &&
+                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
+                   iowait_sdma_pending(&priv->s_iowait) == 0)
+                       return dd->process_pio_send;
+               break;
+       default:
+               break;
+       }
+       return dd->process_dma_send;
+}
+
 /**
  * hfi1_verbs_send - send a packet
  * @qp: the QP to send on
@@ -995,19 +1074,10 @@ bad:
 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       send_routine sr;
        int ret;
-       int pio = 0;
-       unsigned long flags = 0;
-
-       /*
-        * VL15 packets (IB_QPT_SMI) will always use PIO, so we
-        * can defer SDMA restart until link goes ACTIVE without
-        * worrying about just how we got there.
-        */
-       if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
-           !(dd->flags & HFI1_HAS_SEND_DMA))
-               pio = 1;
 
+       sr = get_send_routine(qp, &ps->s_txreq->phdr.hdr);
        ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp);
        if (unlikely(ret)) {
                /*
@@ -1018,7 +1088,9 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                 * mechanism for handling the errors. So for SDMA we can just
                 * return.
                 */
-               if (pio) {
+               if (sr == dd->process_pio_send) {
+                       unsigned long flags;
+
                        hfi1_cdbg(PIO, "%s() Failed. Completing with err",
                                  __func__);
                        spin_lock_irqsave(&qp->s_lock, flags);
@@ -1027,20 +1099,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                }
                return -EINVAL;
        }
-
-       if (pio) {
-               ret = dd->process_pio_send(qp, ps, 0);
-       } else {
-#ifdef CONFIG_SDMA_VERBOSITY
-               dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
-                          slashstrip(__FILE__), __LINE__, __func__);
-               dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", qp->s_hdrwords,
-                          qp->s_cur_size);
-#endif
-               ret = dd->process_dma_send(qp, ps, 0);
-       }
-
-       return ret;
+       return sr(qp, ps, 0);
 }
 
 /**
index 3d25ad406af7c880acd98e728e4bb13d030196bd..8f1fde847c146a9d8782103173644647467262ba 100644 (file)
@@ -265,6 +265,7 @@ struct hfi1_ibdev {
        struct timer_list mem_timer;
 
        u64 n_piowait;
+       u64 n_piodrain;
        u64 n_txwait;
        u64 n_kmem_wait;
 
@@ -425,6 +426,19 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 
 int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
 
+extern const u32 rc_only_opcode;
+extern const u32 uc_only_opcode;
+
+static inline u8 get_opcode(struct hfi1_ib_header *h)
+{
+       u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
+
+       if (lnh == IB_LNH_IBA_LOCAL)
+               return be32_to_cpu(h->u.oth.bth[0]) >> 24;
+       else
+               return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
+}
+
 int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
                       int has_grh, struct rvt_qp *qp, u32 bth0);
 
@@ -494,6 +508,8 @@ extern unsigned int hfi1_max_srq_sges;
 
 extern unsigned int hfi1_max_srq_wrs;
 
+extern unsigned short piothreshold;
+
 extern const u32 ib_hfi1_rnr_table[];
 
 #endif                          /* HFI1_VERBS_H */
index f56149eb51ca9025f46920dbea084693a665f886..1cf69b2fe4a52ef68f1e0dd1b9e954fe74db94dc 100644 (file)
@@ -93,6 +93,11 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
        return tx;
 }
 
+static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
+{
+       return &tx->txreq;
+}
+
 static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
 {
        struct sdma_txreq *stx;
index 5c307ed4d195b2ecd42db89c3e1c81e730a47a41..f2f4df023aaa935354e02bebd2e7ffb8d7253c62 100644 (file)
@@ -82,6 +82,7 @@
  * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating
  *                  next send completion entry not via send DMA
  * RVT_S_WAIT_PIO - waiting for a send buffer to be available
+ * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets
  * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available
  * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
  * RVT_S_WAIT_KMEM - waiting for kernel memory to be available
 #define RVT_S_WAIT_SSN_CREDIT  0x0100
 #define RVT_S_WAIT_DMA         0x0200
 #define RVT_S_WAIT_PIO         0x0400
-#define RVT_S_WAIT_TX          0x0800
-#define RVT_S_WAIT_DMA_DESC    0x1000
-#define RVT_S_WAIT_KMEM                0x2000
-#define RVT_S_WAIT_PSN         0x4000
-#define RVT_S_WAIT_ACK         0x8000
-#define RVT_S_SEND_ONE         0x10000
-#define RVT_S_UNLIMITED_CREDIT 0x20000
-#define RVT_S_AHG_VALID                0x40000
-#define RVT_S_AHG_CLEAR                0x80000
-#define RVT_S_ECN              0x100000
+#define RVT_S_WAIT_PIO_DRAIN    0x0800
+#define RVT_S_WAIT_TX          0x1000
+#define RVT_S_WAIT_DMA_DESC    0x2000
+#define RVT_S_WAIT_KMEM                0x4000
+#define RVT_S_WAIT_PSN         0x8000
+#define RVT_S_WAIT_ACK         0x10000
+#define RVT_S_SEND_ONE         0x20000
+#define RVT_S_UNLIMITED_CREDIT 0x40000
+#define RVT_S_AHG_VALID                0x80000
+#define RVT_S_AHG_CLEAR                0x100000
+#define RVT_S_ECN              0x200000
 
 /*
  * Wait flags that would prevent any packet type from being sent.