IB: add a proper completion queue abstraction
authorChristoph Hellwig <hch@lst.de>
Fri, 11 Dec 2015 19:53:03 +0000 (11:53 -0800)
committerChristoph Hellwig <hch@lst.de>
Fri, 11 Dec 2015 22:10:43 +0000 (14:10 -0800)
This adds an abstraction that allows ULPs to simply pass a completion
object and completion callback with each submitted WR and let the RDMA
core handle the nitty gritty details of how to handle completion
interrupts and poll the CQ.

In detail there is a new ib_cqe structure which just contains the
completion callback, and which can be used to get at the containing
object using container_of.  It is pointed to by the WR and WC as an
alternative to the wr_id field, similar to how many ULPs already use
the field to store a pointer using casts.

A driver using the new completion callbacks allocates it's CQs using
the new ib_create_cq API, which in addition to the number of CQEs and
the completion vectors also takes a mode on how we poll for CQEs.
Three modes are available: direct for drivers that never take CQ
interrupts and just poll for them, softirq to poll from softirq context
using the to be renamed blk-iopoll infrastructure which takes care of
rearming and budgeting, or a workqueue for consumer who want to be
called from user context.

Thanks a lot to Sagi Grimberg who helped reviewing the API, wrote
the current version of the workqueue code because my two previous
attempts sucked too much and converted the iSER initiator to the new
API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/cq.c [new file with mode: 0644]
drivers/infiniband/core/device.c
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/srp/ib_srp.c
include/rdma/ib_verbs.h

index aa26f3c3416bbbcd04dac1ec5381fcb081105d5e..282ec0b664fe94578dcbbb0fba4410733016e644 100644 (file)
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
        depends on NET
        depends on INET
        depends on m || IPV6 != m
+       select IRQ_POLL
        ---help---
          Core support for InfiniBand (IB).  Make sure to also select
          any protocols you wish to use as well as drivers for your
index d43a8994ac5c129d201b0f164442c618192eea9d..ae48d874012f1ca34da75975f3bc839bb0ee2ad8 100644 (file)
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) +=    ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=        ib_uverbs.o ib_ucm.o \
                                        $(user_access-y)
 
-ib_core-y :=                   packer.o ud_header.o verbs.o sysfs.o \
+ib_core-y :=                   packer.o ud_header.o verbs.o cq.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o \
                                roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644 (file)
index 0000000..a754fc7
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH                  16
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ             256
+#define IB_POLL_BUDGET_WORKQUEUE       65536
+
+#define IB_POLL_FLAGS \
+       (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget)
+{
+       int i, n, completed = 0;
+
+       while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
+               for (i = 0; i < n; i++) {
+                       struct ib_wc *wc = &cq->wc[i];
+
+                       if (wc->wr_cqe)
+                               wc->wr_cqe->done(cq, wc);
+                       else
+                               WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+               }
+
+               completed += n;
+
+               if (n != IB_POLL_BATCH ||
+                   (budget != -1 && completed >= budget))
+                       break;
+       }
+
+       return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq:                CQ to process
+ * @budget:    number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries on a
+ * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
+ * context and does not ask for completion interrupts from the HCA.
+ *
+ * Note: for compatibility reasons -1 can be passed in %budget for unlimited
+ * polling.  Do not use this feature in new code, it will be removed soon.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+       WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
+
+       return __ib_process_cq(cq, budget);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+       WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+       struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+       int completed;
+
+       completed = __ib_process_cq(cq, budget);
+       if (completed < budget) {
+               irq_poll_complete(&cq->iop);
+               if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+                       irq_poll_sched(&cq->iop);
+       }
+
+       return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+       irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+       struct ib_cq *cq = container_of(work, struct ib_cq, work);
+       int completed;
+
+       completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
+       if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+           ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+               queue_work(ib_comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+       queue_work(ib_comp_wq, &cq->work);
+}
+
+/**
+ * ib_alloc_cq - allocate a completion queue
+ * @dev:               device to allocate the CQ for
+ * @private:           driver private data, accessible from cq->cq_context
+ * @nr_cqe:            number of CQEs to allocate
+ * @comp_vector:       HCA completion vectors for this CQ
+ * @poll_ctx:          context to poll the CQ from.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context.  The ULP needs must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+               int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+{
+       struct ib_cq_init_attr cq_attr = {
+               .cqe            = nr_cqe,
+               .comp_vector    = comp_vector,
+       };
+       struct ib_cq *cq;
+       int ret = -ENOMEM;
+
+       cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+       if (IS_ERR(cq))
+               return cq;
+
+       cq->device = dev;
+       cq->uobject = NULL;
+       cq->event_handler = NULL;
+       cq->cq_context = private;
+       cq->poll_ctx = poll_ctx;
+       atomic_set(&cq->usecnt, 0);
+
+       cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+       if (!cq->wc)
+               goto out_destroy_cq;
+
+       switch (cq->poll_ctx) {
+       case IB_POLL_DIRECT:
+               cq->comp_handler = ib_cq_completion_direct;
+               break;
+       case IB_POLL_SOFTIRQ:
+               cq->comp_handler = ib_cq_completion_softirq;
+
+               irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+               ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+               break;
+       case IB_POLL_WORKQUEUE:
+               cq->comp_handler = ib_cq_completion_workqueue;
+               INIT_WORK(&cq->work, ib_cq_poll_work);
+               ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+               break;
+       default:
+               ret = -EINVAL;
+               goto out_free_wc;
+       }
+
+       return cq;
+
+out_free_wc:
+       kfree(cq->wc);
+out_destroy_cq:
+       cq->device->destroy_cq(cq);
+       return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq:                completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+       int ret;
+
+       if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+               return;
+
+       switch (cq->poll_ctx) {
+       case IB_POLL_DIRECT:
+               break;
+       case IB_POLL_SOFTIRQ:
+               irq_poll_disable(&cq->iop);
+               break;
+       case IB_POLL_WORKQUEUE:
+               flush_work(&cq->work);
+               break;
+       default:
+               WARN_ON_ONCE(1);
+       }
+
+       kfree(cq->wc);
+       ret = cq->device->destroy_cq(cq);
+       WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
index 179e8134d57fc13b425254d5162006f9fc201879..6421d2317b6f138d759da053b495f58a37b81684 100644 (file)
@@ -58,6 +58,7 @@ struct ib_client_data {
        bool              going_down;
 };
 
+struct workqueue_struct *ib_comp_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
@@ -954,10 +955,18 @@ static int __init ib_core_init(void)
        if (!ib_wq)
                return -ENOMEM;
 
+       ib_comp_wq = alloc_workqueue("ib-comp-wq",
+                       WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+                       WQ_UNBOUND_MAX_ACTIVE);
+       if (!ib_comp_wq) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
        ret = class_register(&ib_class);
        if (ret) {
                printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
-               goto err;
+               goto err_comp;
        }
 
        ret = ibnl_init();
@@ -972,7 +981,8 @@ static int __init ib_core_init(void)
 
 err_sysfs:
        class_unregister(&ib_class);
-
+err_comp:
+       destroy_workqueue(ib_comp_wq);
 err:
        destroy_workqueue(ib_wq);
        return ret;
@@ -983,6 +993,7 @@ static void __exit ib_core_cleanup(void)
        ib_cache_cleanup();
        ibnl_cleanup();
        class_unregister(&ib_class);
+       destroy_workqueue(ib_comp_wq);
        /* Make sure that any pending umem accounting work is done. */
        destroy_workqueue(ib_wq);
 }
index 3ae9726efb9837512a62214705bf5e8e9561a02c..9b014f1534420cca2e73bd28c2d9064f03d324a4 100644 (file)
@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
 
 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
-       .wr_id = IPOIB_CM_RX_DRAIN_WRID,
        .opcode = IB_WR_SEND,
 };
 
@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
         * error" WC will be immediately generated for each WR we post.
         */
        p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+       ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
        if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
                ipoib_warn(priv, "failed to post drain wr\n");
 
index 9909022dc6c3eebd4c52ef3e7dca5221dc8b6038..9f1cb3c5e3c35d6bd150e6813ab1bbb3dffabeb3 100644 (file)
@@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 static void srp_destroy_qp(struct srp_rdma_ch *ch)
 {
        static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
-       static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
+       static struct ib_recv_wr wr = { 0 };
        struct ib_recv_wr *bad_wr;
        int ret;
 
+       wr.wr_id = SRP_LAST_WR_ID;
        /* Destroying a QP and reusing ch->done is only safe if not connected */
        WARN_ON_ONCE(ch->connected);
 
@@ -1042,13 +1043,14 @@ static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
        struct ib_send_wr *bad_wr;
        struct ib_send_wr wr = {
                .opcode             = IB_WR_LOCAL_INV,
-               .wr_id              = LOCAL_INV_WR_ID_MASK,
                .next               = NULL,
                .num_sge            = 0,
                .send_flags         = 0,
                .ex.invalidate_rkey = rkey,
        };
 
+       wr.wr_id = LOCAL_INV_WR_ID_MASK;
+
        return ib_post_send(ch->qp, &wr, &bad_wr);
 }
 
index 9a68a19532ba57c27042cec499843f3cebfa273c..131dd4b5176cb0706af92e09e48a096255f1f2db 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <linux/socket.h>
+#include <linux/irq_poll.h>
 #include <uapi/linux/if_ether.h>
 
 #include <linux/atomic.h>
@@ -56,6 +57,7 @@
 #include <asm/uaccess.h>
 
 extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
 
 union ib_gid {
        u8      raw[16];
@@ -758,7 +760,10 @@ enum ib_wc_flags {
 };
 
 struct ib_wc {
-       u64                     wr_id;
+       union {
+               u64             wr_id;
+               struct ib_cqe   *wr_cqe;
+       };
        enum ib_wc_status       status;
        enum ib_wc_opcode       opcode;
        u32                     vendor_err;
@@ -1079,9 +1084,16 @@ struct ib_mw_bind_info {
        int             mw_access_flags;
 };
 
+struct ib_cqe {
+       void (*done)(struct ib_cq *cq, struct ib_wc *wc);
+};
+
 struct ib_send_wr {
        struct ib_send_wr      *next;
-       u64                     wr_id;
+       union {
+               u64             wr_id;
+               struct ib_cqe   *wr_cqe;
+       };
        struct ib_sge          *sg_list;
        int                     num_sge;
        enum ib_wr_opcode       opcode;
@@ -1175,7 +1187,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
 
 struct ib_recv_wr {
        struct ib_recv_wr      *next;
-       u64                     wr_id;
+       union {
+               u64             wr_id;
+               struct ib_cqe   *wr_cqe;
+       };
        struct ib_sge          *sg_list;
        int                     num_sge;
 };
@@ -1306,6 +1321,12 @@ struct ib_ah {
 
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
+enum ib_poll_context {
+       IB_POLL_DIRECT,         /* caller context, no hw completions */
+       IB_POLL_SOFTIRQ,        /* poll from softirq context */
+       IB_POLL_WORKQUEUE,      /* poll from workqueue */
+};
+
 struct ib_cq {
        struct ib_device       *device;
        struct ib_uobject      *uobject;
@@ -1314,6 +1335,12 @@ struct ib_cq {
        void                   *cq_context;
        int                     cqe;
        atomic_t                usecnt; /* count number of work queues */
+       enum ib_poll_context    poll_ctx;
+       struct ib_wc            *wc;
+       union {
+               struct irq_poll         iop;
+               struct work_struct      work;
+       };
 };
 
 struct ib_srq {
@@ -2453,6 +2480,11 @@ static inline int ib_post_recv(struct ib_qp *qp,
        return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
 }
 
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+               int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
+void ib_free_cq(struct ib_cq *cq);
+int ib_process_cq_direct(struct ib_cq *cq, int budget);
+
 /**
  * ib_create_cq - Creates a CQ on the specified device.
  * @device: The device on which to create the CQ.