* Refactor MR registration and cleanup, and fix reg_pages accounting.
* Create a work queue to handle page fault events in a kthread context.
* Register a fault handler to get events from the core for each QP.
The registered fault handler is empty in this patch, and only a later
patch implements it.
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
struct mlx5_ib_dev *dev =
container_of(device, struct mlx5_ib_dev, ib_dev.dev);
- return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages);
+ return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
}
static ssize_t show_hca(struct device *device, struct device_attribute *attr,
goto err_eqs;
mutex_init(&dev->cap_mask_mutex);
- spin_lock_init(&dev->mr_lock);
err = create_dev_resources(&dev->devr);
if (err)
goto err_eqs;
- err = ib_register_device(&dev->ib_dev, NULL);
+ err = mlx5_ib_odp_init_one(dev);
if (err)
goto err_rsrc;
+ err = ib_register_device(&dev->ib_dev, NULL);
+ if (err)
+ goto err_odp;
+
err = create_umr_res(dev);
if (err)
goto err_dev;
err_dev:
ib_unregister_device(&dev->ib_dev);
+err_odp:
+ mlx5_ib_odp_remove_one(dev);
+
err_rsrc:
destroy_dev_resources(&dev->devr);
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
{
struct mlx5_ib_dev *dev = context;
+
ib_unregister_device(&dev->ib_dev);
destroy_umrc_res(dev);
+ mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
free_comp_eqs(dev);
ib_dealloc_device(&dev->ib_dev);
static int __init mlx5_ib_init(void)
{
+ int err;
+
if (deprecated_prof_sel != 2)
pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
- return mlx5_register_interface(&mlx5_ib_interface);
+ err = mlx5_ib_odp_init();
+ if (err)
+ return err;
+
+ err = mlx5_register_interface(&mlx5_ib_interface);
+ if (err)
+ goto clean_odp;
+
+ return err;
+
+clean_odp:
+ mlx5_ib_odp_cleanup();
+ return err;
}
static void __exit mlx5_ib_cleanup(void)
{
mlx5_unregister_interface(&mlx5_ib_interface);
+ mlx5_ib_odp_cleanup();
}
module_init(mlx5_ib_init);
MLX5_QP_EMPTY
};
+/*
+ * Connect-IB can trigger up to four concurrent pagefaults
+ * per-QP.
+ */
+enum mlx5_ib_pagefault_context {
+ MLX5_IB_PAGEFAULT_RESPONDER_READ,
+ MLX5_IB_PAGEFAULT_REQUESTOR_READ,
+ MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
+ MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
+ MLX5_IB_PAGEFAULT_CONTEXTS
+};
+
+static inline enum mlx5_ib_pagefault_context
+ mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
+{
+ return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
+}
+
+struct mlx5_ib_pfault {
+ struct work_struct work;
+ struct mlx5_pagefault mpfault;
+};
+
struct mlx5_ib_qp {
struct ib_qp ibqp;
struct mlx5_core_qp mqp;
/* Store signature errors */
bool signature_en;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ /*
+ * A flag that is true for QP's that are in a state that doesn't
+ * allow page faults, and shouldn't schedule any more faults.
+ */
+ int disable_page_faults;
+ /*
+ * The disable_page_faults_lock protects a QP's disable_page_faults
+ * field, allowing for a thread to atomically check whether the QP
+ * allows page faults, and if so schedule a page fault.
+ */
+ spinlock_t disable_page_faults_lock;
+ struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
+#endif
};
struct mlx5_ib_cq_buf {
struct umr_common umrc;
/* sync used page count stats
*/
- spinlock_t mr_lock;
struct mlx5_ib_resources devr;
struct mlx5_mr_cache cache;
struct timer_list delay_timer;
int fill_delay;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_odp_caps odp_caps;
+ /*
+ * Sleepable RCU that prevents destruction of MRs while they are still
+ * being used by a page fault handler.
+ */
+ struct srcu_struct mr_srcu;
#endif
};
struct ib_mr_status *mr_status);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+extern struct workqueue_struct *mlx5_ib_page_fault_wq;
+
int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
-#else
+void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
+ struct mlx5_ib_pfault *pfault);
+void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
+int __init mlx5_ib_odp_init(void);
+void mlx5_ib_odp_cleanup(void);
+void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
+void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
{
return 0;
}
+
+static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {}
+static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
+static inline int mlx5_ib_odp_init(void) { return 0; }
+static inline void mlx5_ib_odp_cleanup(void) {}
+static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
+static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
+
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void init_query_mad(struct ib_smp *mad)
static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
#endif
+static int clean_mr(struct mlx5_ib_mr *mr);
+
static int order2idx(struct mlx5_ib_dev *dev, int order)
{
struct mlx5_mr_cache *cache = &dev->cache;
mlx5_ib_dbg(dev, "cache empty for order %d", order);
mr = NULL;
}
+ } else if (access_flags & IB_ACCESS_ON_DEMAND) {
+ err = -EINVAL;
+ pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
+ goto error;
}
if (!mr)
mr->umem = umem;
mr->npages = npages;
- spin_lock(&dev->mr_lock);
- dev->mdev->priv.reg_pages += npages;
- spin_unlock(&dev->mr_lock);
+ atomic_add(npages, &dev->mdev->priv.reg_pages);
mr->ibmr.lkey = mr->mmr.key;
mr->ibmr.rkey = mr->mmr.key;
return err;
}
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+static int clean_mr(struct mlx5_ib_mr *mr)
{
- struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
- struct mlx5_ib_mr *mr = to_mmr(ibmr);
- struct ib_umem *umem = mr->umem;
- int npages = mr->npages;
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
int umred = mr->umred;
int err;
free_cached_mr(dev, mr);
}
+ if (!umred)
+ kfree(mr);
+
+ return 0;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+ int npages = mr->npages;
+ struct ib_umem *umem = mr->umem;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ if (umem)
+ /* Wait for all running page-fault handlers to finish. */
+ synchronize_srcu(&dev->mr_srcu);
+#endif
+
+ clean_mr(mr);
+
if (umem) {
ib_umem_release(umem);
- spin_lock(&dev->mr_lock);
- dev->mdev->priv.reg_pages -= npages;
- spin_unlock(&dev->mr_lock);
+ atomic_sub(npages, &dev->mdev->priv.reg_pages);
}
- if (!umred)
- kfree(mr);
-
return 0;
}
#include "mlx5_ib.h"
+struct workqueue_struct *mlx5_ib_page_fault_wq;
+
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
out:
return err;
}
+
+static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
+ u32 key)
+{
+ u32 base_key = mlx5_base_mkey(key);
+ struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
+
+ if (!mmr || mmr->key != key)
+ return NULL;
+
+ return container_of(mmr, struct mlx5_ib_mr, mmr);
+}
+
+static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
+ struct mlx5_ib_pfault *pfault,
+ int error) {
+ struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+ int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
+ pfault->mpfault.flags,
+ error);
+ if (ret)
+ pr_err("Failed to resolve the page fault on QP 0x%x\n",
+ qp->mqp.qpn);
+}
+
+void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
+ struct mlx5_ib_pfault *pfault)
+{
+ u8 event_subtype = pfault->mpfault.event_subtype;
+
+ switch (event_subtype) {
+ default:
+ pr_warn("Invalid page fault event subtype: 0x%x\n",
+ event_subtype);
+ mlx5_ib_page_fault_resume(qp, pfault, 1);
+ break;
+ }
+}
+
+static void mlx5_ib_qp_pfault_action(struct work_struct *work)
+{
+ struct mlx5_ib_pfault *pfault = container_of(work,
+ struct mlx5_ib_pfault,
+ work);
+ enum mlx5_ib_pagefault_context context =
+ mlx5_ib_get_pagefault_context(&pfault->mpfault);
+ struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
+ pagefaults[context]);
+ mlx5_ib_mr_pfault_handler(qp, pfault);
+}
+
+void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
+ qp->disable_page_faults = 1;
+ spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
+
+ /*
+ * Note that at this point, we are guarenteed that no more
+ * work queue elements will be posted to the work queue with
+ * the QP we are closing.
+ */
+ flush_workqueue(mlx5_ib_page_fault_wq);
+}
+
+void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
+ qp->disable_page_faults = 0;
+ spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
+}
+
+static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
+ struct mlx5_pagefault *pfault)
+{
+ /*
+ * Note that we will only get one fault event per QP per context
+ * (responder/initiator, read/write), until we resolve the page fault
+ * with the mlx5_ib_page_fault_resume command. Since this function is
+ * called from within the work element, there is no risk of missing
+ * events.
+ */
+ struct mlx5_ib_qp *mibqp = to_mibqp(qp);
+ enum mlx5_ib_pagefault_context context =
+ mlx5_ib_get_pagefault_context(pfault);
+ struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
+
+ qp_pfault->mpfault = *pfault;
+
+ /* No need to stop interrupts here since we are in an interrupt */
+ spin_lock(&mibqp->disable_page_faults_lock);
+ if (!mibqp->disable_page_faults)
+ queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
+ spin_unlock(&mibqp->disable_page_faults_lock);
+}
+
+void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
+{
+ int i;
+
+ qp->disable_page_faults = 1;
+ spin_lock_init(&qp->disable_page_faults_lock);
+
+ qp->mqp.pfault_handler = mlx5_ib_pfault_handler;
+
+ for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
+ INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
+}
+
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
+{
+ int ret;
+
+ ret = init_srcu_struct(&ibdev->mr_srcu);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
+{
+ cleanup_srcu_struct(&ibdev->mr_srcu);
+}
+
+int __init mlx5_ib_odp_init(void)
+{
+ mlx5_ib_page_fault_wq =
+ create_singlethread_workqueue("mlx5_ib_page_faults");
+ if (!mlx5_ib_page_fault_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mlx5_ib_odp_cleanup(void)
+{
+ destroy_workqueue(mlx5_ib_page_fault_wq);
+}
int inlen = sizeof(*in);
int err;
+ mlx5_ib_odp_create_qp(qp);
+
gen = &dev->mdev->caps.gen;
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
in = kzalloc(sizeof(*in), GFP_KERNEL);
if (!in)
return;
- if (qp->state != IB_QPS_RESET)
+ if (qp->state != IB_QPS_RESET) {
+ mlx5_ib_qp_disable_pagefaults(qp);
if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
qp->mqp.qpn);
+ }
get_cqs(qp, &send_cq, &recv_cq);
if (mlx5_st < 0)
goto out;
+ /* If moving to a reset or error state, we must disable page faults on
+ * this QP and flush all current page faults. Otherwise a stale page
+ * fault may attempt to work on this QP after it is reset and moved
+ * again to RTS, and may cause the driver and the device to get out of
+ * sync. */
+ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+ (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+ mlx5_ib_qp_disable_pagefaults(qp);
+
optpar = ib_mask_to_mlx5_opt(attr_mask);
optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
in->optparam = cpu_to_be32(optpar);
if (err)
goto out;
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ mlx5_ib_qp_enable_pagefaults(qp);
+
qp->state = new_state;
if (attr_mask & IB_QP_ACCESS_FLAGS)
int mlx5_state;
int err = 0;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ /*
+ * Wait for any outstanding page faults, in case the user frees memory
+ * based upon this query's result.
+ */
+ flush_workqueue(mlx5_ib_page_fault_wq);
+#endif
+
mutex_lock(&qp->mutex);
outb = kzalloc(sizeof(*outb), GFP_KERNEL);
if (!outb) {
struct workqueue_struct *pg_wq;
struct rb_root page_root;
int fw_pages;
- int reg_pages;
+ atomic_t reg_pages;
struct list_head free_list;
struct mlx5_core_health health;