* If detach fails when hw is down, we don't care.
*/
WARN_ON(cxl_ops->detach_process(ctx) &&
- cxl_ops->link_ok(ctx->afu->adapter));
+ cxl_ops->link_ok(ctx->afu->adapter, ctx->afu));
flush_work(&ctx->fault_work); /* Only needed for dedicated process */
/* release the reference to the group leader and mm handling pid */
phys_addr_t p2n_phys;
u64 p2n_size;
int max_ints;
+ struct mutex recovery_lock;
+ int previous_state;
};
struct cxl_afu {
__be32 software_state;
} __packed;
-static inline bool cxl_adapter_link_ok(struct cxl *cxl)
+static inline bool cxl_adapter_link_ok(struct cxl *cxl, struct cxl_afu *afu)
{
struct pci_dev *pdev;
static inline void cxl_p1_write(struct cxl *cxl, cxl_p1_reg_t reg, u64 val)
{
- if (likely(cxl_adapter_link_ok(cxl)))
+ if (likely(cxl_adapter_link_ok(cxl, NULL)))
out_be64(_cxl_p1_addr(cxl, reg), val);
}
static inline u64 cxl_p1_read(struct cxl *cxl, cxl_p1_reg_t reg)
{
- if (likely(cxl_adapter_link_ok(cxl)))
+ if (likely(cxl_adapter_link_ok(cxl, NULL)))
return in_be64(_cxl_p1_addr(cxl, reg));
else
return ~0ULL;
static inline void cxl_p1n_write(struct cxl_afu *afu, cxl_p1n_reg_t reg, u64 val)
{
- if (likely(cxl_adapter_link_ok(afu->adapter)))
+ if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
out_be64(_cxl_p1n_addr(afu, reg), val);
}
static inline u64 cxl_p1n_read(struct cxl_afu *afu, cxl_p1n_reg_t reg)
{
- if (likely(cxl_adapter_link_ok(afu->adapter)))
+ if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
return in_be64(_cxl_p1n_addr(afu, reg));
else
return ~0ULL;
static inline void cxl_p2n_write(struct cxl_afu *afu, cxl_p2n_reg_t reg, u64 val)
{
- if (likely(cxl_adapter_link_ok(afu->adapter)))
+ if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
out_be64(_cxl_p2n_addr(afu, reg), val);
}
static inline u64 cxl_p2n_read(struct cxl_afu *afu, cxl_p2n_reg_t reg)
{
- if (likely(cxl_adapter_link_ok(afu->adapter)))
+ if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
return in_be64(_cxl_p2n_addr(afu, reg));
else
return ~0ULL;
u64 wed, u64 amr);
int (*detach_process)(struct cxl_context *ctx);
bool (*support_attributes)(const char *attr_name, enum cxl_attrs type);
- bool (*link_ok)(struct cxl *cxl);
+ bool (*link_ok)(struct cxl *cxl, struct cxl_afu *afu);
void (*release_afu)(struct device *dev);
ssize_t (*afu_read_err_buffer)(struct cxl_afu *afu, char *buf,
loff_t off, size_t count);
if (!afu->current_mode)
goto err_put_afu;
- if (!cxl_ops->link_ok(adapter)) {
+ if (!cxl_ops->link_ok(adapter, afu)) {
rc = -EIO;
goto err_put_afu;
}
if (ctx->status == CLOSED)
return -EIO;
- if (!cxl_ops->link_ok(ctx->afu->adapter))
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
return -EIO;
pr_devel("afu_ioctl\n");
if (ctx->status != STARTED)
return -EIO;
- if (!cxl_ops->link_ok(ctx->afu->adapter))
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
return -EIO;
return cxl_context_iomap(ctx, vm);
int rc;
DEFINE_WAIT(wait);
- if (!cxl_ops->link_ok(ctx->afu->adapter))
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
return -EIO;
if (count < CXL_READ_MIN_SIZE)
if (ctx_event_pending(ctx))
break;
- if (!cxl_ops->link_ok(ctx->afu->adapter)) {
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) {
rc = -EIO;
goto out;
}
#include "hcalls.h"
#include "trace.h"
+#define CXL_ERROR_DETECTED_EVENT 1
+#define CXL_SLOT_RESET_EVENT 2
+#define CXL_RESUME_EVENT 3
+
+static void pci_error_handlers(struct cxl_afu *afu,
+ int bus_error_event,
+ pci_channel_state_t state)
+{
+ struct pci_dev *afu_dev;
+
+ if (afu->phb == NULL)
+ return;
+
+ list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
+ if (!afu_dev->driver)
+ continue;
+
+ switch (bus_error_event) {
+ case CXL_ERROR_DETECTED_EVENT:
+ afu_dev->error_state = state;
+
+ if (afu_dev->driver->err_handler &&
+ afu_dev->driver->err_handler->error_detected)
+ afu_dev->driver->err_handler->error_detected(afu_dev, state);
+ break;
+ case CXL_SLOT_RESET_EVENT:
+ afu_dev->error_state = state;
+
+ if (afu_dev->driver->err_handler &&
+ afu_dev->driver->err_handler->slot_reset)
+ afu_dev->driver->err_handler->slot_reset(afu_dev);
+ break;
+ case CXL_RESUME_EVENT:
+ if (afu_dev->driver->err_handler &&
+ afu_dev->driver->err_handler->resume)
+ afu_dev->driver->err_handler->resume(afu_dev);
+ break;
+ }
+ }
+}
static irqreturn_t guest_handle_psl_slice_error(struct cxl_context *ctx, u64 dsisr,
u64 errstat)
return rc;
}
+static int afu_read_error_state(struct cxl_afu *afu, int *state_out)
+{
+ u64 state;
+ int rc = 0;
+
+ rc = cxl_h_read_error_state(afu->guest->handle, &state);
+ if (!rc) {
+ WARN_ON(state != H_STATE_NORMAL &&
+ state != H_STATE_DISABLE &&
+ state != H_STATE_TEMP_UNAVAILABLE &&
+ state != H_STATE_PERM_UNAVAILABLE);
+ *state_out = state & 0xffffffff;
+ }
+ return rc;
+}
+
static irqreturn_t guest_slice_irq_err(int irq, void *data)
{
struct cxl_afu *afu = data;
static int guest_reset(struct cxl *adapter)
{
- int rc;
+ struct cxl_afu *afu = NULL;
+ int i, rc;
pr_devel("Adapter reset request\n");
+ for (i = 0; i < adapter->slices; i++) {
+ if ((afu = adapter->afu[i])) {
+ pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
+ pci_channel_io_frozen);
+ cxl_context_detach_all(afu);
+ }
+ }
+
rc = cxl_h_reset_adapter(adapter->guest->handle);
+ for (i = 0; i < adapter->slices; i++) {
+ if (!rc && (afu = adapter->afu[i])) {
+ pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
+ pci_channel_io_normal);
+ pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
+ }
+ }
return rc;
}
pr_devel("in %s\n", __func__);
trace_cxl_detach(ctx);
- if (!cxl_ops->link_ok(ctx->afu->adapter))
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
return -EIO;
if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
iounmap(afu->p2n_mmio);
}
-static bool guest_link_ok(struct cxl *cxl)
+static int afu_update_state(struct cxl_afu *afu)
{
+ int rc, cur_state;
+
+ rc = afu_read_error_state(afu, &cur_state);
+ if (rc)
+ return rc;
+
+ if (afu->guest->previous_state == cur_state)
+ return 0;
+
+ pr_devel("AFU(%d) update state to %#x\n", afu->slice, cur_state);
+
+ switch (cur_state) {
+ case H_STATE_NORMAL:
+ afu->guest->previous_state = cur_state;
+ rc = 1;
+ break;
+
+ case H_STATE_DISABLE:
+ pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
+ pci_channel_io_frozen);
+
+ cxl_context_detach_all(afu);
+ if ((rc = cxl_ops->afu_reset(afu)))
+ pr_devel("reset hcall failed %d\n", rc);
+
+ rc = afu_read_error_state(afu, &cur_state);
+ if (!rc && cur_state == H_STATE_NORMAL) {
+ pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
+ pci_channel_io_normal);
+ pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
+ rc = 1;
+ }
+ afu->guest->previous_state = 0;
+ break;
+
+ case H_STATE_TEMP_UNAVAILABLE:
+ afu->guest->previous_state = cur_state;
+ break;
+
+ case H_STATE_PERM_UNAVAILABLE:
+ dev_err(&afu->dev, "AFU is in permanent error state\n");
+ pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
+ pci_channel_io_perm_failure);
+ afu->guest->previous_state = cur_state;
+ break;
+
+ default:
+ pr_err("Unexpected AFU(%d) error state: %#x\n",
+ afu->slice, cur_state);
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int afu_do_recovery(struct cxl_afu *afu)
+{
+ int rc;
+
+ /* many threads can arrive here, in case of detach_all for example.
+ * Only one needs to drive the recovery
+ */
+ if (mutex_trylock(&afu->guest->recovery_lock)) {
+ rc = afu_update_state(afu);
+ mutex_unlock(&afu->guest->recovery_lock);
+ return rc;
+ }
+ return 0;
+}
+
+static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu)
+{
+ int state;
+
+ if (afu) {
+ if (afu_read_error_state(afu, &state) ||
+ state != H_STATE_NORMAL) {
+ if (afu_do_recovery(afu) > 0) {
+ /* check again in case we've just fixed it */
+ if (!afu_read_error_state(afu, &state) &&
+ state == H_STATE_NORMAL)
+ return true;
+ }
+ return false;
+ }
+ }
+
return true;
}
return -ENOMEM;
}
+ mutex_init(&afu->guest->recovery_lock);
+
if ((rc = dev_set_name(&afu->dev, "afu%i.%i",
adapter->adapter_num,
slice)))
/* If the adapter has gone down, we can assume that we
* will PERST it and that will invalidate everything.
*/
- if (!cxl_ops->link_ok(afu->adapter))
+ if (!cxl_ops->link_ok(afu->adapter, afu))
return -EIO;
cpu_relax();
}
goto out;
}
- if (!cxl_ops->link_ok(afu->adapter)) {
+ if (!cxl_ops->link_ok(afu->adapter, afu)) {
afu->enabled = enabled;
rc = -EIO;
goto out;
static int native_afu_check_and_enable(struct cxl_afu *afu)
{
- if (!cxl_ops->link_ok(afu->adapter)) {
+ if (!cxl_ops->link_ok(afu->adapter, afu)) {
WARN(1, "Refusing to enable afu while link down!\n");
return -EIO;
}
pr_devel("PSL purge request\n");
- if (!cxl_ops->link_ok(afu->adapter)) {
+ if (!cxl_ops->link_ok(afu->adapter, afu)) {
dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
rc = -EIO;
goto out;
rc = -EBUSY;
goto out;
}
- if (!cxl_ops->link_ok(afu->adapter)) {
+ if (!cxl_ops->link_ok(afu->adapter, afu)) {
rc = -EIO;
goto out;
}
dev_warn(&adapter->dev, "WARNING: CXL adapter wide TLBIA timed out!\n");
return -EBUSY;
}
- if (!cxl_ops->link_ok(adapter))
+ if (!cxl_ops->link_ok(adapter, NULL))
return -EIO;
cpu_relax();
}
dev_warn(&adapter->dev, "WARNING: CXL adapter wide SLBIA timed out!\n");
return -EBUSY;
}
- if (!cxl_ops->link_ok(adapter))
+ if (!cxl_ops->link_ok(adapter, NULL))
return -EIO;
cpu_relax();
}
cxl_p1_write(adapter, CXL_PSL_SLBIA, CXL_TLB_SLB_IQ_LPIDPID);
while (1) {
- if (!cxl_ops->link_ok(adapter))
+ if (!cxl_ops->link_ok(adapter, NULL))
break;
slbia = cxl_p1_read(adapter, CXL_PSL_SLBIA);
if (!(slbia & CXL_TLB_SLB_P))
rc = -EBUSY;
goto out;
}
- if (!cxl_ops->link_ok(ctx->afu->adapter)) {
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) {
dev_warn(&ctx->afu->dev, "WARNING: Device link down, aborting Process Element Command!\n");
rc = -EIO;
goto out;
* should always succeed: it's not running if the hw has gone
* away and is being reset.
*/
- if (cxl_ops->link_ok(ctx->afu->adapter))
+ if (cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_TERMINATE,
CXL_PE_SOFTWARE_STATE_V | CXL_PE_SOFTWARE_STATE_T);
ctx->elem->software_state = 0; /* Remove Valid bit */
/* We could be asked to remove when the hw is down. Again, if
* the hw is down, the PE is gone, so we succeed.
*/
- if (cxl_ops->link_ok(ctx->afu->adapter))
+ if (cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_REMOVE, 0);
if (!rc)
if (!(mode & afu->modes_supported))
return -EINVAL;
- if (!cxl_ops->link_ok(afu->adapter)) {
+ if (!cxl_ops->link_ok(afu->adapter, afu)) {
WARN(1, "Device link is down, refusing to activate!\n");
return -EIO;
}
static int native_attach_process(struct cxl_context *ctx, bool kernel,
u64 wed, u64 amr)
{
- if (!cxl_ops->link_ok(ctx->afu->adapter)) {
+ if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) {
WARN(1, "Device link is down, refusing to attach process!\n");
return -EIO;
}
/* If the adapter has gone away, we can't get any meaningful
* information.
*/
- if (!cxl_ops->link_ok(afu->adapter))
+ if (!cxl_ops->link_ok(afu->adapter, afu))
return -EIO;
info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
static int native_afu_cr_read64(struct cxl_afu *afu, int cr, u64 off, u64 *out)
{
- if (unlikely(!cxl_ops->link_ok(afu->adapter)))
+ if (unlikely(!cxl_ops->link_ok(afu->adapter, afu)))
return -EIO;
if (unlikely(off >= afu->crs_len))
return -ERANGE;
static int native_afu_cr_read32(struct cxl_afu *afu, int cr, u64 off, u32 *out)
{
- if (unlikely(!cxl_ops->link_ok(afu->adapter)))
+ if (unlikely(!cxl_ops->link_ok(afu->adapter, afu)))
return -EIO;
if (unlikely(off >= afu->crs_len))
return -ERANGE;
static int native_afu_cr_write32(struct cxl_afu *afu, int cr, u64 off, u32 in)
{
- if (unlikely(!cxl_ops->link_ok(afu->adapter)))
+ if (unlikely(!cxl_ops->link_ok(afu->adapter, afu)))
return -EIO;
if (unlikely(off >= afu->crs_len))
return -ERANGE;
phb = pci_bus_to_host(dev->bus);
afu = (struct cxl_afu *)phb->private_data;
- if (!cxl_ops->link_ok(afu->adapter)) {
+ if (!cxl_ops->link_ok(afu->adapter, afu)) {
dev_warn(&dev->dev, "%s: Device link is down, refusing to enable AFU\n", __func__);
return false;
}