cd->class_genwqe = class_genwqe;
cd->debugfs_genwqe = debugfs_genwqe;
+ /*
+ * This comes from kernel config option and can be overritten via
+ * debugfs.
+ */
+ cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY;
+
init_waitqueue_head(&cd->queue_waitq);
spin_lock_init(&cd->file_lock);
return 0;
fatal_error:
+ if (cd->use_platform_recovery) {
+ /*
+ * Since we use raw accessors, EEH errors won't be detected
+ * by the platform until we do a non-raw MMIO or config space
+ * read
+ */
+ readq(cd->mmio + IO_SLC_CFGREG_GFIR);
+
+ /* We do nothing if the card is going over PCI recovery */
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+ }
+
dev_err(&pci_dev->dev,
"[%s] card unusable. Please trigger unbind!\n", __func__);
pci_set_master(pci_dev);
pci_enable_pcie_error_reporting(pci_dev);
+ /* EEH recovery requires PCIe fundamental reset */
+ pci_dev->needs_freset = 1;
+
/* request complete BAR-0 space (length = 0) */
cd->mmio_len = pci_resource_len(pci_dev, 0);
cd->mmio = pci_iomap(pci_dev, 0, 0);
dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state);
- if (pci_dev == NULL)
- return PCI_ERS_RESULT_NEED_RESET;
-
cd = dev_get_drvdata(&pci_dev->dev);
if (cd == NULL)
- return PCI_ERS_RESULT_NEED_RESET;
+ return PCI_ERS_RESULT_DISCONNECT;
- switch (state) {
- case pci_channel_io_normal:
- return PCI_ERS_RESULT_CAN_RECOVER;
- case pci_channel_io_frozen:
- return PCI_ERS_RESULT_NEED_RESET;
- case pci_channel_io_perm_failure:
+ /* Stop the card */
+ genwqe_health_check_stop(cd);
+ genwqe_stop(cd);
+
+ /*
+ * On permanent failure, the PCI code will call device remove
+ * after the return of this function.
+ * genwqe_stop() can be called twice.
+ */
+ if (state == pci_channel_io_perm_failure) {
return PCI_ERS_RESULT_DISCONNECT;
+ } else {
+ genwqe_pci_remove(cd);
+ return PCI_ERS_RESULT_NEED_RESET;
}
+}
+
+static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev)
+{
+ int rc;
+ struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
- return PCI_ERS_RESULT_NEED_RESET;
+ rc = genwqe_pci_setup(cd);
+ if (!rc) {
+ return PCI_ERS_RESULT_RECOVERED;
+ } else {
+ dev_err(&pci_dev->dev,
+ "err: problems with PCI setup (err=%d)\n", rc);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
}
static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
return PCI_ERS_RESULT_NONE;
}
-static void genwqe_err_resume(struct pci_dev *dev)
+static void genwqe_err_resume(struct pci_dev *pci_dev)
{
+ int rc;
+ struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
+
+ rc = genwqe_start(cd);
+ if (!rc) {
+ rc = genwqe_health_check_start(cd);
+ if (rc)
+ dev_err(&pci_dev->dev,
+ "err: cannot start health checking! (err=%d)\n",
+ rc);
+ } else {
+ dev_err(&pci_dev->dev,
+ "err: cannot start card services! (err=%d)\n", rc);
+ }
}
static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
.error_detected = genwqe_err_error_detected,
.mmio_enabled = genwqe_err_result_none,
.link_reset = genwqe_err_result_none,
- .slot_reset = genwqe_err_result_none,
+ .slot_reset = genwqe_err_slot_reset,
.resume = genwqe_err_resume,
};
* safer, but slower for the good-case ... See above.
*/
gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
- if ((gfir & GFIR_ERR_TRIGGER) != 0x0) {
+ if (((gfir & GFIR_ERR_TRIGGER) != 0x0) &&
+ !pci_channel_offline(pci_dev)) {
+
+ if (cd->use_platform_recovery) {
+ /*
+ * Since we use raw accessors, EEH errors won't be
+ * detected by the platform until we do a non-raw
+ * MMIO or config space read
+ */
+ readq(cd->mmio + IO_SLC_CFGREG_GFIR);
+
+ /* Don't do anything if the PCI channel is frozen */
+ if (pci_channel_offline(pci_dev))
+ goto exit;
+ }
wake_up_interruptible(&cd->health_waitq);
* By default GFIRs causes recovery actions. This
* count is just for debug when recovery is masked.
*/
- printk_ratelimited(KERN_ERR
- "%s %s: [%s] GFIR=%016llx\n",
- GENWQE_DEVNAME, dev_name(&pci_dev->dev),
- __func__, gfir);
+ dev_err_ratelimited(&pci_dev->dev,
+ "[%s] GFIR=%016llx\n",
+ __func__, gfir);
}
+ exit:
return IRQ_HANDLED;
}
*/
int __genwqe_writeq(struct genwqe_dev *cd, u64 byte_offs, u64 val)
{
+ struct pci_dev *pci_dev = cd->pci_dev;
+
if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE)
return -EIO;
if (cd->mmio == NULL)
return -EIO;
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+
__raw_writeq((__force u64)cpu_to_be64(val), cd->mmio + byte_offs);
return 0;
}
*/
int __genwqe_writel(struct genwqe_dev *cd, u64 byte_offs, u32 val)
{
+ struct pci_dev *pci_dev = cd->pci_dev;
+
if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE)
return -EIO;
if (cd->mmio == NULL)
return -EIO;
+ if (pci_channel_offline(pci_dev))
+ return -EIO;
+
__raw_writel((__force u32)cpu_to_be32(val), cd->mmio + byte_offs);
return 0;
}