[SCSI] mpt2sas: driver fails to recover from injected PCIe bus errors
authorEric Moore <eric.moore@lsi.com>
Thu, 8 Jul 2010 20:44:34 +0000 (14:44 -0600)
committerJames Bottomley <James.Bottomley@suse.de>
Tue, 27 Jul 2010 17:04:03 +0000 (12:04 -0500)
fixes surrounding PCIe enhanced error handling:

(1) We need to reject all request generated internaly inside the driver as well
as request arriving from the scsi mid layer when PCIe EEH is active. The fix is
to add a per adapter flag called pci_error_recovery which is checked thru out
the driver when request are generated.

(2) We don't need to call the pci_driver->remove directly from the PCIe
callbacks becuase its already called from the PCIe EEH code. In its place we are
shutting down the watchdog timer, and flushing back all pending IO.

(3) We need to save and restore the pci state across PCIe EEH handling.

Signed-off-by: Eric Moore <eric.moore@lsi.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
drivers/scsi/mpt2sas/mpt2sas_base.c
drivers/scsi/mpt2sas/mpt2sas_base.h
drivers/scsi/mpt2sas/mpt2sas_config.c
drivers/scsi/mpt2sas/mpt2sas_ctl.c
drivers/scsi/mpt2sas/mpt2sas_scsih.c
drivers/scsi/mpt2sas/mpt2sas_transport.c

index 1f22a764927acacd3a352558e93ba94a5cf48901..57bcd5c9dcfffacc3df6ada1c7566ccb6d825b77 100644 (file)
@@ -1311,6 +1311,9 @@ mpt2sas_base_map_resources(struct MPT2SAS_ADAPTER *ioc)
        printk(MPT2SAS_INFO_FMT "ioport(0x%016llx), size(%d)\n",
            ioc->name, (unsigned long long)pio_chip, pio_sz);
 
+       /* Save PCI configuration state for recovery from PCI AER/EEH errors */
+       pci_save_state(pdev);
+
        return 0;
 
  out_fail:
@@ -3407,6 +3410,9 @@ _base_make_ioc_ready(struct MPT2SAS_ADAPTER *ioc, int sleep_flag,
        dinitprintk(ioc, printk(MPT2SAS_INFO_FMT "%s\n", ioc->name,
            __func__));
 
+       if (ioc->pci_error_recovery)
+               return 0;
+
        ioc_state = mpt2sas_base_get_iocstate(ioc, 0);
        dhsprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: ioc_state(0x%08x)\n",
            ioc->name, __func__, ioc_state));
@@ -3869,6 +3875,13 @@ mpt2sas_base_hard_reset_handler(struct MPT2SAS_ADAPTER *ioc, int sleep_flag,
        dtmprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: enter\n", ioc->name,
            __func__));
 
+       if (ioc->pci_error_recovery) {
+               printk(MPT2SAS_ERR_FMT "%s: pci error recovery reset\n",
+                   ioc->name, __func__);
+               r = 0;
+               goto out;
+       }
+
        if (mpt2sas_fwfault_debug)
                mpt2sas_halt_firmware(ioc);
 
index 0b0ef5e7899a7752d5e5a988a8a374cba11284ee..0ebef0c0d949e2c05e854af6acb8b939bccd99c0 100644 (file)
@@ -477,6 +477,7 @@ typedef void (*MPT_ADD_SGE)(void *paddr, u32 flags_length, dma_addr_t dma_addr);
  * @ioc_link_reset_in_progress: phy/hard reset in progress
  * @ignore_loginfos: ignore loginfos during task managment
  * @remove_host: flag for when driver unloads, to avoid sending dev resets
+ * @pci_error_recovery: flag to prevent ioc access until slot reset completes
  * @wait_for_port_enable_to_complete:
  * @msix_enable: flag indicating msix is enabled
  * @msix_vector_count: number msix vectors
@@ -617,6 +618,7 @@ struct MPT2SAS_ADAPTER {
 
        u8              ignore_loginfos;
        u8              remove_host;
+       u8              pci_error_recovery;
        u8              wait_for_port_enable_to_complete;
 
        u8              msix_enable;
index e26f9206a521f93441553ce895b8296a03dfd539..6afd67b324fe44ee602dd3ec585db2e0cce2d790 100644 (file)
@@ -401,7 +401,7 @@ _config_request(struct MPT2SAS_ADAPTER *ioc, Mpi2ConfigRequest_t
                if (ioc->config_cmds.smid == smid)
                        mpt2sas_base_free_smid(ioc, smid);
                if ((ioc->shost_recovery) || (ioc->config_cmds.status &
-                   MPT2_CMD_RESET))
+                   MPT2_CMD_RESET) || ioc->pci_error_recovery)
                        goto retry_config;
                issue_host_reset = 1;
                r = -EFAULT;
index 55ac1cb34778e8c671a49b979a17b6d995b587cb..b774973f07658a3346fa10239846ee74fc3783cf 100644 (file)
@@ -2156,7 +2156,7 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg)
                    !ioc)
                        return -ENODEV;
 
-               if (ioc->shost_recovery)
+               if (ioc->shost_recovery || ioc->pci_error_recovery)
                        return -EAGAIN;
 
                if (_IOC_SIZE(cmd) == sizeof(struct mpt2_ioctl_command)) {
@@ -2275,7 +2275,7 @@ _ctl_compat_mpt_command(struct file *file, unsigned cmd, unsigned long arg)
        if (_ctl_verify_adapter(karg32.hdr.ioc_number, &ioc) == -1 || !ioc)
                return -ENODEV;
 
-       if (ioc->shost_recovery)
+       if (ioc->shost_recovery || ioc->pci_error_recovery)
                return -EAGAIN;
 
        memset(&karg, 0, sizeof(struct mpt2_ioctl_command));
index 854cc91e7aace142dda446cd7d8cffb70247df1d..6273abd0535e4c4a386a5cd195afc85a1e866782 100644 (file)
@@ -1997,7 +1997,8 @@ mpt2sas_scsih_issue_tm(struct MPT2SAS_ADAPTER *ioc, u16 handle, uint channel,
                goto err_out;
        }
 
-       if (ioc->shost_recovery || ioc->remove_host) {
+       if (ioc->shost_recovery || ioc->remove_host ||
+           ioc->pci_error_recovery) {
                printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
                    __func__, ioc->name);
                rc = FAILED;
@@ -2644,7 +2645,8 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        unsigned long flags;
        struct _tr_list *delayed_tr;
 
-       if (ioc->shost_recovery || ioc->remove_host) {
+       if (ioc->shost_recovery || ioc->remove_host ||
+           ioc->pci_error_recovery) {
                dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
                   "progress!\n", __func__, ioc->name));
                return;
@@ -2742,7 +2744,8 @@ _scsih_tm_tr_volume_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        u16 smid;
        struct _tr_list *delayed_tr;
 
-       if (ioc->shost_recovery || ioc->remove_host) {
+       if (ioc->shost_recovery || ioc->remove_host ||
+           ioc->pci_error_recovery) {
                dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
                   "progress!\n", __func__, ioc->name));
                return;
@@ -2793,7 +2796,8 @@ _scsih_tm_volume_tr_complete(struct MPT2SAS_ADAPTER *ioc, u16 smid,
        Mpi2SCSITaskManagementReply_t *mpi_reply =
            mpt2sas_base_get_reply_virt_addr(ioc, reply);
 
-       if (ioc->shost_recovery || ioc->remove_host) {
+       if (ioc->shost_recovery || ioc->remove_host ||
+           ioc->pci_error_recovery) {
                dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
                   "progress!\n", __func__, ioc->name));
                return 1;
@@ -2845,7 +2849,8 @@ _scsih_tm_tr_complete(struct MPT2SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
        Mpi2SasIoUnitControlRequest_t *mpi_request;
        u16 smid_sas_ctrl;
 
-       if (ioc->shost_recovery || ioc->remove_host) {
+       if (ioc->shost_recovery || ioc->remove_host ||
+           ioc->pci_error_recovery) {
                dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
                   "progress!\n", __func__, ioc->name));
                return 1;
@@ -3187,7 +3192,10 @@ _scsih_flush_running_cmds(struct MPT2SAS_ADAPTER *ioc)
                count++;
                mpt2sas_base_free_smid(ioc, smid);
                scsi_dma_unmap(scmd);
-               scmd->result = DID_RESET << 16;
+               if (ioc->pci_error_recovery)
+                       scmd->result = DID_NO_CONNECT << 16;
+               else
+                       scmd->result = DID_RESET << 16;
                scmd->scsi_done(scmd);
        }
        dtmprintk(ioc, printk(MPT2SAS_INFO_FMT "completing %d cmds\n",
@@ -3324,6 +3332,12 @@ _scsih_qcmd(struct scsi_cmnd *scmd, void (*done)(struct scsi_cmnd *))
                return 0;
        }
 
+       if (ioc->pci_error_recovery) {
+               scmd->result = DID_NO_CONNECT << 16;
+               scmd->scsi_done(scmd);
+               return 0;
+       }
+
        sas_target_priv_data = sas_device_priv_data->sas_target;
        /* invalid device handle */
        if (sas_target_priv_data->handle == MPT2SAS_INVALID_DEVICE_HANDLE) {
@@ -4156,7 +4170,7 @@ _scsih_expander_add(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        if (!handle)
                return -1;
 
-       if (ioc->shost_recovery)
+       if (ioc->shost_recovery || ioc->pci_error_recovery)
                return -1;
 
        if ((mpt2sas_config_get_expander_pg0(ioc, &mpi_reply, &expander_pg0,
@@ -4734,7 +4748,7 @@ _scsih_sas_topology_change_event(struct MPT2SAS_ADAPTER *ioc,
                _scsih_sas_topology_change_event_debug(ioc, event_data);
 #endif
 
-       if (ioc->shost_recovery || ioc->remove_host)
+       if (ioc->shost_recovery || ioc->remove_host || ioc->pci_error_recovery)
                return;
 
        if (!ioc->sas_hba.num_phys)
@@ -4773,7 +4787,8 @@ _scsih_sas_topology_change_event(struct MPT2SAS_ADAPTER *ioc,
                            "expander event\n", ioc->name));
                        return;
                }
-               if (ioc->shost_recovery || ioc->remove_host)
+               if (ioc->shost_recovery || ioc->remove_host ||
+                   ioc->pci_error_recovery)
                        return;
                phy_number = event_data->StartPhyNum + i;
                reason_code = event_data->PHY[i].PhyStatus &
@@ -6273,7 +6288,8 @@ _firmware_event_work(struct work_struct *work)
        struct MPT2SAS_ADAPTER *ioc = fw_event->ioc;
 
        /* the queue is being flushed so ignore this event */
-       if (ioc->remove_host || fw_event->cancel_pending_work) {
+       if (ioc->remove_host || fw_event->cancel_pending_work ||
+           ioc->pci_error_recovery) {
                _scsih_fw_event_free(ioc, fw_event);
                return;
        }
@@ -6355,7 +6371,7 @@ mpt2sas_scsih_event_callback(struct MPT2SAS_ADAPTER *ioc, u8 msix_index,
        u16 sz;
 
        /* events turned off due to host reset or driver unloading */
-       if (ioc->remove_host)
+       if (ioc->remove_host || ioc->pci_error_recovery)
                return 1;
 
        mpi_reply = mpt2sas_base_get_reply_virt_addr(ioc, reply);
@@ -7058,12 +7074,17 @@ _scsih_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
        case pci_channel_io_normal:
                return PCI_ERS_RESULT_CAN_RECOVER;
        case pci_channel_io_frozen:
+               /* Fatal error, prepare for slot reset */
+               ioc->pci_error_recovery = 1;
                scsi_block_requests(ioc->shost);
                mpt2sas_base_stop_watchdog(ioc);
                mpt2sas_base_free_resources(ioc);
                return PCI_ERS_RESULT_NEED_RESET;
        case pci_channel_io_perm_failure:
-               _scsih_remove(pdev);
+               /* Permanent error, prepare for device removal */
+               ioc->pci_error_recovery = 1;
+               mpt2sas_base_stop_watchdog(ioc);
+               _scsih_flush_running_cmds(ioc);
                return PCI_ERS_RESULT_DISCONNECT;
        }
        return PCI_ERS_RESULT_NEED_RESET;
@@ -7087,7 +7108,9 @@ _scsih_pci_slot_reset(struct pci_dev *pdev)
        printk(MPT2SAS_INFO_FMT "PCI error: slot reset callback!!\n",
                ioc->name);
 
+       ioc->pci_error_recovery = 0;
        ioc->pdev = pdev;
+       pci_restore_state(pdev);
        rc = mpt2sas_base_map_resources(ioc);
        if (rc)
                return PCI_ERS_RESULT_DISCONNECT;
index f29ea5e78bb34f72b4379a76d1a3853a4ad30b7d..b55c6dc07470cc65aed1a30979f0e61e86a16aec 100644 (file)
@@ -140,7 +140,7 @@ _transport_set_identify(struct MPT2SAS_ADAPTER *ioc, u16 handle,
        u32 device_info;
        u32 ioc_status;
 
-       if (ioc->shost_recovery) {
+       if (ioc->shost_recovery || ioc->pci_error_recovery) {
                printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
                    __func__, ioc->name);
                return -EFAULT;
@@ -302,7 +302,7 @@ _transport_expander_report_manufacture(struct MPT2SAS_ADAPTER *ioc,
        u64 *sas_address_le;
        u16 wait_state_count;
 
-       if (ioc->shost_recovery) {
+       if (ioc->shost_recovery || ioc->pci_error_recovery) {
                printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
                    __func__, ioc->name);
                return -EFAULT;
@@ -894,7 +894,7 @@ mpt2sas_transport_update_links(struct MPT2SAS_ADAPTER *ioc,
        struct _sas_node *sas_node;
        struct _sas_phy *mpt2sas_phy;
 
-       if (ioc->shost_recovery)
+       if (ioc->shost_recovery || ioc->pci_error_recovery)
                return;
 
        spin_lock_irqsave(&ioc->sas_node_lock, flags);
@@ -997,7 +997,7 @@ _transport_get_expander_phy_error_log(struct MPT2SAS_ADAPTER *ioc,
        u64 *sas_address_le;
        u16 wait_state_count;
 
-       if (ioc->shost_recovery) {
+       if (ioc->shost_recovery || ioc->pci_error_recovery) {
                printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
                    __func__, ioc->name);
                return -EFAULT;