scsi: ipr: Fix SATA EH hang
authorBrian King <brking@linux.vnet.ibm.com>
Wed, 15 Mar 2017 21:58:41 +0000 (16:58 -0500)
committerMartin K. Petersen <martin.petersen@oracle.com>
Thu, 23 Mar 2017 16:04:05 +0000 (12:04 -0400)
This patch fixes a hang that can occur in ATA EH with ipr. With ipr's
usage of libata, commands should never end up on ap->eh_done_q. The
timeout function we use for ipr, even for SATA devices, is
scsi_times_out, so ATA_QCFLAG_EH_SCHEDULED never gets set for ipr and EH
is driven completely by ipr and SCSI. The SCSI EH thread ends up calling
ipr's eh_device_reset_handler, which then calls
ata_std_error_handler. This ends up calling ipr_sata_reset, which issues
a reset to the device. This should result in all pending commands
getting failed back and having ata_qc_complete called for them, which
should end up clearing ATA_QCFLAG_FAILED as qc->flags gets zeroed in
ata_qc_free.  This ensures that when we end up in ata_eh_finish, we
don't do anything more with the command.

On adapters that only support a single interrupt and when running with
two MSI-X vectors or less, the adapter firmware guarantees that
responses to all outstanding commands are sent back prior to sending the
response to the SATA reset command.  On newer adapters supporting
multiple HRRQs, however, this can no longer be guaranteed, since the
command responses and reset response may be processed on different
HRRQs.

If ipr returns from ipr_sata_reset before the outstanding command was
returned, this sends us down the path of __ata_eh_qc_complete which then
moves the associated scsi_cmd from the work_q in
scsi_eh_bus_device_reset to ap->eh_done_q, which then will sit there
forever and we will be wedged.

This patch fixes this up by ensuring that any outstanding commands are
flushed before returning from eh_device_reset_handler for a SATA device.

Reported-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Reviewed-by: Wendy Xiong <wenxiong@linux.vnet.ibm.com>
Tested-by: Wendy Xiong <wenxiong@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/ipr.c

index 95f1e6457664929b7edb680eea970c748712250a..b3d1e4426ecc6b27b8d0b5c08c69301f2b7f0eda 100644 (file)
@@ -5066,6 +5066,23 @@ static bool ipr_cmnd_is_free(struct ipr_cmnd *ipr_cmd)
        return false;
 }
 
+/**
+ * ipr_match_res - Match function for specified resource entry
+ * @ipr_cmd:   ipr command struct
+ * @resource:  resource entry to match
+ *
+ * Returns:
+ *     1 if command matches sdev / 0 if command does not match sdev
+ **/
+static int ipr_match_res(struct ipr_cmnd *ipr_cmd, void *resource)
+{
+       struct ipr_resource_entry *res = resource;
+
+       if (res && ipr_cmd->ioarcb.res_handle == res->res_handle)
+               return 1;
+       return 0;
+}
+
 /**
  * ipr_wait_for_ops - Wait for matching commands to complete
  * @ipr_cmd:   ipr command struct
@@ -5246,7 +5263,7 @@ static int ipr_sata_reset(struct ata_link *link, unsigned int *classes,
        struct ipr_ioa_cfg *ioa_cfg = sata_port->ioa_cfg;
        struct ipr_resource_entry *res;
        unsigned long lock_flags = 0;
-       int rc = -ENXIO;
+       int rc = -ENXIO, ret;
 
        ENTER;
        spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags);
@@ -5260,9 +5277,19 @@ static int ipr_sata_reset(struct ata_link *link, unsigned int *classes,
        if (res) {
                rc = ipr_device_reset(ioa_cfg, res);
                *classes = res->ata_class;
-       }
+               spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
+
+               ret = ipr_wait_for_ops(ioa_cfg, res, ipr_match_res);
+               if (ret != SUCCESS) {
+                       spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags);
+                       ipr_initiate_ioa_reset(ioa_cfg, IPR_SHUTDOWN_ABBREV);
+                       spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
+
+                       wait_event(ioa_cfg->reset_wait_q, !ioa_cfg->in_reset_reload);
+               }
+       } else
+               spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
 
-       spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
        LEAVE;
        return rc;
 }
@@ -5291,9 +5318,6 @@ static int __ipr_eh_dev_reset(struct scsi_cmnd *scsi_cmd)
        ioa_cfg = (struct ipr_ioa_cfg *) scsi_cmd->device->host->hostdata;
        res = scsi_cmd->device->hostdata;
 
-       if (!res)
-               return FAILED;
-
        /*
         * If we are currently going through reset/reload, return failed. This will force the
         * mid-layer to call ipr_eh_host_reset, which will then go to sleep and wait for the
@@ -5332,19 +5356,6 @@ static int __ipr_eh_dev_reset(struct scsi_cmnd *scsi_cmd)
                spin_unlock_irq(scsi_cmd->device->host->host_lock);
                ata_std_error_handler(ap);
                spin_lock_irq(scsi_cmd->device->host->host_lock);
-
-               for_each_hrrq(hrrq, ioa_cfg) {
-                       spin_lock(&hrrq->_lock);
-                       list_for_each_entry(ipr_cmd,
-                                           &hrrq->hrrq_pending_q, queue) {
-                               if (ipr_cmd->ioarcb.res_handle ==
-                                   res->res_handle) {
-                                       rc = -EIO;
-                                       break;
-                               }
-                       }
-                       spin_unlock(&hrrq->_lock);
-               }
        } else
                rc = ipr_device_reset(ioa_cfg, res);
        res->resetting_device = 0;
@@ -5358,15 +5369,24 @@ static int ipr_eh_dev_reset(struct scsi_cmnd *cmd)
 {
        int rc;
        struct ipr_ioa_cfg *ioa_cfg;
+       struct ipr_resource_entry *res;
 
        ioa_cfg = (struct ipr_ioa_cfg *) cmd->device->host->hostdata;
+       res = cmd->device->hostdata;
+
+       if (!res)
+               return FAILED;
 
        spin_lock_irq(cmd->device->host->host_lock);
        rc = __ipr_eh_dev_reset(cmd);
        spin_unlock_irq(cmd->device->host->host_lock);
 
-       if (rc == SUCCESS)
-               rc = ipr_wait_for_ops(ioa_cfg, cmd->device, ipr_match_lun);
+       if (rc == SUCCESS) {
+               if (ipr_is_gata(res) && res->sata_port)
+                       rc = ipr_wait_for_ops(ioa_cfg, res, ipr_match_res);
+               else
+                       rc = ipr_wait_for_ops(ioa_cfg, cmd->device, ipr_match_lun);
+       }
 
        return rc;
 }