scsi: scsi_error: count medium access timeout only once per EH run

author Hannes Reinecke <hare@suse.de>

Thu, 6 Apr 2017 13:36:29 +0000 (15:36 +0200)

committer Martin K. Petersen <martin.petersen@oracle.com>

Thu, 6 Apr 2017 17:07:32 +0000 (13:07 -0400)
author Hannes Reinecke <hare@suse.de>
Thu, 6 Apr 2017 13:36:29 +0000 (15:36 +0200)
committer Martin K. Petersen <martin.petersen@oracle.com>
Thu, 6 Apr 2017 17:07:32 +0000 (13:07 -0400)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c

index f2cafae150bcdb85292d86945cfb9d6e9fd7c340..370f6c045b60e32f6c20dae425dadd9b669789f8 100644 (file)
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -220,6 +220,23 @@ scsi_abort_command(struct scsi_cmnd *scmd)
         return SUCCESS;
  }
  
+/**
+ * scsi_eh_reset - call into ->eh_action to reset internal counters
+ * @scmd:      scmd to run eh on.
+ *
+ * The scsi driver might be carrying internal state about the
+ * devices, so we need to call into the driver to reset the
+ * internal state once the error handler is started.
+ */
+static void scsi_eh_reset(struct scsi_cmnd *scmd)
+{
+       if (!blk_rq_is_passthrough(scmd->request)) {
+               struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd);
+               if (sdrv->eh_reset)
+                       sdrv->eh_reset(scmd);
+       }
+}
+
  /**
   * scsi_eh_scmd_add - add scsi cmd to error handling.
   * @scmd:      scmd to run eh on.
@@ -249,6 +266,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
         if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED)
                 eh_flag &= ~SCSI_EH_CANCEL_CMD;
         scmd->eh_eflags |= eh_flag;
+       scsi_eh_reset(scmd);
         list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
         shost->host_failed++;
         scsi_eh_wakeup(shost);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index d277e8620e3e39794584ac29dc55cc3ce476a03a..bd2a38ef46f51211ca237e7517652a74cf2ad686 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -115,6 +115,7 @@ static void sd_rescan(struct device *);
  static int sd_init_command(struct scsi_cmnd *SCpnt);
  static void sd_uninit_command(struct scsi_cmnd *SCpnt);
  static int sd_done(struct scsi_cmnd *);
+static void sd_eh_reset(struct scsi_cmnd *);
  static int sd_eh_action(struct scsi_cmnd *, int);
  static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
  static void scsi_disk_release(struct device *cdev);
@@ -532,6 +533,7 @@ static struct scsi_driver sd_template = {
         .uninit_command         = sd_uninit_command,
         .done                   = sd_done,
         .eh_action              = sd_eh_action,
+       .eh_reset               = sd_eh_reset,
  };
  
  /*
@@ -1685,6 +1687,26 @@ static const struct block_device_operations sd_fops = {
         .pr_ops                 = &sd_pr_ops,
  };
  
+/**
+ *     sd_eh_reset - reset error handling callback
+ *     @scmd:          sd-issued command that has failed
+ *
+ *     This function is called by the SCSI midlayer before starting
+ *     SCSI EH. When counting medium access failures we have to be
+ *     careful to register it only only once per device and SCSI EH run;
+ *     there might be several timed out commands which will cause the
+ *     'max_medium_access_timeouts' counter to trigger after the first
+ *     SCSI EH run already and set the device to offline.
+ *     So this function resets the internal counter before starting SCSI EH.
+ **/
+static void sd_eh_reset(struct scsi_cmnd *scmd)
+{
+       struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);
+
+       /* New SCSI EH run, reset gate variable */
+       sdkp->ignore_medium_access_errors = false;
+}
+
  /**
   *     sd_eh_action - error handling callback
   *     @scmd:          sd-issued command that has failed
@@ -1714,7 +1736,10 @@ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
          * process of recovering or has it suffered an internal failure
          * that prevents access to the storage medium.
          */
-       sdkp->medium_access_timed_out++;
+       if (!sdkp->ignore_medium_access_errors) {
+               sdkp->medium_access_timed_out++;
+               sdkp->ignore_medium_access_errors = true;
+       }
  
         /*
          * If the device keeps failing read/write commands but TEST UNIT
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h

index 4dac35e96a75baf833acbfe017c3edef8f8f23b8..0cf9680cb4698f3541adc39446d8a091d576e525 100644 (file)
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -106,6 +106,7 @@ struct scsi_disk {
         unsigned        rc_basis: 2;
         unsigned        zoned: 2;
         unsigned        urswrz : 1;
+       unsigned        ignore_medium_access_errors : 1;
  };
  #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
  
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h

index 891a658aa8673d03b5ad8ac848f3318662d39707..a5534ccad859d29069bff2b240ccfca4dd0c9567 100644 (file)
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -16,6 +16,7 @@ struct scsi_driver {
         void (*uninit_command)(struct scsi_cmnd *);
         int (*done)(struct scsi_cmnd *);
         int (*eh_action)(struct scsi_cmnd *, int);
+       void (*eh_reset)(struct scsi_cmnd *);
  };
  #define to_scsi_driver(drv) \
         container_of((drv), struct scsi_driver, gendrv)
author	Hannes Reinecke <hare@suse.de>
	Thu, 6 Apr 2017 13:36:29 +0000 (15:36 +0200)
committer	Martin K. Petersen <martin.petersen@oracle.com>
	Thu, 6 Apr 2017 17:07:32 +0000 (13:07 -0400)
drivers/scsi/scsi_error.c		patch \| blob \| blame \| history
drivers/scsi/sd.c		patch \| blob \| blame \| history
drivers/scsi/sd.h		patch \| blob \| blame \| history
include/scsi/scsi_driver.h		patch \| blob \| blame \| history