[SCSI] Reduce error recovery time by reducing use of TURs
authorDavid Jeffery <dhjeffery@gmail.com>
Thu, 19 May 2011 18:41:12 +0000 (14:41 -0400)
committerJames Bottomley <jbottomley@parallels.com>
Tue, 24 May 2011 16:51:53 +0000 (12:51 -0400)
In error recovery, most scsi error recovery stages will send a TUR command
for every bad command when a driver's error handler reports success.  When
several bad commands to the same device, this results in a device
being probed multiple times.

This becomes very problematic if the device or connection is in a state
where the device still doesn't respond to commands even after a recovery
function returns success.  The error handler must wait for the test
commands to time out.  The time waiting for the redundant commands can
drastically lengthen error recovery.

This patch alters the scsi mid-layer's error routines to send test commands
once per device instead of once per bad command.  This can drastically
lower error recovery time.

[jejb: fixed up whitespace and formatting]
Signed-of-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: James Bottomley <jbottomley@parallels.com>
drivers/scsi/scsi_error.c

index abea2cf05c2e41cc1b7a7df581ae74cf0db56768..a4b9cdbaaa0b99d321a37e2fa1db8bf4063e2c86 100644 (file)
@@ -50,6 +50,8 @@
 #define BUS_RESET_SETTLE_TIME   (10)
 #define HOST_RESET_SETTLE_TIME  (10)
 
+static int scsi_eh_try_stu(struct scsi_cmnd *scmd);
+
 /* called with shost->host_lock held */
 void scsi_eh_wakeup(struct Scsi_Host *shost)
 {
@@ -946,6 +948,48 @@ retry_tur:
        }
 }
 
+/**
+ * scsi_eh_test_devices - check if devices are responding from error recovery.
+ * @cmd_list:  scsi commands in error recovery.
+ * @work_q:     queue for commands which still need more error recovery
+ * @done_q:     queue for commands which are finished
+ * @try_stu:    boolean on if a STU command should be tried in addition to TUR.
+ *
+ * Decription:
+ *    Tests if devices are in a working state.  Commands to devices now in
+ *    a working state are sent to the done_q while commands to devices which
+ *    are still failing to respond are returned to the work_q for more
+ *    processing.
+ **/
+static int scsi_eh_test_devices(struct list_head *cmd_list,
+                               struct list_head *work_q,
+                               struct list_head *done_q, int try_stu)
+{
+       struct scsi_cmnd *scmd, *next;
+       struct scsi_device *sdev;
+       int finish_cmds;
+
+       while (!list_empty(cmd_list)) {
+               scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry);
+               sdev = scmd->device;
+
+               finish_cmds = !scsi_device_online(scmd->device) ||
+                       (try_stu && !scsi_eh_try_stu(scmd) &&
+                        !scsi_eh_tur(scmd)) ||
+                       !scsi_eh_tur(scmd);
+
+               list_for_each_entry_safe(scmd, next, cmd_list, eh_entry)
+                       if (scmd->device == sdev) {
+                               if (finish_cmds)
+                                       scsi_eh_finish_cmd(scmd, done_q);
+                               else
+                                       list_move_tail(&scmd->eh_entry, work_q);
+                       }
+       }
+       return list_empty(work_q);
+}
+
+
 /**
  * scsi_eh_abort_cmds - abort pending commands.
  * @work_q:    &list_head for pending commands.
@@ -962,6 +1006,7 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
                              struct list_head *done_q)
 {
        struct scsi_cmnd *scmd, *next;
+       LIST_HEAD(check_list);
        int rtn;
 
        list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
@@ -973,11 +1018,10 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
                rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd);
                if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
                        scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
-                       if (!scsi_device_online(scmd->device) ||
-                           rtn == FAST_IO_FAIL ||
-                           !scsi_eh_tur(scmd)) {
+                       if (rtn == FAST_IO_FAIL)
                                scsi_eh_finish_cmd(scmd, done_q);
-                       }
+                       else
+                               list_move_tail(&scmd->eh_entry, &check_list);
                } else
                        SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
                                                          " cmd failed:"
@@ -986,7 +1030,7 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
                                                          scmd));
        }
 
-       return list_empty(work_q);
+       return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
 }
 
 /**
@@ -1137,6 +1181,7 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
                                struct list_head *done_q)
 {
        LIST_HEAD(tmp_list);
+       LIST_HEAD(check_list);
 
        list_splice_init(work_q, &tmp_list);
 
@@ -1161,9 +1206,9 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
                        if (scmd_id(scmd) != id)
                                continue;
 
-                       if ((rtn == SUCCESS || rtn == FAST_IO_FAIL)
-                           && (!scsi_device_online(scmd->device) ||
-                                rtn == FAST_IO_FAIL || !scsi_eh_tur(scmd)))
+                       if (rtn == SUCCESS)
+                               list_move_tail(&scmd->eh_entry, &check_list);
+                       else if (rtn == FAST_IO_FAIL)
                                scsi_eh_finish_cmd(scmd, done_q);
                        else
                                /* push back on work queue for further processing */
@@ -1171,7 +1216,7 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
                }
        }
 
-       return list_empty(work_q);
+       return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
 }
 
 /**
@@ -1185,6 +1230,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
                             struct list_head *done_q)
 {
        struct scsi_cmnd *scmd, *chan_scmd, *next;
+       LIST_HEAD(check_list);
        unsigned int channel;
        int rtn;
 
@@ -1216,12 +1262,14 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
                rtn = scsi_try_bus_reset(chan_scmd);
                if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
                        list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
-                               if (channel == scmd_channel(scmd))
-                                       if (!scsi_device_online(scmd->device) ||
-                                           rtn == FAST_IO_FAIL ||
-                                           !scsi_eh_tur(scmd))
+                               if (channel == scmd_channel(scmd)) {
+                                       if (rtn == FAST_IO_FAIL)
                                                scsi_eh_finish_cmd(scmd,
                                                                   done_q);
+                                       else
+                                               list_move_tail(&scmd->eh_entry,
+                                                              &check_list);
+                               }
                        }
                } else {
                        SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
@@ -1230,7 +1278,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
                                                          channel));
                }
        }
-       return list_empty(work_q);
+       return scsi_eh_test_devices(&check_list, work_q, done_q, 0);
 }
 
 /**
@@ -1242,6 +1290,7 @@ static int scsi_eh_host_reset(struct list_head *work_q,
                              struct list_head *done_q)
 {
        struct scsi_cmnd *scmd, *next;
+       LIST_HEAD(check_list);
        int rtn;
 
        if (!list_empty(work_q)) {
@@ -1252,12 +1301,10 @@ static int scsi_eh_host_reset(struct list_head *work_q,
                                                  , current->comm));
 
                rtn = scsi_try_host_reset(scmd);
-               if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
+               if (rtn == SUCCESS) {
+                       list_splice_init(work_q, &check_list);
+               } else if (rtn == FAST_IO_FAIL) {
                        list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
-                               if (!scsi_device_online(scmd->device) ||
-                                   rtn == FAST_IO_FAIL ||
-                                   (!scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) ||
-                                   !scsi_eh_tur(scmd))
                                        scsi_eh_finish_cmd(scmd, done_q);
                        }
                } else {
@@ -1266,7 +1313,7 @@ static int scsi_eh_host_reset(struct list_head *work_q,
                                                          current->comm));
                }
        }
-       return list_empty(work_q);
+       return scsi_eh_test_devices(&check_list, work_q, done_q, 1);
 }
 
 /**