From a7d656063e55f8227a54fad99270b26472719ff1 Mon Sep 17 00:00:00 2001 From: Tim Sell Date: Thu, 12 May 2016 09:14:41 -0400 Subject: [PATCH] staging: unisys: visorhba: correct scsi task mgmt completion handling This patch is necessary to enable ANY task mgmt command to complete successfully via visorhba. When issuing a task mgmt command (CMD_SCSITASKMGMT_TYPE) to the IO partition (back-end), forward_taskmgmt_command() includes pointers within the command area that will be used to wake up the issuing process and provide the result when the command completes: cmdrsp->scsitaskmgmt.notify_handle = (u64)¬ifyevent; cmdrsp->scsitaskmgmt.notifyresult_handle = (u64)¬ifyresult; 'notify_handle' is a pointer to a 'wait_queue_head_t' variable, and 'notifyresult' is a pointer to an int. Both of these are just local stack variables in the issuing process. The way it's supposed to happen is that when the IO partition completes the command, in our completion handling we get copies of those pointers back from the IO partition, where we stash the result of the command at '*notifyresult' (which should not be 0xffff, because that is the initial value that the caller is looking to see a change in), and wake up the wait queue at '*notify_handle'. There are several places we do that dance, but prior to this patch, we always do it WRONG, like: cmdrsp->scsitaskmgmt.notifyresult_handle = TASK_MGMT_FAILED; wake_up_all((wait_queue_head_t *)cmdrsp->scsitaskmgmt.notify_handle); The wake_up_all() part is correct (albeit with the help of the sloppy pointer casting, but that's irrelevant to the bug), but the assignment of 'notifyresult_handle' is WRONG, and SHOULD read: *(int *)(cmdrsp->scsitaskmgmt.notifyresult_handle) = TASK_MGMT_FAILED; Without this change, the caller is NEVER going to notice a change in his local value of 'notifyresult' when he does the: if (!wait_event_timeout(notifyevent, notifyresult != 0xffff, msecs_to_jiffies(45000))) and hence will be timing out EVERY taskmgmt command. This patch also eliminates the need for sloppy casting of pointers back-and-forth between u64 values, with the help of idr_alloc() to provide handles for us. It is the generated int handles we pass to the IO partition to denote our completion context, and these are validated and converted back to the required pointers when the task mgmt commands are returned back to us by the IO partition. == Testing == You must enable dynamic debugging in visorhba (build kernel with 'CONFIG_DYNAMIC_DEBUG=y', provide kernel parameter 'visorhba.dyndbg=+p') to see kernel messages involved with visorhba scsi task mgmt commands, which were added in this patch in the form of a few dev_dbg() / pr_debug() messages. In order to inject faults necessary to get visorhba to actully issue scsi task mgmt commands, you will need to compile a kernel with CONFIG_FAIL_IO_TIMEOUT and friends, in the "Kernel hacking" section: * Enable "Fault-injection framework" * Enable "Fault-injection capability for disk IO" * Enable "Fault-injection capability for faking disk interrupts" * Enable "Debugfs entries for fault-injection capabilities" When running a kernel with those options, you can manually inject a fault that will force a scsi task mgmt command to be issued like this: # mount -t debugfs nodev /sys/kernel/debug # cd /sys/kernel/debug/fail_io_timeout # cat interval 1 # cat probability 0 # cat times 1 # echo 100 >probability # cd /sys/block/sda # l | grep fail -rw-r--r-- 1 root root 4096 May 5 10:53 io-timeout-fail -rw-r--r-- 1 root root 4096 May 5 10:54 make-it-fail # echo 1 >io-timeout-fail # echo 1 >make-it-fail To test this patch, after performing the above steps, I did something to force a block device i/o, then shortly afterwards examined the kernel log. There I found evidence that visorhba had successfully issued a task mgmt command, and that it completed successfully: [ 333.352612] FAULT_INJECTION: forcing a failure. name fail_io_timeout, interval 1, probability 100, space 0, times 1 [ 333.352617] CPU: 0 PID: 295 Comm: vhba_incoming Tainted: G C 4.6.0-rc3-ARCH+ #2 [ 333.352619] Hardware name: Dell Inc. PowerEdge T110/ , BIOS 1.23 12/15/2009 [ 333.352620] 0000000000000000 ffff88001d1a7dd0 ffffffff8125beeb ffffffff818507c0 [ 333.352623] 0000000000000064 ffff88001d1a7df0 ffffffff8128047a ffff8800113462b0 [ 333.352625] ffff88000e523000 ffff88001d1a7e00 ffffffff81241c79 ffff88001d1a7e18 [ 333.352627] Call Trace: [ 333.352634] [] dump_stack+0x4d/0x72 [ 333.352637] [] should_fail+0x11a/0x120 [ 333.352641] [] blk_should_fake_timeout+0x29/0x30 [ 333.352643] [] blk_complete_request+0x16/0x30 [ 333.352654] [] scsi_done+0x26/0x80 [scsi_mod] [ 333.352657] [] process_incoming_rsps+0x2bc/0x770 [visorhba] [ 333.352661] [] ? wait_woken+0x80/0x80 [ 333.352663] [] ? add_scsipending_entry+0x100/0x100 [visorhba] [ 333.352666] [] kthread+0xc9/0xe0 [ 333.352669] [] ret_from_fork+0x22/0x40 [ 333.352671] [] ? kthread_create_on_node+0x180/0x180 [ 364.025672] sd 0:0:1:1: visorhba: initiating type=1 taskmgmt command [ 364.029721] visorhba: notifying initiator with result=0x1 [ 364.029726] sd 0:0:1:1: visorhba: taskmgmt type=1 success; result=0x1 Signed-off-by: Tim Sell Signed-off-by: David Kershner Signed-off-by: Greg Kroah-Hartman --- .../staging/unisys/visorhba/visorhba_main.c | 138 ++++++++++++++---- 1 file changed, 111 insertions(+), 27 deletions(-) diff --git a/drivers/staging/unisys/visorhba/visorhba_main.c b/drivers/staging/unisys/visorhba/visorhba_main.c index 085d78212a80..c4a9f325c34a 100644 --- a/drivers/staging/unisys/visorhba/visorhba_main.c +++ b/drivers/staging/unisys/visorhba/visorhba_main.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -133,6 +134,12 @@ struct visorhba_devdata { int devnum; struct task_struct *thread; int thread_wait_ms; + + /* + * allows us to pass int handles back-and-forth between us and + * iovm, instead of raw pointers + */ + struct idr idr; }; struct visorhba_devices_open { @@ -268,6 +275,62 @@ static struct uiscmdrsp *get_scsipending_cmdrsp(struct visorhba_devdata *ddata, return NULL; } +/** + * simple_idr_get - associate a provided pointer with an int value + * 1 <= value <= INT_MAX, and return this int value; + * the pointer value can be obtained later by passing + * this int value to idr_find() + * @idrtable: the data object maintaining the pointer<-->int mappings + * @p: the pointer value to be remembered + * @lock: a spinlock used when exclusive access to idrtable is needed + */ +static unsigned int simple_idr_get(struct idr *idrtable, void *p, + spinlock_t *lock) +{ + int id; + unsigned long flags; + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(lock, flags); + id = idr_alloc(idrtable, p, 1, INT_MAX, GFP_NOWAIT); + spin_unlock_irqrestore(lock, flags); + idr_preload_end(); + if (id < 0) + return 0; /* failure */ + return (unsigned int)(id); /* idr_alloc() guarantees > 0 */ +} + +/** + * setup_scsitaskmgmt_handles - stash the necessary handles so that the + * completion processing logic for a taskmgmt + * cmd will be able to find who to wake up + * and where to stash the result + */ +static void setup_scsitaskmgmt_handles(struct idr *idrtable, spinlock_t *lock, + struct uiscmdrsp *cmdrsp, + wait_queue_head_t *event, int *result) +{ + /* specify the event that has to be triggered when this */ + /* cmd is complete */ + cmdrsp->scsitaskmgmt.notify_handle = + simple_idr_get(idrtable, event, lock); + cmdrsp->scsitaskmgmt.notifyresult_handle = + simple_idr_get(idrtable, result, lock); +} + +/** + * cleanup_scsitaskmgmt_handles - forget handles created by + * setup_scsitaskmgmt_handles() + */ +static void cleanup_scsitaskmgmt_handles(struct idr *idrtable, + struct uiscmdrsp *cmdrsp) +{ + if (cmdrsp->scsitaskmgmt.notify_handle) + idr_remove(idrtable, cmdrsp->scsitaskmgmt.notify_handle); + if (cmdrsp->scsitaskmgmt.notifyresult_handle) + idr_remove(idrtable, cmdrsp->scsitaskmgmt.notifyresult_handle); +} + /** * forward_taskmgmt_command - send taskmegmt command to the Service * Partition @@ -303,10 +366,8 @@ static int forward_taskmgmt_command(enum task_mgmt_types tasktype, /* issue TASK_MGMT_ABORT_TASK */ cmdrsp->cmdtype = CMD_SCSITASKMGMT_TYPE; - /* specify the event that has to be triggered when this */ - /* cmd is complete */ - cmdrsp->scsitaskmgmt.notify_handle = (u64)¬ifyevent; - cmdrsp->scsitaskmgmt.notifyresult_handle = (u64)¬ifyresult; + setup_scsitaskmgmt_handles(&devdata->idr, &devdata->privlock, cmdrsp, + ¬ifyevent, ¬ifyresult); /* save destination */ cmdrsp->scsitaskmgmt.tasktype = tasktype; @@ -315,6 +376,8 @@ static int forward_taskmgmt_command(enum task_mgmt_types tasktype, cmdrsp->scsitaskmgmt.vdest.lun = scsidev->lun; cmdrsp->scsitaskmgmt.handle = scsicmd_id; + dev_dbg(&scsidev->sdev_gendev, + "visorhba: initiating type=%d taskmgmt command\n", tasktype); if (!visorchannel_signalinsert(devdata->dev->visorchannel, IOCHAN_TO_IOPART, cmdrsp)) @@ -327,17 +390,23 @@ static int forward_taskmgmt_command(enum task_mgmt_types tasktype, msecs_to_jiffies(45000))) goto err_del_scsipending_ent; + dev_dbg(&scsidev->sdev_gendev, + "visorhba: taskmgmt type=%d success; result=0x%x\n", + tasktype, notifyresult); if (tasktype == TASK_MGMT_ABORT_TASK) scsicmd->result = DID_ABORT << 16; else scsicmd->result = DID_RESET << 16; scsicmd->scsi_done(scsicmd); - + cleanup_scsitaskmgmt_handles(&devdata->idr, cmdrsp); return SUCCESS; err_del_scsipending_ent: + dev_dbg(&scsidev->sdev_gendev, + "visorhba: taskmgmt type=%d not executed\n", tasktype); del_scsipending_ent(devdata, scsicmd_id); + cleanup_scsitaskmgmt_handles(&devdata->idr, cmdrsp); return FAILED; } @@ -666,6 +735,35 @@ static ssize_t info_debugfs_read(struct file *file, char __user *buf, return bytes_read; } +/** + * complete_taskmgmt_command - complete task management + * @cmdrsp: Response from the IOVM + * + * Service Partition returned the result of the task management + * command. Wake up anyone waiting for it. + * Returns void + */ +static inline void complete_taskmgmt_command +(struct idr *idrtable, struct uiscmdrsp *cmdrsp, int result) +{ + wait_queue_head_t *wq = + idr_find(idrtable, cmdrsp->scsitaskmgmt.notify_handle); + int *scsi_result_ptr = + idr_find(idrtable, cmdrsp->scsitaskmgmt.notifyresult_handle); + + if (unlikely(!(wq && scsi_result_ptr))) { + pr_err("visorhba: no completion context; cmd will time out\n"); + return; + } + + /* copy the result of the taskmgmt and + * wake up the error handler that is waiting for this + */ + pr_debug("visorhba: notifying initiator with result=0x%x\n", result); + *scsi_result_ptr = result; + wake_up_all(wq); +} + /** * visorhba_serverdown_complete - Called when we are done cleaning up * from serverdown @@ -701,10 +799,8 @@ static void visorhba_serverdown_complete(struct visorhba_devdata *devdata) break; case CMD_SCSITASKMGMT_TYPE: cmdrsp = pendingdel->sent; - cmdrsp->scsitaskmgmt.notifyresult_handle - = TASK_MGMT_FAILED; - wake_up_all((wait_queue_head_t *) - cmdrsp->scsitaskmgmt.notify_handle); + complete_taskmgmt_command(&devdata->idr, cmdrsp, + TASK_MGMT_FAILED); break; default: break; @@ -871,23 +967,6 @@ complete_scsi_command(struct uiscmdrsp *cmdrsp, struct scsi_cmnd *scsicmd) scsicmd->scsi_done(scsicmd); } -/** - * complete_taskmgmt_command - complete task management - * @cmdrsp: Response from the IOVM - * - * Service Partition returned the result of the task management - * command. Wake up anyone waiting for it. - * Returns void - */ -static inline void complete_taskmgmt_command(struct uiscmdrsp *cmdrsp) -{ - /* copy the result of the taskgmgt and - * wake up the error handler that is waiting for this - */ - cmdrsp->vdiskmgmt.notifyresult_handle = cmdrsp->vdiskmgmt.result; - wake_up_all((wait_queue_head_t *)cmdrsp->scsitaskmgmt.notify_handle); -} - static struct work_struct dar_work_queue; static struct diskaddremove *dar_work_queue_head; static spinlock_t dar_work_queue_lock; /* Lock to protet dar_work_queue_head */ @@ -978,7 +1057,8 @@ drain_queue(struct uiscmdrsp *cmdrsp, struct visorhba_devdata *devdata) if (!del_scsipending_ent(devdata, cmdrsp->scsitaskmgmt.handle)) break; - complete_taskmgmt_command(cmdrsp); + complete_taskmgmt_command(&devdata->idr, cmdrsp, + cmdrsp->scsitaskmgmt.result); } else if (cmdrsp->cmdtype == CMD_NOTIFYGUEST_TYPE) { /* The vHba pointer has no meaning in a * guest partition. Let's be safe and set it @@ -1140,6 +1220,8 @@ static int visorhba_probe(struct visor_device *dev) if (err) goto err_scsi_remove_host; + idr_init(&devdata->idr); + devdata->thread_wait_ms = 2; devdata->thread = visor_thread_start(process_incoming_rsps, devdata, "vhba_incoming"); @@ -1176,6 +1258,8 @@ static void visorhba_remove(struct visor_device *dev) scsi_remove_host(scsihost); scsi_host_put(scsihost); + idr_destroy(&devdata->idr); + dev_set_drvdata(&dev->device, NULL); } -- 2.20.1