cxlflash: Fix regression issue with re-ordering patch
authorManoj N. Kumar <manoj@linux.vnet.ibm.com>
Fri, 25 Mar 2016 19:26:34 +0000 (14:26 -0500)
committerMartin K. Petersen <martin.petersen@oracle.com>
Tue, 29 Mar 2016 00:43:17 +0000 (20:43 -0400)
While running 'sg_reset -H' back to back the following exception was seen:

[  735.115695] Faulting instruction address: 0xd0000000098c0864
cpu 0x0: Vector: 300 (Data Access) at [c000000ffffafa80]
    pc: d0000000098c0864: cxlflash_async_err_irq+0x84/0x5c0 [cxlflash]
    lr: c00000000013aed0: handle_irq_event_percpu+0xa0/0x310
    sp: c000000ffffafd00
   msr: 9000000000009033
   dar: 2010000
 dsisr: 40000000
  current = 0xc000000001510880
  paca    = 0xc00000000fb80000   softe: 0        irq_happened: 0x01
    pid   = 0, comm = swapper/0

Linux version 4.5.0-491-26f710d+

enter ? for help
[c000000ffffafe10c00000000013aed0 handle_irq_event_percpu+0xa0/0x310
[c000000ffffafed0c00000000013b1a8 handle_irq_event+0x68/0xc0
[c000000ffffaff00c0000000001404ec handle_fasteoi_irq+0xec/0x2a0
[c000000ffffaff30c00000000013a084 generic_handle_irq+0x54/0x80
[c000000ffffaff60c000000000011130 __do_irq+0x80/0x1d0
[c000000ffffaff90c000000000024d40 call_do_irq+0x14/0x24
[c000000001573a20c000000000011318 do_IRQ+0x98/0x140
[c000000001573a70c000000000002594 hardware_interrupt_common+0x114/0x180

This exception is being hit because the async_err interrupt path performs
an MMIO to read the interrupt status register. The MMIO region in this
case is not available.

Commit 6ded8b3cbd9a ("cxlflash: Unmap problem state area before detaching
master context") re-ordered the sequence in which term_mc() and stop_afu()
are called. This introduces a window for interrupts to come in with the
problem space area unmapped, that did not exist previously.

The fix is to separate the disabling of all AFU interrupts to a distinct
function, term_intr() so that it is the first thing that is done in the
tear down process.

To keep the initialization process symmetric, separate the AFU interrupt
setup also to a distinct function: init_intr().

Fixes: 6ded8b3cbd9a ("cxlflash: Unmap problem state area before detaching master context")
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
Acked-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Uma Krishnan <ukrishn@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/cxlflash/main.c
drivers/scsi/cxlflash/main.h

index 3879b46d79e1fa6710e48b59feb438ec7b2ca1ae..1d418073d621784023c343fb31b882d6a59ea07d 100644 (file)
@@ -683,28 +683,23 @@ static void stop_afu(struct cxlflash_cfg *cfg)
 }
 
 /**
- * term_mc() - terminates the master context
+ * term_intr() - disables all AFU interrupts
  * @cfg:       Internal structure associated with the host.
  * @level:     Depth of allocation, where to begin waterfall tear down.
  *
  * Safe to call with AFU/MC in partially allocated/initialized state.
  */
-static void term_mc(struct cxlflash_cfg *cfg, enum undo_level level)
+static void term_intr(struct cxlflash_cfg *cfg, enum undo_level level)
 {
-       int rc = 0;
        struct afu *afu = cfg->afu;
        struct device *dev = &cfg->dev->dev;
 
        if (!afu || !cfg->mcctx) {
-               dev_err(dev, "%s: returning from term_mc with NULL afu or MC\n",
-                      __func__);
+               dev_err(dev, "%s: returning with NULL afu or MC\n", __func__);
                return;
        }
 
        switch (level) {
-       case UNDO_START:
-               rc = cxl_stop_context(cfg->mcctx);
-               BUG_ON(rc);
        case UNMAP_THREE:
                cxl_unmap_afu_irq(cfg->mcctx, 3, afu);
        case UNMAP_TWO:
@@ -713,9 +708,34 @@ static void term_mc(struct cxlflash_cfg *cfg, enum undo_level level)
                cxl_unmap_afu_irq(cfg->mcctx, 1, afu);
        case FREE_IRQ:
                cxl_free_afu_irqs(cfg->mcctx);
-       case RELEASE_CONTEXT:
-               cfg->mcctx = NULL;
+               /* fall through */
+       case UNDO_NOOP:
+               /* No action required */
+               break;
+       }
+}
+
+/**
+ * term_mc() - terminates the master context
+ * @cfg:       Internal structure associated with the host.
+ * @level:     Depth of allocation, where to begin waterfall tear down.
+ *
+ * Safe to call with AFU/MC in partially allocated/initialized state.
+ */
+static void term_mc(struct cxlflash_cfg *cfg)
+{
+       int rc = 0;
+       struct afu *afu = cfg->afu;
+       struct device *dev = &cfg->dev->dev;
+
+       if (!afu || !cfg->mcctx) {
+               dev_err(dev, "%s: returning with NULL afu or MC\n", __func__);
+               return;
        }
+
+       rc = cxl_stop_context(cfg->mcctx);
+       WARN_ON(rc);
+       cfg->mcctx = NULL;
 }
 
 /**
@@ -726,10 +746,20 @@ static void term_mc(struct cxlflash_cfg *cfg, enum undo_level level)
  */
 static void term_afu(struct cxlflash_cfg *cfg)
 {
+       /*
+        * Tear down is carefully orchestrated to ensure
+        * no interrupts can come in when the problem state
+        * area is unmapped.
+        *
+        * 1) Disable all AFU interrupts
+        * 2) Unmap the problem state area
+        * 3) Stop the master context
+        */
+       term_intr(cfg, UNMAP_THREE);
        if (cfg->afu)
                stop_afu(cfg);
 
-       term_mc(cfg, UNDO_START);
+       term_mc(cfg);
 
        pr_debug("%s: returning\n", __func__);
 }
@@ -1597,41 +1627,24 @@ static int start_afu(struct cxlflash_cfg *cfg)
 }
 
 /**
- * init_mc() - create and register as the master context
+ * init_intr() - setup interrupt handlers for the master context
  * @cfg:       Internal structure associated with the host.
  *
  * Return: 0 on success, -errno on failure
  */
-static int init_mc(struct cxlflash_cfg *cfg)
+static enum undo_level init_intr(struct cxlflash_cfg *cfg,
+                                struct cxl_context *ctx)
 {
-       struct cxl_context *ctx;
-       struct device *dev = &cfg->dev->dev;
        struct afu *afu = cfg->afu;
+       struct device *dev = &cfg->dev->dev;
        int rc = 0;
-       enum undo_level level;
-
-       ctx = cxl_get_context(cfg->dev);
-       if (unlikely(!ctx))
-               return -ENOMEM;
-       cfg->mcctx = ctx;
-
-       /* Set it up as a master with the CXL */
-       cxl_set_master(ctx);
-
-       /* During initialization reset the AFU to start from a clean slate */
-       rc = cxl_afu_reset(cfg->mcctx);
-       if (unlikely(rc)) {
-               dev_err(dev, "%s: initial AFU reset failed rc=%d\n",
-                       __func__, rc);
-               level = RELEASE_CONTEXT;
-               goto out;
-       }
+       enum undo_level level = UNDO_NOOP;
 
        rc = cxl_allocate_afu_irqs(ctx, 3);
        if (unlikely(rc)) {
                dev_err(dev, "%s: call to allocate_afu_irqs failed rc=%d!\n",
                        __func__, rc);
-               level = RELEASE_CONTEXT;
+               level = UNDO_NOOP;
                goto out;
        }
 
@@ -1661,8 +1674,47 @@ static int init_mc(struct cxlflash_cfg *cfg)
                level = UNMAP_TWO;
                goto out;
        }
+out:
+       return level;
+}
 
-       rc = 0;
+/**
+ * init_mc() - create and register as the master context
+ * @cfg:       Internal structure associated with the host.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int init_mc(struct cxlflash_cfg *cfg)
+{
+       struct cxl_context *ctx;
+       struct device *dev = &cfg->dev->dev;
+       int rc = 0;
+       enum undo_level level;
+
+       ctx = cxl_get_context(cfg->dev);
+       if (unlikely(!ctx)) {
+               rc = -ENOMEM;
+               goto ret;
+       }
+       cfg->mcctx = ctx;
+
+       /* Set it up as a master with the CXL */
+       cxl_set_master(ctx);
+
+       /* During initialization reset the AFU to start from a clean slate */
+       rc = cxl_afu_reset(cfg->mcctx);
+       if (unlikely(rc)) {
+               dev_err(dev, "%s: initial AFU reset failed rc=%d\n",
+                       __func__, rc);
+               goto ret;
+       }
+
+       level = init_intr(cfg, ctx);
+       if (unlikely(level)) {
+               dev_err(dev, "%s: setting up interrupts failed rc=%d\n",
+                       __func__, rc);
+               goto out;
+       }
 
        /* This performs the equivalent of the CXL_IOCTL_START_WORK.
         * The CXL_IOCTL_GET_PROCESS_ELEMENT is implicit in the process
@@ -1678,7 +1730,7 @@ ret:
        pr_debug("%s: returning rc=%d\n", __func__, rc);
        return rc;
 out:
-       term_mc(cfg, level);
+       term_intr(cfg, level);
        goto ret;
 }
 
@@ -1751,7 +1803,8 @@ out:
 err2:
        kref_put(&afu->mapcount, afu_unmap);
 err1:
-       term_mc(cfg, UNDO_START);
+       term_intr(cfg, UNMAP_THREE);
+       term_mc(cfg);
        goto out;
 }
 
@@ -2502,8 +2555,7 @@ static pci_ers_result_t cxlflash_pci_error_detected(struct pci_dev *pdev,
                if (unlikely(rc))
                        dev_err(dev, "%s: Failed to mark user contexts!(%d)\n",
                                __func__, rc);
-               stop_afu(cfg);
-               term_mc(cfg, UNDO_START);
+               term_afu(cfg);
                return PCI_ERS_RESULT_NEED_RESET;
        case pci_channel_io_perm_failure:
                cfg->state = STATE_FAILTERM;
index 0faed422c7f47eec6ac8f474cb85f7e61dd6ed1d..eb9d8f730b38ec34e86c777d560f3762b48bf3d5 100644 (file)
 #define WWPN_BUF_LEN   (WWPN_LEN + 1)
 
 enum undo_level {
-       RELEASE_CONTEXT = 0,
+       UNDO_NOOP = 0,
        FREE_IRQ,
        UNMAP_ONE,
        UNMAP_TWO,
-       UNMAP_THREE,
-       UNDO_START
+       UNMAP_THREE
 };
 
 struct dev_dependent_vals {