powerpc/eeh: Report frozen parent PE prior to child PE
authorGavin Shan <gwshan@linux.vnet.ibm.com>
Sun, 4 May 2014 23:29:03 +0000 (09:29 +1000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 11 Jun 2014 07:04:16 +0000 (17:04 +1000)
When we have the corner case of frozen parent and child PE at the
same time, we have to handle the frozen parent PE prior to the
child. Without clearning the frozen state on parent PE, the child
PE can't be recovered successfully.

The patch searches the EEH PE hierarchy tree and returns the toppest
frozen PE to be handled. It ensures the frozen parent PE will be
handled prior to child PE.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/kernel/eeh.c
arch/powerpc/platforms/powernv/eeh-ioda.c

index 7051ea3101b96af830faf0e44eadeb70884fb6fc..c25064b7d667677ae92acd6ef1e577558dff846b 100644 (file)
@@ -358,10 +358,11 @@ out:
 int eeh_dev_check_failure(struct eeh_dev *edev)
 {
        int ret;
+       int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
        unsigned long flags;
        struct device_node *dn;
        struct pci_dev *dev;
-       struct eeh_pe *pe;
+       struct eeh_pe *pe, *parent_pe;
        int rc = 0;
        const char *location;
 
@@ -439,14 +440,34 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
         */
        if ((ret < 0) ||
            (ret == EEH_STATE_NOT_SUPPORT) ||
-           (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
-           (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+           ((ret & active_flags) == active_flags)) {
                eeh_stats.false_positives++;
                pe->false_positives++;
                rc = 0;
                goto dn_unlock;
        }
 
+       /*
+        * It should be corner case that the parent PE has been
+        * put into frozen state as well. We should take care
+        * that at first.
+        */
+       parent_pe = pe->parent;
+       while (parent_pe) {
+               /* Hit the ceiling ? */
+               if (parent_pe->type & EEH_PE_PHB)
+                       break;
+
+               /* Frozen parent PE ? */
+               ret = eeh_ops->get_state(parent_pe, NULL);
+               if (ret > 0 &&
+                   (ret & active_flags) != active_flags)
+                       pe = parent_pe;
+
+               /* Next parent level */
+               parent_pe = parent_pe->parent;
+       }
+
        eeh_stats.slot_resets++;
 
        /* Avoid repeated reports of this failure, including problems
index e0d6a3a213e2b656db3307cb3a8dbafc3a6e0d3e..68167cd9ea97dee01184e5983354decb6a134d59 100644 (file)
@@ -705,11 +705,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
        struct pci_controller *hose;
        struct pnv_phb *phb;
-       struct eeh_pe *phb_pe;
+       struct eeh_pe *phb_pe, *parent_pe;
        __be64 frozen_pe_no;
        __be16 err_type, severity;
+       int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
        long rc;
-       int ret = EEH_NEXT_ERR_NONE;
+       int state, ret = EEH_NEXT_ERR_NONE;
 
        /*
         * While running here, it's safe to purge the event queue.
@@ -838,6 +839,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
                        ioda_eeh_phb_diag(hose);
                }
 
+               /*
+                * We probably have the frozen parent PE out there and
+                * we need have to handle frozen parent PE firstly.
+                */
+               if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+                       parent_pe = (*pe)->parent;
+                       while (parent_pe) {
+                               /* Hit the ceiling ? */
+                               if (parent_pe->type & EEH_PE_PHB)
+                                       break;
+
+                               /* Frozen parent PE ? */
+                               state = ioda_eeh_get_state(parent_pe);
+                               if (state > 0 &&
+                                   (state & active_flags) != active_flags)
+                                       *pe = parent_pe;
+
+                               /* Next parent level */
+                               parent_pe = parent_pe->parent;
+                       }
+
+                       /* We possibly migrate to another PE */
+                       eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+               }
+
                /*
                 * If we have no errors on the specific PHB or only
                 * informative error there, we continue poking it.