powerpc/eeh: Report frozen parent PE prior to child PE

author Gavin Shan <gwshan@linux.vnet.ibm.com>

Sun, 4 May 2014 23:29:03 +0000 (09:29 +1000)

committer Benjamin Herrenschmidt <benh@kernel.crashing.org>

Wed, 11 Jun 2014 07:04:16 +0000 (17:04 +1000)
author Gavin Shan <gwshan@linux.vnet.ibm.com>
Sun, 4 May 2014 23:29:03 +0000 (09:29 +1000)
committer Benjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 11 Jun 2014 07:04:16 +0000 (17:04 +1000)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c

index 7051ea3101b96af830faf0e44eadeb70884fb6fc..c25064b7d667677ae92acd6ef1e577558dff846b 100644 (file)
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -358,10 +358,11 @@ out:
  int eeh_dev_check_failure(struct eeh_dev *edev)
  {
         int ret;
+       int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
         unsigned long flags;
         struct device_node *dn;
         struct pci_dev *dev;
-       struct eeh_pe *pe;
+       struct eeh_pe *pe, *parent_pe;
         int rc = 0;
         const char *location;
  
@@ -439,14 +440,34 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
          */
         if ((ret < 0) ||
             (ret == EEH_STATE_NOT_SUPPORT) ||
-           (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
-           (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+           ((ret & active_flags) == active_flags)) {
                 eeh_stats.false_positives++;
                 pe->false_positives++;
                 rc = 0;
                 goto dn_unlock;
         }
  
+       /*
+        * It should be corner case that the parent PE has been
+        * put into frozen state as well. We should take care
+        * that at first.
+        */
+       parent_pe = pe->parent;
+       while (parent_pe) {
+               /* Hit the ceiling ? */
+               if (parent_pe->type & EEH_PE_PHB)
+                       break;
+
+               /* Frozen parent PE ? */
+               ret = eeh_ops->get_state(parent_pe, NULL);
+               if (ret > 0 &&
+                   (ret & active_flags) != active_flags)
+                       pe = parent_pe;
+
+               /* Next parent level */
+               parent_pe = parent_pe->parent;
+       }
+
         eeh_stats.slot_resets++;
  
         /* Avoid repeated reports of this failure, including problems
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c

index e0d6a3a213e2b656db3307cb3a8dbafc3a6e0d3e..68167cd9ea97dee01184e5983354decb6a134d59 100644 (file)
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -705,11 +705,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
  {
         struct pci_controller *hose;
         struct pnv_phb *phb;
-       struct eeh_pe *phb_pe;
+       struct eeh_pe *phb_pe, *parent_pe;
         __be64 frozen_pe_no;
         __be16 err_type, severity;
+       int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
         long rc;
-       int ret = EEH_NEXT_ERR_NONE;
+       int state, ret = EEH_NEXT_ERR_NONE;
  
         /*
          * While running here, it's safe to purge the event queue.
@@ -838,6 +839,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
                         ioda_eeh_phb_diag(hose);
                 }
  
+               /*
+                * We probably have the frozen parent PE out there and
+                * we need have to handle frozen parent PE firstly.
+                */
+               if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+                       parent_pe = (*pe)->parent;
+                       while (parent_pe) {
+                               /* Hit the ceiling ? */
+                               if (parent_pe->type & EEH_PE_PHB)
+                                       break;
+
+                               /* Frozen parent PE ? */
+                               state = ioda_eeh_get_state(parent_pe);
+                               if (state > 0 &&
+                                   (state & active_flags) != active_flags)
+                                       *pe = parent_pe;
+
+                               /* Next parent level */
+                               parent_pe = parent_pe->parent;
+                       }
+
+                       /* We possibly migrate to another PE */
+                       eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+               }
+
                 /*
                  * If we have no errors on the specific PHB or only
                  * informative error there, we continue poking it.
author	Gavin Shan <gwshan@linux.vnet.ibm.com>
	Sun, 4 May 2014 23:29:03 +0000 (09:29 +1000)
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>
	Wed, 11 Jun 2014 07:04:16 +0000 (17:04 +1000)
arch/powerpc/kernel/eeh.c		patch \| blob \| blame \| history
arch/powerpc/platforms/powernv/eeh-ioda.c		patch \| blob \| blame \| history