powerpc/eeh: Trace error based on PE from beginning
authorGavin Shan <shangw@linux.vnet.ibm.com>
Fri, 7 Sep 2012 22:44:13 +0000 (22:44 +0000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Sun, 9 Sep 2012 23:35:33 +0000 (09:35 +1000)
There're 2 conditions to trigger EEH error detection: invalid value
returned from reading I/O or config space. On each case, the function
eeh_dn_check_failure will be called to initialize EEH event and put
it into the poll for further processing.

The patch changes the function for a little bit so that the EEH error
will be traced based on PE instead of EEH device any more. Also, the
function eeh_find_device_pe() has been removed since the eeh device
is tracing the PE by struct eeh_dev::pe.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/include/asm/ppc-pci.h
arch/powerpc/platforms/pseries/eeh.c
arch/powerpc/platforms/pseries/msi.c

index c7e5bd60bca16cf0c091fa07232d18decde0be4c..3e301b14e09ff0c26d176cfc35f9b582e7bad5ff 100644 (file)
@@ -59,7 +59,6 @@ int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
 void eeh_pe_state_clear(struct eeh_pe *pe, int state);
-struct device_node *eeh_find_device_pe(struct device_node *dn);
 
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
index 9c623c23057fa1ea7074ba46c68a7dd5086dcdf6..f210160c1fdb64445243907b463090f08e0c0558 100644 (file)
@@ -263,21 +263,6 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
        return pa | (token & (PAGE_SIZE-1));
 }
 
-/**
- * eeh_find_device_pe - Retrieve the PE for the given device
- * @dn: device node
- *
- * Return the PE under which this device lies
- */
-struct device_node *eeh_find_device_pe(struct device_node *dn)
-{
-       while (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-              (of_node_to_eeh_dev(dn->parent)->mode & EEH_MODE_SUPPORTED)) {
-               dn = dn->parent;
-       }
-       return dn;
-}
-
 /**
  * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
  * @dn: device node
@@ -297,6 +282,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 {
        int ret;
        unsigned long flags;
+       struct eeh_pe *pe;
        struct eeh_dev *edev;
        int rc = 0;
        const char *location;
@@ -306,23 +292,26 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        if (!eeh_subsystem_enabled)
                return 0;
 
-       if (!dn) {
+       if (dn) {
+               edev = of_node_to_eeh_dev(dn);
+       } else if (dev) {
+               edev = pci_dev_to_eeh_dev(dev);
+               dn = pci_device_to_OF_node(dev);
+       } else {
                eeh_stats.no_dn++;
                return 0;
        }
-       dn = eeh_find_device_pe(dn);
-       edev = of_node_to_eeh_dev(dn);
+       pe = edev->pe;
 
        /* Access to IO BARs might get this far and still not want checking. */
-       if (!(edev->mode & EEH_MODE_SUPPORTED) ||
-           edev->mode & EEH_MODE_NOCHECK) {
+       if (!pe) {
                eeh_stats.ignored_check++;
-               pr_debug("EEH: Ignored check (%x) for %s %s\n",
-                       edev->mode, eeh_pci_name(dev), dn->full_name);
+               pr_debug("EEH: Ignored check for %s %s\n",
+                       eeh_pci_name(dev), dn->full_name);
                return 0;
        }
 
-       if (!edev->config_addr && !edev->pe_config_addr) {
+       if (!pe->addr && !pe->config_addr) {
                eeh_stats.no_cfg_addr++;
                return 0;
        }
@@ -335,13 +324,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         */
        raw_spin_lock_irqsave(&confirm_error_lock, flags);
        rc = 1;
-       if (edev->mode & EEH_MODE_ISOLATED) {
-               edev->check_count++;
-               if (edev->check_count % EEH_MAX_FAILS == 0) {
+       if (pe->state & EEH_PE_ISOLATED) {
+               pe->check_count++;
+               if (pe->check_count % EEH_MAX_FAILS == 0) {
                        location = of_get_property(dn, "ibm,loc-code", NULL);
                        printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
                                "location=%s driver=%s pci addr=%s\n",
-                               edev->check_count, location,
+                               pe->check_count, location,
                                eeh_driver_name(dev), eeh_pci_name(dev));
                        printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
                                eeh_driver_name(dev));
@@ -357,7 +346,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         * function zero of a multi-function device.
         * In any case they must share a common PHB.
         */
-       ret = eeh_ops->get_state(dn, NULL);
+       ret = eeh_ops->get_state(pe, NULL);
 
        /* Note that config-io to empty slots may fail;
         * they are empty when they don't have children.
@@ -370,7 +359,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
            (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
            (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
                eeh_stats.false_positives++;
-               edev->false_positives ++;
+               pe->false_positives++;
                rc = 0;
                goto dn_unlock;
        }
@@ -381,10 +370,10 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         * with other functions on this device, and functions under
         * bridges.
         */
-       eeh_mark_slot(dn, EEH_MODE_ISOLATED);
+       eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
        raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 
-       eeh_send_failure_event(edev);
+       eeh_send_failure_event(pe);
 
        /* Most EEH events are due to device driver bugs.  Having
         * a stack trace will help the device-driver authors figure
index 8bc89e4ecb5099f7fa90845fc7a558b78e96e712..d19f4977c83492e1174be5456e1d0ff7ee33ca33 100644 (file)
@@ -210,6 +210,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
        struct device_node *dn;
+       struct eeh_dev *edev;
 
        /* Found our PE and assume 8 at that point. */
 
@@ -217,7 +218,10 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
        if (!dn)
                return NULL;
 
-       dn = eeh_find_device_pe(dn);
+       /* Get the top level device in the PE */
+       edev = of_node_to_eeh_dev(dn);
+       edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+       dn = eeh_dev_to_of_node(edev);
        if (!dn)
                return NULL;