[PATCH] ppc64: escape hatch for spinning interrupt deadlocks

author Linas Vepstas <linas@linas.org>

Fri, 4 Nov 2005 00:49:31 +0000 (18:49 -0600)

committer Paul Mackerras <paulus@samba.org>

Thu, 10 Nov 2005 00:33:33 +0000 (11:33 +1100)
author Linas Vepstas <linas@linas.org>
Fri, 4 Nov 2005 00:49:31 +0000 (18:49 -0600)
committer Paul Mackerras <paulus@samba.org>
Thu, 10 Nov 2005 00:33:33 +0000 (11:33 +1100)
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c

index e7522f6da69d2bd03dc87318f3fa93328385921e..0c52c2de92e02b43d6dbee63c98878c77bb76bfa 100644 (file)
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -78,14 +78,12 @@ DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
  
  static struct notifier_block *eeh_notifier_chain;
  
-/*
- * If a device driver keeps reading an MMIO register in an interrupt
+/* If a device driver keeps reading an MMIO register in an interrupt
   * handler after a slot isolation event has occurred, we assume it
   * is broken and panic.  This sets the threshold for how many read
   * attempts we allow before panicking.
   */
-#define EEH_MAX_FAILS  1000
-static atomic_t eeh_fail_count;
+#define EEH_MAX_FAILS  100000
  
  /* RTAS tokens */
  static int ibm_set_eeh_option;
@@ -521,7 +519,6 @@ static void eeh_event_handler(void *dummy)
                        "%s\n", event->reset_state,
                        pci_name(event->dev));
  
-               atomic_set(&eeh_fail_count, 0);
                 notifier_call_chain (&eeh_notifier_chain,
                                      EEH_NOTIFY_FREEZE, event);
  
@@ -657,12 +654,18 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         spin_lock_irqsave(&confirm_error_lock, flags);
         rc = 1;
         if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
-               atomic_inc(&eeh_fail_count);
-               if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
+               pdn->eeh_check_count ++;
+               if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
+                       printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
+                               pdn->eeh_check_count);
+                       dump_stack();
+                       
                         /* re-read the slot reset state */
                         if (read_slot_reset_state(pdn, rets) != 0)
                                 rets[0] = -1;   /* reset state unknown */
-                       eeh_panic(dev, rets[0]);
+
+                       /* If we are here, then we hit an infinite loop. Stop. */
+                       panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
                 }
                 goto dn_unlock;
         }
@@ -808,6 +811,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
         struct pci_dn *pdn = PCI_DN(dn);
  
         pdn->eeh_mode = 0;
+       pdn->eeh_check_count = 0;
+       pdn->eeh_freeze_count = 0;
  
         if (status && strcmp(status, "ok") != 0)
                 return NULL;    /* ignore devices with bad status */
author	Linas Vepstas <linas@linas.org>
	Fri, 4 Nov 2005 00:49:31 +0000 (18:49 -0600)
committer	Paul Mackerras <paulus@samba.org>
	Thu, 10 Nov 2005 00:33:33 +0000 (11:33 +1100)