IB/ipath: HW workaround for case where chip can send but not receive
authorDave Olson <dave.olson@qlogic.com>
Thu, 17 Apr 2008 04:09:25 +0000 (21:09 -0700)
committerRoland Dreier <rolandd@cisco.com>
Thu, 17 Apr 2008 04:09:25 +0000 (21:09 -0700)
Workaround a QLE7140 problem that in rare cases causes flow control
problems after link recovery by forcing a link retrain after recovery.
A module parameter is provided to control the behavior in case it causes
problems.

Signed-off-by: Dave Olson <dave.olson@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/hw/ipath/ipath_driver.c
drivers/infiniband/hw/ipath/ipath_intr.c
drivers/infiniband/hw/ipath/ipath_kernel.h

index 5605f4f27521c6de65bdea393834f07d28368055..2cad7335681b6e26902ac64d1c38d8aa3f156f75 100644 (file)
@@ -82,6 +82,10 @@ module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
 MODULE_PARM_DESC(hol_timeout_ms,
        "duration of user app suspension after link failure");
 
+unsigned ipath_linkrecovery = 1;
+module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("QLogic <support@pathscale.com>");
 MODULE_DESCRIPTION("QLogic InfiniPath driver");
index dde5dfc9fcf5023cfbcee6f0e83287dc19c6bffe..ed82ecbb02dadb085dc39df082e43b8347837ff6 100644 (file)
@@ -366,6 +366,22 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
        dd->ipath_ibpollcnt = 0; /* not poll*, now */
        ipath_stats.sps_iblink++;
 
+       if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
+               u64 linkrecov;
+               linkrecov = ipath_snap_cntr(dd,
+                       dd->ipath_cregs->cr_iblinkerrrecovcnt);
+               if (linkrecov != dd->ipath_lastlinkrecov) {
+                       ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
+                               ibcs, ib_linkstate(dd, ibcs),
+                               ipath_ibcstatus_str[ltstate],
+                               linkrecov);
+                       /* and no more until active again */
+                       dd->ipath_lastlinkrecov = 0;
+                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+                       goto skip_ibchange;
+               }
+       }
+
        if (ibstate == init || ibstate == arm || ibstate == active) {
                *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
                if (ibstate == init || ibstate == arm) {
@@ -392,6 +408,8 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
                                IPATH_NOCABLE);
                        ipath_hol_down(dd);
                } else {  /* active */
+                       dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
+                               dd->ipath_cregs->cr_iblinkerrrecovcnt);
                        *dd->ipath_statusp |=
                                IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
                        dd->ipath_flags |= IPATH_LINKACTIVE;
index 960d5b7e7865303cc1f36d6cdabfc0816127067b..b8b81cb745b91499aadf48aa82996311e034c1fc 100644 (file)
@@ -309,6 +309,7 @@ struct ipath_devdata {
        ipath_err_t ipath_lasthwerror;
        /* errors masked because they occur too fast */
        ipath_err_t ipath_maskederrs;
+       u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
        /* time in jiffies at which to re-enable maskederrs */
        unsigned long ipath_unmasktime;
        /* count of egrfull errors, combined for all ports */
@@ -1099,6 +1100,7 @@ dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
 #endif
 
 extern unsigned ipath_debug; /* debugging bit mask */
+extern unsigned ipath_linkrecovery;
 extern unsigned ipath_mtu4096;
 
 #define IPATH_MAX_PARITY_ATTEMPTS 10000 /* max times to try recovery */