wimax/i2400m: add the error recovery mechanism on TX path

author Cindy H Kao <cindy.h.kao@intel.com>

Thu, 8 Apr 2010 03:07:47 +0000 (20:07 -0700)

committer Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>

Tue, 11 May 2010 21:05:39 +0000 (14:05 -0700)
author Cindy H Kao <cindy.h.kao@intel.com>
Thu, 8 Apr 2010 03:07:47 +0000 (20:07 -0700)
committer Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
Tue, 11 May 2010 21:05:39 +0000 (14:05 -0700)
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c

index 1674dba43f83bac68f1a47e71386f9f00a43cc3e..d83fe84407bf80be627731b178cd5682dc3e56d2 100644 (file)
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -395,6 +395,16 @@ retry:
         result = i2400m_dev_initialize(i2400m);
         if (result < 0)
                 goto error_dev_initialize;
+
+       /* We don't want any additional unwanted error recovery triggered
+        * from any other context so if anything went wrong before we come
+        * here, let's keep i2400m->error_recovery untouched and leave it to
+        * dev_reset_handle(). See dev_reset_handle(). */
+
+       atomic_dec(&i2400m->error_recovery);
+       /* Every thing works so far, ok, now we are ready to
+        * take error recovery if it's required. */
+
         /* At this point, reports will come for the device and set it
          * to the right state if it is different than UNINITIALIZED */
         d_fnend(3, dev, "(net_dev %p [i2400m %p]) = %d\n",
@@ -770,6 +780,66 @@ int i2400m_dev_reset_handle(struct i2400m *i2400m, const char *reason)
  EXPORT_SYMBOL_GPL(i2400m_dev_reset_handle);
  
  
+ /*
+ * The actual work of error recovery.
+ *
+ * The current implementation of error recovery is to trigger a bus reset.
+ */
+static
+void __i2400m_error_recovery(struct work_struct *ws)
+{
+       struct i2400m_work *iw = container_of(ws, struct i2400m_work, ws);
+       struct i2400m *i2400m = iw->i2400m;
+
+       i2400m_reset(i2400m, I2400M_RT_BUS);
+
+       i2400m_put(i2400m);
+       kfree(iw);
+       return;
+}
+
+/*
+ * Schedule a work struct for error recovery.
+ *
+ * The intention of error recovery is to bring back the device to some
+ * known state whenever TX sees -110 (-ETIMEOUT) on copying the data to
+ * the device. The TX failure could mean a device bus stuck, so the current
+ * error recovery implementation is to trigger a bus reset to the device
+ * and hopefully it can bring back the device.
+ *
+ * The actual work of error recovery has to be in a thread context because
+ * it is kicked off in the TX thread (i2400ms->tx_workqueue) which is to be
+ * destroyed by the error recovery mechanism (currently a bus reset).
+ *
+ * Also, there may be already a queue of TX works that all hit
+ * the -ETIMEOUT error condition because the device is stuck already.
+ * Since bus reset is used as the error recovery mechanism and we don't
+ * want consecutive bus resets simply because the multiple TX works
+ * in the queue all hit the same device erratum, the flag "error_recovery"
+ * is introduced for preventing unwanted consecutive bus resets.
+ *
+ * Error recovery shall only be invoked again if previous one was completed.
+ * The flag error_recovery is set when error recovery mechanism is scheduled,
+ * and is checked when we need to schedule another error recovery. If it is
+ * in place already, then we shouldn't schedule another one.
+ */
+void i2400m_error_recovery(struct i2400m *i2400m)
+{
+       struct device *dev = i2400m_dev(i2400m);
+
+       if (atomic_add_return(1, &i2400m->error_recovery) == 1) {
+               if (i2400m_schedule_work(i2400m, __i2400m_error_recovery,
+                       GFP_ATOMIC, NULL, 0) < 0) {
+                       dev_err(dev, "run out of memory for "
+                               "scheduling an error recovery ?\n");
+                       atomic_dec(&i2400m->error_recovery);
+               }
+       } else
+               atomic_dec(&i2400m->error_recovery);
+       return;
+}
+EXPORT_SYMBOL_GPL(i2400m_error_recovery);
+
  /*
   * Alloc the command and ack buffers for boot mode
   *
@@ -839,6 +909,10 @@ void i2400m_init(struct i2400m *i2400m)
         atomic_set(&i2400m->bus_reset_retries, 0);
  
         i2400m->alive = 0;
+
+       /* initialize error_recovery to 1 for denoting we
+        * are not yet ready to take any error recovery */
+       atomic_set(&i2400m->error_recovery, 1);
  }
  EXPORT_SYMBOL_GPL(i2400m_init);
  
diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h

index ad8e6a3be1e3cab423c596458e3ed578f7eff963..7a9c2c5b25cb92a9e5b271d0dbcf6758aecaf046 100644 (file)
--- a/drivers/net/wimax/i2400m/i2400m.h
+++ b/drivers/net/wimax/i2400m/i2400m.h
@@ -545,6 +545,15 @@ struct i2400m_barker_db;
   *     all the device reboot events detected can be still handled properly
   *     by either dev_reset_handle() or .pre_reset/.post_reset as long as
   *     the driver presents. It is set 0 along with @updown in dev_stop().
+ *
+ * @error_recovery: flag to denote if we are ready to take an error recovery.
+ *     0 for ready to take an error recovery; 1 for not ready. It is
+ *     initialized to 1 while probe() since we don't tend to take any error
+ *     recovery during probe(). It is decremented by 1 whenever dev_start()
+ *     succeeds to indicate we are ready to take error recovery from now on.
+ *     It is checked every time we wanna schedule an error recovery. If an
+ *     error recovery is already in place (error_recovery was set 1), we
+ *     should not schedule another one until the last one is done.
   */
  struct i2400m {
         struct wimax_dev wimax_dev;     /* FIRST! See doc */
@@ -625,6 +634,10 @@ struct i2400m {
  
         /* if the device is expected to be alive */
         unsigned alive;
+
+       /* 0 if we are ready for error recovery; 1 if not ready  */
+       atomic_t error_recovery;
+
  };
  
  
@@ -847,6 +860,7 @@ void i2400m_put(struct i2400m *i2400m)
  extern int i2400m_dev_reset_handle(struct i2400m *, const char *);
  extern int i2400m_pre_reset(struct i2400m *);
  extern int i2400m_post_reset(struct i2400m *);
+extern void i2400m_error_recovery(struct i2400m *);
  
  /*
   * _setup()/_release() are called by the probe/disconnect functions of
diff --git a/drivers/net/wimax/i2400m/sdio-tx.c b/drivers/net/wimax/i2400m/sdio-tx.c

index 412b6a8eaef2983028360e862102c2bdd0d40237..b53cd1c80e3e92900a85f81eb7cd85b538f78115 100644 (file)
--- a/drivers/net/wimax/i2400m/sdio-tx.c
+++ b/drivers/net/wimax/i2400m/sdio-tx.c
@@ -98,6 +98,10 @@ void i2400ms_tx_submit(struct work_struct *ws)
                                 tx_msg_size, result);
                 }
  
+               if (result == -ETIMEDOUT) {
+                       i2400m_error_recovery(i2400m);
+                       break;
+               }
                 d_printf(2, dev, "TX: %zub submitted\n", tx_msg_size);
         }
author	Cindy H Kao <cindy.h.kao@intel.com>
	Thu, 8 Apr 2010 03:07:47 +0000 (20:07 -0700)
committer	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
	Tue, 11 May 2010 21:05:39 +0000 (14:05 -0700)
drivers/net/wimax/i2400m/driver.c		patch \| blob \| blame \| history
drivers/net/wimax/i2400m/i2400m.h		patch \| blob \| blame \| history
drivers/net/wimax/i2400m/sdio-tx.c		patch \| blob \| blame \| history