net: ena: bug fix in lost tx packets detection mechanism
authorNetanel Belgazal <netanel@amazon.com>
Sun, 11 Jun 2017 12:42:50 +0000 (15:42 +0300)
committerDavid S. Miller <davem@davemloft.net>
Sun, 11 Jun 2017 20:36:47 +0000 (16:36 -0400)
check_for_missing_tx_completions() is called from a timer
task and looking for lost tx packets.
The old implementation accumulate all the lost tx packets
and did not check if those packets were retrieved on a later stage.
This cause to a situation where the driver reset
the device for no reason.

Fixes: 1738cd3ed342 ("Add a driver for Amazon Elastic Network Adapters (ENA)")
Signed-off-by: Netanel Belgazal <netanel@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/amazon/ena/ena_ethtool.c
drivers/net/ethernet/amazon/ena/ena_netdev.c
drivers/net/ethernet/amazon/ena/ena_netdev.h

index 533b2fbdeef165811117ba01e36cc412e4ce50be..3ee55e2fd69465e12603890bce1b530be551a2d9 100644 (file)
@@ -80,7 +80,6 @@ static const struct ena_stats ena_stats_tx_strings[] = {
        ENA_STAT_TX_ENTRY(tx_poll),
        ENA_STAT_TX_ENTRY(doorbells),
        ENA_STAT_TX_ENTRY(prepare_ctx_err),
-       ENA_STAT_TX_ENTRY(missing_tx_comp),
        ENA_STAT_TX_ENTRY(bad_req_id),
 };
 
index 3c366bfbbab13db6a902f0f775cba94f114501c7..4f16ed38bcf3a267f84894177da1f89713a0eb3e 100644 (file)
@@ -1995,6 +1995,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
        tx_info->tx_descs = nb_hw_desc;
        tx_info->last_jiffies = jiffies;
+       tx_info->print_once = 0;
 
        tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
                tx_ring->ring_size);
@@ -2564,13 +2565,44 @@ err:
                "Reset attempt failed. Can not reset the device\n");
 }
 
-static void check_for_missing_tx_completions(struct ena_adapter *adapter)
+static int check_missing_comp_in_queue(struct ena_adapter *adapter,
+                                      struct ena_ring *tx_ring)
 {
        struct ena_tx_buffer *tx_buf;
        unsigned long last_jiffies;
+       u32 missed_tx = 0;
+       int i;
+
+       for (i = 0; i < tx_ring->ring_size; i++) {
+               tx_buf = &tx_ring->tx_buffer_info[i];
+               last_jiffies = tx_buf->last_jiffies;
+               if (unlikely(last_jiffies &&
+                            time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
+                       if (!tx_buf->print_once)
+                               netif_notice(adapter, tx_err, adapter->netdev,
+                                            "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
+                                            tx_ring->qid, i);
+
+                       tx_buf->print_once = 1;
+                       missed_tx++;
+
+                       if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
+                               netif_err(adapter, tx_err, adapter->netdev,
+                                         "The number of lost tx completions is above the threshold (%d > %d). Reset the device\n",
+                                         missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
+                               set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+                               return -EIO;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static void check_for_missing_tx_completions(struct ena_adapter *adapter)
+{
        struct ena_ring *tx_ring;
-       int i, j, budget;
-       u32 missed_tx;
+       int i, budget, rc;
 
        /* Make sure the driver doesn't turn the device in other process */
        smp_rmb();
@@ -2586,31 +2618,9 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
        for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
                tx_ring = &adapter->tx_ring[i];
 
-               for (j = 0; j < tx_ring->ring_size; j++) {
-                       tx_buf = &tx_ring->tx_buffer_info[j];
-                       last_jiffies = tx_buf->last_jiffies;
-                       if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
-                               netif_notice(adapter, tx_err, adapter->netdev,
-                                            "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
-                                            tx_ring->qid, j);
-
-                               u64_stats_update_begin(&tx_ring->syncp);
-                               missed_tx = tx_ring->tx_stats.missing_tx_comp++;
-                               u64_stats_update_end(&tx_ring->syncp);
-
-                               /* Clear last jiffies so the lost buffer won't
-                                * be counted twice.
-                                */
-                               tx_buf->last_jiffies = 0;
-
-                               if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
-                                       netif_err(adapter, tx_err, adapter->netdev,
-                                                 "The number of lost tx completion is above the threshold (%d > %d). Reset the device\n",
-                                                 missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
-                                       set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
-                               }
-                       }
-               }
+               rc = check_missing_comp_in_queue(adapter, tx_ring);
+               if (unlikely(rc))
+                       return;
 
                budget--;
                if (!budget)
index 8828f1d6dd22bac9cdc8efe8a6a4d3ec11e5a071..88b5e56123381e0d0af17ce723f9e7d993c50cd9 100644 (file)
@@ -146,7 +146,18 @@ struct ena_tx_buffer {
        u32 tx_descs;
        /* num of buffers used by this skb */
        u32 num_of_bufs;
-       /* Save the last jiffies to detect missing tx packets */
+
+       /* Used for detect missing tx packets to limit the number of prints */
+       u32 print_once;
+       /* Save the last jiffies to detect missing tx packets
+        *
+        * sets to non zero value on ena_start_xmit and set to zero on
+        * napi and timer_Service_routine.
+        *
+        * while this value is not protected by lock,
+        * a given packet is not expected to be handled by ena_start_xmit
+        * and by napi/timer_service at the same time.
+        */
        unsigned long last_jiffies;
        struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
 } ____cacheline_aligned;
@@ -170,7 +181,6 @@ struct ena_stats_tx {
        u64 napi_comp;
        u64 tx_poll;
        u64 doorbells;
-       u64 missing_tx_comp;
        u64 bad_req_id;
 };