forcedeth: optimized routines
authorAyaz Abdulla <aabdulla@nvidia.com>
Sun, 21 Jan 2007 23:10:37 +0000 (18:10 -0500)
committerJeff Garzik <jeff@garzik.org>
Mon, 5 Feb 2007 21:58:48 +0000 (16:58 -0500)
This patch breaks up the routines into two versions, one for legacy
descriptor versions (ver 1 and ver 2) and one for desc ver 3. This will
make the new desc functions more leaner and further reductions will be
made in next few patches.

Signed-Off-By: Ayaz Abdulla <aabdulla@nvidia.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
drivers/net/forcedeth.c

index 0fc07862509077ed71f9dda50be4fc0b8891ac3e..f28ae12d8569cbd17563d52e0374256650594343 100644 (file)
@@ -1307,50 +1307,57 @@ static struct net_device_stats *nv_get_stats(struct net_device *dev)
 static int nv_alloc_rx(struct net_device *dev)
 {
        struct fe_priv *np = netdev_priv(dev);
-       union ring_type less_rx;
+       struct ring_desc* less_rx;
 
-       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-               less_rx.orig = np->get_rx.orig;
-               if (less_rx.orig-- == np->first_rx.orig)
-                       less_rx.orig = np->last_rx.orig;
-       } else {
-               less_rx.ex = np->get_rx.ex;
-               if (less_rx.ex-- == np->first_rx.ex)
-                       less_rx.ex = np->last_rx.ex;
-       }
+       less_rx = np->get_rx.orig;
+       if (less_rx-- == np->first_rx.orig)
+               less_rx = np->last_rx.orig;
 
-       while (1) {
-               struct sk_buff *skb;
-
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       if (np->put_rx.orig == less_rx.orig)
-                               break;
+       while (np->put_rx.orig != less_rx) {
+               struct sk_buff *skb = dev_alloc_skb(np->rx_buf_sz + NV_RX_ALLOC_PAD);
+               if (skb) {
+                       skb->dev = dev;
+                       np->put_rx_ctx->skb = skb;
+                       np->put_rx_ctx->dma = pci_map_single(np->pci_dev, skb->data,
+                                                            skb->end-skb->data, PCI_DMA_FROMDEVICE);
+                       np->put_rx_ctx->dma_len = skb->end-skb->data;
+                       np->put_rx.orig->buf = cpu_to_le32(np->put_rx_ctx->dma);
+                       wmb();
+                       np->put_rx.orig->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX_AVAIL);
+                       if (np->put_rx.orig++ == np->last_rx.orig)
+                               np->put_rx.orig = np->first_rx.orig;
+                       if (np->put_rx_ctx++ == np->last_rx_ctx)
+                               np->put_rx_ctx = np->first_rx_ctx;
                } else {
-                       if (np->put_rx.ex == less_rx.ex)
-                               break;
+                       return 1;
                }
+       }
+       return 0;
+}
+
+static int nv_alloc_rx_optimized(struct net_device *dev)
+{
+       struct fe_priv *np = netdev_priv(dev);
+       struct ring_desc_ex* less_rx;
+
+       less_rx = np->get_rx.ex;
+       if (less_rx-- == np->first_rx.ex)
+               less_rx = np->last_rx.ex;
 
-               skb = dev_alloc_skb(np->rx_buf_sz + NV_RX_ALLOC_PAD);
+       while (np->put_rx.ex != less_rx) {
+               struct sk_buff *skb = dev_alloc_skb(np->rx_buf_sz + NV_RX_ALLOC_PAD);
                if (skb) {
                        skb->dev = dev;
                        np->put_rx_ctx->skb = skb;
                        np->put_rx_ctx->dma = pci_map_single(np->pci_dev, skb->data,
                                                             skb->end-skb->data, PCI_DMA_FROMDEVICE);
                        np->put_rx_ctx->dma_len = skb->end-skb->data;
-                       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                               np->put_rx.orig->buf = cpu_to_le32(np->put_rx_ctx->dma);
-                               wmb();
-                               np->put_rx.orig->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX_AVAIL);
-                               if (np->put_rx.orig++ == np->last_rx.orig)
-                                       np->put_rx.orig = np->first_rx.orig;
-                       } else {
-                               np->put_rx.ex->bufhigh = cpu_to_le64(np->put_rx_ctx->dma) >> 32;
-                               np->put_rx.ex->buflow = cpu_to_le64(np->put_rx_ctx->dma) & 0x0FFFFFFFF;
-                               wmb();
-                               np->put_rx.ex->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX2_AVAIL);
-                               if (np->put_rx.ex++ == np->last_rx.ex)
-                                       np->put_rx.ex = np->first_rx.ex;
-                       }
+                       np->put_rx.ex->bufhigh = cpu_to_le64(np->put_rx_ctx->dma) >> 32;
+                       np->put_rx.ex->buflow = cpu_to_le64(np->put_rx_ctx->dma) & 0x0FFFFFFFF;
+                       wmb();
+                       np->put_rx.ex->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX2_AVAIL);
+                       if (np->put_rx.ex++ == np->last_rx.ex)
+                               np->put_rx.ex = np->first_rx.ex;
                        if (np->put_rx_ctx++ == np->last_rx_ctx)
                                np->put_rx_ctx = np->first_rx_ctx;
                } else {
@@ -1374,6 +1381,7 @@ static void nv_do_rx_refill(unsigned long data)
 {
        struct net_device *dev = (struct net_device *) data;
        struct fe_priv *np = netdev_priv(dev);
+       int retcode;
 
        if (!using_multi_irqs(dev)) {
                if (np->msi_flags & NV_MSI_X_ENABLED)
@@ -1383,7 +1391,11 @@ static void nv_do_rx_refill(unsigned long data)
        } else {
                disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector);
        }
-       if (nv_alloc_rx(dev)) {
+       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+               retcode = nv_alloc_rx(dev);
+       else
+               retcode = nv_alloc_rx_optimized(dev);
+       if (retcode) {
                spin_lock_irq(&np->lock);
                if (!np->in_shutdown)
                        mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
@@ -1456,9 +1468,14 @@ static void nv_init_tx(struct net_device *dev)
 
 static int nv_init_ring(struct net_device *dev)
 {
+       struct fe_priv *np = netdev_priv(dev);
+
        nv_init_tx(dev);
        nv_init_rx(dev);
-       return nv_alloc_rx(dev);
+       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+               return nv_alloc_rx(dev);
+       else
+               return nv_alloc_rx_optimized(dev);
 }
 
 static int nv_release_txskb(struct net_device *dev, struct nv_skb_map* tx_skb)
@@ -1554,9 +1571,9 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
        u32 entries = (size >> NV_TX2_TSO_MAX_SHIFT) + ((size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
        u32 empty_slots;
        u32 tx_flags_vlan = 0;
-       union ring_type put_tx;
-       union ring_type start_tx;
-       union ring_type prev_tx;
+       struct ring_desc* put_tx;
+       struct ring_desc* start_tx;
+       struct ring_desc* prev_tx;
        struct nv_skb_map* prev_tx_ctx;
 
        /* add fragments to entries count */
@@ -1573,10 +1590,7 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
                return NETDEV_TX_BUSY;
        }
 
-       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
-               start_tx.orig = put_tx.orig = np->put_tx.orig;
-       else
-               start_tx.ex = put_tx.ex = np->put_tx.ex;
+       start_tx = put_tx = np->put_tx.orig;
 
        /* setup the header buffer */
        do {
@@ -1586,24 +1600,13 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
                np->put_tx_ctx->dma = pci_map_single(np->pci_dev, skb->data + offset, bcnt,
                                                PCI_DMA_TODEVICE);
                np->put_tx_ctx->dma_len = bcnt;
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       put_tx.orig->buf = cpu_to_le32(np->put_tx_ctx->dma);
-                       put_tx.orig->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
-               } else {
-                       put_tx.ex->bufhigh = cpu_to_le64(np->put_tx_ctx->dma) >> 32;
-                       put_tx.ex->buflow = cpu_to_le64(np->put_tx_ctx->dma) & 0x0FFFFFFFF;
-                       put_tx.ex->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
-               }
+               put_tx->buf = cpu_to_le32(np->put_tx_ctx->dma);
+               put_tx->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
                tx_flags = np->tx_flags;
                offset += bcnt;
                size -= bcnt;
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       if (put_tx.orig++ == np->last_tx.orig)
-                               put_tx.orig = np->first_tx.orig;
-               } else {
-                       if (put_tx.ex++ == np->last_tx.ex)
-                               put_tx.ex = np->first_tx.ex;
-               }
+               if (put_tx++ == np->last_tx.orig)
+                       put_tx = np->first_tx.orig;
                if (np->put_tx_ctx++ == np->last_tx_ctx)
                        np->put_tx_ctx = np->first_tx_ctx;
        } while (size);
@@ -1622,33 +1625,19 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
                                                           PCI_DMA_TODEVICE);
                        np->put_tx_ctx->dma_len = bcnt;
 
-                       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                               put_tx.orig->buf = cpu_to_le32(np->put_tx_ctx->dma);
-                               put_tx.orig->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
-                       } else {
-                               put_tx.ex->bufhigh = cpu_to_le64(np->put_tx_ctx->dma) >> 32;
-                               put_tx.ex->buflow = cpu_to_le64(np->put_tx_ctx->dma) & 0x0FFFFFFFF;
-                               put_tx.ex->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
-                       }
+                       put_tx->buf = cpu_to_le32(np->put_tx_ctx->dma);
+                       put_tx->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
                        offset += bcnt;
                        size -= bcnt;
-                       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                               if (put_tx.orig++ == np->last_tx.orig)
-                                       put_tx.orig = np->first_tx.orig;
-                       } else {
-                               if (put_tx.ex++ == np->last_tx.ex)
-                                       put_tx.ex = np->first_tx.ex;
-                       }
+                       if (put_tx++ == np->last_tx.orig)
+                               put_tx = np->first_tx.orig;
                        if (np->put_tx_ctx++ == np->last_tx_ctx)
                                np->put_tx_ctx = np->first_tx_ctx;
                } while (size);
        }
 
        /* set last fragment flag  */
-       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
-               prev_tx.orig->flaglen |= cpu_to_le32(tx_flags_extra);
-       else
-               prev_tx.ex->flaglen |= cpu_to_le32(tx_flags_extra);
+       prev_tx->flaglen |= cpu_to_le32(tx_flags_extra);
 
        /* save skb in this slot's context area */
        prev_tx_ctx->skb = skb;
@@ -1667,14 +1656,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
        spin_lock_irq(&np->lock);
 
        /* set tx flags */
-       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-               start_tx.orig->flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
-               np->put_tx.orig = put_tx.orig;
-       } else {
-               start_tx.ex->txvlan = cpu_to_le32(tx_flags_vlan);
-               start_tx.ex->flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
-               np->put_tx.ex = put_tx.ex;
-       }
+       start_tx->flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
+       np->put_tx.orig = put_tx;
 
        spin_unlock_irq(&np->lock);
 
@@ -1696,6 +1679,130 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
        return NETDEV_TX_OK;
 }
 
+static int nv_start_xmit_optimized(struct sk_buff *skb, struct net_device *dev)
+{
+       struct fe_priv *np = netdev_priv(dev);
+       u32 tx_flags = 0;
+       u32 tx_flags_extra = NV_TX2_LASTPACKET;
+       unsigned int fragments = skb_shinfo(skb)->nr_frags;
+       unsigned int i;
+       u32 offset = 0;
+       u32 bcnt;
+       u32 size = skb->len-skb->data_len;
+       u32 entries = (size >> NV_TX2_TSO_MAX_SHIFT) + ((size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
+       u32 empty_slots;
+       u32 tx_flags_vlan = 0;
+       struct ring_desc_ex* put_tx;
+       struct ring_desc_ex* start_tx;
+       struct ring_desc_ex* prev_tx;
+       struct nv_skb_map* prev_tx_ctx;
+
+       /* add fragments to entries count */
+       for (i = 0; i < fragments; i++) {
+               entries += (skb_shinfo(skb)->frags[i].size >> NV_TX2_TSO_MAX_SHIFT) +
+                          ((skb_shinfo(skb)->frags[i].size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
+       }
+
+       empty_slots = nv_get_empty_tx_slots(np);
+       if ((empty_slots - np->tx_limit_stop) <= entries) {
+               spin_lock_irq(&np->lock);
+               netif_stop_queue(dev);
+               spin_unlock_irq(&np->lock);
+               return NETDEV_TX_BUSY;
+       }
+
+       start_tx = put_tx = np->put_tx.ex;
+
+       /* setup the header buffer */
+       do {
+               prev_tx = put_tx;
+               prev_tx_ctx = np->put_tx_ctx;
+               bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
+               np->put_tx_ctx->dma = pci_map_single(np->pci_dev, skb->data + offset, bcnt,
+                                               PCI_DMA_TODEVICE);
+               np->put_tx_ctx->dma_len = bcnt;
+               put_tx->bufhigh = cpu_to_le64(np->put_tx_ctx->dma) >> 32;
+               put_tx->buflow = cpu_to_le64(np->put_tx_ctx->dma) & 0x0FFFFFFFF;
+               put_tx->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
+               tx_flags = np->tx_flags;
+               offset += bcnt;
+               size -= bcnt;
+               if (put_tx++ == np->last_tx.ex)
+                       put_tx = np->first_tx.ex;
+               if (np->put_tx_ctx++ == np->last_tx_ctx)
+                       np->put_tx_ctx = np->first_tx_ctx;
+       } while (size);
+
+       /* setup the fragments */
+       for (i = 0; i < fragments; i++) {
+               skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+               u32 size = frag->size;
+               offset = 0;
+
+               do {
+                       prev_tx = put_tx;
+                       prev_tx_ctx = np->put_tx_ctx;
+                       bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
+                       np->put_tx_ctx->dma = pci_map_page(np->pci_dev, frag->page, frag->page_offset+offset, bcnt,
+                                                          PCI_DMA_TODEVICE);
+                       np->put_tx_ctx->dma_len = bcnt;
+
+                       put_tx->bufhigh = cpu_to_le64(np->put_tx_ctx->dma) >> 32;
+                       put_tx->buflow = cpu_to_le64(np->put_tx_ctx->dma) & 0x0FFFFFFFF;
+                       put_tx->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
+                       offset += bcnt;
+                       size -= bcnt;
+                       if (put_tx++ == np->last_tx.ex)
+                               put_tx = np->first_tx.ex;
+                       if (np->put_tx_ctx++ == np->last_tx_ctx)
+                               np->put_tx_ctx = np->first_tx_ctx;
+               } while (size);
+       }
+
+       /* set last fragment flag  */
+       prev_tx->flaglen |= cpu_to_le32(tx_flags_extra);
+
+       /* save skb in this slot's context area */
+       prev_tx_ctx->skb = skb;
+
+       if (skb_is_gso(skb))
+               tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
+       else
+               tx_flags_extra = skb->ip_summed == CHECKSUM_PARTIAL ?
+                        NV_TX2_CHECKSUM_L3 | NV_TX2_CHECKSUM_L4 : 0;
+
+       /* vlan tag */
+       if (np->vlangrp && vlan_tx_tag_present(skb)) {
+               tx_flags_vlan = NV_TX3_VLAN_TAG_PRESENT | vlan_tx_tag_get(skb);
+       }
+
+       spin_lock_irq(&np->lock);
+
+       /* set tx flags */
+       start_tx->txvlan = cpu_to_le32(tx_flags_vlan);
+       start_tx->flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
+       np->put_tx.ex = put_tx;
+
+       spin_unlock_irq(&np->lock);
+
+       dprintk(KERN_DEBUG "%s: nv_start_xmit_optimized: entries %d queued for transmission. tx_flags_extra: %x\n",
+               dev->name, entries, tx_flags_extra);
+       {
+               int j;
+               for (j=0; j<64; j++) {
+                       if ((j%16) == 0)
+                               dprintk("\n%03x:", j);
+                       dprintk(" %02x", ((unsigned char*)skb->data)[j]);
+               }
+               dprintk("\n");
+       }
+
+       dev->trans_start = jiffies;
+       writel(NVREG_TXRXCTL_KICK|np->txrxctl_bits, get_hwbase(dev) + NvRegTxRxControl);
+       pci_push(get_hwbase(dev));
+       return NETDEV_TX_OK;
+}
+
 /*
  * nv_tx_done: check for completed packets, release the skbs.
  *
@@ -1707,16 +1814,8 @@ static void nv_tx_done(struct net_device *dev)
        u32 flags;
        struct sk_buff *skb;
 
-       while (1) {
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       if (np->get_tx.orig == np->put_tx.orig)
-                               break;
-                       flags = le32_to_cpu(np->get_tx.orig->flaglen);
-               } else {
-                       if (np->get_tx.ex == np->put_tx.ex)
-                               break;
-                       flags = le32_to_cpu(np->get_tx.ex->flaglen);
-               }
+       while (np->get_tx.orig != np->put_tx.orig) {
+               flags = le32_to_cpu(np->get_tx.orig->flaglen);
 
                dprintk(KERN_DEBUG "%s: nv_tx_done: flags 0x%x.\n",
                                        dev->name, flags);
@@ -1754,13 +1853,45 @@ static void nv_tx_done(struct net_device *dev)
                        }
                }
                nv_release_txskb(dev, np->get_tx_ctx);
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       if (np->get_tx.orig++ == np->last_tx.orig)
-                               np->get_tx.orig = np->first_tx.orig;
-               } else {
-                       if (np->get_tx.ex++ == np->last_tx.ex)
-                               np->get_tx.ex = np->first_tx.ex;
+               if (np->get_tx.orig++ == np->last_tx.orig)
+                       np->get_tx.orig = np->first_tx.orig;
+               if (np->get_tx_ctx++ == np->last_tx_ctx)
+                       np->get_tx_ctx = np->first_tx_ctx;
+       }
+       if (nv_get_empty_tx_slots(np) > np->tx_limit_start)
+               netif_wake_queue(dev);
+}
+
+static void nv_tx_done_optimized(struct net_device *dev)
+{
+       struct fe_priv *np = netdev_priv(dev);
+       u32 flags;
+       struct sk_buff *skb;
+
+       while (np->get_tx.ex == np->put_tx.ex) {
+               flags = le32_to_cpu(np->get_tx.ex->flaglen);
+
+               dprintk(KERN_DEBUG "%s: nv_tx_done_optimized: flags 0x%x.\n",
+                                       dev->name, flags);
+               if (flags & NV_TX_VALID)
+                       break;
+               if (flags & NV_TX2_LASTPACKET) {
+                       skb = np->get_tx_ctx->skb;
+                       if (flags & (NV_TX2_RETRYERROR|NV_TX2_CARRIERLOST|NV_TX2_LATECOLLISION|
+                                    NV_TX2_UNDERFLOW|NV_TX2_ERROR)) {
+                               if (flags & NV_TX2_UNDERFLOW)
+                                       np->stats.tx_fifo_errors++;
+                               if (flags & NV_TX2_CARRIERLOST)
+                                       np->stats.tx_carrier_errors++;
+                               np->stats.tx_errors++;
+                       } else {
+                               np->stats.tx_packets++;
+                               np->stats.tx_bytes += skb->len;
+                       }
                }
+               nv_release_txskb(dev, np->get_tx_ctx);
+               if (np->get_tx.ex++ == np->last_tx.ex)
+                       np->get_tx.ex = np->first_tx.ex;
                if (np->get_tx_ctx++ == np->last_tx_ctx)
                        np->get_tx_ctx = np->first_tx_ctx;
        }
@@ -1837,7 +1968,10 @@ static void nv_tx_timeout(struct net_device *dev)
        nv_stop_tx(dev);
 
        /* 2) check that the packets were not sent already: */
-       nv_tx_done(dev);
+       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+               nv_tx_done(dev);
+       else
+               nv_tx_done_optimized(dev);
 
        /* 3) if there are dead entries: clear everything */
        if (np->get_tx_ctx != np->put_tx_ctx) {
@@ -1913,22 +2047,14 @@ static int nv_rx_process(struct net_device *dev, int limit)
        u32 vlanflags = 0;
        int count;
 
-       for (count = 0; count < limit; ++count) {
+       for (count = 0; count < limit; ++count) {
                struct sk_buff *skb;
                int len;
 
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       if (np->get_rx.orig == np->put_rx.orig)
-                               break;  /* we scanned the whole ring - do not continue */
-                       flags = le32_to_cpu(np->get_rx.orig->flaglen);
-                       len = nv_descr_getlength(np->get_rx.orig, np->desc_ver);
-               } else {
-                       if (np->get_rx.ex == np->put_rx.ex)
-                               break;  /* we scanned the whole ring - do not continue */
-                       flags = le32_to_cpu(np->get_rx.ex->flaglen);
-                       len = nv_descr_getlength_ex(np->get_rx.ex, np->desc_ver);
-                       vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
-               }
+               if (np->get_rx.orig == np->put_rx.orig)
+                       break;  /* we scanned the whole ring - do not continue */
+               flags = le32_to_cpu(np->get_rx.orig->flaglen);
+               len = nv_descr_getlength(np->get_rx.orig, np->desc_ver);
 
                dprintk(KERN_DEBUG "%s: nv_rx_process: flags 0x%x.\n",
                                        dev->name, flags);
@@ -2076,13 +2202,133 @@ static int nv_rx_process(struct net_device *dev, int limit)
                np->stats.rx_packets++;
                np->stats.rx_bytes += len;
 next_pkt:
-               if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-                       if (np->get_rx.orig++ == np->last_rx.orig)
-                               np->get_rx.orig = np->first_rx.orig;
-               } else {
-                       if (np->get_rx.ex++ == np->last_rx.ex)
-                               np->get_rx.ex = np->first_rx.ex;
+               if (np->get_rx.orig++ == np->last_rx.orig)
+                       np->get_rx.orig = np->first_rx.orig;
+               if (np->get_rx_ctx++ == np->last_rx_ctx)
+                       np->get_rx_ctx = np->first_rx_ctx;
+       }
+
+       return count;
+}
+
+static int nv_rx_process_optimized(struct net_device *dev, int limit)
+{
+       struct fe_priv *np = netdev_priv(dev);
+       u32 flags;
+       u32 vlanflags = 0;
+       int count;
+
+       for (count = 0; count < limit; ++count) {
+               struct sk_buff *skb;
+               int len;
+
+               if (np->get_rx.ex == np->put_rx.ex)
+                       break;  /* we scanned the whole ring - do not continue */
+               flags = le32_to_cpu(np->get_rx.ex->flaglen);
+               len = nv_descr_getlength_ex(np->get_rx.ex, np->desc_ver);
+               vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
+
+               dprintk(KERN_DEBUG "%s: nv_rx_process_optimized: flags 0x%x.\n",
+                                       dev->name, flags);
+
+               if (flags & NV_RX_AVAIL)
+                       break;  /* still owned by hardware, */
+
+               /*
+                * the packet is for us - immediately tear down the pci mapping.
+                * TODO: check if a prefetch of the first cacheline improves
+                * the performance.
+                */
+               pci_unmap_single(np->pci_dev, np->get_rx_ctx->dma,
+                               np->get_rx_ctx->dma_len,
+                               PCI_DMA_FROMDEVICE);
+               skb = np->get_rx_ctx->skb;
+               np->get_rx_ctx->skb = NULL;
+
+               {
+                       int j;
+                       dprintk(KERN_DEBUG "Dumping packet (flags 0x%x).",flags);
+                       for (j=0; j<64; j++) {
+                               if ((j%16) == 0)
+                                       dprintk("\n%03x:", j);
+                               dprintk(" %02x", ((unsigned char*)skb->data)[j]);
+                       }
+                       dprintk("\n");
                }
+               /* look at what we actually got: */
+               if (!(flags & NV_RX2_DESCRIPTORVALID)) {
+                       dev_kfree_skb(skb);
+                       goto next_pkt;
+               }
+
+               if (flags & NV_RX2_ERROR) {
+                       if (flags & (NV_RX2_ERROR1|NV_RX2_ERROR2|NV_RX2_ERROR3)) {
+                               np->stats.rx_errors++;
+                               dev_kfree_skb(skb);
+                               goto next_pkt;
+                       }
+                       if (flags & NV_RX2_CRCERR) {
+                               np->stats.rx_crc_errors++;
+                               np->stats.rx_errors++;
+                               dev_kfree_skb(skb);
+                               goto next_pkt;
+                       }
+                       if (flags & NV_RX2_OVERFLOW) {
+                               np->stats.rx_over_errors++;
+                               np->stats.rx_errors++;
+                               dev_kfree_skb(skb);
+                               goto next_pkt;
+                       }
+                       if (flags & NV_RX2_ERROR4) {
+                               len = nv_getlen(dev, skb->data, len);
+                               if (len < 0) {
+                                       np->stats.rx_errors++;
+                                       dev_kfree_skb(skb);
+                                       goto next_pkt;
+                               }
+                       }
+                       /* framing errors are soft errors */
+                       if (flags & NV_RX2_FRAMINGERR) {
+                               if (flags & NV_RX2_SUBSTRACT1) {
+                                       len--;
+                               }
+                       }
+               }
+               if (np->rx_csum) {
+                       flags &= NV_RX2_CHECKSUMMASK;
+                       if (flags == NV_RX2_CHECKSUMOK1 ||
+                           flags == NV_RX2_CHECKSUMOK2 ||
+                           flags == NV_RX2_CHECKSUMOK3) {
+                               dprintk(KERN_DEBUG "%s: hw checksum hit!.\n", dev->name);
+                               skb->ip_summed = CHECKSUM_UNNECESSARY;
+                       } else {
+                               dprintk(KERN_DEBUG "%s: hwchecksum miss!.\n", dev->name);
+                       }
+               }
+               /* got a valid packet - forward it to the network core */
+               skb_put(skb, len);
+               skb->protocol = eth_type_trans(skb, dev);
+               dprintk(KERN_DEBUG "%s: nv_rx_process: %d bytes, proto %d accepted.\n",
+                                       dev->name, len, skb->protocol);
+#ifdef CONFIG_FORCEDETH_NAPI
+               if (np->vlangrp && (vlanflags & NV_RX3_VLAN_TAG_PRESENT))
+                       vlan_hwaccel_receive_skb(skb, np->vlangrp,
+                                                vlanflags & NV_RX3_VLAN_TAG_MASK);
+               else
+                       netif_receive_skb(skb);
+#else
+               if (np->vlangrp && (vlanflags & NV_RX3_VLAN_TAG_PRESENT))
+                       vlan_hwaccel_rx(skb, np->vlangrp,
+                                       vlanflags & NV_RX3_VLAN_TAG_MASK);
+               else
+                       netif_rx(skb);
+#endif
+               dev->last_rx = jiffies;
+               np->stats.rx_packets++;
+               np->stats.rx_bytes += len;
+next_pkt:
+               if (np->get_rx.ex++ == np->last_rx.ex)
+                       np->get_rx.ex = np->first_rx.ex;
                if (np->get_rx_ctx++ == np->last_rx_ctx)
                        np->get_rx_ctx = np->first_rx_ctx;
        }
@@ -2655,6 +2901,117 @@ static irqreturn_t nv_nic_irq(int foo, void *data)
        return IRQ_RETVAL(i);
 }
 
+static irqreturn_t nv_nic_irq_optimized(int foo, void *data)
+{
+       struct net_device *dev = (struct net_device *) data;
+       struct fe_priv *np = netdev_priv(dev);
+       u8 __iomem *base = get_hwbase(dev);
+       u32 events;
+       int i;
+
+       dprintk(KERN_DEBUG "%s: nv_nic_irq_optimized\n", dev->name);
+
+       for (i=0; ; i++) {
+               if (!(np->msi_flags & NV_MSI_X_ENABLED)) {
+                       events = readl(base + NvRegIrqStatus) & NVREG_IRQSTAT_MASK;
+                       writel(NVREG_IRQSTAT_MASK, base + NvRegIrqStatus);
+               } else {
+                       events = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK;
+                       writel(NVREG_IRQSTAT_MASK, base + NvRegMSIXIrqStatus);
+               }
+               pci_push(base);
+               dprintk(KERN_DEBUG "%s: irq: %08x\n", dev->name, events);
+               if (!(events & np->irqmask))
+                       break;
+
+               spin_lock(&np->lock);
+               nv_tx_done_optimized(dev);
+               spin_unlock(&np->lock);
+
+               if (events & NVREG_IRQ_LINK) {
+                       spin_lock(&np->lock);
+                       nv_link_irq(dev);
+                       spin_unlock(&np->lock);
+               }
+               if (np->need_linktimer && time_after(jiffies, np->link_timeout)) {
+                       spin_lock(&np->lock);
+                       nv_linkchange(dev);
+                       spin_unlock(&np->lock);
+                       np->link_timeout = jiffies + LINK_TIMEOUT;
+               }
+               if (events & (NVREG_IRQ_TX_ERR)) {
+                       dprintk(KERN_DEBUG "%s: received irq with events 0x%x. Probably TX fail.\n",
+                                               dev->name, events);
+               }
+               if (events & (NVREG_IRQ_UNKNOWN)) {
+                       printk(KERN_DEBUG "%s: received irq with unknown events 0x%x. Please report\n",
+                                               dev->name, events);
+               }
+               if (unlikely(events & NVREG_IRQ_RECOVER_ERROR)) {
+                       spin_lock(&np->lock);
+                       /* disable interrupts on the nic */
+                       if (!(np->msi_flags & NV_MSI_X_ENABLED))
+                               writel(0, base + NvRegIrqMask);
+                       else
+                               writel(np->irqmask, base + NvRegIrqMask);
+                       pci_push(base);
+
+                       if (!np->in_shutdown) {
+                               np->nic_poll_irq = np->irqmask;
+                               np->recover_error = 1;
+                               mod_timer(&np->nic_poll, jiffies + POLL_WAIT);
+                       }
+                       spin_unlock(&np->lock);
+                       break;
+               }
+
+#ifdef CONFIG_FORCEDETH_NAPI
+               if (events & NVREG_IRQ_RX_ALL) {
+                       netif_rx_schedule(dev);
+
+                       /* Disable furthur receive irq's */
+                       spin_lock(&np->lock);
+                       np->irqmask &= ~NVREG_IRQ_RX_ALL;
+
+                       if (np->msi_flags & NV_MSI_X_ENABLED)
+                               writel(NVREG_IRQ_RX_ALL, base + NvRegIrqMask);
+                       else
+                               writel(np->irqmask, base + NvRegIrqMask);
+                       spin_unlock(&np->lock);
+               }
+#else
+               nv_rx_process_optimized(dev, dev->weight);
+               if (nv_alloc_rx_optimized(dev)) {
+                       spin_lock(&np->lock);
+                       if (!np->in_shutdown)
+                               mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
+                       spin_unlock(&np->lock);
+               }
+#endif
+               if (i > max_interrupt_work) {
+                       spin_lock(&np->lock);
+                       /* disable interrupts on the nic */
+                       if (!(np->msi_flags & NV_MSI_X_ENABLED))
+                               writel(0, base + NvRegIrqMask);
+                       else
+                               writel(np->irqmask, base + NvRegIrqMask);
+                       pci_push(base);
+
+                       if (!np->in_shutdown) {
+                               np->nic_poll_irq = np->irqmask;
+                               mod_timer(&np->nic_poll, jiffies + POLL_WAIT);
+                       }
+                       printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq.\n", dev->name, i);
+                       spin_unlock(&np->lock);
+                       break;
+               }
+
+       }
+       dprintk(KERN_DEBUG "%s: nv_nic_irq_optimized completed\n", dev->name);
+
+       return IRQ_RETVAL(i);
+}
+
 static irqreturn_t nv_nic_irq_tx(int foo, void *data)
 {
        struct net_device *dev = (struct net_device *) data;
@@ -2675,7 +3032,7 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data)
                        break;
 
                spin_lock_irqsave(&np->lock, flags);
-               nv_tx_done(dev);
+               nv_tx_done_optimized(dev);
                spin_unlock_irqrestore(&np->lock, flags);
 
                if (events & (NVREG_IRQ_TX_ERR)) {
@@ -2711,7 +3068,10 @@ static int nv_napi_poll(struct net_device *dev, int *budget)
        u8 __iomem *base = get_hwbase(dev);
        unsigned long flags;
 
-       pkts = nv_rx_process(dev, limit);
+       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+               pkts = nv_rx_process(dev, limit);
+       else
+               pkts = nv_rx_process_optimized(dev, limit);
 
        if (nv_alloc_rx(dev)) {
                spin_lock_irqsave(&np->lock, flags);
@@ -2782,8 +3142,8 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
                if (!(events & np->irqmask))
                        break;
 
-               nv_rx_process(dev, dev->weight);
-               if (nv_alloc_rx(dev)) {
+               nv_rx_process_optimized(dev, dev->weight);
+               if (nv_alloc_rx_optimized(dev)) {
                        spin_lock_irqsave(&np->lock, flags);
                        if (!np->in_shutdown)
                                mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
@@ -2942,6 +3302,16 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
        u8 __iomem *base = get_hwbase(dev);
        int ret = 1;
        int i;
+       irqreturn_t (*handler)(int foo, void *data);
+
+       if (intr_test) {
+               handler = nv_nic_irq_test;
+       } else {
+               if (np->desc_ver == DESC_VER_3)
+                       handler = nv_nic_irq_optimized;
+               else
+                       handler = nv_nic_irq;
+       }
 
        if (np->msi_flags & NV_MSI_X_CAPABLE) {
                for (i = 0; i < (np->msi_flags & NV_MSI_X_VECTORS_MASK); i++) {
@@ -2979,10 +3349,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
                                set_msix_vector_map(dev, NV_MSI_X_VECTOR_OTHER, NVREG_IRQ_OTHER);
                        } else {
                                /* Request irq for all interrupts */
-                               if ((!intr_test &&
-                                    request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector, &nv_nic_irq, IRQF_SHARED, dev->name, dev) != 0) ||
-                                   (intr_test &&
-                                    request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector, &nv_nic_irq_test, IRQF_SHARED, dev->name, dev) != 0)) {
+                               if (request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector, handler, IRQF_SHARED, dev->name, dev) != 0) {
                                        printk(KERN_INFO "forcedeth: request_irq failed %d\n", ret);
                                        pci_disable_msix(np->pci_dev);
                                        np->msi_flags &= ~NV_MSI_X_ENABLED;
@@ -2998,8 +3365,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
        if (ret != 0 && np->msi_flags & NV_MSI_CAPABLE) {
                if ((ret = pci_enable_msi(np->pci_dev)) == 0) {
                        np->msi_flags |= NV_MSI_ENABLED;
-                       if ((!intr_test && request_irq(np->pci_dev->irq, &nv_nic_irq, IRQF_SHARED, dev->name, dev) != 0) ||
-                           (intr_test && request_irq(np->pci_dev->irq, &nv_nic_irq_test, IRQF_SHARED, dev->name, dev) != 0)) {
+                       if (request_irq(np->pci_dev->irq, handler, IRQF_SHARED, dev->name, dev) != 0) {
                                printk(KERN_INFO "forcedeth: request_irq failed %d\n", ret);
                                pci_disable_msi(np->pci_dev);
                                np->msi_flags &= ~NV_MSI_ENABLED;
@@ -3014,8 +3380,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
                }
        }
        if (ret != 0) {
-               if ((!intr_test && request_irq(np->pci_dev->irq, &nv_nic_irq, IRQF_SHARED, dev->name, dev) != 0) ||
-                   (intr_test && request_irq(np->pci_dev->irq, &nv_nic_irq_test, IRQF_SHARED, dev->name, dev) != 0))
+               if (request_irq(np->pci_dev->irq, handler, IRQF_SHARED, dev->name, dev) != 0)
                        goto out_err;
 
        }
@@ -4629,7 +4994,10 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 
        dev->open = nv_open;
        dev->stop = nv_close;
-       dev->hard_start_xmit = nv_start_xmit;
+       if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+               dev->hard_start_xmit = nv_start_xmit;
+       else
+               dev->hard_start_xmit = nv_start_xmit_optimized;
        dev->get_stats = nv_get_stats;
        dev->change_mtu = nv_change_mtu;
        dev->set_mac_address = nv_set_mac_address;