From 1a1c225b9463038ac68b369ef05e4ee7fd9c82a5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 25 Sep 2012 00:30:52 +0000 Subject: [PATCH] igb: Do not use header split, instead receive all frames into a single buffer This change makes it so that we no longer use header split. The idea is to reduce partial cache line writes by hardware when handling frames larger then header size. We can compensate for the extra overhead of having to memcpy the header buffer by avoiding the cache misses seen by leaving an full skb allocated and sitting on the ring. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 13 +- drivers/net/ethernet/intel/igb/igb_ethtool.c | 31 +- drivers/net/ethernet/intel/igb/igb_main.c | 420 +++++++++++++------ 3 files changed, 312 insertions(+), 152 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index f6a1cd9d72a6..72ab9ac34a30 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -174,11 +174,9 @@ struct igb_tx_buffer { }; struct igb_rx_buffer { - struct sk_buff *skb; dma_addr_t dma; struct page *page; - dma_addr_t page_dma; - u32 page_offset; + unsigned int page_offset; }; struct igb_tx_queue_stats { @@ -251,6 +249,7 @@ struct igb_ring { }; /* RX */ struct { + struct sk_buff *skb; struct igb_rx_queue_stats rx_stats; struct u64_stats_sync rx_syncp; }; @@ -451,13 +450,11 @@ static inline void igb_ptp_rx_hwtstamp(struct igb_q_vector *q_vector, union e1000_adv_rx_desc *rx_desc, struct sk_buff *skb) { - if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { - igb_ptp_rx_pktstamp(q_vector, skb->data, skb); - skb_pull(skb, IGB_TS_HDR_LEN); - } else if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TS)) { + if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TS) && + !igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) igb_ptp_rx_rgtstamp(q_vector, skb); - } } + extern int igb_ptp_hwtstamp_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd); #endif /* CONFIG_IGB_PTP */ diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 2ea012849825..0faac423bd5b 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "igb.h" @@ -1685,16 +1686,24 @@ static void igb_create_lbtest_frame(struct sk_buff *skb, memset(&skb->data[frame_size + 12], 0xAF, 1); } -static int igb_check_lbtest_frame(struct sk_buff *skb, unsigned int frame_size) +static int igb_check_lbtest_frame(struct igb_rx_buffer *rx_buffer, + unsigned int frame_size) { - frame_size /= 2; - if (*(skb->data + 3) == 0xFF) { - if ((*(skb->data + frame_size + 10) == 0xBE) && - (*(skb->data + frame_size + 12) == 0xAF)) { - return 0; - } - } - return 13; + unsigned char *data; + bool match = true; + + frame_size >>= 1; + + data = kmap(rx_buffer->page) + rx_buffer->page_offset; + + if (data[3] != 0xFF || + data[frame_size + 10] != 0xBE || + data[frame_size + 12] != 0xAF) + match = false; + + kunmap(rx_buffer->page); + + return match; } static int igb_clean_test_rings(struct igb_ring *rx_ring, @@ -1720,12 +1729,12 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring, /* unmap rx buffer, will be remapped by alloc_rx_buffers */ dma_unmap_single(rx_ring->dev, rx_buffer_info->dma, - IGB_RX_HDR_LEN, + PAGE_SIZE / 2, DMA_FROM_DEVICE); rx_buffer_info->dma = 0; /* verify contents of skb */ - if (!igb_check_lbtest_frame(rx_buffer_info->skb, size)) + if (igb_check_lbtest_frame(rx_buffer_info, size)) count++; /* unmap buffer on tx side */ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index c611cffa788f..665eafa401d4 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -534,25 +534,21 @@ rx_ring_summary: if (staterr & E1000_RXD_STAT_DD) { /* Descriptor Done */ - pr_info("%s[0x%03X] %016llX %016llX -------" - "--------- %p%s\n", "RWB", i, + pr_info("%s[0x%03X] %016llX %016llX ---------------- %s\n", + "RWB", i, le64_to_cpu(u0->a), le64_to_cpu(u0->b), - buffer_info->skb, next_desc); + next_desc); } else { - pr_info("%s[0x%03X] %016llX %016llX %016llX" - " %p%s\n", "R ", i, + pr_info("%s[0x%03X] %016llX %016llX %016llX %s\n", + "R ", i, le64_to_cpu(u0->a), le64_to_cpu(u0->b), (u64)buffer_info->dma, - buffer_info->skb, next_desc); + next_desc); if (netif_msg_pktdata(adapter) && - buffer_info->dma && buffer_info->skb) { - print_hex_dump(KERN_INFO, "", - DUMP_PREFIX_ADDRESS, - 16, 1, buffer_info->skb->data, - IGB_RX_HDR_LEN, true); + buffer_info->dma && buffer_info->page) { print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 1, @@ -3111,7 +3107,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, #else srrctl |= (PAGE_SIZE / 2) >> E1000_SRRCTL_BSIZEPKT_SHIFT; #endif - srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; + srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; #ifdef CONFIG_IGB_PTP if (hw->mac.type >= e1000_82580) srrctl |= E1000_SRRCTL_TIMESTAMP; @@ -3305,36 +3301,27 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) unsigned long size; u16 i; + if (rx_ring->skb) + dev_kfree_skb(rx_ring->skb); + rx_ring->skb = NULL; + if (!rx_ring->rx_buffer_info) return; /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i]; - if (buffer_info->dma) { - dma_unmap_single(rx_ring->dev, - buffer_info->dma, - IGB_RX_HDR_LEN, - DMA_FROM_DEVICE); - buffer_info->dma = 0; - } - if (buffer_info->skb) { - dev_kfree_skb(buffer_info->skb); - buffer_info->skb = NULL; - } - if (buffer_info->page_dma) { + if (buffer_info->dma) dma_unmap_page(rx_ring->dev, - buffer_info->page_dma, + buffer_info->dma, PAGE_SIZE / 2, DMA_FROM_DEVICE); - buffer_info->page_dma = 0; - } - if (buffer_info->page) { - put_page(buffer_info->page); - buffer_info->page = NULL; - buffer_info->page_offset = 0; - } + buffer_info->dma = 0; + if (buffer_info->page) + __free_page(buffer_info->page); + buffer_info->page = NULL; + buffer_info->page_offset = 0; } size = sizeof(struct igb_rx_buffer) * rx_ring->count; @@ -5906,23 +5893,219 @@ static void igb_rx_vlan(struct igb_ring *ring, } } -static inline u16 igb_get_hlen(union e1000_adv_rx_desc *rx_desc) +/** + * igb_get_headlen - determine size of header for LRO/GRO + * @data: pointer to the start of the headers + * @max_len: total length of section to find headers in + * + * This function is meant to determine the length of headers that will + * be recognized by hardware for LRO, and GRO offloads. The main + * motivation of doing this is to only perform one pull for IPv4 TCP + * packets so that we can do basic things like calculating the gso_size + * based on the average data per packet. + **/ +static unsigned int igb_get_headlen(unsigned char *data, + unsigned int max_len) +{ + union { + unsigned char *network; + /* l2 headers */ + struct ethhdr *eth; + struct vlan_hdr *vlan; + /* l3 headers */ + struct iphdr *ipv4; + struct ipv6hdr *ipv6; + } hdr; + __be16 protocol; + u8 nexthdr = 0; /* default to not TCP */ + u8 hlen; + + /* this should never happen, but better safe than sorry */ + if (max_len < ETH_HLEN) + return max_len; + + /* initialize network frame pointer */ + hdr.network = data; + + /* set first protocol and move network header forward */ + protocol = hdr.eth->h_proto; + hdr.network += ETH_HLEN; + + /* handle any vlan tag if present */ + if (protocol == __constant_htons(ETH_P_8021Q)) { + if ((hdr.network - data) > (max_len - VLAN_HLEN)) + return max_len; + + protocol = hdr.vlan->h_vlan_encapsulated_proto; + hdr.network += VLAN_HLEN; + } + + /* handle L3 protocols */ + if (protocol == __constant_htons(ETH_P_IP)) { + if ((hdr.network - data) > (max_len - sizeof(struct iphdr))) + return max_len; + + /* access ihl as a u8 to avoid unaligned access on ia64 */ + hlen = (hdr.network[0] & 0x0F) << 2; + + /* verify hlen meets minimum size requirements */ + if (hlen < sizeof(struct iphdr)) + return hdr.network - data; + + /* record next protocol */ + nexthdr = hdr.ipv4->protocol; + hdr.network += hlen; + } else if (protocol == __constant_htons(ETH_P_IPV6)) { + if ((hdr.network - data) > (max_len - sizeof(struct ipv6hdr))) + return max_len; + + /* record next protocol */ + nexthdr = hdr.ipv6->nexthdr; + hdr.network += sizeof(struct ipv6hdr); + } else { + return hdr.network - data; + } + + /* finally sort out TCP */ + if (nexthdr == IPPROTO_TCP) { + if ((hdr.network - data) > (max_len - sizeof(struct tcphdr))) + return max_len; + + /* access doff as a u8 to avoid unaligned access on ia64 */ + hlen = (hdr.network[12] & 0xF0) >> 2; + + /* verify hlen meets minimum size requirements */ + if (hlen < sizeof(struct tcphdr)) + return hdr.network - data; + + hdr.network += hlen; + } else if (nexthdr == IPPROTO_UDP) { + if ((hdr.network - data) > (max_len - sizeof(struct udphdr))) + return max_len; + + hdr.network += sizeof(struct udphdr); + } + + /* + * If everything has gone correctly hdr.network should be the + * data section of the packet and will be the end of the header. + * If not then it probably represents the end of the last recognized + * header. + */ + if ((hdr.network - data) < max_len) + return hdr.network - data; + else + return max_len; +} + +/** + * igb_pull_tail - igb specific version of skb_pull_tail + * @rx_ring: rx descriptor ring packet is being transacted on + * @skb: pointer to current skb being adjusted + * + * This function is an igb specific version of __pskb_pull_tail. The + * main difference between this version and the original function is that + * this function can make several assumptions about the state of things + * that allow for significant optimizations versus the standard function. + * As a result we can do things like drop a frag and maintain an accurate + * truesize for the skb. + */ +static void igb_pull_tail(struct igb_ring *rx_ring, + union e1000_adv_rx_desc *rx_desc, + struct sk_buff *skb) { - /* HW will not DMA in data larger than the given buffer, even if it - * parses the (NFS, of course) header to be larger. In that case, it - * fills the header buffer and spills the rest into the page. + struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; + unsigned char *va; + unsigned int pull_len; + + /* + * it is valid to use page_address instead of kmap since we are + * working with pages allocated out of the lomem pool per + * alloc_page(GFP_ATOMIC) */ - u16 hlen = (le16_to_cpu(rx_desc->wb.lower.lo_dword.hdr_info) & - E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT; - if (hlen > IGB_RX_HDR_LEN) - hlen = IGB_RX_HDR_LEN; - return hlen; + va = skb_frag_address(frag); + +#ifdef CONFIG_IGB_PTP + if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { + /* retrieve timestamp from buffer */ + igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); + + /* update pointers to remove timestamp header */ + skb_frag_size_sub(frag, IGB_TS_HDR_LEN); + frag->page_offset += IGB_TS_HDR_LEN; + skb->data_len -= IGB_TS_HDR_LEN; + skb->len -= IGB_TS_HDR_LEN; + + /* move va to start of packet data */ + va += IGB_TS_HDR_LEN; + } + +#endif + /* + * we need the header to contain the greater of either ETH_HLEN or + * 60 bytes if the skb->len is less than 60 for skb_pad. + */ + pull_len = igb_get_headlen(va, IGB_RX_HDR_LEN); + + /* align pull length to size of long to optimize memcpy performance */ + skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long))); + + /* update all of the pointers */ + skb_frag_size_sub(frag, pull_len); + frag->page_offset += pull_len; + skb->data_len -= pull_len; + skb->tail += pull_len; +} + +/** + * igb_cleanup_headers - Correct corrupted or empty headers + * @rx_ring: rx descriptor ring packet is being transacted on + * @rx_desc: pointer to the EOP Rx descriptor + * @skb: pointer to current skb being fixed + * + * Address the case where we are pulling data in on pages only + * and as such no data is present in the skb header. + * + * In addition if skb is not at least 60 bytes we need to pad it so that + * it is large enough to qualify as a valid Ethernet frame. + * + * Returns true if an error was encountered and skb was freed. + **/ +static bool igb_cleanup_headers(struct igb_ring *rx_ring, + union e1000_adv_rx_desc *rx_desc, + struct sk_buff *skb) +{ + + if (unlikely((igb_test_staterr(rx_desc, + E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) { + struct net_device *netdev = rx_ring->netdev; + if (!(netdev->features & NETIF_F_RXALL)) { + dev_kfree_skb_any(skb); + return true; + } + } + + /* place header in linear portion of buffer */ + if (skb_is_nonlinear(skb)) + igb_pull_tail(rx_ring, rx_desc, skb); + + /* if skb_pad returns an error the skb was freed */ + if (unlikely(skb->len < 60)) { + int pad_len = 60 - skb->len; + + if (skb_pad(skb, pad_len)) + return true; + __skb_put(skb, pad_len); + } + + return false; } static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget) { struct igb_ring *rx_ring = q_vector->rx.ring; union e1000_adv_rx_desc *rx_desc; + struct sk_buff *skb = rx_ring->skb; const int current_node = numa_node_id(); unsigned int total_bytes = 0, total_packets = 0; u16 cleaned_count = igb_desc_unused(rx_ring); @@ -5932,12 +6115,9 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget) while (igb_test_staterr(rx_desc, E1000_RXD_STAT_DD)) { struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i]; - struct sk_buff *skb = buffer_info->skb; + struct page *page; union e1000_adv_rx_desc *next_rxd; - buffer_info->skb = NULL; - prefetch(skb->data); - i++; if (i == rx_ring->count) i = 0; @@ -5952,52 +6132,57 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget) */ rmb(); - if (!skb_is_nonlinear(skb)) { - __skb_put(skb, igb_get_hlen(rx_desc)); - dma_unmap_single(rx_ring->dev, buffer_info->dma, - IGB_RX_HDR_LEN, - DMA_FROM_DEVICE); - buffer_info->dma = 0; - } - - if (rx_desc->wb.upper.length) { - u16 length = le16_to_cpu(rx_desc->wb.upper.length); + page = buffer_info->page; + prefetchw(page); - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, - buffer_info->page, - buffer_info->page_offset, - length); + if (likely(!skb)) { + void *page_addr = page_address(page) + + buffer_info->page_offset; - skb->len += length; - skb->data_len += length; - skb->truesize += PAGE_SIZE / 2; + /* prefetch first cache line of first page */ + prefetch(page_addr); +#if L1_CACHE_BYTES < 128 + prefetch(page_addr + L1_CACHE_BYTES); +#endif - if ((page_count(buffer_info->page) != 1) || - (page_to_nid(buffer_info->page) != current_node)) - buffer_info->page = NULL; - else - get_page(buffer_info->page); + /* allocate a skb to store the frags */ + skb = netdev_alloc_skb_ip_align(rx_ring->netdev, + IGB_RX_HDR_LEN); + if (unlikely(!skb)) { + rx_ring->rx_stats.alloc_failed++; + break; + } - dma_unmap_page(rx_ring->dev, buffer_info->page_dma, - PAGE_SIZE / 2, DMA_FROM_DEVICE); - buffer_info->page_dma = 0; + /* + * we will be copying header into skb->data in + * pskb_may_pull so it is in our interest to prefetch + * it now to avoid a possible cache miss + */ + prefetchw(skb->data); } - if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP)) { - struct igb_rx_buffer *next_buffer; - next_buffer = &rx_ring->rx_buffer_info[i]; - buffer_info->skb = next_buffer->skb; - buffer_info->dma = next_buffer->dma; - next_buffer->skb = skb; - next_buffer->dma = 0; - goto next_desc; - } + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, + buffer_info->page_offset, + le16_to_cpu(rx_desc->wb.upper.length), + PAGE_SIZE / 2); - if (unlikely((igb_test_staterr(rx_desc, - E1000_RXDEXT_ERR_FRAME_ERR_MASK)) - && !(rx_ring->netdev->features & NETIF_F_RXALL))) { - dev_kfree_skb_any(skb); + if ((page_count(buffer_info->page) != 1) || + (page_to_nid(buffer_info->page) != current_node)) + buffer_info->page = NULL; + else + get_page(buffer_info->page); + + dma_unmap_page(rx_ring->dev, buffer_info->dma, + PAGE_SIZE / 2, DMA_FROM_DEVICE); + buffer_info->dma = 0; + + if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP)) goto next_desc; + + /* verify the packet layout is correct */ + if (igb_cleanup_headers(rx_ring, rx_desc, skb)) { + skb = NULL; + continue; } #ifdef CONFIG_IGB_PTP @@ -6010,10 +6195,14 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget) total_bytes += skb->len; total_packets++; + skb_record_rx_queue(skb, rx_ring->queue_index); skb->protocol = eth_type_trans(skb, rx_ring->netdev); napi_gro_receive(&q_vector->napi, skb); + /* reset skb pointer */ + skb = NULL; + budget--; next_desc: if (!budget) @@ -6030,6 +6219,9 @@ next_desc: rx_desc = next_rxd; } + /* place incomplete frames back on ring for completion */ + rx_ring->skb = skb; + rx_ring->next_to_clean = i; u64_stats_update_begin(&rx_ring->rx_syncp); rx_ring->rx_stats.packets += total_packets; @@ -6044,70 +6236,37 @@ next_desc: return !!budget; } -static bool igb_alloc_mapped_skb(struct igb_ring *rx_ring, - struct igb_rx_buffer *bi) -{ - struct sk_buff *skb = bi->skb; - dma_addr_t dma = bi->dma; - - if (dma) - return true; - - if (likely(!skb)) { - skb = netdev_alloc_skb_ip_align(rx_ring->netdev, - IGB_RX_HDR_LEN); - bi->skb = skb; - if (!skb) { - rx_ring->rx_stats.alloc_failed++; - return false; - } - - /* initialize skb for ring */ - skb_record_rx_queue(skb, rx_ring->queue_index); - } - - dma = dma_map_single(rx_ring->dev, skb->data, - IGB_RX_HDR_LEN, DMA_FROM_DEVICE); - - if (dma_mapping_error(rx_ring->dev, dma)) { - rx_ring->rx_stats.alloc_failed++; - return false; - } - - bi->dma = dma; - return true; -} - static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, struct igb_rx_buffer *bi) { struct page *page = bi->page; - dma_addr_t page_dma = bi->page_dma; + dma_addr_t dma = bi->dma; unsigned int page_offset = bi->page_offset ^ (PAGE_SIZE / 2); - if (page_dma) + if (dma) return true; if (!page) { - page = __skb_alloc_page(GFP_ATOMIC, bi->skb); - bi->page = page; + page = __skb_alloc_page(GFP_ATOMIC | __GFP_COLD, NULL); if (unlikely(!page)) { rx_ring->rx_stats.alloc_failed++; return false; } + bi->page = page; } - page_dma = dma_map_page(rx_ring->dev, page, - page_offset, PAGE_SIZE / 2, - DMA_FROM_DEVICE); + dma = dma_map_page(rx_ring->dev, page, + page_offset, PAGE_SIZE / 2, + DMA_FROM_DEVICE); - if (dma_mapping_error(rx_ring->dev, page_dma)) { + if (dma_mapping_error(rx_ring->dev, dma)) { rx_ring->rx_stats.alloc_failed++; return false; } - bi->page_dma = page_dma; + bi->dma = dma; bi->page_offset = page_offset; + return true; } @@ -6126,17 +6285,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) i -= rx_ring->count; while (cleaned_count--) { - if (!igb_alloc_mapped_skb(rx_ring, bi)) + if (!igb_alloc_mapped_page(rx_ring, bi)) break; /* Refresh the desc even if buffer_addrs didn't change * because each write-back erases this info. */ - rx_desc->read.hdr_addr = cpu_to_le64(bi->dma); - - if (!igb_alloc_mapped_page(rx_ring, bi)) - break; - - rx_desc->read.pkt_addr = cpu_to_le64(bi->page_dma); + rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); rx_desc++; bi++; -- 2.20.1