Based on patches from Sonny Rao and Milton Miller...
Combined the patches to fix up clean_tx_irq and clean_rx_irq.
The PowerPC architecture does not require loads to independent bytes
to be ordered without adding an explicit barrier.
In ixgbe_clean_rx_irq we load the status bit then load the packet data.
With packet split disabled if these loads go out of order we get a
stale packet, but we will notice the bad sequence numbers and drop it.
The problem occurs with packet split enabled where the TCP/IP header
and data are in different descriptors. If the reads go out of order
we may have data that doesn't match the TCP/IP header. Since we use
hardware checksumming this bad data is never verified and it makes it
all the way to the application.
This bug was found during stress testing and adding this barrier has
been shown to fix it. The bug can manifest as a data integrity issue
(bad payload data) or as a BUG in skb_pull().
This was a nasty bug to hunt down, if people agree with the fix I think
it's a candidate for stable.
Previously Submitted to e1000-devel only for ixgbe
http://marc.info/?l=e1000-devel&m=
126593062701537&w=3
We've now seen this problem hit with other device drivers (e1000e mostly)
So I'm resubmitting with fixes for other Intel Device Drivers with
similar issues.
CC: Milton Miller <miltonm@bga.com>
CC: Anton Blanchard <anton@samba.org>
CC: Sonny Rao <sonnyrao@us.ibm.com>
CC: stable <stable@kernel.org>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
for (cb = nic->cb_to_clean;
cb->status & cpu_to_le16(cb_complete);
cb = nic->cb_to_clean = cb->next) {
+ rmb(); /* read skb after status */
netif_printk(nic, tx_done, KERN_DEBUG, nic->netdev,
"cb[%d]->status = 0x%04X\n",
(int)(((void*)cb - (void*)nic->cbs)/sizeof(struct cb)),
netif_printk(nic, rx_status, KERN_DEBUG, nic->netdev,
"status=0x%04X\n", rfd_status);
+ rmb(); /* read size after status bit */
/* If data isn't ready, nothing to indicate */
if (unlikely(!(rfd_status & cb_complete))) {
while ((eop_desc->upper.data & cpu_to_le32(E1000_TXD_STAT_DD)) &&
(count < tx_ring->count)) {
bool cleaned = false;
+ rmb(); /* read buffer_info after eop_desc */
for ( ; !cleaned; count++) {
tx_desc = E1000_TX_DESC(*tx_ring, i);
buffer_info = &tx_ring->buffer_info[i];
if (*work_done >= work_to_do)
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
status = rx_desc->status;
skb = buffer_info->skb;
if (*work_done >= work_to_do)
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
status = rx_desc->status;
skb = buffer_info->skb;
if (*work_done >= work_to_do)
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
status = rx_desc->status;
skb = buffer_info->skb;
while ((eop_desc->upper.data & cpu_to_le32(E1000_TXD_STAT_DD)) &&
(count < tx_ring->count)) {
bool cleaned = false;
+ rmb(); /* read buffer_info after eop_desc */
for (; !cleaned; count++) {
tx_desc = E1000_TX_DESC(*tx_ring, i);
buffer_info = &tx_ring->buffer_info[i];
break;
(*work_done)++;
skb = buffer_info->skb;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
/* in the packet split case this is header only */
prefetch(skb->data - NET_IP_ALIGN);
if (*work_done >= work_to_do)
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
status = rx_desc->status;
skb = buffer_info->skb;
while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&
(count < tx_ring->count)) {
+ rmb(); /* read buffer_info after eop_desc status */
for (cleaned = false; !cleaned; count++) {
tx_desc = E1000_TX_DESC_ADV(*tx_ring, i);
buffer_info = &tx_ring->buffer_info[i];
if (*work_done >= budget)
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
skb = buffer_info->skb;
prefetch(skb->data - NET_IP_ALIGN);
if (*work_done >= work_to_do)
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
buffer_info = &rx_ring->buffer_info[i];
while ((eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD)) &&
(count < tx_ring->count)) {
+ rmb(); /* read buffer_info after eop_desc status */
for (cleaned = false; !cleaned; count++) {
tx_desc = IGBVF_TX_DESC_ADV(*tx_ring, i);
buffer_info = &tx_ring->buffer_info[i];
while (eop_desc->status & IXGB_TX_DESC_STATUS_DD) {
+ rmb(); /* read buffer_info after eop_desc */
for (cleaned = false; !cleaned; ) {
tx_desc = IXGB_TX_DESC(*tx_ring, i);
buffer_info = &tx_ring->buffer_info[i];
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
status = rx_desc->status;
skb = buffer_info->skb;
buffer_info->skb = NULL;
while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
(count < tx_ring->work_limit)) {
bool cleaned = false;
+ rmb(); /* read buffer_info after eop_desc */
for ( ; !cleaned; count++) {
struct sk_buff *skb;
tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i);
while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
(count < tx_ring->work_limit)) {
bool cleaned = false;
+ rmb(); /* read buffer_info after eop_desc */
for ( ; !cleaned; count++) {
struct sk_buff *skb;
tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i);
break;
(*work_done)++;
+ rmb(); /* read descriptor and rx_buffer_info after status DD */
if (adapter->flags & IXGBE_FLAG_RX_PS_ENABLED) {
hdr_info = le16_to_cpu(ixgbevf_get_hdr_info(rx_desc));
len = (hdr_info & IXGBE_RXDADV_HDRBUFLEN_MASK) >>