From 2768935a46603bb9bdd121864b1f2b2e8a71cccc Mon Sep 17 00:00:00 2001 From: Daniel Pieczko Date: Wed, 13 Feb 2013 10:54:41 +0000 Subject: [PATCH] sfc: reuse pages to avoid DMA mapping/unmapping costs On POWER systems, DMA mapping/unmapping operations are very expensive. These changes reduce these costs by trying to reuse DMA mapped pages. After all the buffers associated with a page have been processed and passed up, the page is placed into a ring (if there is room). For each page that is required for a refill operation, a page in the ring is examined to determine if its page count has fallen to 1, ie. the kernel has released its reference to these packets. If this is the case, the page can be immediately added back into the RX descriptor ring, without having to re-map it for DMA. If the kernel is still holding a reference to this page, it is removed from the ring and unmapped for DMA. Then a new page, which can immediately be used by RX buffers in the descriptor ring, is allocated and DMA mapped. The time a page needs to spend in the recycle ring before the kernel has released its page references is based on the number of buffers that use this page. As large pages can hold more RX buffers, the RX recycle ring can be shorter. This reduces memory usage on POWER systems, while maintaining the performance gain achieved by recycling pages, following the driver change to pack more than two RX buffers into large pages. When an IOMMU is not present, the recycle ring can be small to reduce memory usage, since DMA mapping operations are inexpensive. With a small recycle ring, attempting to refill the descriptor queue with more buffers than the equivalent size of the recycle ring could ultimately lead to memory leaks if page entries in the recycle ring were overwritten. To prevent this, the check to see if the recycle ring is full is changed to check if the next entry to be written is NULL. [bwh: Combine and rebase several commits so this is complete before the following buffer-packing changes. Remove module parameter.] Signed-off-by: Ben Hutchings --- drivers/net/ethernet/sfc/efx.c | 2 + drivers/net/ethernet/sfc/net_driver.h | 19 ++ drivers/net/ethernet/sfc/rx.c | 299 ++++++++++++++++++-------- 3 files changed, 226 insertions(+), 94 deletions(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 1213af5024d1..a70c458f3cef 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -661,6 +661,8 @@ static void efx_start_datapath(struct efx_nic *efx) efx->rx_buffer_truesize = PAGE_SIZE << efx->rx_buffer_order; } + efx->rx_bufs_per_page = (rx_buf_len <= PAGE_SIZE / 2) ? 2 : 1; + /* RX filters also have scatter-enabled flags */ if (efx->rx_scatter != old_rx_scatter) efx_filter_update_rx_scatter(efx); diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index e41b54bada7c..370c5bcebad9 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -264,12 +264,22 @@ struct efx_rx_page_state { * @notified_count: Number of buffers given to NIC (<= @added_count). * @removed_count: Number of buffers removed from the receive queue. * @scatter_n: Number of buffers used by current packet + * @page_ring: The ring to store DMA mapped pages for reuse. + * @page_add: Counter to calculate the write pointer for the recycle ring. + * @page_remove: Counter to calculate the read pointer for the recycle ring. + * @page_recycle_count: The number of pages that have been recycled. + * @page_recycle_failed: The number of pages that couldn't be recycled because + * the kernel still held a reference to them. + * @page_recycle_full: The number of pages that were released because the + * recycle ring was full. + * @page_ptr_mask: The number of pages in the RX recycle ring minus 1. * @max_fill: RX descriptor maximum fill level (<= ring size) * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill * (<= @max_fill) * @min_fill: RX descriptor minimum non-zero fill level. * This records the minimum fill level observed when a ring * refill was triggered. + * @recycle_count: RX buffer recycle counter. * @slow_fill: Timer used to defer efx_nic_generate_fill_event(). */ struct efx_rx_queue { @@ -285,10 +295,18 @@ struct efx_rx_queue { unsigned int notified_count; unsigned int removed_count; unsigned int scatter_n; + struct page **page_ring; + unsigned int page_add; + unsigned int page_remove; + unsigned int page_recycle_count; + unsigned int page_recycle_failed; + unsigned int page_recycle_full; + unsigned int page_ptr_mask; unsigned int max_fill; unsigned int fast_fill_trigger; unsigned int min_fill; unsigned int min_overfill; + unsigned int recycle_count; struct timer_list slow_fill; unsigned int slow_fill_count; }; @@ -806,6 +824,7 @@ struct efx_nic { unsigned int rx_dma_len; unsigned int rx_buffer_order; unsigned int rx_buffer_truesize; + unsigned int rx_bufs_per_page; u8 rx_hash_key[40]; u32 rx_indir_table[128]; bool rx_scatter; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 88aa1ff01e3f..eea56f3ec81c 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include "net_driver.h" @@ -27,6 +28,13 @@ /* Number of RX descriptors pushed at once. */ #define EFX_RX_BATCH 8 +/* Number of RX buffers to recycle pages for. When creating the RX page recycle + * ring, this number is divided by the number of buffers per page to calculate + * the number of pages to store in the RX page recycle ring. + */ +#define EFX_RECYCLE_RING_SIZE_IOMMU 4096 +#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_BATCH) + /* Maximum length for an RX descriptor sharing a page */ #define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state) \ - EFX_PAGE_IP_ALIGN) @@ -79,6 +87,56 @@ efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf) return rx_buf + 1; } +static inline void efx_sync_rx_buffer(struct efx_nic *efx, + struct efx_rx_buffer *rx_buf, + unsigned int len) +{ + dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len, + DMA_FROM_DEVICE); +} + +/* Return true if this is the last RX buffer using a page. */ +static inline bool efx_rx_is_last_buffer(struct efx_nic *efx, + struct efx_rx_buffer *rx_buf) +{ + return (rx_buf->page_offset >= (PAGE_SIZE >> 1) || + efx->rx_dma_len > EFX_RX_HALF_PAGE); +} + +/* Check the RX page recycle ring for a page that can be reused. */ +static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue) +{ + struct efx_nic *efx = rx_queue->efx; + struct page *page; + struct efx_rx_page_state *state; + unsigned index; + + index = rx_queue->page_remove & rx_queue->page_ptr_mask; + page = rx_queue->page_ring[index]; + if (page == NULL) + return NULL; + + rx_queue->page_ring[index] = NULL; + /* page_remove cannot exceed page_add. */ + if (rx_queue->page_remove != rx_queue->page_add) + ++rx_queue->page_remove; + + /* If page_count is 1 then we hold the only reference to this page. */ + if (page_count(page) == 1) { + ++rx_queue->page_recycle_count; + return page; + } else { + state = page_address(page); + dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + put_page(page); + ++rx_queue->page_recycle_failed; + } + + return NULL; +} + /** * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers * @@ -103,20 +161,28 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue) BUILD_BUG_ON(EFX_RX_BATCH & 1); for (count = 0; count < EFX_RX_BATCH; ++count) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC, - efx->rx_buffer_order); - if (unlikely(page == NULL)) - return -ENOMEM; - dma_addr = dma_map_page(&efx->pci_dev->dev, page, 0, - PAGE_SIZE << efx->rx_buffer_order, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(&efx->pci_dev->dev, dma_addr))) { - __free_pages(page, efx->rx_buffer_order); - return -EIO; + page = efx_reuse_page(rx_queue); + if (page == NULL) { + page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC, + efx->rx_buffer_order); + if (unlikely(page == NULL)) + return -ENOMEM; + dma_addr = + dma_map_page(&efx->pci_dev->dev, page, 0, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(&efx->pci_dev->dev, + dma_addr))) { + __free_pages(page, efx->rx_buffer_order); + return -EIO; + } + state = page_address(page); + state->dma_addr = dma_addr; + } else { + state = page_address(page); + dma_addr = state->dma_addr; } - state = page_address(page); - state->refcnt = 0; - state->dma_addr = dma_addr; + get_page(page); dma_addr += sizeof(struct efx_rx_page_state); page_offset = sizeof(struct efx_rx_page_state); @@ -128,9 +194,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue) rx_buf->page = page; rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN; rx_buf->len = efx->rx_dma_len; - rx_buf->flags = 0; ++rx_queue->added_count; - ++state->refcnt; if ((~count & 1) && (efx->rx_dma_len <= EFX_RX_HALF_PAGE)) { /* Use the second half of the page */ @@ -145,99 +209,91 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue) return 0; } +/* Unmap a DMA-mapped page. This function is only called for the final RX + * buffer in a page. + */ static void efx_unmap_rx_buffer(struct efx_nic *efx, - struct efx_rx_buffer *rx_buf, - unsigned int used_len) + struct efx_rx_buffer *rx_buf) { - if (rx_buf->page) { - struct efx_rx_page_state *state; - - state = page_address(rx_buf->page); - if (--state->refcnt == 0) { - dma_unmap_page(&efx->pci_dev->dev, - state->dma_addr, - PAGE_SIZE << efx->rx_buffer_order, - DMA_FROM_DEVICE); - } else if (used_len) { - dma_sync_single_for_cpu(&efx->pci_dev->dev, - rx_buf->dma_addr, used_len, - DMA_FROM_DEVICE); - } + struct page *page = rx_buf->page; + + if (page) { + struct efx_rx_page_state *state = page_address(page); + dma_unmap_page(&efx->pci_dev->dev, + state->dma_addr, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); } } -static void efx_free_rx_buffer(struct efx_nic *efx, - struct efx_rx_buffer *rx_buf) +static void efx_free_rx_buffer(struct efx_rx_buffer *rx_buf) { if (rx_buf->page) { - __free_pages(rx_buf->page, efx->rx_buffer_order); + put_page(rx_buf->page); rx_buf->page = NULL; } } -static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, - struct efx_rx_buffer *rx_buf) +/* Attempt to recycle the page if there is an RX recycle ring; the page can + * only be added if this is the final RX buffer, to prevent pages being used in + * the descriptor ring and appearing in the recycle ring simultaneously. + */ +static void efx_recycle_rx_page(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf) { - efx_unmap_rx_buffer(rx_queue->efx, rx_buf, 0); - efx_free_rx_buffer(rx_queue->efx, rx_buf); -} + struct page *page = rx_buf->page; + struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); + struct efx_nic *efx = rx_queue->efx; + unsigned index; -/* Attempt to resurrect the other receive buffer that used to share this page, - * which had previously been passed up to the kernel and freed. */ -static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue, - struct efx_rx_buffer *rx_buf) -{ - struct efx_rx_page_state *state = page_address(rx_buf->page); - struct efx_rx_buffer *new_buf; - unsigned fill_level, index; - - /* +1 because efx_rx_packet() incremented removed_count. +1 because - * we'd like to insert an additional descriptor whilst leaving - * EFX_RXD_HEAD_ROOM for the non-recycle path */ - fill_level = (rx_queue->added_count - rx_queue->removed_count + 2); - if (unlikely(fill_level > rx_queue->max_fill)) { - /* We could place "state" on a list, and drain the list in - * efx_fast_push_rx_descriptors(). For now, this will do. */ + /* Only recycle the page after processing the final buffer. */ + if (!efx_rx_is_last_buffer(efx, rx_buf)) return; - } - ++state->refcnt; - get_page(rx_buf->page); + index = rx_queue->page_add & rx_queue->page_ptr_mask; + if (rx_queue->page_ring[index] == NULL) { + unsigned read_index = rx_queue->page_remove & + rx_queue->page_ptr_mask; - index = rx_queue->added_count & rx_queue->ptr_mask; - new_buf = efx_rx_buffer(rx_queue, index); - new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1); - new_buf->page = rx_buf->page; - new_buf->len = rx_buf->len; - ++rx_queue->added_count; + /* The next slot in the recycle ring is available, but + * increment page_remove if the read pointer currently + * points here. + */ + if (read_index == index) + ++rx_queue->page_remove; + rx_queue->page_ring[index] = page; + ++rx_queue->page_add; + return; + } + ++rx_queue->page_recycle_full; + efx_unmap_rx_buffer(efx, rx_buf); + put_page(rx_buf->page); } -/* Recycle buffers directly back into the rx_queue. There is always - * room to add these buffer, because we've just popped them. - */ +static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, + struct efx_rx_buffer *rx_buf) +{ + /* Release the page reference we hold for the buffer. */ + if (rx_buf->page) + put_page(rx_buf->page); + + /* If this is the last buffer in a page, unmap and free it. */ + if (efx_rx_is_last_buffer(rx_queue->efx, rx_buf)) { + efx_unmap_rx_buffer(rx_queue->efx, rx_buf); + efx_free_rx_buffer(rx_buf); + } + rx_buf->page = NULL; +} + +/* Recycle the pages that are used by buffers that have just been received. */ static void efx_recycle_rx_buffers(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, unsigned int n_frags) { - struct efx_nic *efx = channel->efx; struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); - struct efx_rx_buffer *new_buf; - unsigned index; do { - rx_buf->flags = 0; - - if (efx->rx_dma_len <= EFX_RX_HALF_PAGE && - page_count(rx_buf->page) == 1) - efx_resurrect_rx_buffer(rx_queue, rx_buf); - - index = rx_queue->added_count & rx_queue->ptr_mask; - new_buf = efx_rx_buffer(rx_queue, index); - - memcpy(new_buf, rx_buf, sizeof(*new_buf)); - rx_buf->page = NULL; - ++rx_queue->added_count; - + efx_recycle_rx_page(channel, rx_buf); rx_buf = efx_rx_buf_next(rx_queue, rx_buf); } while (--n_frags); } @@ -451,7 +507,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, struct efx_rx_buffer *rx_buf; rx_buf = efx_rx_buffer(rx_queue, index); - rx_buf->flags |= flags; + rx_buf->flags = flags; /* Validate the number of fragments and completed length */ if (n_frags == 1) { @@ -479,6 +535,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, */ if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) { efx_rx_flush_packet(channel); + put_page(rx_buf->page); efx_recycle_rx_buffers(channel, rx_buf, n_frags); return; } @@ -486,10 +543,10 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, if (n_frags == 1) rx_buf->len = len; - /* Release and/or sync DMA mapping - assumes all RX buffers - * consumed in-order per RX queue + /* Release and/or sync the DMA mapping - assumes all RX buffers + * consumed in-order per RX queue. */ - efx_unmap_rx_buffer(efx, rx_buf, rx_buf->len); + efx_sync_rx_buffer(efx, rx_buf, rx_buf->len); /* Prefetch nice and early so data will (hopefully) be in cache by * the time we look at it. @@ -509,12 +566,16 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, rx_buf = efx_rx_buf_next(rx_queue, rx_buf); if (--tail_frags == 0) break; - efx_unmap_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE); + efx_sync_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE); } rx_buf->len = len - (n_frags - 1) * EFX_RX_USR_BUF_SIZE; - efx_unmap_rx_buffer(efx, rx_buf, rx_buf->len); + efx_sync_rx_buffer(efx, rx_buf, rx_buf->len); } + /* All fragments have been DMA-synced, so recycle buffers and pages. */ + rx_buf = efx_rx_buffer(rx_queue, index); + efx_recycle_rx_buffers(channel, rx_buf, n_frags); + /* Pipeline receives so that we give time for packet headers to be * prefetched into cache. */ @@ -532,7 +593,7 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh, skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len); if (unlikely(skb == NULL)) { - efx_free_rx_buffer(channel->efx, rx_buf); + efx_free_rx_buffer(rx_buf); return; } skb_record_rx_queue(skb, channel->rx_queue.core_index); @@ -561,7 +622,7 @@ void __efx_rx_packet(struct efx_channel *channel) */ if (unlikely(efx->loopback_selftest)) { efx_loopback_rx_packet(efx, eh, rx_buf->len); - efx_free_rx_buffer(efx, rx_buf); + efx_free_rx_buffer(rx_buf); goto out; } @@ -603,9 +664,32 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) kfree(rx_queue->buffer); rx_queue->buffer = NULL; } + return rc; } +void efx_init_rx_recycle_ring(struct efx_nic *efx, + struct efx_rx_queue *rx_queue) +{ + unsigned int bufs_in_recycle_ring, page_ring_size; + + /* Set the RX recycle ring size */ +#ifdef CONFIG_PPC64 + bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; +#else + if (efx->pci_dev->dev.iommu_group) + bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; + else + bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU; +#endif /* CONFIG_PPC64 */ + + page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring / + efx->rx_bufs_per_page); + rx_queue->page_ring = kcalloc(page_ring_size, + sizeof(*rx_queue->page_ring), GFP_KERNEL); + rx_queue->page_ptr_mask = page_ring_size - 1; +} + void efx_init_rx_queue(struct efx_rx_queue *rx_queue) { struct efx_nic *efx = rx_queue->efx; @@ -619,6 +703,13 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) rx_queue->notified_count = 0; rx_queue->removed_count = 0; rx_queue->min_fill = -1U; + efx_init_rx_recycle_ring(efx, rx_queue); + + rx_queue->page_remove = 0; + rx_queue->page_add = rx_queue->page_ptr_mask + 1; + rx_queue->page_recycle_count = 0; + rx_queue->page_recycle_failed = 0; + rx_queue->page_recycle_full = 0; /* Initialise limit fields */ max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM; @@ -642,6 +733,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) { int i; + struct efx_nic *efx = rx_queue->efx; struct efx_rx_buffer *rx_buf; netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, @@ -653,13 +745,32 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) del_timer_sync(&rx_queue->slow_fill); efx_nic_fini_rx(rx_queue); - /* Release RX buffers NB start at index 0 not current HW ptr */ + /* Release RX buffers from the current read ptr to the write ptr */ if (rx_queue->buffer) { - for (i = 0; i <= rx_queue->ptr_mask; i++) { - rx_buf = efx_rx_buffer(rx_queue, i); + for (i = rx_queue->removed_count; i < rx_queue->added_count; + i++) { + unsigned index = i & rx_queue->ptr_mask; + rx_buf = efx_rx_buffer(rx_queue, index); efx_fini_rx_buffer(rx_queue, rx_buf); } } + + /* Unmap and release the pages in the recycle ring. Remove the ring. */ + for (i = 0; i <= rx_queue->page_ptr_mask; i++) { + struct page *page = rx_queue->page_ring[i]; + struct efx_rx_page_state *state; + + if (page == NULL) + continue; + + state = page_address(page); + dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, + PAGE_SIZE << efx->rx_buffer_order, + DMA_FROM_DEVICE); + put_page(page); + } + kfree(rx_queue->page_ring); + rx_queue->page_ring = NULL; } void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) -- 2.20.1