net/mlx5e: Build RX SKB on demand
authorSaeed Mahameed <saeedm@mellanox.com>
Wed, 21 Sep 2016 09:19:42 +0000 (12:19 +0300)
committerDavid S. Miller <davem@davemloft.net>
Thu, 22 Sep 2016 06:51:40 +0000 (02:51 -0400)
For non-striding RQ configuration before this patch we had a ring
with pre-allocated SKBs and mapped the SKB->data buffers for
device.

For robustness and better RX data buffers management, we allocate a
page per packet and build_skb around it.

This patch (which is a prerequisite for XDP) will actually reduce
performance for normal stack usage, because we are now hitting a bottleneck
in the page allocator. We use the page-cache to restore or even improve
performance in comparison to the old RX scheme.

Packet rate performance testing was done with pktgen 64B packets on xmit
side and TC ingress dropping action on RX side.

CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Comparison is done between:
 1.Baseline, before 'net/mlx5e: Build RX SKB on demand'
 2.Build SKB with RX page cache (This patch)

RX Cores  Baseline    Build SKB+page-cache    Improvement
-----------------------------------------------------------
1          4.16Mpps       5.33Mpps                28%
2          7.16Mpps      10.24Mpps                43%
4         13.61Mpps      20.51Mpps                51%
8         25.32Mpps      32.00Mpps                26%

All respective cores were 100% utilized.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index 7dd4763e726e50b69cd24c0e5cc65bed597c9464..4d06c1b9348b8ad0b23ed0cc95c481cb16b35512 100644 (file)
@@ -65,6 +65,8 @@
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW            0x3
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW            0x6
 
+#define MLX5_RX_HEADROOM NET_SKB_PAD
+
 #define MLX5_MPWRQ_LOG_STRIDE_SIZE             6  /* >= 6, HW restriction */
 #define MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS        8  /* >= 6, HW restriction */
 #define MLX5_MPWRQ_LOG_WQE_SZ                  18
@@ -302,10 +304,14 @@ struct mlx5e_page_cache {
 struct mlx5e_rq {
        /* data path */
        struct mlx5_wq_ll      wq;
-       u32                    wqe_sz;
-       struct sk_buff       **skb;
+
+       struct mlx5e_dma_info *dma_info;
        struct mlx5e_mpw_info *wqe_info;
        void                  *mtt_no_align;
+       struct {
+               u8             page_order;
+               u32            wqe_sz;    /* wqe data buffer size */
+       } buff;
        __be32                 mkey_be;
 
        struct device         *pdev;
index 8595b507e200914a0371a070f4ea1231b191c401..d09588b54c386498e5c32138ed59043c84957f09 100644 (file)
@@ -408,6 +408,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
        void *rqc = param->rqc;
        void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
        u32 byte_count;
+       u32 frag_sz;
+       int npages;
        int wq_sz;
        int err;
        int i;
@@ -442,29 +444,40 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 
                rq->mpwqe_stride_sz = BIT(priv->params.mpwqe_log_stride_sz);
                rq->mpwqe_num_strides = BIT(priv->params.mpwqe_log_num_strides);
-               rq->wqe_sz = rq->mpwqe_stride_sz * rq->mpwqe_num_strides;
-               byte_count = rq->wqe_sz;
+
+               rq->buff.wqe_sz = rq->mpwqe_stride_sz * rq->mpwqe_num_strides;
+               byte_count = rq->buff.wqe_sz;
                rq->mkey_be = cpu_to_be32(c->priv->umr_mkey.key);
                err = mlx5e_rq_alloc_mpwqe_info(rq, c);
                if (err)
                        goto err_rq_wq_destroy;
                break;
        default: /* MLX5_WQ_TYPE_LINKED_LIST */
-               rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
-                                      cpu_to_node(c->cpu));
-               if (!rq->skb) {
+               rq->dma_info = kzalloc_node(wq_sz * sizeof(*rq->dma_info),
+                                           GFP_KERNEL, cpu_to_node(c->cpu));
+               if (!rq->dma_info) {
                        err = -ENOMEM;
                        goto err_rq_wq_destroy;
                }
+
                rq->handle_rx_cqe = mlx5e_handle_rx_cqe;
                rq->alloc_wqe = mlx5e_alloc_rx_wqe;
                rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
 
-               rq->wqe_sz = (priv->params.lro_en) ?
+               rq->buff.wqe_sz = (priv->params.lro_en) ?
                                priv->params.lro_wqe_sz :
                                MLX5E_SW2HW_MTU(priv->netdev->mtu);
-               rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz);
-               byte_count = rq->wqe_sz;
+               byte_count = rq->buff.wqe_sz;
+
+               /* calc the required page order */
+               frag_sz = MLX5_RX_HEADROOM +
+                         byte_count /* packet data */ +
+                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+               frag_sz = SKB_DATA_ALIGN(frag_sz);
+
+               npages = DIV_ROUND_UP(frag_sz, PAGE_SIZE);
+               rq->buff.page_order = order_base_2(npages);
+
                byte_count |= MLX5_HW_START_PADDING;
                rq->mkey_be = c->mkey_be;
        }
@@ -499,7 +512,7 @@ static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
                mlx5e_rq_free_mpwqe_info(rq);
                break;
        default: /* MLX5_WQ_TYPE_LINKED_LIST */
-               kfree(rq->skb);
+               kfree(rq->dma_info);
        }
 
        for (i = rq->page_cache.head; i != rq->page_cache.tail;
index dc8677933f7628212198a8af4331ffa670a87905..d017829b77ebd1fcba6c6fb8e9dfa50b613da4bc 100644 (file)
@@ -179,50 +179,99 @@ unlock:
        mutex_unlock(&priv->state_lock);
 }
 
-int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
+
+static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
+                                     struct mlx5e_dma_info *dma_info)
 {
-       struct sk_buff *skb;
-       dma_addr_t dma_addr;
+       struct mlx5e_page_cache *cache = &rq->page_cache;
+       u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
 
-       skb = napi_alloc_skb(rq->cq.napi, rq->wqe_sz);
-       if (unlikely(!skb))
-               return -ENOMEM;
+       if (tail_next == cache->head) {
+               rq->stats.cache_full++;
+               return false;
+       }
+
+       cache->page_cache[cache->tail] = *dma_info;
+       cache->tail = tail_next;
+       return true;
+}
+
+static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
+                                     struct mlx5e_dma_info *dma_info)
+{
+       struct mlx5e_page_cache *cache = &rq->page_cache;
+
+       if (unlikely(cache->head == cache->tail)) {
+               rq->stats.cache_empty++;
+               return false;
+       }
+
+       if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
+               rq->stats.cache_busy++;
+               return false;
+       }
+
+       *dma_info = cache->page_cache[cache->head];
+       cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
+       rq->stats.cache_reuse++;
+
+       dma_sync_single_for_device(rq->pdev, dma_info->addr,
+                                  RQ_PAGE_SIZE(rq),
+                                  DMA_FROM_DEVICE);
+       return true;
+}
 
-       dma_addr = dma_map_single(rq->pdev,
-                                 /* hw start padding */
-                                 skb->data,
-                                 /* hw end padding */
-                                 rq->wqe_sz,
-                                 DMA_FROM_DEVICE);
+static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
+                                         struct mlx5e_dma_info *dma_info)
+{
+       struct page *page;
 
-       if (unlikely(dma_mapping_error(rq->pdev, dma_addr)))
-               goto err_free_skb;
+       if (mlx5e_rx_cache_get(rq, dma_info))
+               return 0;
 
-       *((dma_addr_t *)skb->cb) = dma_addr;
-       wqe->data.addr = cpu_to_be64(dma_addr);
+       page = dev_alloc_pages(rq->buff.page_order);
+       if (unlikely(!page))
+               return -ENOMEM;
 
-       rq->skb[ix] = skb;
+       dma_info->page = page;
+       dma_info->addr = dma_map_page(rq->pdev, page, 0,
+                                     RQ_PAGE_SIZE(rq), DMA_FROM_DEVICE);
+       if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
+               put_page(page);
+               return -ENOMEM;
+       }
 
        return 0;
+}
 
-err_free_skb:
-       dev_kfree_skb(skb);
+void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
+                       bool recycle)
+{
+       if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
+               return;
+
+       dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
+                      DMA_FROM_DEVICE);
+       put_page(dma_info->page);
+}
+
+int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+{
+       struct mlx5e_dma_info *di = &rq->dma_info[ix];
 
-       return -ENOMEM;
+       if (unlikely(mlx5e_page_alloc_mapped(rq, di)))
+               return -ENOMEM;
+
+       wqe->data.addr = cpu_to_be64(di->addr + MLX5_RX_HEADROOM);
+       return 0;
 }
 
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
-       struct sk_buff *skb = rq->skb[ix];
+       struct mlx5e_dma_info *di = &rq->dma_info[ix];
 
-       if (skb) {
-               rq->skb[ix] = NULL;
-               dma_unmap_single(rq->pdev,
-                                *((dma_addr_t *)skb->cb),
-                                rq->wqe_sz,
-                                DMA_FROM_DEVICE);
-               dev_kfree_skb(skb);
-       }
+       mlx5e_page_release(rq, di, true);
 }
 
 static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq)
@@ -305,79 +354,6 @@ static inline void mlx5e_post_umr_wqe(struct mlx5e_rq *rq, u16 ix)
        mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
 }
 
-static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
-                                     struct mlx5e_dma_info *dma_info)
-{
-       struct mlx5e_page_cache *cache = &rq->page_cache;
-       u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
-
-       if (tail_next == cache->head) {
-               rq->stats.cache_full++;
-               return false;
-       }
-
-       cache->page_cache[cache->tail] = *dma_info;
-       cache->tail = tail_next;
-       return true;
-}
-
-static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
-                                     struct mlx5e_dma_info *dma_info)
-{
-       struct mlx5e_page_cache *cache = &rq->page_cache;
-
-       if (unlikely(cache->head == cache->tail)) {
-               rq->stats.cache_empty++;
-               return false;
-       }
-
-       if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
-               rq->stats.cache_busy++;
-               return false;
-       }
-
-       *dma_info = cache->page_cache[cache->head];
-       cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
-       rq->stats.cache_reuse++;
-
-       dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
-                                  DMA_FROM_DEVICE);
-       return true;
-}
-
-static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
-                                         struct mlx5e_dma_info *dma_info)
-{
-       struct page *page;
-
-       if (mlx5e_rx_cache_get(rq, dma_info))
-               return 0;
-
-       page = dev_alloc_page();
-       if (unlikely(!page))
-               return -ENOMEM;
-
-       dma_info->page = page;
-       dma_info->addr = dma_map_page(rq->pdev, page, 0, PAGE_SIZE,
-                                     DMA_FROM_DEVICE);
-       if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
-               put_page(page);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
-                       bool recycle)
-{
-       if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
-               return;
-
-       dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, DMA_FROM_DEVICE);
-       put_page(dma_info->page);
-}
-
 static int mlx5e_alloc_rx_umr_mpwqe(struct mlx5e_rq *rq,
                                    struct mlx5e_rx_wqe *wqe,
                                    u16 ix)
@@ -448,7 +424,7 @@ void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
        mlx5_wq_ll_update_db_record(wq);
 }
 
-int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe,        u16 ix)
+int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 {
        int err;
 
@@ -658,31 +634,46 @@ static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+       struct mlx5e_dma_info *di;
        struct mlx5e_rx_wqe *wqe;
-       struct sk_buff *skb;
        __be16 wqe_counter_be;
+       struct sk_buff *skb;
        u16 wqe_counter;
        u32 cqe_bcnt;
+       void *va;
 
        wqe_counter_be = cqe->wqe_counter;
        wqe_counter    = be16_to_cpu(wqe_counter_be);
        wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
-       skb            = rq->skb[wqe_counter];
-       prefetch(skb->data);
-       rq->skb[wqe_counter] = NULL;
+       di             = &rq->dma_info[wqe_counter];
+       va             = page_address(di->page);
 
-       dma_unmap_single(rq->pdev,
-                        *((dma_addr_t *)skb->cb),
-                        rq->wqe_sz,
-                        DMA_FROM_DEVICE);
+       dma_sync_single_range_for_cpu(rq->pdev,
+                                     di->addr,
+                                     MLX5_RX_HEADROOM,
+                                     rq->buff.wqe_sz,
+                                     DMA_FROM_DEVICE);
+       prefetch(va + MLX5_RX_HEADROOM);
 
        if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
                rq->stats.wqe_err++;
-               dev_kfree_skb(skb);
+               mlx5e_page_release(rq, di, true);
                goto wq_ll_pop;
        }
 
+       skb = build_skb(va, RQ_PAGE_SIZE(rq));
+       if (unlikely(!skb)) {
+               rq->stats.buff_alloc_err++;
+               mlx5e_page_release(rq, di, true);
+               goto wq_ll_pop;
+       }
+
+       /* queue up for recycling ..*/
+       page_ref_inc(di->page);
+       mlx5e_page_release(rq, di, true);
+
        cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+       skb_reserve(skb, MLX5_RX_HEADROOM);
        skb_put(skb, cqe_bcnt);
 
        mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);