net/mlx5e: Introduce RX Page-Reuse
authorTariq Toukan <tariqt@mellanox.com>
Sun, 29 Jan 2017 15:42:26 +0000 (17:42 +0200)
committerSaeed Mahameed <saeedm@mellanox.com>
Thu, 22 Jun 2017 11:30:13 +0000 (14:30 +0300)
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.

A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.

In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed.  In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.

Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).

Performance tests:
iperf tcp tests show huge gain:

--------------------------------------------
num streams | BW before | BW after | ratio |
          1 |      22.2 |     30.9 | 1.39x |
          8 |      64.2 |     93.6 | 1.46x |
         64 |      56.7 |     91.4 | 1.61x |
--------------------------------------------

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_stats.h

index 00ff2a1651e7e50b7434a0930a9afcc29f111567..709f500ef16f2c4c565162425e1d4cf074113bf6 100644 (file)
@@ -448,6 +448,11 @@ struct mlx5e_dma_info {
        dma_addr_t      addr;
 };
 
+struct mlx5e_wqe_frag_info {
+       struct mlx5e_dma_info di;
+       u32 offset;
+};
+
 struct mlx5e_umr_dma_info {
        __be64                *mtt;
        dma_addr_t             mtt_addr;
@@ -509,7 +514,12 @@ struct mlx5e_rq {
        struct mlx5_wq_ll      wq;
 
        union {
-               struct mlx5e_dma_info *dma_info;
+               struct {
+                       struct mlx5e_wqe_frag_info *frag_info;
+                       u32 frag_sz;    /* max possible skb frag_sz */
+                       bool page_reuse;
+                       bool xdp_xmit;
+               } wqe;
                struct {
                        struct mlx5e_mpw_info *info;
                        void                  *mtt_no_align;
index 66173e5545ce84a05bc37a3c6935054568d2d75b..9f99f624004fd874240beae13ef7ea42d21d279c 100644 (file)
@@ -200,6 +200,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
                s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
                s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
                s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
+               s->rx_page_reuse  += rq_stats->page_reuse;
                s->rx_cache_reuse += rq_stats->cache_reuse;
                s->rx_cache_full  += rq_stats->cache_full;
                s->rx_cache_empty += rq_stats->cache_empty;
@@ -550,7 +551,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
        void *rqc = rqp->rqc;
        void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
        u32 byte_count;
-       u32 frag_sz;
        int npages;
        int wq_sz;
        int err;
@@ -614,9 +614,10 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                        goto err_destroy_umr_mkey;
                break;
        default: /* MLX5_WQ_TYPE_LINKED_LIST */
-               rq->dma_info = kzalloc_node(wq_sz * sizeof(*rq->dma_info),
-                                           GFP_KERNEL, cpu_to_node(c->cpu));
-               if (!rq->dma_info) {
+               rq->wqe.frag_info =
+                       kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
+                                    GFP_KERNEL, cpu_to_node(c->cpu));
+               if (!rq->wqe.frag_info) {
                        err = -ENOMEM;
                        goto err_rq_wq_destroy;
                }
@@ -625,7 +626,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
                rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
                if (!rq->handle_rx_cqe) {
-                       kfree(rq->dma_info);
+                       kfree(rq->wqe.frag_info);
                        err = -EINVAL;
                        netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
                        goto err_rq_wq_destroy;
@@ -634,11 +635,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                rq->buff.wqe_sz = params->lro_en  ?
                                params->lro_wqe_sz :
                                MLX5E_SW2HW_MTU(c->priv, c->netdev->mtu);
+               rq->wqe.page_reuse = !params->xdp_prog && !params->lro_en;
                byte_count = rq->buff.wqe_sz;
 
                /* calc the required page order */
-               frag_sz = MLX5_SKB_FRAG_SZ(rq->rx_headroom + byte_count);
-               npages = DIV_ROUND_UP(frag_sz, PAGE_SIZE);
+               rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->rx_headroom + byte_count);
+               npages = DIV_ROUND_UP(rq->wqe.frag_sz, PAGE_SIZE);
                rq->buff.page_order = order_base_2(npages);
 
                byte_count |= MLX5_HW_START_PADDING;
@@ -683,7 +685,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
                mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
                break;
        default: /* MLX5_WQ_TYPE_LINKED_LIST */
-               kfree(rq->dma_info);
+               kfree(rq->wqe.frag_info);
        }
 
        for (i = rq->page_cache.head; i != rq->page_cache.tail;
@@ -865,6 +867,16 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
                mlx5_wq_ll_pop(&rq->wq, wqe_ix_be,
                               &wqe->next.next_wqe_index);
        }
+
+       if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST && rq->wqe.page_reuse) {
+               /* Clean outstanding pages on handled WQEs that decided to do page-reuse,
+                * but yet to be re-posted.
+                */
+               int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+
+               for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
+                       rq->dealloc_wqe(rq, wqe_ix);
+       }
 }
 
 static int mlx5e_open_rq(struct mlx5e_channel *c,
index 2eef4e701ab3f3750e4013ac8cf96c84a8b9e43d..5f3c138c948d3b7bfefa7867786096a74de36743 100644 (file)
@@ -160,6 +160,11 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
 
 #define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
 
+static inline bool mlx5e_page_is_reserved(struct page *page)
+{
+       return page_is_pfmemalloc(page) || page_to_nid(page) != numa_node_id();
+}
+
 static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
                                      struct mlx5e_dma_info *dma_info)
 {
@@ -238,22 +243,54 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
        put_page(dma_info->page);
 }
 
+static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
+                                   struct mlx5e_wqe_frag_info *wi)
+{
+       return rq->wqe.page_reuse && wi->di.page &&
+               (wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
+               !mlx5e_page_is_reserved(wi->di.page);
+}
+
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 {
-       struct mlx5e_dma_info *di = &rq->dma_info[ix];
+       struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
-       if (unlikely(mlx5e_page_alloc_mapped(rq, di)))
-               return -ENOMEM;
+       /* check if page exists, hence can be reused */
+       if (!wi->di.page) {
+               if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
+                       return -ENOMEM;
+               wi->offset = 0;
+       }
 
-       wqe->data.addr = cpu_to_be64(di->addr + rq->rx_headroom);
+       wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset +
+                                    rq->rx_headroom);
        return 0;
 }
 
+static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
+                                    struct mlx5e_wqe_frag_info *wi)
+{
+       mlx5e_page_release(rq, &wi->di, true);
+       wi->di.page = NULL;
+}
+
+static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
+                                          struct mlx5e_wqe_frag_info *wi)
+{
+       if (mlx5e_page_reuse(rq, wi)) {
+               rq->stats.page_reuse++;
+               return;
+       }
+
+       mlx5e_free_rx_wqe(rq, wi);
+}
+
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
-       struct mlx5e_dma_info *di = &rq->dma_info[ix];
+       struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
-       mlx5e_page_release(rq, di, true);
+       if (wi->di.page)
+               mlx5e_free_rx_wqe(rq, wi);
 }
 
 static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq)
@@ -650,7 +687,6 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
        if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE ||
                     MLX5E_SW2HW_MTU(rq->channel->priv, rq->netdev->mtu) < dma_len)) {
                rq->stats.xdp_drop++;
-               mlx5e_page_release(rq, di, true);
                return false;
        }
 
@@ -661,7 +697,6 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
                        sq->db.doorbell = false;
                }
                rq->stats.xdp_tx_full++;
-               mlx5e_page_release(rq, di, true);
                return false;
        }
 
@@ -686,10 +721,15 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 
        cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
 
+       /* move page to reference to sq responsibility,
+        * and mark so it's not put back in page-cache.
+        */
+       rq->wqe.xdp_xmit = true;
        sq->db.di[pi] = *di;
        sq->pc++;
 
        sq->db.doorbell = true;
+
        rq->stats.xdp_tx++;
        return true;
 }
@@ -726,36 +766,34 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
                trace_xdp_exception(rq->netdev, prog, act);
        case XDP_DROP:
                rq->stats.xdp_drop++;
-               mlx5e_page_release(rq, di, true);
                return true;
        }
 }
 
 static inline
 struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
-                            u16 wqe_counter, u32 cqe_bcnt)
+                            struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
-       struct mlx5e_dma_info *di;
+       struct mlx5e_dma_info *di = &wi->di;
        struct sk_buff *skb;
        void *va, *data;
        u16 rx_headroom = rq->rx_headroom;
        bool consumed;
        u32 frag_size;
 
-       di             = &rq->dma_info[wqe_counter];
-       va             = page_address(di->page);
+       va             = page_address(di->page) + wi->offset;
        data           = va + rx_headroom;
+       frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
 
        dma_sync_single_range_for_cpu(rq->pdev,
-                                     di->addr,
-                                     rx_headroom,
-                                     rq->buff.wqe_sz,
+                                     di->addr + wi->offset,
+                                     0, frag_size,
                                      DMA_FROM_DEVICE);
        prefetch(data);
+       wi->offset += frag_size;
 
        if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
                rq->stats.wqe_err++;
-               mlx5e_page_release(rq, di, true);
                return NULL;
        }
 
@@ -765,17 +803,14 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
        if (consumed)
                return NULL; /* page/packet was consumed by XDP */
 
-       frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
        skb = build_skb(va, frag_size);
        if (unlikely(!skb)) {
                rq->stats.buff_alloc_err++;
-               mlx5e_page_release(rq, di, true);
                return NULL;
        }
 
-       /* queue up for recycling ..*/
+       /* queue up for recycling/reuse */
        page_ref_inc(di->page);
-       mlx5e_page_release(rq, di, true);
 
        skb_reserve(skb, rx_headroom);
        skb_put(skb, cqe_bcnt);
@@ -785,6 +820,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+       struct mlx5e_wqe_frag_info *wi;
        struct mlx5e_rx_wqe *wqe;
        __be16 wqe_counter_be;
        struct sk_buff *skb;
@@ -794,15 +830,27 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        wqe_counter_be = cqe->wqe_counter;
        wqe_counter    = be16_to_cpu(wqe_counter_be);
        wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+       wi             = &rq->wqe.frag_info[wqe_counter];
        cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
-       skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
-       if (!skb)
+       skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+       if (!skb) {
+               /* probably for XDP */
+               if (rq->wqe.xdp_xmit) {
+                       wi->di.page = NULL;
+                       rq->wqe.xdp_xmit = false;
+                       /* do not return page to cache, it will be returned on XDP_TX completion */
+                       goto wq_ll_pop;
+               }
+               /* probably an XDP_DROP, save the page-reuse checks */
+               mlx5e_free_rx_wqe(rq, wi);
                goto wq_ll_pop;
+       }
 
        mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
        napi_gro_receive(rq->cq.napi, skb);
 
+       mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
        mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
                       &wqe->next.next_wqe_index);
@@ -814,6 +862,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        struct mlx5e_priv *priv = netdev_priv(netdev);
        struct mlx5e_rep_priv *rpriv  = priv->ppriv;
        struct mlx5_eswitch_rep *rep = rpriv->rep;
+       struct mlx5e_wqe_frag_info *wi;
        struct mlx5e_rx_wqe *wqe;
        struct sk_buff *skb;
        __be16 wqe_counter_be;
@@ -823,11 +872,21 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        wqe_counter_be = cqe->wqe_counter;
        wqe_counter    = be16_to_cpu(wqe_counter_be);
        wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+       wi             = &rq->wqe.frag_info[wqe_counter];
        cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
-       skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
-       if (!skb)
+       skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+       if (!skb) {
+               if (rq->wqe.xdp_xmit) {
+                       wi->di.page = NULL;
+                       rq->wqe.xdp_xmit = false;
+                       /* do not return page to cache, it will be returned on XDP_TX completion */
+                       goto wq_ll_pop;
+               }
+               /* probably an XDP_DROP, save the page-reuse checks */
+               mlx5e_free_rx_wqe(rq, wi);
                goto wq_ll_pop;
+       }
 
        mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
@@ -836,6 +895,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
        napi_gro_receive(rq->cq.napi, skb);
 
+       mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
        mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
                       &wqe->next.next_wqe_index);
@@ -1096,6 +1156,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+       struct mlx5e_wqe_frag_info *wi;
        struct mlx5e_rx_wqe *wqe;
        __be16 wqe_counter_be;
        struct sk_buff *skb;
@@ -1105,16 +1166,18 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        wqe_counter_be = cqe->wqe_counter;
        wqe_counter    = be16_to_cpu(wqe_counter_be);
        wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+       wi             = &rq->wqe.frag_info[wqe_counter];
        cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
-       skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
+       skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
        if (!skb)
-               goto wq_ll_pop;
+               goto wq_free_wqe;
 
        mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
        napi_gro_receive(rq->cq.napi, skb);
 
-wq_ll_pop:
+wq_free_wqe:
+       mlx5e_free_rx_wqe_reuse(rq, wi);
        mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
                       &wqe->next.next_wqe_index);
 }
index fda247587ff6f6160b7103b1748c8e2986af6a96..e65517eafc58226b35bcf18e45a67bdc831a16b3 100644 (file)
@@ -79,6 +79,7 @@ struct mlx5e_sw_stats {
        u64 rx_buff_alloc_err;
        u64 rx_cqe_compress_blks;
        u64 rx_cqe_compress_pkts;
+       u64 rx_page_reuse;
        u64 rx_cache_reuse;
        u64 rx_cache_full;
        u64 rx_cache_empty;
@@ -117,6 +118,7 @@ static const struct counter_desc sw_stats_desc[] = {
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_page_reuse) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_reuse) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_full) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_empty) },
@@ -319,6 +321,7 @@ struct mlx5e_rq_stats {
        u64 buff_alloc_err;
        u64 cqe_compress_blks;
        u64 cqe_compress_pkts;
+       u64 page_reuse;
        u64 cache_reuse;
        u64 cache_full;
        u64 cache_empty;
@@ -341,6 +344,7 @@ static const struct counter_desc rq_stats_desc[] = {
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
+       { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, page_reuse) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_reuse) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_full) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_empty) },