net/mlx5e: XDP fast RX drop bpf programs support
authorRana Shahout <ranas@mellanox.com>
Wed, 21 Sep 2016 09:19:46 +0000 (12:19 +0300)
committerDavid S. Miller <davem@davemloft.net>
Thu, 22 Sep 2016 06:51:41 +0000 (02:51 -0400)
Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.

When XDP is on we make sure to change channels RQs type to
MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
ensure "page per packet".

On XDP set, we fail if HW LRO is set and request from user to turn it
off.  Since on ConnectX4-LX HW LRO is always on by default, this will be
annoying, but we prefer not to enforce LRO off from XDP set function.

Full channels reset (close/open) is required only when setting XDP
on/off.

When XDP set is called just to exchange programs, we will update
each RQ xdp program on the fly and for synchronization with current
data path RX activity of that RQ, we temporally disable that RQ and
ensure RX path is not running, quickly update and re-enable that RQ,
for that we do:
- rq.state = disabled
- napi_synnchronize
- xchg(rq->xdp_prg)
- rq.state = enabled
- napi_schedule // Just in case we've missed an IRQ

Packet rate performance testing was done with pktgen 64B packets and on
TX side and, TC drop action on RX side compared to XDP fast drop.

CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Comparison is done between:
1. Baseline, Before this patch with TC drop action
2. This patch with TC drop action
3. This patch with XDP RX fast drop

RX Cores  Baseline(TC drop)    TC drop    XDP fast Drop
--------------------------------------------------------------
1            5.3Mpps           5.3Mpps     16.5Mpps
2           10.2Mpps          10.2Mpps     31.3Mpps
4           20.5Mpps          19.9Mpps     36.3Mpps*

*My xmitter was limited to 36.3Mpps, so it is the bottleneck.
It seems that receive side can handle more.

Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_stats.h

index e3331237ce0e17884843c1e0f4dbd3c9482296a8..5e8e669f69c0ebc34f2c30cbc9e52e5c96be5835 100644 (file)
@@ -334,6 +334,7 @@ struct mlx5e_rq {
        int                    ix;
 
        struct mlx5e_rx_am     am; /* Adaptive Moderation */
+       struct bpf_prog       *xdp_prog;
 
        /* control */
        struct mlx5_wq_ctrl    wq_ctrl;
@@ -627,6 +628,7 @@ struct mlx5e_priv {
        /* priv data path fields - start */
        struct mlx5e_sq            **txq_to_sq_map;
        int channeltc_to_txq_map[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
+       struct bpf_prog *xdp_prog;
        /* priv data path fields - end */
 
        unsigned long              state;
index ff520ad27f5ecd4f887fcee4227e0f281bf7e3ce..6e95a16226f89c3f2e5c3b97a6ec4b7c2cbbc91d 100644 (file)
@@ -34,6 +34,7 @@
 #include <net/pkt_cls.h>
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
+#include <linux/bpf.h>
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
@@ -104,7 +105,8 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type)
 
 static void mlx5e_set_rq_priv_params(struct mlx5e_priv *priv)
 {
-       u8 rq_type = mlx5e_check_fragmented_striding_rq_cap(priv->mdev) ?
+       u8 rq_type = mlx5e_check_fragmented_striding_rq_cap(priv->mdev) &&
+                   !priv->xdp_prog ?
                    MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
                    MLX5_WQ_TYPE_LINKED_LIST;
        mlx5e_set_rq_type_params(priv, rq_type);
@@ -177,6 +179,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
                s->rx_csum_none += rq_stats->csum_none;
                s->rx_csum_complete += rq_stats->csum_complete;
                s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner;
+               s->rx_xdp_drop += rq_stats->xdp_drop;
                s->rx_wqe_err   += rq_stats->wqe_err;
                s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
                s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
@@ -473,6 +476,7 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
        rq->channel = c;
        rq->ix      = c->ix;
        rq->priv    = c->priv;
+       rq->xdp_prog = priv->xdp_prog;
 
        switch (priv->params.rq_wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
@@ -536,6 +540,9 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
        rq->page_cache.head = 0;
        rq->page_cache.tail = 0;
 
+       if (rq->xdp_prog)
+               bpf_prog_add(rq->xdp_prog, 1);
+
        return 0;
 
 err_rq_wq_destroy:
@@ -548,6 +555,9 @@ static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
        int i;
 
+       if (rq->xdp_prog)
+               bpf_prog_put(rq->xdp_prog);
+
        switch (rq->wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
                mlx5e_rq_free_mpwqe_info(rq);
@@ -2955,6 +2965,92 @@ static void mlx5e_tx_timeout(struct net_device *dev)
                schedule_work(&priv->tx_timeout_work);
 }
 
+static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+       struct mlx5e_priv *priv = netdev_priv(netdev);
+       struct bpf_prog *old_prog;
+       int err = 0;
+       bool reset, was_opened;
+       int i;
+
+       mutex_lock(&priv->state_lock);
+
+       if ((netdev->features & NETIF_F_LRO) && prog) {
+               netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO first\n");
+               err = -EINVAL;
+               goto unlock;
+       }
+
+       was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+       /* no need for full reset when exchanging programs */
+       reset = (!priv->xdp_prog || !prog);
+
+       if (was_opened && reset)
+               mlx5e_close_locked(netdev);
+
+       /* exchange programs */
+       old_prog = xchg(&priv->xdp_prog, prog);
+       if (prog)
+               bpf_prog_add(prog, 1);
+       if (old_prog)
+               bpf_prog_put(old_prog);
+
+       if (reset) /* change RQ type according to priv->xdp_prog */
+               mlx5e_set_rq_priv_params(priv);
+
+       if (was_opened && reset)
+               mlx5e_open_locked(netdev);
+
+       if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || reset)
+               goto unlock;
+
+       /* exchanging programs w/o reset, we update ref counts on behalf
+        * of the channels RQs here.
+        */
+       bpf_prog_add(prog, priv->params.num_channels);
+       for (i = 0; i < priv->params.num_channels; i++) {
+               struct mlx5e_channel *c = priv->channel[i];
+
+               set_bit(MLX5E_RQ_STATE_FLUSH, &c->rq.state);
+               napi_synchronize(&c->napi);
+               /* prevent mlx5e_poll_rx_cq from accessing rq->xdp_prog */
+
+               old_prog = xchg(&c->rq.xdp_prog, prog);
+
+               clear_bit(MLX5E_RQ_STATE_FLUSH, &c->rq.state);
+               /* napi_schedule in case we have missed anything */
+               set_bit(MLX5E_CHANNEL_NAPI_SCHED, &c->flags);
+               napi_schedule(&c->napi);
+
+               if (old_prog)
+                       bpf_prog_put(old_prog);
+       }
+
+unlock:
+       mutex_unlock(&priv->state_lock);
+       return err;
+}
+
+static bool mlx5e_xdp_attached(struct net_device *dev)
+{
+       struct mlx5e_priv *priv = netdev_priv(dev);
+
+       return !!priv->xdp_prog;
+}
+
+static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+       switch (xdp->command) {
+       case XDP_SETUP_PROG:
+               return mlx5e_xdp_set(dev, xdp->prog);
+       case XDP_QUERY_PROG:
+               xdp->prog_attached = mlx5e_xdp_attached(dev);
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
+
 static const struct net_device_ops mlx5e_netdev_ops_basic = {
        .ndo_open                = mlx5e_open,
        .ndo_stop                = mlx5e_close,
@@ -2974,6 +3070,7 @@ static const struct net_device_ops mlx5e_netdev_ops_basic = {
        .ndo_rx_flow_steer       = mlx5e_rx_flow_steer,
 #endif
        .ndo_tx_timeout          = mlx5e_tx_timeout,
+       .ndo_xdp                 = mlx5e_xdp,
 };
 
 static const struct net_device_ops mlx5e_netdev_ops_sriov = {
@@ -3005,6 +3102,7 @@ static const struct net_device_ops mlx5e_netdev_ops_sriov = {
        .ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
        .ndo_get_vf_stats        = mlx5e_get_vf_stats,
        .ndo_tx_timeout          = mlx5e_tx_timeout,
+       .ndo_xdp                 = mlx5e_xdp,
 };
 
 static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
index a403a797ebef427c2068706b21ac46f6be2a7901..96f6317a95ea1d29c040b179ccd53eacc1d74639 100644 (file)
@@ -632,8 +632,20 @@ static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
        napi_gro_receive(rq->cq.napi, skb);
 }
 
+static inline enum xdp_action mlx5e_xdp_handle(struct mlx5e_rq *rq,
+                                              const struct bpf_prog *prog,
+                                              void *data, u32 len)
+{
+       struct xdp_buff xdp;
+
+       xdp.data = data;
+       xdp.data_end = xdp.data + len;
+       return bpf_prog_run_xdp(prog, &xdp);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+       struct bpf_prog *xdp_prog = READ_ONCE(rq->xdp_prog);
        struct mlx5e_dma_info *di;
        struct mlx5e_rx_wqe *wqe;
        __be16 wqe_counter_be;
@@ -654,6 +666,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
                                      rq->buff.wqe_sz,
                                      DMA_FROM_DEVICE);
        prefetch(va + MLX5_RX_HEADROOM);
+       cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
        if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
                rq->stats.wqe_err++;
@@ -661,6 +674,18 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
                goto wq_ll_pop;
        }
 
+       if (xdp_prog) {
+               enum xdp_action act =
+                       mlx5e_xdp_handle(rq, xdp_prog, va + MLX5_RX_HEADROOM,
+                                        cqe_bcnt);
+
+               if (act != XDP_PASS) {
+                       rq->stats.xdp_drop++;
+                       mlx5e_page_release(rq, di, true);
+                       goto wq_ll_pop;
+               }
+       }
+
        skb = build_skb(va, RQ_PAGE_SIZE(rq));
        if (unlikely(!skb)) {
                rq->stats.buff_alloc_err++;
@@ -672,7 +697,6 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        page_ref_inc(di->page);
        mlx5e_page_release(rq, di, true);
 
-       cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
        skb_reserve(skb, MLX5_RX_HEADROOM);
        skb_put(skb, cqe_bcnt);
 
index 6af8d79e8c2a470507a277fecf053265a484c232..084d6c893a6e0174eef9095af6bc96e14cc59199 100644 (file)
@@ -65,6 +65,7 @@ struct mlx5e_sw_stats {
        u64 rx_csum_none;
        u64 rx_csum_complete;
        u64 rx_csum_unnecessary_inner;
+       u64 rx_xdp_drop;
        u64 tx_csum_partial;
        u64 tx_csum_partial_inner;
        u64 tx_queue_stopped;
@@ -100,6 +101,7 @@ static const struct counter_desc sw_stats_desc[] = {
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) },
+       { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) },
        { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
@@ -278,6 +280,7 @@ struct mlx5e_rq_stats {
        u64 csum_none;
        u64 lro_packets;
        u64 lro_bytes;
+       u64 xdp_drop;
        u64 wqe_err;
        u64 mpwqe_filler;
        u64 buff_alloc_err;
@@ -295,6 +298,7 @@ static const struct counter_desc rq_stats_desc[] = {
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) },
+       { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_packets) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_bytes) },
        { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, wqe_err) },