From e3f42f8453902d1520dcdc9909731f274c6d52c1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Nov 2016 15:56:10 -0800 Subject: [PATCH] mlx4: reorganize struct mlx4_en_tx_ring Goal is to reorganize this critical structure to increase performance. ndo_start_xmit() should only dirty one cache line, and access as few cache lines as possible. Add sp_ (Slow Path) prefix to fields that are not used in fast path, to make clear what is going on. After this patch pahole reports something much better, as all ndo_start_xmit() needed fields are packed into two cache lines instead of seven or eight struct mlx4_en_tx_ring { u32 last_nr_txbb; /* 0 0x4 */ u32 cons; /* 0x4 0x4 */ long unsigned int wake_queue; /* 0x8 0x8 */ struct netdev_queue * tx_queue; /* 0x10 0x8 */ u32 (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /* 0x18 0x8 */ struct mlx4_en_rx_ring * recycle_ring; /* 0x20 0x8 */ /* XXX 24 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 prod; /* 0x40 0x4 */ unsigned int tx_dropped; /* 0x44 0x4 */ long unsigned int bytes; /* 0x48 0x8 */ long unsigned int packets; /* 0x50 0x8 */ long unsigned int tx_csum; /* 0x58 0x8 */ long unsigned int tso_packets; /* 0x60 0x8 */ long unsigned int xmit_more; /* 0x68 0x8 */ struct mlx4_bf bf; /* 0x70 0x18 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ __be32 doorbell_qpn; /* 0x88 0x4 */ __be32 mr_key; /* 0x8c 0x4 */ u32 size; /* 0x90 0x4 */ u32 size_mask; /* 0x94 0x4 */ u32 full_size; /* 0x98 0x4 */ u32 buf_size; /* 0x9c 0x4 */ void * buf; /* 0xa0 0x8 */ struct mlx4_en_tx_info * tx_info; /* 0xa8 0x8 */ int qpn; /* 0xb0 0x4 */ u8 queue_index; /* 0xb4 0x1 */ bool bf_enabled; /* 0xb5 0x1 */ bool bf_alloced; /* 0xb6 0x1 */ u8 hwtstamp_tx_type; /* 0xb7 0x1 */ u8 * bounce_buf; /* 0xb8 0x8 */ /* --- cacheline 3 boundary (192 bytes) --- */ long unsigned int queue_stopped; /* 0xc0 0x8 */ struct mlx4_hwq_resources sp_wqres; /* 0xc8 0x58 */ /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */ struct mlx4_qp sp_qp; /* 0x120 0x30 */ /* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */ struct mlx4_qp_context sp_context; /* 0x150 0xf8 */ /* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */ cpumask_t sp_affinity_mask; /* 0x248 0x20 */ enum mlx4_qp_state sp_qp_state; /* 0x268 0x4 */ u16 sp_stride; /* 0x26c 0x2 */ u16 sp_cqn; /* 0x26e 0x2 */ /* size: 640, cachelines: 10, members: 36 */ /* sum members: 600, holes: 1, sum holes: 24 */ /* padding: 16 */ }; Instead of this silly placement : struct mlx4_en_tx_ring { u32 last_nr_txbb; /* 0 0x4 */ u32 cons; /* 0x4 0x4 */ long unsigned int wake_queue; /* 0x8 0x8 */ /* XXX 48 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 prod; /* 0x40 0x4 */ /* XXX 4 bytes hole, try to pack */ long unsigned int bytes; /* 0x48 0x8 */ long unsigned int packets; /* 0x50 0x8 */ long unsigned int tx_csum; /* 0x58 0x8 */ long unsigned int tso_packets; /* 0x60 0x8 */ long unsigned int xmit_more; /* 0x68 0x8 */ unsigned int tx_dropped; /* 0x70 0x4 */ /* XXX 4 bytes hole, try to pack */ struct mlx4_bf bf; /* 0x78 0x18 */ /* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */ long unsigned int queue_stopped; /* 0x90 0x8 */ cpumask_t affinity_mask; /* 0x98 0x10 */ struct mlx4_qp qp; /* 0xa8 0x30 */ /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ struct mlx4_hwq_resources wqres; /* 0xd8 0x58 */ /* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */ u32 size; /* 0x130 0x4 */ u32 size_mask; /* 0x134 0x4 */ u16 stride; /* 0x138 0x2 */ /* XXX 2 bytes hole, try to pack */ u32 full_size; /* 0x13c 0x4 */ /* --- cacheline 5 boundary (320 bytes) --- */ u16 cqn; /* 0x140 0x2 */ /* XXX 2 bytes hole, try to pack */ u32 buf_size; /* 0x144 0x4 */ __be32 doorbell_qpn; /* 0x148 0x4 */ __be32 mr_key; /* 0x14c 0x4 */ void * buf; /* 0x150 0x8 */ struct mlx4_en_tx_info * tx_info; /* 0x158 0x8 */ struct mlx4_en_rx_ring * recycle_ring; /* 0x160 0x8 */ u32 (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /* 0x168 0x8 */ u8 * bounce_buf; /* 0x170 0x8 */ struct mlx4_qp_context context; /* 0x178 0xf8 */ /* --- cacheline 9 boundary (576 bytes) was 48 bytes ago --- */ int qpn; /* 0x270 0x4 */ enum mlx4_qp_state qp_state; /* 0x274 0x4 */ u8 queue_index; /* 0x278 0x1 */ bool bf_enabled; /* 0x279 0x1 */ bool bf_alloced; /* 0x27a 0x1 */ /* XXX 5 bytes hole, try to pack */ /* --- cacheline 10 boundary (640 bytes) --- */ struct netdev_queue * tx_queue; /* 0x280 0x8 */ int hwtstamp_tx_type; /* 0x288 0x4 */ /* size: 704, cachelines: 11, members: 36 */ /* sum members: 587, holes: 6, sum holes: 65 */ /* padding: 52 */ }; Signed-off-by: Eric Dumazet Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx4/en_netdev.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 48 +++++++++---------- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 42 ++++++++-------- 3 files changed, 48 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 9a807e93c9fd..9018bb1b2e12 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1305,7 +1305,7 @@ static void mlx4_en_tx_timeout(struct net_device *dev) if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i))) continue; en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n", - i, tx_ring->qpn, tx_ring->cqn, + i, tx_ring->qpn, tx_ring->sp_cqn, tx_ring->cons, tx_ring->prod); } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 5de3cbe24f2b..4b597dca5c52 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -66,7 +66,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, ring->size = size; ring->size_mask = size - 1; - ring->stride = stride; + ring->sp_stride = stride; ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS; tmp = size * sizeof(struct mlx4_en_tx_info); @@ -90,22 +90,22 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, goto err_info; } } - ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); + ring->buf_size = ALIGN(size * ring->sp_stride, MLX4_EN_PAGE_SIZE); /* Allocate HW buffers on provided NUMA node */ set_dev_node(&mdev->dev->persist->pdev->dev, node); - err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); + err = mlx4_alloc_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size); set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node); if (err) { en_err(priv, "Failed allocating hwq resources\n"); goto err_bounce; } - ring->buf = ring->wqres.buf.direct.buf; + ring->buf = ring->sp_wqres.buf.direct.buf; en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, - (unsigned long long) ring->wqres.buf.direct.map); + (unsigned long long) ring->sp_wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, MLX4_RESERVE_ETH_BF_QP); @@ -114,12 +114,12 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, goto err_hwq_res; } - err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL); + err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp, GFP_KERNEL); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_reserve; } - ring->qp.event = mlx4_en_sqp_event; + ring->sp_qp.event = mlx4_en_sqp_event; err = mlx4_bf_alloc(mdev->dev, &ring->bf, node); if (err) { @@ -141,7 +141,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, if (queue_index < priv->num_tx_rings_p_up) cpumask_set_cpu(cpumask_local_spread(queue_index, priv->mdev->dev->numa_node), - &ring->affinity_mask); + &ring->sp_affinity_mask); *pring = ring; return 0; @@ -149,7 +149,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, err_reserve: mlx4_qp_release_range(mdev->dev, ring->qpn, 1); err_hwq_res: - mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); + mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size); err_bounce: kfree(ring->bounce_buf); ring->bounce_buf = NULL; @@ -171,10 +171,10 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, if (ring->bf_alloced) mlx4_bf_free(mdev->dev, &ring->bf); - mlx4_qp_remove(mdev->dev, &ring->qp); - mlx4_qp_free(mdev->dev, &ring->qp); + mlx4_qp_remove(mdev->dev, &ring->sp_qp); + mlx4_qp_free(mdev->dev, &ring->sp_qp); mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1); - mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); + mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size); kfree(ring->bounce_buf); ring->bounce_buf = NULL; kvfree(ring->tx_info); @@ -190,7 +190,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_dev *mdev = priv->mdev; int err; - ring->cqn = cq; + ring->sp_cqn = cq; ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; @@ -198,21 +198,21 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, memset(ring->buf, 0, ring->buf_size); ring->free_tx_desc = mlx4_en_free_tx_desc; - ring->qp_state = MLX4_QP_STATE_RST; - ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8); + ring->sp_qp_state = MLX4_QP_STATE_RST; + ring->doorbell_qpn = cpu_to_be32(ring->sp_qp.qpn << 8); ring->mr_key = cpu_to_be32(mdev->mr.key); - mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, - ring->cqn, user_prio, &ring->context); + mlx4_en_fill_qp_context(priv, ring->size, ring->sp_stride, 1, 0, ring->qpn, + ring->sp_cqn, user_prio, &ring->sp_context); if (ring->bf_alloced) - ring->context.usr_page = + ring->sp_context.usr_page = cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev, ring->bf.uar->index)); - err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, - &ring->qp, &ring->qp_state); - if (!cpumask_empty(&ring->affinity_mask)) - netif_set_xps_queue(priv->dev, &ring->affinity_mask, + err = mlx4_qp_to_ready(mdev->dev, &ring->sp_wqres.mtt, &ring->sp_context, + &ring->sp_qp, &ring->sp_qp_state); + if (!cpumask_empty(&ring->sp_affinity_mask)) + netif_set_xps_queue(priv->dev, &ring->sp_affinity_mask, ring->queue_index); return err; @@ -223,8 +223,8 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, { struct mlx4_en_dev *mdev = priv->mdev; - mlx4_qp_modify(mdev->dev, NULL, ring->qp_state, - MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); + mlx4_qp_modify(mdev->dev, NULL, ring->sp_qp_state, + MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp); } static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index eff21651b673..574bcbb1b38f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -281,46 +281,50 @@ struct mlx4_en_tx_ring { u32 last_nr_txbb; u32 cons; unsigned long wake_queue; + struct netdev_queue *tx_queue; + u32 (*free_tx_desc)(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, + u64 timestamp, int napi_mode); + struct mlx4_en_rx_ring *recycle_ring; /* cache line used and dirtied in mlx4_en_xmit() */ u32 prod ____cacheline_aligned_in_smp; + unsigned int tx_dropped; unsigned long bytes; unsigned long packets; unsigned long tx_csum; unsigned long tso_packets; unsigned long xmit_more; - unsigned int tx_dropped; struct mlx4_bf bf; - unsigned long queue_stopped; /* Following part should be mostly read */ - cpumask_t affinity_mask; - struct mlx4_qp qp; - struct mlx4_hwq_resources wqres; + __be32 doorbell_qpn; + __be32 mr_key; u32 size; /* number of TXBBs */ u32 size_mask; - u16 stride; u32 full_size; - u16 cqn; /* index of port CQ associated with this ring */ u32 buf_size; - __be32 doorbell_qpn; - __be32 mr_key; void *buf; struct mlx4_en_tx_info *tx_info; - struct mlx4_en_rx_ring *recycle_ring; - u32 (*free_tx_desc)(struct mlx4_en_priv *priv, - struct mlx4_en_tx_ring *ring, - int index, u8 owner, - u64 timestamp, int napi_mode); - u8 *bounce_buf; - struct mlx4_qp_context context; int qpn; - enum mlx4_qp_state qp_state; u8 queue_index; bool bf_enabled; bool bf_alloced; - struct netdev_queue *tx_queue; - int hwtstamp_tx_type; + u8 hwtstamp_tx_type; + u8 *bounce_buf; + + /* Not used in fast path + * Only queue_stopped might be used if BQL is not properly working. + */ + unsigned long queue_stopped; + struct mlx4_hwq_resources sp_wqres; + struct mlx4_qp sp_qp; + struct mlx4_qp_context sp_context; + cpumask_t sp_affinity_mask; + enum mlx4_qp_state sp_qp_state; + u16 sp_stride; + u16 sp_cqn; /* index of port CQ associated with this ring */ } ____cacheline_aligned_in_smp; struct mlx4_en_rx_desc { -- 2.20.1