mlx4: reorganize struct mlx4_en_tx_ring
authorEric Dumazet <edumazet@google.com>
Tue, 22 Nov 2016 23:56:10 +0000 (15:56 -0800)
committerDavid S. Miller <davem@davemloft.net>
Thu, 24 Nov 2016 21:03:37 +0000 (16:03 -0500)
Goal is to reorganize this critical structure to increase performance.

ndo_start_xmit() should only dirty one cache line, and access as few
cache lines as possible.

Add sp_ (Slow Path) prefix to fields that are not used in fast path,
to make clear what is going on.

After this patch pahole reports something much better, as all
ndo_start_xmit() needed fields are packed into two cache lines instead
of seven or eight

struct mlx4_en_tx_ring {
u32                        last_nr_txbb;         /*     0   0x4 */
u32                        cons;                 /*   0x4   0x4 */
long unsigned int          wake_queue;           /*   0x8   0x8 */
struct netdev_queue *      tx_queue;             /*  0x10   0x8 */
u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /*  0x18   0x8 */
struct mlx4_en_rx_ring *   recycle_ring;         /*  0x20   0x8 */

/* XXX 24 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
u32                        prod;                 /*  0x40   0x4 */
unsigned int               tx_dropped;           /*  0x44   0x4 */
long unsigned int          bytes;                /*  0x48   0x8 */
long unsigned int          packets;              /*  0x50   0x8 */
long unsigned int          tx_csum;              /*  0x58   0x8 */
long unsigned int          tso_packets;          /*  0x60   0x8 */
long unsigned int          xmit_more;            /*  0x68   0x8 */
struct mlx4_bf             bf;                   /*  0x70  0x18 */
/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
__be32                     doorbell_qpn;         /*  0x88   0x4 */
__be32                     mr_key;               /*  0x8c   0x4 */
u32                        size;                 /*  0x90   0x4 */
u32                        size_mask;            /*  0x94   0x4 */
u32                        full_size;            /*  0x98   0x4 */
u32                        buf_size;             /*  0x9c   0x4 */
void *                     buf;                  /*  0xa0   0x8 */
struct mlx4_en_tx_info *   tx_info;              /*  0xa8   0x8 */
int                        qpn;                  /*  0xb0   0x4 */
u8                         queue_index;          /*  0xb4   0x1 */
bool                       bf_enabled;           /*  0xb5   0x1 */
bool                       bf_alloced;           /*  0xb6   0x1 */
u8                         hwtstamp_tx_type;     /*  0xb7   0x1 */
u8 *                       bounce_buf;           /*  0xb8   0x8 */
/* --- cacheline 3 boundary (192 bytes) --- */
long unsigned int          queue_stopped;        /*  0xc0   0x8 */
struct mlx4_hwq_resources  sp_wqres;             /*  0xc8  0x58 */
/* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
struct mlx4_qp             sp_qp;                /* 0x120  0x30 */
/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
struct mlx4_qp_context     sp_context;           /* 0x150  0xf8 */
/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
cpumask_t                  sp_affinity_mask;     /* 0x248  0x20 */
enum mlx4_qp_state         sp_qp_state;          /* 0x268   0x4 */
u16                        sp_stride;            /* 0x26c   0x2 */
u16                        sp_cqn;               /* 0x26e   0x2 */

/* size: 640, cachelines: 10, members: 36 */
/* sum members: 600, holes: 1, sum holes: 24 */
/* padding: 16 */
};

Instead of this silly placement :

struct mlx4_en_tx_ring {
u32                        last_nr_txbb;         /*     0   0x4 */
u32                        cons;                 /*   0x4   0x4 */
long unsigned int          wake_queue;           /*   0x8   0x8 */

/* XXX 48 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
u32                        prod;                 /*  0x40   0x4 */

/* XXX 4 bytes hole, try to pack */

long unsigned int          bytes;                /*  0x48   0x8 */
long unsigned int          packets;              /*  0x50   0x8 */
long unsigned int          tx_csum;              /*  0x58   0x8 */
long unsigned int          tso_packets;          /*  0x60   0x8 */
long unsigned int          xmit_more;            /*  0x68   0x8 */
unsigned int               tx_dropped;           /*  0x70   0x4 */

/* XXX 4 bytes hole, try to pack */

struct mlx4_bf             bf;                   /*  0x78  0x18 */
/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
long unsigned int          queue_stopped;        /*  0x90   0x8 */
cpumask_t                  affinity_mask;        /*  0x98  0x10 */
struct mlx4_qp             qp;                   /*  0xa8  0x30 */
/* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
struct mlx4_hwq_resources  wqres;                /*  0xd8  0x58 */
/* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
u32                        size;                 /* 0x130   0x4 */
u32                        size_mask;            /* 0x134   0x4 */
u16                        stride;               /* 0x138   0x2 */

/* XXX 2 bytes hole, try to pack */

u32                        full_size;            /* 0x13c   0x4 */
/* --- cacheline 5 boundary (320 bytes) --- */
u16                        cqn;                  /* 0x140   0x2 */

/* XXX 2 bytes hole, try to pack */

u32                        buf_size;             /* 0x144   0x4 */
__be32                     doorbell_qpn;         /* 0x148   0x4 */
__be32                     mr_key;               /* 0x14c   0x4 */
void *                     buf;                  /* 0x150   0x8 */
struct mlx4_en_tx_info *   tx_info;              /* 0x158   0x8 */
struct mlx4_en_rx_ring *   recycle_ring;         /* 0x160   0x8 */
u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /* 0x168   0x8 */
u8 *                       bounce_buf;           /* 0x170   0x8 */
struct mlx4_qp_context     context;              /* 0x178  0xf8 */
/* --- cacheline 9 boundary (576 bytes) was 48 bytes ago --- */
int                        qpn;                  /* 0x270   0x4 */
enum mlx4_qp_state         qp_state;             /* 0x274   0x4 */
u8                         queue_index;          /* 0x278   0x1 */
bool                       bf_enabled;           /* 0x279   0x1 */
bool                       bf_alloced;           /* 0x27a   0x1 */

/* XXX 5 bytes hole, try to pack */

/* --- cacheline 10 boundary (640 bytes) --- */
struct netdev_queue *      tx_queue;             /* 0x280   0x8 */
int                        hwtstamp_tx_type;     /* 0x288   0x4 */

/* size: 704, cachelines: 11, members: 36 */
/* sum members: 587, holes: 6, sum holes: 65 */
/* padding: 52 */
};

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
drivers/net/ethernet/mellanox/mlx4/en_tx.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h

index 9a807e93c9fdd81e61e561208aa1480a244d0bdb..9018bb1b2e12142e048281a9d28ddf95e0023a61 100644 (file)
@@ -1305,7 +1305,7 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
                if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i)))
                        continue;
                en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n",
-                       i, tx_ring->qpn, tx_ring->cqn,
+                       i, tx_ring->qpn, tx_ring->sp_cqn,
                        tx_ring->cons, tx_ring->prod);
        }
 
index 5de3cbe24f2bf467f9a8f7d499e131b6d2a1844c..4b597dca5c52d114344d638895275ed0d378bd96 100644 (file)
@@ -66,7 +66,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 
        ring->size = size;
        ring->size_mask = size - 1;
-       ring->stride = stride;
+       ring->sp_stride = stride;
        ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
 
        tmp = size * sizeof(struct mlx4_en_tx_info);
@@ -90,22 +90,22 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
                        goto err_info;
                }
        }
-       ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
+       ring->buf_size = ALIGN(size * ring->sp_stride, MLX4_EN_PAGE_SIZE);
 
        /* Allocate HW buffers on provided NUMA node */
        set_dev_node(&mdev->dev->persist->pdev->dev, node);
-       err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+       err = mlx4_alloc_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
        set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
        if (err) {
                en_err(priv, "Failed allocating hwq resources\n");
                goto err_bounce;
        }
 
-       ring->buf = ring->wqres.buf.direct.buf;
+       ring->buf = ring->sp_wqres.buf.direct.buf;
 
        en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
               ring, ring->buf, ring->size, ring->buf_size,
-              (unsigned long long) ring->wqres.buf.direct.map);
+              (unsigned long long) ring->sp_wqres.buf.direct.map);
 
        err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
                                    MLX4_RESERVE_ETH_BF_QP);
@@ -114,12 +114,12 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
                goto err_hwq_res;
        }
 
-       err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
+       err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp, GFP_KERNEL);
        if (err) {
                en_err(priv, "Failed allocating qp %d\n", ring->qpn);
                goto err_reserve;
        }
-       ring->qp.event = mlx4_en_sqp_event;
+       ring->sp_qp.event = mlx4_en_sqp_event;
 
        err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
        if (err) {
@@ -141,7 +141,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
        if (queue_index < priv->num_tx_rings_p_up)
                cpumask_set_cpu(cpumask_local_spread(queue_index,
                                                     priv->mdev->dev->numa_node),
-                               &ring->affinity_mask);
+                               &ring->sp_affinity_mask);
 
        *pring = ring;
        return 0;
@@ -149,7 +149,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 err_reserve:
        mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
 err_hwq_res:
-       mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+       mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
 err_bounce:
        kfree(ring->bounce_buf);
        ring->bounce_buf = NULL;
@@ -171,10 +171,10 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
 
        if (ring->bf_alloced)
                mlx4_bf_free(mdev->dev, &ring->bf);
-       mlx4_qp_remove(mdev->dev, &ring->qp);
-       mlx4_qp_free(mdev->dev, &ring->qp);
+       mlx4_qp_remove(mdev->dev, &ring->sp_qp);
+       mlx4_qp_free(mdev->dev, &ring->sp_qp);
        mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
-       mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+       mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
        kfree(ring->bounce_buf);
        ring->bounce_buf = NULL;
        kvfree(ring->tx_info);
@@ -190,7 +190,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
        struct mlx4_en_dev *mdev = priv->mdev;
        int err;
 
-       ring->cqn = cq;
+       ring->sp_cqn = cq;
        ring->prod = 0;
        ring->cons = 0xffffffff;
        ring->last_nr_txbb = 1;
@@ -198,21 +198,21 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
        memset(ring->buf, 0, ring->buf_size);
        ring->free_tx_desc = mlx4_en_free_tx_desc;
 
-       ring->qp_state = MLX4_QP_STATE_RST;
-       ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
+       ring->sp_qp_state = MLX4_QP_STATE_RST;
+       ring->doorbell_qpn = cpu_to_be32(ring->sp_qp.qpn << 8);
        ring->mr_key = cpu_to_be32(mdev->mr.key);
 
-       mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
-                               ring->cqn, user_prio, &ring->context);
+       mlx4_en_fill_qp_context(priv, ring->size, ring->sp_stride, 1, 0, ring->qpn,
+                               ring->sp_cqn, user_prio, &ring->sp_context);
        if (ring->bf_alloced)
-               ring->context.usr_page =
+               ring->sp_context.usr_page =
                        cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
                                                         ring->bf.uar->index));
 
-       err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
-                              &ring->qp, &ring->qp_state);
-       if (!cpumask_empty(&ring->affinity_mask))
-               netif_set_xps_queue(priv->dev, &ring->affinity_mask,
+       err = mlx4_qp_to_ready(mdev->dev, &ring->sp_wqres.mtt, &ring->sp_context,
+                              &ring->sp_qp, &ring->sp_qp_state);
+       if (!cpumask_empty(&ring->sp_affinity_mask))
+               netif_set_xps_queue(priv->dev, &ring->sp_affinity_mask,
                                    ring->queue_index);
 
        return err;
@@ -223,8 +223,8 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
 {
        struct mlx4_en_dev *mdev = priv->mdev;
 
-       mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
-                      MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
+       mlx4_qp_modify(mdev->dev, NULL, ring->sp_qp_state,
+                      MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
 }
 
 static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
index eff21651b67308a17fe3d60d236cd0b6800a3fd2..574bcbb1b38fc4758511d8f7bd17a87b0a507a73 100644 (file)
@@ -281,46 +281,50 @@ struct mlx4_en_tx_ring {
        u32                     last_nr_txbb;
        u32                     cons;
        unsigned long           wake_queue;
+       struct netdev_queue     *tx_queue;
+       u32                     (*free_tx_desc)(struct mlx4_en_priv *priv,
+                                               struct mlx4_en_tx_ring *ring,
+                                               int index, u8 owner,
+                                               u64 timestamp, int napi_mode);
+       struct mlx4_en_rx_ring  *recycle_ring;
 
        /* cache line used and dirtied in mlx4_en_xmit() */
        u32                     prod ____cacheline_aligned_in_smp;
+       unsigned int            tx_dropped;
        unsigned long           bytes;
        unsigned long           packets;
        unsigned long           tx_csum;
        unsigned long           tso_packets;
        unsigned long           xmit_more;
-       unsigned int            tx_dropped;
        struct mlx4_bf          bf;
-       unsigned long           queue_stopped;
 
        /* Following part should be mostly read */
-       cpumask_t               affinity_mask;
-       struct mlx4_qp          qp;
-       struct mlx4_hwq_resources wqres;
+       __be32                  doorbell_qpn;
+       __be32                  mr_key;
        u32                     size; /* number of TXBBs */
        u32                     size_mask;
-       u16                     stride;
        u32                     full_size;
-       u16                     cqn;    /* index of port CQ associated with this ring */
        u32                     buf_size;
-       __be32                  doorbell_qpn;
-       __be32                  mr_key;
        void                    *buf;
        struct mlx4_en_tx_info  *tx_info;
-       struct mlx4_en_rx_ring  *recycle_ring;
-       u32                     (*free_tx_desc)(struct mlx4_en_priv *priv,
-                                               struct mlx4_en_tx_ring *ring,
-                                               int index, u8 owner,
-                                               u64 timestamp, int napi_mode);
-       u8                      *bounce_buf;
-       struct mlx4_qp_context  context;
        int                     qpn;
-       enum mlx4_qp_state      qp_state;
        u8                      queue_index;
        bool                    bf_enabled;
        bool                    bf_alloced;
-       struct netdev_queue     *tx_queue;
-       int                     hwtstamp_tx_type;
+       u8                      hwtstamp_tx_type;
+       u8                      *bounce_buf;
+
+       /* Not used in fast path
+        * Only queue_stopped might be used if BQL is not properly working.
+        */
+       unsigned long           queue_stopped;
+       struct mlx4_hwq_resources sp_wqres;
+       struct mlx4_qp          sp_qp;
+       struct mlx4_qp_context  sp_context;
+       cpumask_t               sp_affinity_mask;
+       enum mlx4_qp_state      sp_qp_state;
+       u16                     sp_stride;
+       u16                     sp_cqn; /* index of port CQ associated with this ring */
 } ____cacheline_aligned_in_smp;
 
 struct mlx4_en_rx_desc {