[1/1] mbuf: move pool pointer in first half

Message ID 20201107155306.463148-1-thomas@monjalon.net (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [1/1] mbuf: move pool pointer in first half |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed
ci/iol-mellanox-Performance success Performance Testing PASS
ci/Intel-compilation success Compilation OK

Commit Message

Thomas Monjalon Nov. 7, 2020, 3:53 p.m. UTC
  The mempool pointer in the mbuf struct is moved
from the second to the first half.
It should increase performance on most systems having 64-byte cache line,
i.e. mbuf is split in two cache lines.

Due to this change, tx_offload is moved, so some vector data paths
may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!

Moving this field gives more space to dynfield1.

This is how the mbuf layout looks like (pahole-style):

word  type                              name                byte  size
 0    void *                            buf_addr;         /*   0 +  8 */
 1    rte_iova_t                        buf_iova          /*   8 +  8 */
      /* --- RTE_MARKER64               rearm_data;                   */
 2    uint16_t                          data_off;         /*  16 +  2 */
      uint16_t                          refcnt;           /*  18 +  2 */
      uint16_t                          nb_segs;          /*  20 +  2 */
      uint16_t                          port;             /*  22 +  2 */
 3    uint64_t                          ol_flags;         /*  24 +  8 */
      /* --- RTE_MARKER                 rx_descriptor_fields1;        */
 4    uint32_t             union        packet_type;      /*  32 +  4 */
      uint32_t                          pkt_len;          /*  36 +  4 */
 5    uint16_t                          data_len;         /*  40 +  2 */
      uint16_t                          vlan_tci;         /*  42 +  2 */
 5.5  uint64_t             union        hash;             /*  44 +  8 */
 6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
      uint16_t                          buf_len;          /*  54 +  2 */
 7    struct rte_mempool *              pool;             /*  56 +  8 */
      /* --- RTE_MARKER                 cacheline1;                   */
 8    struct rte_mbuf *                 next;             /*  64 +  8 */
 9    uint64_t             union        tx_offload;       /*  72 +  8 */
10    struct rte_mbuf_ext_shared_info * shinfo;           /*  80 +  8 */
11    uint16_t                          priv_size;        /*  88 +  2 */
      uint16_t                          timesync;         /*  90 +  2 */
11.5  uint32_t                          dynfield1[9];     /*  92 + 36 */
16    /* --- END                                             128      */

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 doc/guides/rel_notes/deprecation.rst | 7 -------
 drivers/net/octeontx2/otx2_ethdev.c  | 2 --
 lib/librte_kni/rte_kni_common.h      | 3 ++-
 lib/librte_mbuf/rte_mbuf.h           | 1 -
 lib/librte_mbuf/rte_mbuf_core.h      | 5 ++---
 lib/librte_mbuf/rte_mbuf_dyn.c       | 1 -
 6 files changed, 4 insertions(+), 15 deletions(-)
  

Comments

Jerin Jacob Nov. 7, 2020, 5:12 p.m. UTC | #1
On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> The mempool pointer in the mbuf struct is moved
> from the second to the first half.
> It should increase performance on most systems having 64-byte cache line,

> i.e. mbuf is split in two cache lines.

But In any event, Tx needs to touch the pool to freeing back to the
pool upon  Tx completion. Right?
Not able to understand the motivation for moving it to the first 64B cache line?
The gain varies from driver to driver. For example, a Typical
ARM-based NPU does not need to
touch the pool in Rx and its been filled by HW. Whereas it needs to
touch in Tx if the reference count is implemented.


>
> Due to this change, tx_offload is moved, so some vector data paths
> may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!

It will be breaking the Tx path, Please just don't remove the static
assert without adjusting the code.

>
> Moving this field gives more space to dynfield1.
>
> This is how the mbuf layout looks like (pahole-style):
>
> word  type                              name                byte  size
>  0    void *                            buf_addr;         /*   0 +  8 */
>  1    rte_iova_t                        buf_iova          /*   8 +  8 */
>       /* --- RTE_MARKER64               rearm_data;                   */
>  2    uint16_t                          data_off;         /*  16 +  2 */
>       uint16_t                          refcnt;           /*  18 +  2 */
>       uint16_t                          nb_segs;          /*  20 +  2 */
>       uint16_t                          port;             /*  22 +  2 */
>  3    uint64_t                          ol_flags;         /*  24 +  8 */
>       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
>  4    uint32_t             union        packet_type;      /*  32 +  4 */
>       uint32_t                          pkt_len;          /*  36 +  4 */
>  5    uint16_t                          data_len;         /*  40 +  2 */
>       uint16_t                          vlan_tci;         /*  42 +  2 */
>  5.5  uint64_t             union        hash;             /*  44 +  8 */
>  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
>       uint16_t                          buf_len;          /*  54 +  2 */
>  7    struct rte_mempool *              pool;             /*  56 +  8 */
>       /* --- RTE_MARKER                 cacheline1;                   */
>  8    struct rte_mbuf *                 next;             /*  64 +  8 */
>  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> 10    struct rte_mbuf_ext_shared_info * shinfo;           /*  80 +  8 */
> 11    uint16_t                          priv_size;        /*  88 +  2 */
>       uint16_t                          timesync;         /*  90 +  2 */
> 11.5  uint32_t                          dynfield1[9];     /*  92 + 36 */
> 16    /* --- END                                             128      */
>
> Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
> ---
>  doc/guides/rel_notes/deprecation.rst | 7 -------
>  drivers/net/octeontx2/otx2_ethdev.c  | 2 --
>  lib/librte_kni/rte_kni_common.h      | 3 ++-
>  lib/librte_mbuf/rte_mbuf.h           | 1 -
>  lib/librte_mbuf/rte_mbuf_core.h      | 5 ++---
>  lib/librte_mbuf/rte_mbuf_dyn.c       | 1 -
>  6 files changed, 4 insertions(+), 15 deletions(-)
>
> diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
> index f3258eb3f7..efb09f0c5e 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -81,13 +81,6 @@ Deprecation Notices
>    us extending existing enum/define.
>    One solution can be using a fixed size array instead of ``.*MAX.*`` value.
>
> -* mbuf: Some fields will be converted to dynamic API in DPDK 20.11
> -  in order to reserve more space for the dynamic fields, as explained in
> -  `this presentation <https://www.youtube.com/watch?v=Ttl6MlhmzWY>`_.
> -  As a consequence, the layout of the ``struct rte_mbuf`` will be re-arranged,
> -  avoiding impact on vectorized implementation of the driver datapaths,
> -  while evaluating performance gains of a better use of the first cache line.
> -
>  * ethdev: The flow director API, including ``rte_eth_conf.fdir_conf`` field,
>    and the related structures (``rte_fdir_*`` and ``rte_eth_fdir_*``),
>    will be removed in DPDK 20.11.
> diff --git a/drivers/net/octeontx2/otx2_ethdev.c b/drivers/net/octeontx2/otx2_ethdev.c
> index 6cebbe677d..d6e0f1dd03 100644
> --- a/drivers/net/octeontx2/otx2_ethdev.c
> +++ b/drivers/net/octeontx2/otx2_ethdev.c
> @@ -748,8 +748,6 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
>                          offsetof(struct rte_mbuf, buf_iova) + 16);
>         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
>                          offsetof(struct rte_mbuf, ol_flags) + 12);
> -       RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
> -                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
>
>         if (conf & DEV_TX_OFFLOAD_VLAN_INSERT ||
>             conf & DEV_TX_OFFLOAD_QINQ_INSERT)
> diff --git a/lib/librte_kni/rte_kni_common.h b/lib/librte_kni/rte_kni_common.h
> index 36d66e2ffa..ffb3182731 100644
> --- a/lib/librte_kni/rte_kni_common.h
> +++ b/lib/librte_kni/rte_kni_common.h
> @@ -84,10 +84,11 @@ struct rte_kni_mbuf {
>         char pad2[4];
>         uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
>         uint16_t data_len;      /**< Amount of data in segment buffer. */
> +       char pad3[14];
> +       void *pool;
>
>         /* fields on second cache line */
>         __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)))
> -       void *pool;
>         void *next;             /**< Physical address of next mbuf in kernel. */
>  };
>
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 3190a29cce..c4c9ebfaa0 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -1107,7 +1107,6 @@ rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
>  static inline void
>  rte_mbuf_dynfield_copy(struct rte_mbuf *mdst, const struct rte_mbuf *msrc)
>  {
> -       memcpy(&mdst->dynfield0, msrc->dynfield0, sizeof(mdst->dynfield0));
>         memcpy(&mdst->dynfield1, msrc->dynfield1, sizeof(mdst->dynfield1));
>  }
>
> diff --git a/lib/librte_mbuf/rte_mbuf_core.h b/lib/librte_mbuf/rte_mbuf_core.h
> index debaace95a..567551deab 100644
> --- a/lib/librte_mbuf/rte_mbuf_core.h
> +++ b/lib/librte_mbuf/rte_mbuf_core.h
> @@ -586,12 +586,11 @@ struct rte_mbuf {
>
>         uint16_t buf_len;         /**< Length of segment buffer. */
>
> -       uint64_t dynfield0[1]; /**< Reserved for dynamic fields. */
> +       struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
>
>         /* second cache line - fields only used in slow path or on TX */
>         RTE_MARKER cacheline1 __rte_cache_min_aligned;
>
> -       struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
>         struct rte_mbuf *next;    /**< Next segment of scattered packet. */
>
>         /* fields to support TX offloads */
> @@ -645,7 +644,7 @@ struct rte_mbuf {
>         /** Timesync flags for use with IEEE1588. */
>         uint16_t timesync;
>
> -       uint32_t dynfield1[7]; /**< Reserved for dynamic fields. */
> +       uint32_t dynfield1[9]; /**< Reserved for dynamic fields. */
>  } __rte_cache_aligned;
>
>  /**
> diff --git a/lib/librte_mbuf/rte_mbuf_dyn.c b/lib/librte_mbuf/rte_mbuf_dyn.c
> index fd3e019a22..7d5e942bf0 100644
> --- a/lib/librte_mbuf/rte_mbuf_dyn.c
> +++ b/lib/librte_mbuf/rte_mbuf_dyn.c
> @@ -125,7 +125,6 @@ init_shared_mem(void)
>                  * rte_mbuf_dynfield_copy().
>                  */
>                 memset(shm, 0, sizeof(*shm));
> -               mark_free(dynfield0);
>                 mark_free(dynfield1);
>
>                 /* init free_flags */
> --
> 2.28.0
>
  
Thomas Monjalon Nov. 7, 2020, 6:39 p.m. UTC | #2
07/11/2020 18:12, Jerin Jacob:
> On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> >
> > The mempool pointer in the mbuf struct is moved
> > from the second to the first half.
> > It should increase performance on most systems having 64-byte cache line,
> 
> > i.e. mbuf is split in two cache lines.
> 
> But In any event, Tx needs to touch the pool to freeing back to the
> pool upon  Tx completion. Right?
> Not able to understand the motivation for moving it to the first 64B cache line?
> The gain varies from driver to driver. For example, a Typical
> ARM-based NPU does not need to
> touch the pool in Rx and its been filled by HW. Whereas it needs to
> touch in Tx if the reference count is implemented.
> 
> > Due to this change, tx_offload is moved, so some vector data paths
> > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> 
> It will be breaking the Tx path, Please just don't remove the static
> assert without adjusting the code.

Of course not.
I looked at the vector Tx path of OCTEON TX2,
it's close to be impossible to understand :)
Please help!
  
Morten Brørup Nov. 7, 2020, 6:57 p.m. UTC | #3
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Saturday, November 7, 2020 4:53 PM
> 
> The mempool pointer in the mbuf struct is moved
> from the second to the first half.
> It should increase performance on most systems having 64-byte cache line,
> i.e. mbuf is split in two cache lines.

Perhaps with the #define DEV_TX_OFFLOAD_MBUF_FAST_FREE mentioned by Konstantin, it might be better moving m->next instead, at least for applications with the opportunity to set this flag (e.g. applications with only one mbuf pool).

Unfortunately, the information about the DEV_TX_OFFLOAD_MBUF_FAST_FREE flag came to light after the techboard meeting, and I don't have any benchmarks to support this suggestion, so I guess it's hard to change the techboard's decision to move the pool pointer.

> 
> Due to this change, tx_offload is moved, so some vector data paths
> may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!

Regardless if m->pool or m->next is moved, it will not affect any issues related to m->tx_offload moving to a new location in the second cache line.

> 
> Moving this field gives more space to dynfield1.
> 
> This is how the mbuf layout looks like (pahole-style):
> 
> word  type                              name                byte  size
>  0    void *                            buf_addr;         /*   0 +  8 */
>  1    rte_iova_t                        buf_iova          /*   8 +  8 */
>       /* --- RTE_MARKER64               rearm_data;                   */
>  2    uint16_t                          data_off;         /*  16 +  2 */
>       uint16_t                          refcnt;           /*  18 +  2 */
>       uint16_t                          nb_segs;          /*  20 +  2 */
>       uint16_t                          port;             /*  22 +  2 */
>  3    uint64_t                          ol_flags;         /*  24 +  8 */
>       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
>  4    uint32_t             union        packet_type;      /*  32 +  4 */
>       uint32_t                          pkt_len;          /*  36 +  4 */
>  5    uint16_t                          data_len;         /*  40 +  2 */
>       uint16_t                          vlan_tci;         /*  42 +  2 */
>  5.5  uint64_t             union        hash;             /*  44 +  8 */
>  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
>       uint16_t                          buf_len;          /*  54 +  2 */
>  7    struct rte_mempool *              pool;             /*  56 +  8 */
>       /* --- RTE_MARKER                 cacheline1;                   */
>  8    struct rte_mbuf *                 next;             /*  64 +  8 */
>  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> 10    struct rte_mbuf_ext_shared_info * shinfo;           /*  80 +  8 */
> 11    uint16_t                          priv_size;        /*  88 +  2 */
>       uint16_t                          timesync;         /*  90 +  2 */
> 11.5  uint32_t                          dynfield1[9];     /*  92 + 36 */
> 16    /* --- END                                             128      */
> 
> Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
> ---
>  doc/guides/rel_notes/deprecation.rst | 7 -------
>  drivers/net/octeontx2/otx2_ethdev.c  | 2 --
>  lib/librte_kni/rte_kni_common.h      | 3 ++-
>  lib/librte_mbuf/rte_mbuf.h           | 1 -
>  lib/librte_mbuf/rte_mbuf_core.h      | 5 ++---
>  lib/librte_mbuf/rte_mbuf_dyn.c       | 1 -
>  6 files changed, 4 insertions(+), 15 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/deprecation.rst
> b/doc/guides/rel_notes/deprecation.rst
> index f3258eb3f7..efb09f0c5e 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -81,13 +81,6 @@ Deprecation Notices
>    us extending existing enum/define.
>    One solution can be using a fixed size array instead of ``.*MAX.*``
> value.
> 
> -* mbuf: Some fields will be converted to dynamic API in DPDK 20.11
> -  in order to reserve more space for the dynamic fields, as explained in
> -  `this presentation <https://www.youtube.com/watch?v=Ttl6MlhmzWY>`_.
> -  As a consequence, the layout of the ``struct rte_mbuf`` will be re-
> arranged,
> -  avoiding impact on vectorized implementation of the driver datapaths,
> -  while evaluating performance gains of a better use of the first cache
> line.
> -
>  * ethdev: The flow director API, including ``rte_eth_conf.fdir_conf``
> field,
>    and the related structures (``rte_fdir_*`` and ``rte_eth_fdir_*``),
>    will be removed in DPDK 20.11.
> diff --git a/drivers/net/octeontx2/otx2_ethdev.c
> b/drivers/net/octeontx2/otx2_ethdev.c
> index 6cebbe677d..d6e0f1dd03 100644
> --- a/drivers/net/octeontx2/otx2_ethdev.c
> +++ b/drivers/net/octeontx2/otx2_ethdev.c
> @@ -748,8 +748,6 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
>  			 offsetof(struct rte_mbuf, buf_iova) + 16);
>  	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
>  			 offsetof(struct rte_mbuf, ol_flags) + 12);
> -	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
> -			 offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
> 
>  	if (conf & DEV_TX_OFFLOAD_VLAN_INSERT ||
>  	    conf & DEV_TX_OFFLOAD_QINQ_INSERT)
> diff --git a/lib/librte_kni/rte_kni_common.h
> b/lib/librte_kni/rte_kni_common.h
> index 36d66e2ffa..ffb3182731 100644
> --- a/lib/librte_kni/rte_kni_common.h
> +++ b/lib/librte_kni/rte_kni_common.h
> @@ -84,10 +84,11 @@ struct rte_kni_mbuf {
>  	char pad2[4];
>  	uint32_t pkt_len;       /**< Total pkt len: sum of all segment
> data_len. */
>  	uint16_t data_len;      /**< Amount of data in segment buffer. */
> +	char pad3[14];
> +	void *pool;
> 
>  	/* fields on second cache line */
>  	__attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)))
> -	void *pool;
>  	void *next;             /**< Physical address of next mbuf in kernel.
> */
>  };
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 3190a29cce..c4c9ebfaa0 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -1107,7 +1107,6 @@ rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void
> *buf_addr,
>  static inline void
>  rte_mbuf_dynfield_copy(struct rte_mbuf *mdst, const struct rte_mbuf *msrc)
>  {
> -	memcpy(&mdst->dynfield0, msrc->dynfield0, sizeof(mdst->dynfield0));
>  	memcpy(&mdst->dynfield1, msrc->dynfield1, sizeof(mdst->dynfield1));
>  }
> 
> diff --git a/lib/librte_mbuf/rte_mbuf_core.h
> b/lib/librte_mbuf/rte_mbuf_core.h
> index debaace95a..567551deab 100644
> --- a/lib/librte_mbuf/rte_mbuf_core.h
> +++ b/lib/librte_mbuf/rte_mbuf_core.h
> @@ -586,12 +586,11 @@ struct rte_mbuf {
> 
>  	uint16_t buf_len;         /**< Length of segment buffer. */
> 
> -	uint64_t dynfield0[1]; /**< Reserved for dynamic fields. */
> +	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
> 
>  	/* second cache line - fields only used in slow path or on TX */
>  	RTE_MARKER cacheline1 __rte_cache_min_aligned;
> 
> -	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
>  	struct rte_mbuf *next;    /**< Next segment of scattered packet. */
> 
>  	/* fields to support TX offloads */
> @@ -645,7 +644,7 @@ struct rte_mbuf {
>  	/** Timesync flags for use with IEEE1588. */
>  	uint16_t timesync;
> 
> -	uint32_t dynfield1[7]; /**< Reserved for dynamic fields. */
> +	uint32_t dynfield1[9]; /**< Reserved for dynamic fields. */
>  } __rte_cache_aligned;
> 
>  /**
> diff --git a/lib/librte_mbuf/rte_mbuf_dyn.c
> b/lib/librte_mbuf/rte_mbuf_dyn.c
> index fd3e019a22..7d5e942bf0 100644
> --- a/lib/librte_mbuf/rte_mbuf_dyn.c
> +++ b/lib/librte_mbuf/rte_mbuf_dyn.c
> @@ -125,7 +125,6 @@ init_shared_mem(void)
>  		 * rte_mbuf_dynfield_copy().
>  		 */
>  		memset(shm, 0, sizeof(*shm));
> -		mark_free(dynfield0);
>  		mark_free(dynfield1);
> 
>  		/* init free_flags */
> --
> 2.28.0
>
  
Jerin Jacob Nov. 7, 2020, 7:05 p.m. UTC | #4
On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 07/11/2020 18:12, Jerin Jacob:
> > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > >
> > > The mempool pointer in the mbuf struct is moved
> > > from the second to the first half.
> > > It should increase performance on most systems having 64-byte cache line,
> >
> > > i.e. mbuf is split in two cache lines.
> >
> > But In any event, Tx needs to touch the pool to freeing back to the
> > pool upon  Tx completion. Right?
> > Not able to understand the motivation for moving it to the first 64B cache line?
> > The gain varies from driver to driver. For example, a Typical
> > ARM-based NPU does not need to
> > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > touch in Tx if the reference count is implemented.

See below.

> >
> > > Due to this change, tx_offload is moved, so some vector data paths
> > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> >
> > It will be breaking the Tx path, Please just don't remove the static
> > assert without adjusting the code.
>
> Of course not.
> I looked at the vector Tx path of OCTEON TX2,
> it's close to be impossible to understand :)
> Please help!

Off course. Could you check the above section any share the rationale
for this change
and where it helps and how much it helps?


>
>
  
Thomas Monjalon Nov. 7, 2020, 8:33 p.m. UTC | #5
07/11/2020 20:05, Jerin Jacob:
> On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > 07/11/2020 18:12, Jerin Jacob:
> > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > >
> > > > The mempool pointer in the mbuf struct is moved
> > > > from the second to the first half.
> > > > It should increase performance on most systems having 64-byte cache line,
> > >
> > > > i.e. mbuf is split in two cache lines.
> > >
> > > But In any event, Tx needs to touch the pool to freeing back to the
> > > pool upon  Tx completion. Right?
> > > Not able to understand the motivation for moving it to the first 64B cache line?
> > > The gain varies from driver to driver. For example, a Typical
> > > ARM-based NPU does not need to
> > > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > > touch in Tx if the reference count is implemented.
> 
> See below.
> 
> > >
> > > > Due to this change, tx_offload is moved, so some vector data paths
> > > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> > >
> > > It will be breaking the Tx path, Please just don't remove the static
> > > assert without adjusting the code.
> >
> > Of course not.
> > I looked at the vector Tx path of OCTEON TX2,
> > it's close to be impossible to understand :)
> > Please help!
> 
> Off course. Could you check the above section any share the rationale
> for this change
> and where it helps and how much it helps?

It has been concluded in the techboard meeting you were part of.
I don't understand why we restart this discussion again.
I won't have the energy to restart this process myself.
If you don't want to apply the techboard decision, then please
do the necessary to request another quick decision.
  
Jerin Jacob Nov. 9, 2020, 5:18 a.m. UTC | #6
On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 07/11/2020 20:05, Jerin Jacob:
> > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > 07/11/2020 18:12, Jerin Jacob:
> > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > >
> > > > > The mempool pointer in the mbuf struct is moved
> > > > > from the second to the first half.
> > > > > It should increase performance on most systems having 64-byte cache line,
> > > >
> > > > > i.e. mbuf is split in two cache lines.
> > > >
> > > > But In any event, Tx needs to touch the pool to freeing back to the
> > > > pool upon  Tx completion. Right?
> > > > Not able to understand the motivation for moving it to the first 64B cache line?
> > > > The gain varies from driver to driver. For example, a Typical
> > > > ARM-based NPU does not need to
> > > > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > > > touch in Tx if the reference count is implemented.
> >
> > See below.
> >
> > > >
> > > > > Due to this change, tx_offload is moved, so some vector data paths
> > > > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> > > >
> > > > It will be breaking the Tx path, Please just don't remove the static
> > > > assert without adjusting the code.
> > >
> > > Of course not.
> > > I looked at the vector Tx path of OCTEON TX2,
> > > it's close to be impossible to understand :)
> > > Please help!
> >
> > Off course. Could you check the above section any share the rationale
> > for this change
> > and where it helps and how much it helps?
>
> It has been concluded in the techboard meeting you were part of.
> I don't understand why we restart this discussion again.
> I won't have the energy to restart this process myself.
> If you don't want to apply the techboard decision, then please
> do the necessary to request another quick decision.

Yes. Initially, I thought it is OK as we have 128B CL, After looking
into Thomas's change, I realized
it is not good for ARM64 64B catchlines based NPU as
- A Typical  ARM-based NPU does not need to touch the pool in Rx and
its been filled by HW. Whereas it needs to
touch in Tx if the reference count is implemented.
- Also it will be effecting exiting vector routines

I request to reconsider the tech board decision.



>
>
>
  
Thomas Monjalon Nov. 9, 2020, 8:04 a.m. UTC | #7
09/11/2020 06:18, Jerin Jacob:
> On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > 07/11/2020 20:05, Jerin Jacob:
> > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > >
> > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > from the second to the first half.
> > > > > > It should increase performance on most systems having 64-byte cache line,
> > > > >
> > > > > > i.e. mbuf is split in two cache lines.
> > > > >
> > > > > But In any event, Tx needs to touch the pool to freeing back to the
> > > > > pool upon  Tx completion. Right?
> > > > > Not able to understand the motivation for moving it to the first 64B cache line?
> > > > > The gain varies from driver to driver. For example, a Typical
> > > > > ARM-based NPU does not need to
> > > > > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > > > > touch in Tx if the reference count is implemented.
> > >
> > > See below.
> > >
> > > > >
> > > > > > Due to this change, tx_offload is moved, so some vector data paths
> > > > > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> > > > >
> > > > > It will be breaking the Tx path, Please just don't remove the static
> > > > > assert without adjusting the code.
> > > >
> > > > Of course not.
> > > > I looked at the vector Tx path of OCTEON TX2,
> > > > it's close to be impossible to understand :)
> > > > Please help!
> > >
> > > Off course. Could you check the above section any share the rationale
> > > for this change
> > > and where it helps and how much it helps?
> >
> > It has been concluded in the techboard meeting you were part of.
> > I don't understand why we restart this discussion again.
> > I won't have the energy to restart this process myself.
> > If you don't want to apply the techboard decision, then please
> > do the necessary to request another quick decision.
> 
> Yes. Initially, I thought it is OK as we have 128B CL, After looking
> into Thomas's change, I realized
> it is not good for ARM64 64B catchlines based NPU as
> - A Typical  ARM-based NPU does not need to touch the pool in Rx and
> its been filled by HW. Whereas it needs to
> touch in Tx if the reference count is implemented.

Small note: It is not true for all Arm platforms.

> - Also it will be effecting exiting vector routines
> 
> I request to reconsider the tech board decision.
  
Morten Brørup Nov. 9, 2020, 8:16 a.m. UTC | #8
+CC techboard

> From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> Sent: Monday, November 9, 2020 6:18 AM
> 
> On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net>
> wrote:
> >
> > 07/11/2020 20:05, Jerin Jacob:
> > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon
> <thomas@monjalon.net> wrote:
> > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon
> <thomas@monjalon.net> wrote:
> > > > > >
> > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > from the second to the first half.
> > > > > > It should increase performance on most systems having 64-byte
> cache line,
> > > > >
> > > > > > i.e. mbuf is split in two cache lines.
> > > > >
> > > > > But In any event, Tx needs to touch the pool to freeing back to
> the
> > > > > pool upon  Tx completion. Right?
> > > > > Not able to understand the motivation for moving it to the
> first 64B cache line?
> > > > > The gain varies from driver to driver. For example, a Typical
> > > > > ARM-based NPU does not need to
> > > > > touch the pool in Rx and its been filled by HW. Whereas it
> needs to
> > > > > touch in Tx if the reference count is implemented.
> > >
> > > See below.
> > >
> > > > >
> > > > > > Due to this change, tx_offload is moved, so some vector data
> paths
> > > > > > may need to be adjusted. Note: OCTEON TX2 check is removed
> temporarily!
> > > > >
> > > > > It will be breaking the Tx path, Please just don't remove the
> static
> > > > > assert without adjusting the code.
> > > >
> > > > Of course not.
> > > > I looked at the vector Tx path of OCTEON TX2,
> > > > it's close to be impossible to understand :)
> > > > Please help!
> > >
> > > Off course. Could you check the above section any share the
> rationale
> > > for this change
> > > and where it helps and how much it helps?
> >
> > It has been concluded in the techboard meeting you were part of.
> > I don't understand why we restart this discussion again.
> > I won't have the energy to restart this process myself.
> > If you don't want to apply the techboard decision, then please
> > do the necessary to request another quick decision.
> 
> Yes. Initially, I thought it is OK as we have 128B CL, After looking
> into Thomas's change, I realized
> it is not good for ARM64 64B catchlines based NPU as
> - A Typical  ARM-based NPU does not need to touch the pool in Rx and
> its been filled by HW. Whereas it needs to
> touch in Tx if the reference count is implemented.

Jerin, I don't understand what the problem is here...

Since RX doesn't touch m->pool, it shouldn't matter for RX which cache line m->pool resides in. I get that.

You are saying that TX needs to touch m->pool if the reference count is implemented. I get that. But I don't understand why it is worse having m->pool in the first cache line than in the second cache line; can you please clarify?

> - Also it will be effecting exiting vector routines

That is unavoidable if we move something from the second to the first cache line.

It may require some rework on the vector routines, but it shouldn't be too difficult for whoever wrote these vector routines.

> 
> I request to reconsider the tech board decision.

I was on the techboard meeting as an observer (or whatever the correct term would be for non-members), and this is my impression of the decision on the meeting:

The techboard clearly decided not to move any dynamic fields in the first cache line, on the grounds that if we move them away again in a later version, DPDK users utilizing a dynamic field in the first cache line might experience a performance drop at that later time. And this will be a very bad user experience, causing grief and complaints. To me, this seemed like a firm decision, based on solid arguments.

Then the techboard discussed which other field to move to the freed up space in the first cache line. There were no performance reports showing any improvements by moving the any of the suggested fields (m->pool, m->next, m->tx_offload), and there was a performance report showing no improvements by moving m->next in a test case with large segmented packets. The techboard decided to move m->pool as originally suggested. To me, this seemed like a somewhat random choice between A, B and C, on the grounds that moving one of them is probably better than moving none of them.

The techboard made its decision based on the information available at that time.

Unfortunately, I do not have the resources to test the performance improvement by moving m->next to the first cache line instead of m->pool and utilizing the DEV_TX_OFFLOAD_MBUF_FAST_FREE flag mentioned by Konstantin.

If no new information comes to light, we cannot expect the techboard to change a decision it has already made.

In any case, I am grateful for the joint effort put into nurturing the mbuf, and especially Thomas' unrelenting hard work in this area!


Med venlig hilsen / kind regards
- Morten Brørup
  
Jerin Jacob Nov. 9, 2020, 8:27 a.m. UTC | #9
On Mon, Nov 9, 2020 at 1:34 PM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 09/11/2020 06:18, Jerin Jacob:
> > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > 07/11/2020 20:05, Jerin Jacob:
> > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > > >
> > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > from the second to the first half.
> > > > > > > It should increase performance on most systems having 64-byte cache line,
> > > > > >
> > > > > > > i.e. mbuf is split in two cache lines.
> > > > > >
> > > > > > But In any event, Tx needs to touch the pool to freeing back to the
> > > > > > pool upon  Tx completion. Right?
> > > > > > Not able to understand the motivation for moving it to the first 64B cache line?
> > > > > > The gain varies from driver to driver. For example, a Typical
> > > > > > ARM-based NPU does not need to
> > > > > > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > > > > > touch in Tx if the reference count is implemented.
> > > >
> > > > See below.
> > > >
> > > > > >
> > > > > > > Due to this change, tx_offload is moved, so some vector data paths
> > > > > > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> > > > > >
> > > > > > It will be breaking the Tx path, Please just don't remove the static
> > > > > > assert without adjusting the code.
> > > > >
> > > > > Of course not.
> > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > it's close to be impossible to understand :)
> > > > > Please help!
> > > >
> > > > Off course. Could you check the above section any share the rationale
> > > > for this change
> > > > and where it helps and how much it helps?
> > >
> > > It has been concluded in the techboard meeting you were part of.
> > > I don't understand why we restart this discussion again.
> > > I won't have the energy to restart this process myself.
> > > If you don't want to apply the techboard decision, then please
> > > do the necessary to request another quick decision.
> >
> > Yes. Initially, I thought it is OK as we have 128B CL, After looking
> > into Thomas's change, I realized
> > it is not good for ARM64 64B catchlines based NPU as
> > - A Typical  ARM-based NPU does not need to touch the pool in Rx and
> > its been filled by HW. Whereas it needs to
> > touch in Tx if the reference count is implemented.
>
> Small note: It is not true for all Arm platforms.

Yes. Generally, it is not specific to Arm platform. Any platform that
has HW accelerated mempool.

>
> > - Also it will be effecting exiting vector routines
> >
> > I request to reconsider the tech board decision.
>
>
>
  
Bruce Richardson Nov. 9, 2020, 9:47 a.m. UTC | #10
On Mon, Nov 09, 2020 at 01:57:26PM +0530, Jerin Jacob wrote:
> On Mon, Nov 9, 2020 at 1:34 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> >
> > 09/11/2020 06:18, Jerin Jacob:
> > > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > 07/11/2020 20:05, Jerin Jacob:
> > > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > > > >
> > > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > > from the second to the first half.
> > > > > > > > It should increase performance on most systems having 64-byte cache line,
> > > > > > >
> > > > > > > > i.e. mbuf is split in two cache lines.
> > > > > > >
> > > > > > > But In any event, Tx needs to touch the pool to freeing back to the
> > > > > > > pool upon  Tx completion. Right?
> > > > > > > Not able to understand the motivation for moving it to the first 64B cache line?
> > > > > > > The gain varies from driver to driver. For example, a Typical
> > > > > > > ARM-based NPU does not need to
> > > > > > > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > > > > > > touch in Tx if the reference count is implemented.
> > > > >
> > > > > See below.
> > > > >
> > > > > > >
> > > > > > > > Due to this change, tx_offload is moved, so some vector data paths
> > > > > > > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> > > > > > >
> > > > > > > It will be breaking the Tx path, Please just don't remove the static
> > > > > > > assert without adjusting the code.
> > > > > >
> > > > > > Of course not.
> > > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > > it's close to be impossible to understand :)
> > > > > > Please help!
> > > > >
> > > > > Off course. Could you check the above section any share the rationale
> > > > > for this change
> > > > > and where it helps and how much it helps?
> > > >
> > > > It has been concluded in the techboard meeting you were part of.
> > > > I don't understand why we restart this discussion again.
> > > > I won't have the energy to restart this process myself.
> > > > If you don't want to apply the techboard decision, then please
> > > > do the necessary to request another quick decision.
> > >
> > > Yes. Initially, I thought it is OK as we have 128B CL, After looking
> > > into Thomas's change, I realized
> > > it is not good for ARM64 64B catchlines based NPU as
> > > - A Typical  ARM-based NPU does not need to touch the pool in Rx and
> > > its been filled by HW. Whereas it needs to
> > > touch in Tx if the reference count is implemented.
> >
> > Small note: It is not true for all Arm platforms.
> 
> Yes. Generally, it is not specific to Arm platform. Any platform that
> has HW accelerated mempool.
> 

Hi Jerin, Thomas,

For platforms without hw mempool support too, the same holds that the pool
pointer should never need to be touched on RX. However, as we discussed on
the tech board, moving the pointer may help a little some cases, and
especially less optimized drivers, and there seemed to be no downside to
it. Jerin, can you please clarify why moving the pool pointer would cause a
problem?  Is it going to cause things to be slower on some systems for some
reasons?

Regards,
/Bruce
  
Bruce Richardson Nov. 9, 2020, 10:06 a.m. UTC | #11
On Mon, Nov 09, 2020 at 09:16:27AM +0100, Morten Brørup wrote:
> +CC techboard
> 
> > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > Sent: Monday, November 9, 2020 6:18 AM
> > 
> > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net>
> > wrote:
> > >
> > > 07/11/2020 20:05, Jerin Jacob:
> > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon
> > <thomas@monjalon.net> wrote:
> > > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon
> > <thomas@monjalon.net> wrote:
> > > > > > >
> > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > from the second to the first half.
> > > > > > > It should increase performance on most systems having 64-byte
> > cache line,
> > > > > >
> > > > > > > i.e. mbuf is split in two cache lines.
> > > > > >
> > > > > > But In any event, Tx needs to touch the pool to freeing back to
> > the
> > > > > > pool upon  Tx completion. Right?
> > > > > > Not able to understand the motivation for moving it to the
> > first 64B cache line?
> > > > > > The gain varies from driver to driver. For example, a Typical
> > > > > > ARM-based NPU does not need to
> > > > > > touch the pool in Rx and its been filled by HW. Whereas it
> > needs to
> > > > > > touch in Tx if the reference count is implemented.
> > > >
> > > > See below.
> > > >
> > > > > >
> > > > > > > Due to this change, tx_offload is moved, so some vector data
> > paths
> > > > > > > may need to be adjusted. Note: OCTEON TX2 check is removed
> > temporarily!
> > > > > >
> > > > > > It will be breaking the Tx path, Please just don't remove the
> > static
> > > > > > assert without adjusting the code.
> > > > >
> > > > > Of course not.
> > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > it's close to be impossible to understand :)
> > > > > Please help!
> > > >
> > > > Off course. Could you check the above section any share the
> > rationale
> > > > for this change
> > > > and where it helps and how much it helps?
> > >
> > > It has been concluded in the techboard meeting you were part of.
> > > I don't understand why we restart this discussion again.
> > > I won't have the energy to restart this process myself.
> > > If you don't want to apply the techboard decision, then please
> > > do the necessary to request another quick decision.
> > 
> > Yes. Initially, I thought it is OK as we have 128B CL, After looking
> > into Thomas's change, I realized
> > it is not good for ARM64 64B catchlines based NPU as
> > - A Typical  ARM-based NPU does not need to touch the pool in Rx and
> > its been filled by HW. Whereas it needs to
> > touch in Tx if the reference count is implemented.
> 
> Jerin, I don't understand what the problem is here...
> 
> Since RX doesn't touch m->pool, it shouldn't matter for RX which cache line m->pool resides in. I get that.
> 
> You are saying that TX needs to touch m->pool if the reference count is implemented. I get that. But I don't understand why it is worse having m->pool in the first cache line than in the second cache line; can you please clarify?
> 
> > - Also it will be effecting exiting vector routines
> 
> That is unavoidable if we move something from the second to the first cache line.
> 
> It may require some rework on the vector routines, but it shouldn't be too difficult for whoever wrote these vector routines.
> 
> > 
> > I request to reconsider the tech board decision.
> 
> I was on the techboard meeting as an observer (or whatever the correct term would be for non-members), and this is my impression of the decision on the meeting:
> 
> The techboard clearly decided not to move any dynamic fields in the first cache line, on the grounds that if we move them away again in a later version, DPDK users utilizing a dynamic field in the first cache line might experience a performance drop at that later time. And this will be a very bad user experience, causing grief and complaints. To me, this seemed like a firm decision, based on solid arguments.
> 
> Then the techboard discussed which other field to move to the freed up space in the first cache line. There were no performance reports showing any improvements by moving the any of the suggested fields (m->pool, m->next, m->tx_offload), and there was a performance report showing no improvements by moving m->next in a test case with large segmented packets. The techboard decided to move m->pool as originally suggested. To me, this seemed like a somewhat random choice between A, B and C, on the grounds that moving one of them is probably better than moving none of them.
>

This largely tallies with what I remember of the discussion too.

I'd also add though that the choice between the next pointer and the pool
pointer came down to the fact that the next pointer is only used for
chained, multi-segment, packets - which also tend to be larger packets -
while the pool pointer is of relevance to all packets, big and small,
single and multi-segment.

/Bruce
  
Bruce Richardson Nov. 9, 2020, 10:08 a.m. UTC | #12
On Sat, Nov 07, 2020 at 07:57:05PM +0100, Morten Brørup wrote:
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Saturday, November 7, 2020 4:53 PM
> > 
> > The mempool pointer in the mbuf struct is moved
> > from the second to the first half.
> > It should increase performance on most systems having 64-byte cache line,
> > i.e. mbuf is split in two cache lines.
> 
> Perhaps with the #define DEV_TX_OFFLOAD_MBUF_FAST_FREE mentioned by Konstantin, it might be better moving m->next instead, at least for applications with the opportunity to set this flag (e.g. applications with only one mbuf pool).
> 
> Unfortunately, the information about the DEV_TX_OFFLOAD_MBUF_FAST_FREE flag came to light after the techboard meeting, and I don't have any benchmarks to support this suggestion, so I guess it's hard to change the techboard's decision to move the pool pointer.
> 

The thing with FAST_FREE is that it doesn't just indicate that there is
only one mbuf pool, but rather that none of the packets are chained mbufs
either. Therefore, the flag applies equally to both next and pool pointers
for optimization. [And the flag should perhaps be two flags!]

/Bruce
  
Morten Brørup Nov. 9, 2020, 10:21 a.m. UTC | #13
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Monday, November 9, 2020 11:06 AM
> 
> On Mon, Nov 09, 2020 at 09:16:27AM +0100, Morten Brørup wrote:
> > +CC techboard
> >
> > > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > > Sent: Monday, November 9, 2020 6:18 AM
> > >
> > > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon
> <thomas@monjalon.net>
> > > wrote:
> > > >
> > > > 07/11/2020 20:05, Jerin Jacob:
> > > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon
> > > <thomas@monjalon.net> wrote:
> > > > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon
> > > <thomas@monjalon.net> wrote:
> > > > > > > >
> > > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > > from the second to the first half.
> > > > > > > > It should increase performance on most systems having 64-
> byte
> > > cache line,
> > > > > > >
> > > > > > > > i.e. mbuf is split in two cache lines.
> > > > > > >
> > > > > > > But In any event, Tx needs to touch the pool to freeing
> back to
> > > the
> > > > > > > pool upon  Tx completion. Right?
> > > > > > > Not able to understand the motivation for moving it to the
> > > first 64B cache line?
> > > > > > > The gain varies from driver to driver. For example, a
> Typical
> > > > > > > ARM-based NPU does not need to
> > > > > > > touch the pool in Rx and its been filled by HW. Whereas it
> > > needs to
> > > > > > > touch in Tx if the reference count is implemented.
> > > > >
> > > > > See below.
> > > > >
> > > > > > >
> > > > > > > > Due to this change, tx_offload is moved, so some vector
> data
> > > paths
> > > > > > > > may need to be adjusted. Note: OCTEON TX2 check is
> removed
> > > temporarily!
> > > > > > >
> > > > > > > It will be breaking the Tx path, Please just don't remove
> the
> > > static
> > > > > > > assert without adjusting the code.
> > > > > >
> > > > > > Of course not.
> > > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > > it's close to be impossible to understand :)
> > > > > > Please help!
> > > > >
> > > > > Off course. Could you check the above section any share the
> > > rationale
> > > > > for this change
> > > > > and where it helps and how much it helps?
> > > >
> > > > It has been concluded in the techboard meeting you were part of.
> > > > I don't understand why we restart this discussion again.
> > > > I won't have the energy to restart this process myself.
> > > > If you don't want to apply the techboard decision, then please
> > > > do the necessary to request another quick decision.
> > >
> > > Yes. Initially, I thought it is OK as we have 128B CL, After
> looking
> > > into Thomas's change, I realized
> > > it is not good for ARM64 64B catchlines based NPU as
> > > - A Typical  ARM-based NPU does not need to touch the pool in Rx
> and
> > > its been filled by HW. Whereas it needs to
> > > touch in Tx if the reference count is implemented.
> >
> > Jerin, I don't understand what the problem is here...
> >
> > Since RX doesn't touch m->pool, it shouldn't matter for RX which
> cache line m->pool resides in. I get that.
> >
> > You are saying that TX needs to touch m->pool if the reference count
> is implemented. I get that. But I don't understand why it is worse
> having m->pool in the first cache line than in the second cache line;
> can you please clarify?
> >
> > > - Also it will be effecting exiting vector routines
> >
> > That is unavoidable if we move something from the second to the first
> cache line.
> >
> > It may require some rework on the vector routines, but it shouldn't
> be too difficult for whoever wrote these vector routines.
> >
> > >
> > > I request to reconsider the tech board decision.
> >
> > I was on the techboard meeting as an observer (or whatever the
> correct term would be for non-members), and this is my impression of
> the decision on the meeting:
> >
> > The techboard clearly decided not to move any dynamic fields in the
> first cache line, on the grounds that if we move them away again in a
> later version, DPDK users utilizing a dynamic field in the first cache
> line might experience a performance drop at that later time. And this
> will be a very bad user experience, causing grief and complaints. To
> me, this seemed like a firm decision, based on solid arguments.
> >
> > Then the techboard discussed which other field to move to the freed
> up space in the first cache line. There were no performance reports
> showing any improvements by moving the any of the suggested fields (m-
> >pool, m->next, m->tx_offload), and there was a performance report
> showing no improvements by moving m->next in a test case with large
> segmented packets. The techboard decided to move m->pool as originally
> suggested. To me, this seemed like a somewhat random choice between A,
> B and C, on the grounds that moving one of them is probably better than
> moving none of them.
> >
> 
> This largely tallies with what I remember of the discussion too.
> 
> I'd also add though that the choice between the next pointer and the
> pool
> pointer came down to the fact that the next pointer is only used for
> chained, multi-segment, packets - which also tend to be larger packets
> -
> while the pool pointer is of relevance to all packets, big and small,
> single and multi-segment.

I wish that was the case, but I am not so sure...

It is true that m->next is NULL for non-segmented packets.

However, m->next is read in the likely path of rte_pktmbuf_prefree_seg(), which is called from e.g. rte_pktmbuf_free_seg(), and again from rte_pktmbuf_free(). And I assume that these functions are quite commonly used.

Not being a PMD developer, I might be wrong about this assumption.

> 
> /Bruce
  
Morten Brørup Nov. 9, 2020, 10:30 a.m. UTC | #14
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Monday, November 9, 2020 11:09 AM
> 
> On Sat, Nov 07, 2020 at 07:57:05PM +0100, Morten Brørup wrote:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Sent: Saturday, November 7, 2020 4:53 PM
> > >
> > > The mempool pointer in the mbuf struct is moved
> > > from the second to the first half.
> > > It should increase performance on most systems having 64-byte cache
> line,
> > > i.e. mbuf is split in two cache lines.
> >
> > Perhaps with the #define DEV_TX_OFFLOAD_MBUF_FAST_FREE mentioned by
> Konstantin, it might be better moving m->next instead, at least for
> applications with the opportunity to set this flag (e.g. applications
> with only one mbuf pool).
> >
> > Unfortunately, the information about the
> DEV_TX_OFFLOAD_MBUF_FAST_FREE flag came to light after the techboard
> meeting, and I don't have any benchmarks to support this suggestion, so
> I guess it's hard to change the techboard's decision to move the pool
> pointer.
> >
> 
> The thing with FAST_FREE is that it doesn't just indicate that there is
> only one mbuf pool, but rather that none of the packets are chained
> mbufs
> either. Therefore, the flag applies equally to both next and pool
> pointers
> for optimization. [And the flag should perhaps be two flags!]

I guess we could offer the same optimization to applications by adding a rte_mbuf_raw_free_bulk() function to the mbuf library. I will put that on my ToDo-list. :-)
  
Ananyev, Konstantin Nov. 9, 2020, 10:33 a.m. UTC | #15
> On Sat, Nov 07, 2020 at 07:57:05PM +0100, Morten Brørup wrote:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Sent: Saturday, November 7, 2020 4:53 PM
> > >
> > > The mempool pointer in the mbuf struct is moved
> > > from the second to the first half.
> > > It should increase performance on most systems having 64-byte cache line,
> > > i.e. mbuf is split in two cache lines.
> >
> > Perhaps with the #define DEV_TX_OFFLOAD_MBUF_FAST_FREE mentioned by Konstantin, it might be better moving m->next instead, at
> least for applications with the opportunity to set this flag (e.g. applications with only one mbuf pool).
> >
> > Unfortunately, the information about the DEV_TX_OFFLOAD_MBUF_FAST_FREE flag came to light after the techboard meeting, and I
> don't have any benchmarks to support this suggestion, so I guess it's hard to change the techboard's decision to move the pool pointer.
> >
> 
> The thing with FAST_FREE is that it doesn't just indicate that there is
> only one mbuf pool, but rather that none of the packets are chained mbufs
> either.

Hmm, wonder where such assumption comes from?
There is nothing about forbidding chained mbufs in rte_ethdev.h right now:
#define DEV_TX_OFFLOAD_MBUF_FAST_FREE   0x00010000
/**< Device supports optimization for fast release of mbufs.
 *   When set application must guarantee that per-queue all mbufs comes from
 *   the same mempool and has refcnt = 1.
 */ 

> Therefore, the flag applies equally to both next and pool pointers
> for optimization. [And the flag should perhaps be two flags!]
> 

I think what Morten means here:
For FAST_FREE PMD can store pool pointer somewhere in its queue metadata
and doesn't need to read it from the mbuf.
So if we move next to first line - mbuf_free code-path for FAST_FREE case can
avoid touching  second cache line in the mbuf.
  
Bruce Richardson Nov. 9, 2020, 10:36 a.m. UTC | #16
On Mon, Nov 09, 2020 at 10:33:41AM +0000, Ananyev, Konstantin wrote:
> 
> 
> > On Sat, Nov 07, 2020 at 07:57:05PM +0100, Morten Brørup wrote:
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > Sent: Saturday, November 7, 2020 4:53 PM
> > > >
> > > > The mempool pointer in the mbuf struct is moved
> > > > from the second to the first half.
> > > > It should increase performance on most systems having 64-byte cache line,
> > > > i.e. mbuf is split in two cache lines.
> > >
> > > Perhaps with the #define DEV_TX_OFFLOAD_MBUF_FAST_FREE mentioned by Konstantin, it might be better moving m->next instead, at
> > least for applications with the opportunity to set this flag (e.g. applications with only one mbuf pool).
> > >
> > > Unfortunately, the information about the DEV_TX_OFFLOAD_MBUF_FAST_FREE flag came to light after the techboard meeting, and I
> > don't have any benchmarks to support this suggestion, so I guess it's hard to change the techboard's decision to move the pool pointer.
> > >
> >
> > The thing with FAST_FREE is that it doesn't just indicate that there is
> > only one mbuf pool, but rather that none of the packets are chained mbufs
> > either.
> 
> Hmm, wonder where such assumption comes from?
> There is nothing about forbidding chained mbufs in rte_ethdev.h right now:
> #define DEV_TX_OFFLOAD_MBUF_FAST_FREE   0x00010000
> /**< Device supports optimization for fast release of mbufs.
>  *   When set application must guarantee that per-queue all mbufs comes from
>  *   the same mempool and has refcnt = 1.
>  */
> 

I should have checked before mailing. I was mixing up the refcnt = 1 bit
with chained mbuf support.

/Bruce
  
Ananyev, Konstantin Nov. 9, 2020, 11:24 a.m. UTC | #17
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > Sent: Saturday, November 7, 2020 4:53 PM
> > > >
> > > > The mempool pointer in the mbuf struct is moved
> > > > from the second to the first half.
> > > > It should increase performance on most systems having 64-byte cache line,
> > > > i.e. mbuf is split in two cache lines.
> > >
> > > Perhaps with the #define DEV_TX_OFFLOAD_MBUF_FAST_FREE mentioned by Konstantin, it might be better moving m->next instead, at
> > least for applications with the opportunity to set this flag (e.g. applications with only one mbuf pool).
> > >
> > > Unfortunately, the information about the DEV_TX_OFFLOAD_MBUF_FAST_FREE flag came to light after the techboard meeting, and I
> > don't have any benchmarks to support this suggestion, so I guess it's hard to change the techboard's decision to move the pool pointer.
> > >
> >
> > The thing with FAST_FREE is that it doesn't just indicate that there is
> > only one mbuf pool, but rather that none of the packets are chained mbufs
> > either.
> 
> Hmm, wonder where such assumption comes from?
> There is nothing about forbidding chained mbufs in rte_ethdev.h right now:
> #define DEV_TX_OFFLOAD_MBUF_FAST_FREE   0x00010000
> /**< Device supports optimization for fast release of mbufs.
>  *   When set application must guarantee that per-queue all mbufs comes from
>  *   the same mempool and has refcnt = 1.
>  */
> 
> > Therefore, the flag applies equally to both next and pool pointers
> > for optimization. [And the flag should perhaps be two flags!]
> >
> 
> I think what Morten means here:
> For FAST_FREE PMD can store pool pointer somewhere in its queue metadata
> and doesn't need to read it from the mbuf.
> So if we move next to first line - mbuf_free code-path for FAST_FREE case can
> avoid touching  second cache line in the mbuf.

Just as a different thought:
if we move 'next' to the first cache line, can we get rid of 'nb_segs' field completely?
  
Jerin Jacob Nov. 9, 2020, 12:01 p.m. UTC | #18
On Mon, Nov 9, 2020 at 3:17 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, Nov 09, 2020 at 01:57:26PM +0530, Jerin Jacob wrote:
> > On Mon, Nov 9, 2020 at 1:34 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > >
> > > 09/11/2020 06:18, Jerin Jacob:
> > > > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > 07/11/2020 20:05, Jerin Jacob:
> > > > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > > > > > > >
> > > > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > > > from the second to the first half.
> > > > > > > > > It should increase performance on most systems having 64-byte cache line,
> > > > > > > >
> > > > > > > > > i.e. mbuf is split in two cache lines.
> > > > > > > >
> > > > > > > > But In any event, Tx needs to touch the pool to freeing back to the
> > > > > > > > pool upon  Tx completion. Right?
> > > > > > > > Not able to understand the motivation for moving it to the first 64B cache line?
> > > > > > > > The gain varies from driver to driver. For example, a Typical
> > > > > > > > ARM-based NPU does not need to
> > > > > > > > touch the pool in Rx and its been filled by HW. Whereas it needs to
> > > > > > > > touch in Tx if the reference count is implemented.
> > > > > >
> > > > > > See below.
> > > > > >
> > > > > > > >
> > > > > > > > > Due to this change, tx_offload is moved, so some vector data paths
> > > > > > > > > may need to be adjusted. Note: OCTEON TX2 check is removed temporarily!
> > > > > > > >
> > > > > > > > It will be breaking the Tx path, Please just don't remove the static
> > > > > > > > assert without adjusting the code.
> > > > > > >
> > > > > > > Of course not.
> > > > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > > > it's close to be impossible to understand :)
> > > > > > > Please help!
> > > > > >
> > > > > > Off course. Could you check the above section any share the rationale
> > > > > > for this change
> > > > > > and where it helps and how much it helps?
> > > > >
> > > > > It has been concluded in the techboard meeting you were part of.
> > > > > I don't understand why we restart this discussion again.
> > > > > I won't have the energy to restart this process myself.
> > > > > If you don't want to apply the techboard decision, then please
> > > > > do the necessary to request another quick decision.
> > > >
> > > > Yes. Initially, I thought it is OK as we have 128B CL, After looking
> > > > into Thomas's change, I realized
> > > > it is not good for ARM64 64B catchlines based NPU as
> > > > - A Typical  ARM-based NPU does not need to touch the pool in Rx and
> > > > its been filled by HW. Whereas it needs to
> > > > touch in Tx if the reference count is implemented.
> > >
> > > Small note: It is not true for all Arm platforms.
> >
> > Yes. Generally, it is not specific to Arm platform. Any platform that
> > has HW accelerated mempool.
> >
>
> Hi Jerin, Thomas,
>
> For platforms without hw mempool support too, the same holds that the pool
> pointer should never need to be touched on RX. However, as we discussed on
> the tech board, moving the pointer may help a little some cases, and
> especially less optimized drivers, and there seemed to be no downside to
> it. Jerin, can you please clarify why moving the pool pointer would cause a
> problem?  Is it going to cause things to be slower on some systems for some
> reasons?

I overlooked on that part, No specific issue in moving first cache line.

Hi @Thomas Monjalon

Any specific reason why you removed the static assert from octeontx2.
I am not able
to compilation issue with that static assert.

The current vector driver assumes pool and tx offload needs to 2 DWORDS apart,
Which is the case before and after your change.

Please remove that static assert change, No issue from my side on this patch.


In general, it is too much effort to re-verify and measure performance
impact with
all the cases after RC2, I hope this will last mbuf change in this release.

>
> Regards,
> /Bruce
  
Thomas Monjalon Nov. 9, 2020, 12:59 p.m. UTC | #19
09/11/2020 13:01, Jerin Jacob:
> Hi @Thomas Monjalon
> 
> Any specific reason why you removed the static assert from octeontx2.

I have a build failure when cross-compiling for octeontx2.

> I am not able to compilation issue with that static assert.

There is no issue when compiling for x86.

> The current vector driver assumes pool and tx offload needs to 2 DWORDS apart,
> Which is the case before and after your change.

You're right, pool and tx_offload are moved together.
The only difference is passing the cache line frontier.

> Please remove that static assert change, No issue from my side on this patch.

I cannot remove it without fixing something else,
maybe an issue in cache line alignment?

> In general, it is too much effort to re-verify and measure performance
> impact with
> all the cases after RC2, I hope this will last mbuf change in this release.

Yes it is the last change to mbuf layout,
sorry for pushing all these changes so late.
  
Jerin Jacob Nov. 9, 2020, 1:35 p.m. UTC | #20
On Mon, Nov 9, 2020 at 6:29 PM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 09/11/2020 13:01, Jerin Jacob:
> > Hi @Thomas Monjalon
> >
> > Any specific reason why you removed the static assert from octeontx2.
>
> I have a build failure when cross-compiling for octeontx2.

I am trying the below command, I am not able to see any issue
meson build --cross-file config/arm/arm64_octeontx2_linux_gcc

Are you facing the issue with 32bit? Could you share the steps to
reproduce and gcc version?

>
> > I am not able to compilation issue with that static assert.
>
> There is no issue when compiling for x86.
>
> > The current vector driver assumes pool and tx offload needs to 2 DWORDS apart,
> > Which is the case before and after your change.
>
> You're right, pool and tx_offload are moved together.
> The only difference is passing the cache line frontier.
>
> > Please remove that static assert change, No issue from my side on this patch.
>
> I cannot remove it without fixing something else,
> maybe an issue in cache line alignment?

Can try the below fix. If the issue is seen with 32bit.

diff --git a/drivers/net/octeontx2/otx2_ethdev.c
b/drivers/net/octeontx2/otx2_ethdev.c
index 6cebbe677..66a4d429d 100644
--- a/drivers/net/octeontx2/otx2_ethdev.c
+++ b/drivers/net/octeontx2/otx2_ethdev.c
@@ -749,7 +749,7 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
                         offsetof(struct rte_mbuf, ol_flags) + 12);
        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
-                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
+                        offsetof(struct rte_mbuf, pool) + 2 *
sizeof(uint64_t));

>
> > In general, it is too much effort to re-verify and measure performance
> > impact with
> > all the cases after RC2, I hope this will last mbuf change in this release.
>
> Yes it is the last change to mbuf layout,
> sorry for pushing all these changes so late.
>
>
>
  
Thomas Monjalon Nov. 9, 2020, 2:02 p.m. UTC | #21
09/11/2020 14:35, Jerin Jacob:
> On Mon, Nov 9, 2020 at 6:29 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> >
> > 09/11/2020 13:01, Jerin Jacob:
> > > Hi @Thomas Monjalon
> > >
> > > Any specific reason why you removed the static assert from octeontx2.
> >
> > I have a build failure when cross-compiling for octeontx2.

I was wrong here.

> I am trying the below command, I am not able to see any issue
> meson build --cross-file config/arm/arm64_octeontx2_linux_gcc
> 
> Are you facing the issue with 32bit? Could you share the steps to
> reproduce and gcc version?

Oh you're right, the issue was with 32-bit build,
sorry for the confusion.

> --- a/drivers/net/octeontx2/otx2_ethdev.c
> +++ b/drivers/net/octeontx2/otx2_ethdev.c
> @@ -749,7 +749,7 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
>         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
>                          offsetof(struct rte_mbuf, ol_flags) + 12);
>         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
> -                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
> +                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(uint64_t));

The actual "fix" is
offsetof(struct rte_mbuf, pool) + sizeof(uint64_t) + sizeof(void *)

I don't understand the octeontx2 vector code.
Please check what is the impact of this offset change.
BTW, is 32-bit build really supported with octeontx2?
  
Jerin Jacob Nov. 9, 2020, 2:08 p.m. UTC | #22
On Mon, Nov 9, 2020 at 7:32 PM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 09/11/2020 14:35, Jerin Jacob:
> > On Mon, Nov 9, 2020 at 6:29 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > >
> > > 09/11/2020 13:01, Jerin Jacob:
> > > > Hi @Thomas Monjalon
> > > >
> > > > Any specific reason why you removed the static assert from octeontx2.
> > >
> > > I have a build failure when cross-compiling for octeontx2.
>
> I was wrong here.
>
> > I am trying the below command, I am not able to see any issue
> > meson build --cross-file config/arm/arm64_octeontx2_linux_gcc
> >
> > Are you facing the issue with 32bit? Could you share the steps to
> > reproduce and gcc version?
>
> Oh you're right, the issue was with 32-bit build,

Thanks

> sorry for the confusion.

>
> > --- a/drivers/net/octeontx2/otx2_ethdev.c
> > +++ b/drivers/net/octeontx2/otx2_ethdev.c
> > @@ -749,7 +749,7 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
> >         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
> >                          offsetof(struct rte_mbuf, ol_flags) + 12);
> >         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
> > -                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
> > +                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(uint64_t));
>
> The actual "fix" is
> offsetof(struct rte_mbuf, pool) + sizeof(uint64_t) + sizeof(void *)
>
> I don't understand the octeontx2 vector code.
> Please check what is the impact of this offset change.

Tested the changes, No issue seen. All the expectation of vector code
is expressed with RTE_BUILD_BUG_ON.

> BTW, is 32-bit build really supported with octeontx2?

No. I think, keeping assert as "sizeof(void *)"(Same as now) and remove build
support for 32bit works too for octeontx2.
We will add it when really required.


>
>
  
Thomas Monjalon Nov. 9, 2020, 2:42 p.m. UTC | #23
09/11/2020 15:08, Jerin Jacob:
> On Mon, Nov 9, 2020 at 7:32 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > 09/11/2020 14:35, Jerin Jacob:
> > > Are you facing the issue with 32bit? Could you share the steps to
> > > reproduce and gcc version?
> >
> > Oh you're right, the issue was with 32-bit build,
> 
> Thanks
> 
> > sorry for the confusion.
> 
> >
> > > --- a/drivers/net/octeontx2/otx2_ethdev.c
> > > +++ b/drivers/net/octeontx2/otx2_ethdev.c
> > > @@ -749,7 +749,7 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
> > >         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
> > >                          offsetof(struct rte_mbuf, ol_flags) + 12);
> > >         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
> > > -                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
> > > +                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(uint64_t));
> >
> > The actual "fix" is
> > offsetof(struct rte_mbuf, pool) + sizeof(uint64_t) + sizeof(void *)
> >
> > I don't understand the octeontx2 vector code.
> > Please check what is the impact of this offset change.
> 
> Tested the changes, No issue seen. All the expectation of vector code
> is expressed with RTE_BUILD_BUG_ON.
> 
> > BTW, is 32-bit build really supported with octeontx2?
> 
> No. I think, keeping assert as "sizeof(void *)"(Same as now) and remove build
> support for 32bit works too for octeontx2.

OK, I think it's better than tweaking RTE_BUILD_BUG_ON for
something not really supported.

> We will add it when really required.

Then I'm going to send a patch to disable octeontx2 drivers on 32-bit.

Note there is another build issue with octeontx2 drivers on CentOS/RHEL 7
with Arm GGC 4.8.
  
Jerin Jacob Nov. 9, 2020, 2:53 p.m. UTC | #24
On Mon, Nov 9, 2020 at 8:12 PM Thomas Monjalon <thomas@monjalon.net> wrote:
>
> 09/11/2020 15:08, Jerin Jacob:
> > On Mon, Nov 9, 2020 at 7:32 PM Thomas Monjalon <thomas@monjalon.net> wrote:
> > > 09/11/2020 14:35, Jerin Jacob:
> > > > Are you facing the issue with 32bit? Could you share the steps to
> > > > reproduce and gcc version?
> > >
> > > Oh you're right, the issue was with 32-bit build,
> >
> > Thanks
> >
> > > sorry for the confusion.
> >
> > >
> > > > --- a/drivers/net/octeontx2/otx2_ethdev.c
> > > > +++ b/drivers/net/octeontx2/otx2_ethdev.c
> > > > @@ -749,7 +749,7 @@ nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
> > > >         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
> > > >                          offsetof(struct rte_mbuf, ol_flags) + 12);
> > > >         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
> > > > -                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
> > > > +                        offsetof(struct rte_mbuf, pool) + 2 * sizeof(uint64_t));
> > >
> > > The actual "fix" is
> > > offsetof(struct rte_mbuf, pool) + sizeof(uint64_t) + sizeof(void *)
> > >
> > > I don't understand the octeontx2 vector code.
> > > Please check what is the impact of this offset change.
> >
> > Tested the changes, No issue seen. All the expectation of vector code
> > is expressed with RTE_BUILD_BUG_ON.
> >
> > > BTW, is 32-bit build really supported with octeontx2?
> >
> > No. I think, keeping assert as "sizeof(void *)"(Same as now) and remove build
> > support for 32bit works too for octeontx2.
>
> OK, I think it's better than tweaking RTE_BUILD_BUG_ON for
> something not really supported.
>
> > We will add it when really required.
>
> Then I'm going to send a patch to disable octeontx2 drivers on 32-bit.

OK.

>
> Note there is another build issue with octeontx2 drivers on CentOS/RHEL 7
> with Arm GGC 4.8.

It is a segfault from the compiler. In Makefile, we have skipped
building < 4.9, not sure what needs to be done
for meson.



>
>
  
Stephen Hemminger Nov. 9, 2020, 6:04 p.m. UTC | #25
On Mon, 9 Nov 2020 11:21:02 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Monday, November 9, 2020 11:06 AM
> > 
> > On Mon, Nov 09, 2020 at 09:16:27AM +0100, Morten Brørup wrote:  
> > > +CC techboard
> > >  
> > > > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > > > Sent: Monday, November 9, 2020 6:18 AM
> > > >
> > > > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon  
> > <thomas@monjalon.net>  
> > > > wrote:  
> > > > >
> > > > > 07/11/2020 20:05, Jerin Jacob:  
> > > > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon  
> > > > <thomas@monjalon.net> wrote:  
> > > > > > > 07/11/2020 18:12, Jerin Jacob:  
> > > > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon  
> > > > <thomas@monjalon.net> wrote:  
> > > > > > > > >
> > > > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > > > from the second to the first half.
> > > > > > > > > It should increase performance on most systems having 64-  
> > byte  
> > > > cache line,  
> > > > > > > >  
> > > > > > > > > i.e. mbuf is split in two cache lines.  
> > > > > > > >
> > > > > > > > But In any event, Tx needs to touch the pool to freeing  
> > back to  
> > > > the  
> > > > > > > > pool upon  Tx completion. Right?
> > > > > > > > Not able to understand the motivation for moving it to the  
> > > > first 64B cache line?  
> > > > > > > > The gain varies from driver to driver. For example, a  
> > Typical  
> > > > > > > > ARM-based NPU does not need to
> > > > > > > > touch the pool in Rx and its been filled by HW. Whereas it  
> > > > needs to  
> > > > > > > > touch in Tx if the reference count is implemented.  
> > > > > >
> > > > > > See below.
> > > > > >  
> > > > > > > >  
> > > > > > > > > Due to this change, tx_offload is moved, so some vector  
> > data  
> > > > paths  
> > > > > > > > > may need to be adjusted. Note: OCTEON TX2 check is  
> > removed  
> > > > temporarily!  
> > > > > > > >
> > > > > > > > It will be breaking the Tx path, Please just don't remove  
> > the  
> > > > static  
> > > > > > > > assert without adjusting the code.  
> > > > > > >
> > > > > > > Of course not.
> > > > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > > > it's close to be impossible to understand :)
> > > > > > > Please help!  
> > > > > >
> > > > > > Off course. Could you check the above section any share the  
> > > > rationale  
> > > > > > for this change
> > > > > > and where it helps and how much it helps?  
> > > > >
> > > > > It has been concluded in the techboard meeting you were part of.
> > > > > I don't understand why we restart this discussion again.
> > > > > I won't have the energy to restart this process myself.
> > > > > If you don't want to apply the techboard decision, then please
> > > > > do the necessary to request another quick decision.  
> > > >
> > > > Yes. Initially, I thought it is OK as we have 128B CL, After  
> > looking  
> > > > into Thomas's change, I realized
> > > > it is not good for ARM64 64B catchlines based NPU as
> > > > - A Typical  ARM-based NPU does not need to touch the pool in Rx  
> > and  
> > > > its been filled by HW. Whereas it needs to
> > > > touch in Tx if the reference count is implemented.  
> > >
> > > Jerin, I don't understand what the problem is here...
> > >
> > > Since RX doesn't touch m->pool, it shouldn't matter for RX which  
> > cache line m->pool resides in. I get that.  
> > >
> > > You are saying that TX needs to touch m->pool if the reference count  
> > is implemented. I get that. But I don't understand why it is worse
> > having m->pool in the first cache line than in the second cache line;
> > can you please clarify?  
> > >  
> > > > - Also it will be effecting exiting vector routines  
> > >
> > > That is unavoidable if we move something from the second to the first  
> > cache line.  
> > >
> > > It may require some rework on the vector routines, but it shouldn't  
> > be too difficult for whoever wrote these vector routines.  
> > >  
> > > >
> > > > I request to reconsider the tech board decision.  
> > >
> > > I was on the techboard meeting as an observer (or whatever the  
> > correct term would be for non-members), and this is my impression of
> > the decision on the meeting:  
> > >
> > > The techboard clearly decided not to move any dynamic fields in the  
> > first cache line, on the grounds that if we move them away again in a
> > later version, DPDK users utilizing a dynamic field in the first cache
> > line might experience a performance drop at that later time. And this
> > will be a very bad user experience, causing grief and complaints. To
> > me, this seemed like a firm decision, based on solid arguments.  
> > >
> > > Then the techboard discussed which other field to move to the freed  
> > up space in the first cache line. There were no performance reports
> > showing any improvements by moving the any of the suggested fields (m-  
> > >pool, m->next, m->tx_offload), and there was a performance report  
> > showing no improvements by moving m->next in a test case with large
> > segmented packets. The techboard decided to move m->pool as originally
> > suggested. To me, this seemed like a somewhat random choice between A,
> > B and C, on the grounds that moving one of them is probably better than
> > moving none of them.  
> > >  
> > 
> > This largely tallies with what I remember of the discussion too.
> > 
> > I'd also add though that the choice between the next pointer and the
> > pool
> > pointer came down to the fact that the next pointer is only used for
> > chained, multi-segment, packets - which also tend to be larger packets
> > -
> > while the pool pointer is of relevance to all packets, big and small,
> > single and multi-segment.  
> 
> I wish that was the case, but I am not so sure...
> 
> It is true that m->next is NULL for non-segmented packets.

Yes m->next is NULL for non-segmented packets.
Do we need to modify rte_mbuf_check() to enforce these checks?
  
Morten Brørup Nov. 10, 2020, 7:15 a.m. UTC | #26
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Monday, November 9, 2020 7:05 PM
> 
> On Mon, 9 Nov 2020 11:21:02 +0100
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce
> Richardson
> > > Sent: Monday, November 9, 2020 11:06 AM
> > >
> > > On Mon, Nov 09, 2020 at 09:16:27AM +0100, Morten Brørup wrote:
> > > > +CC techboard
> > > >
> > > > > From: Jerin Jacob [mailto:jerinjacobk@gmail.com]
> > > > > Sent: Monday, November 9, 2020 6:18 AM
> > > > >
> > > > > On Sun, Nov 8, 2020 at 2:03 AM Thomas Monjalon
> > > <thomas@monjalon.net>
> > > > > wrote:
> > > > > >
> > > > > > 07/11/2020 20:05, Jerin Jacob:
> > > > > > > On Sun, Nov 8, 2020 at 12:09 AM Thomas Monjalon
> > > > > <thomas@monjalon.net> wrote:
> > > > > > > > 07/11/2020 18:12, Jerin Jacob:
> > > > > > > > > On Sat, Nov 7, 2020 at 10:04 PM Thomas Monjalon
> > > > > <thomas@monjalon.net> wrote:
> > > > > > > > > >
> > > > > > > > > > The mempool pointer in the mbuf struct is moved
> > > > > > > > > > from the second to the first half.
> > > > > > > > > > It should increase performance on most systems having
> 64-
> > > byte
> > > > > cache line,
> > > > > > > > >
> > > > > > > > > > i.e. mbuf is split in two cache lines.
> > > > > > > > >
> > > > > > > > > But In any event, Tx needs to touch the pool to freeing
> > > back to
> > > > > the
> > > > > > > > > pool upon  Tx completion. Right?
> > > > > > > > > Not able to understand the motivation for moving it to
> the
> > > > > first 64B cache line?
> > > > > > > > > The gain varies from driver to driver. For example, a
> > > Typical
> > > > > > > > > ARM-based NPU does not need to
> > > > > > > > > touch the pool in Rx and its been filled by HW. Whereas
> it
> > > > > needs to
> > > > > > > > > touch in Tx if the reference count is implemented.
> > > > > > >
> > > > > > > See below.
> > > > > > >
> > > > > > > > >
> > > > > > > > > > Due to this change, tx_offload is moved, so some
> vector
> > > data
> > > > > paths
> > > > > > > > > > may need to be adjusted. Note: OCTEON TX2 check is
> > > removed
> > > > > temporarily!
> > > > > > > > >
> > > > > > > > > It will be breaking the Tx path, Please just don't
> remove
> > > the
> > > > > static
> > > > > > > > > assert without adjusting the code.
> > > > > > > >
> > > > > > > > Of course not.
> > > > > > > > I looked at the vector Tx path of OCTEON TX2,
> > > > > > > > it's close to be impossible to understand :)
> > > > > > > > Please help!
> > > > > > >
> > > > > > > Off course. Could you check the above section any share the
> > > > > rationale
> > > > > > > for this change
> > > > > > > and where it helps and how much it helps?
> > > > > >
> > > > > > It has been concluded in the techboard meeting you were part
> of.
> > > > > > I don't understand why we restart this discussion again.
> > > > > > I won't have the energy to restart this process myself.
> > > > > > If you don't want to apply the techboard decision, then
> please
> > > > > > do the necessary to request another quick decision.
> > > > >
> > > > > Yes. Initially, I thought it is OK as we have 128B CL, After
> > > looking
> > > > > into Thomas's change, I realized
> > > > > it is not good for ARM64 64B catchlines based NPU as
> > > > > - A Typical  ARM-based NPU does not need to touch the pool in
> Rx
> > > and
> > > > > its been filled by HW. Whereas it needs to
> > > > > touch in Tx if the reference count is implemented.
> > > >
> > > > Jerin, I don't understand what the problem is here...
> > > >
> > > > Since RX doesn't touch m->pool, it shouldn't matter for RX which
> > > cache line m->pool resides in. I get that.
> > > >
> > > > You are saying that TX needs to touch m->pool if the reference
> count
> > > is implemented. I get that. But I don't understand why it is worse
> > > having m->pool in the first cache line than in the second cache
> line;
> > > can you please clarify?
> > > >
> > > > > - Also it will be effecting exiting vector routines
> > > >
> > > > That is unavoidable if we move something from the second to the
> first
> > > cache line.
> > > >
> > > > It may require some rework on the vector routines, but it
> shouldn't
> > > be too difficult for whoever wrote these vector routines.
> > > >
> > > > >
> > > > > I request to reconsider the tech board decision.
> > > >
> > > > I was on the techboard meeting as an observer (or whatever the
> > > correct term would be for non-members), and this is my impression
> of
> > > the decision on the meeting:
> > > >
> > > > The techboard clearly decided not to move any dynamic fields in
> the
> > > first cache line, on the grounds that if we move them away again in
> a
> > > later version, DPDK users utilizing a dynamic field in the first
> cache
> > > line might experience a performance drop at that later time. And
> this
> > > will be a very bad user experience, causing grief and complaints.
> To
> > > me, this seemed like a firm decision, based on solid arguments.
> > > >
> > > > Then the techboard discussed which other field to move to the
> freed
> > > up space in the first cache line. There were no performance reports
> > > showing any improvements by moving the any of the suggested fields
> (m-
> > > >pool, m->next, m->tx_offload), and there was a performance report
> > > showing no improvements by moving m->next in a test case with large
> > > segmented packets. The techboard decided to move m->pool as
> originally
> > > suggested. To me, this seemed like a somewhat random choice between
> A,
> > > B and C, on the grounds that moving one of them is probably better
> than
> > > moving none of them.
> > > >
> > >
> > > This largely tallies with what I remember of the discussion too.
> > >
> > > I'd also add though that the choice between the next pointer and
> the
> > > pool
> > > pointer came down to the fact that the next pointer is only used
> for
> > > chained, multi-segment, packets - which also tend to be larger
> packets
> > > -
> > > while the pool pointer is of relevance to all packets, big and
> small,
> > > single and multi-segment.
> >
> > I wish that was the case, but I am not so sure...
> >
> > It is true that m->next is NULL for non-segmented packets.
> 
> Yes m->next is NULL for non-segmented packets.
> Do we need to modify rte_mbuf_check() to enforce these checks?

I went through rte_mbuf_check() in my head, and it already enforces this check:
When called with is_header set, the function proceeds to count the number of segments by following the m->next chain (while m->next is not NULL), and checks that the count matches m->nb_segs.
The loop in the function (used when called with is_header set) is implemented to also check that m->nb_segs == 1 when m->next == NULL.
  

Patch

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index f3258eb3f7..efb09f0c5e 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -81,13 +81,6 @@  Deprecation Notices
   us extending existing enum/define.
   One solution can be using a fixed size array instead of ``.*MAX.*`` value.
 
-* mbuf: Some fields will be converted to dynamic API in DPDK 20.11
-  in order to reserve more space for the dynamic fields, as explained in
-  `this presentation <https://www.youtube.com/watch?v=Ttl6MlhmzWY>`_.
-  As a consequence, the layout of the ``struct rte_mbuf`` will be re-arranged,
-  avoiding impact on vectorized implementation of the driver datapaths,
-  while evaluating performance gains of a better use of the first cache line.
-
 * ethdev: The flow director API, including ``rte_eth_conf.fdir_conf`` field,
   and the related structures (``rte_fdir_*`` and ``rte_eth_fdir_*``),
   will be removed in DPDK 20.11.
diff --git a/drivers/net/octeontx2/otx2_ethdev.c b/drivers/net/octeontx2/otx2_ethdev.c
index 6cebbe677d..d6e0f1dd03 100644
--- a/drivers/net/octeontx2/otx2_ethdev.c
+++ b/drivers/net/octeontx2/otx2_ethdev.c
@@ -748,8 +748,6 @@  nix_tx_offload_flags(struct rte_eth_dev *eth_dev)
 			 offsetof(struct rte_mbuf, buf_iova) + 16);
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
 			 offsetof(struct rte_mbuf, ol_flags) + 12);
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, tx_offload) !=
-			 offsetof(struct rte_mbuf, pool) + 2 * sizeof(void *));
 
 	if (conf & DEV_TX_OFFLOAD_VLAN_INSERT ||
 	    conf & DEV_TX_OFFLOAD_QINQ_INSERT)
diff --git a/lib/librte_kni/rte_kni_common.h b/lib/librte_kni/rte_kni_common.h
index 36d66e2ffa..ffb3182731 100644
--- a/lib/librte_kni/rte_kni_common.h
+++ b/lib/librte_kni/rte_kni_common.h
@@ -84,10 +84,11 @@  struct rte_kni_mbuf {
 	char pad2[4];
 	uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
 	uint16_t data_len;      /**< Amount of data in segment buffer. */
+	char pad3[14];
+	void *pool;
 
 	/* fields on second cache line */
 	__attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)))
-	void *pool;
 	void *next;             /**< Physical address of next mbuf in kernel. */
 };
 
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 3190a29cce..c4c9ebfaa0 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -1107,7 +1107,6 @@  rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
 static inline void
 rte_mbuf_dynfield_copy(struct rte_mbuf *mdst, const struct rte_mbuf *msrc)
 {
-	memcpy(&mdst->dynfield0, msrc->dynfield0, sizeof(mdst->dynfield0));
 	memcpy(&mdst->dynfield1, msrc->dynfield1, sizeof(mdst->dynfield1));
 }
 
diff --git a/lib/librte_mbuf/rte_mbuf_core.h b/lib/librte_mbuf/rte_mbuf_core.h
index debaace95a..567551deab 100644
--- a/lib/librte_mbuf/rte_mbuf_core.h
+++ b/lib/librte_mbuf/rte_mbuf_core.h
@@ -586,12 +586,11 @@  struct rte_mbuf {
 
 	uint16_t buf_len;         /**< Length of segment buffer. */
 
-	uint64_t dynfield0[1]; /**< Reserved for dynamic fields. */
+	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
 
 	/* second cache line - fields only used in slow path or on TX */
 	RTE_MARKER cacheline1 __rte_cache_min_aligned;
 
-	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
 	struct rte_mbuf *next;    /**< Next segment of scattered packet. */
 
 	/* fields to support TX offloads */
@@ -645,7 +644,7 @@  struct rte_mbuf {
 	/** Timesync flags for use with IEEE1588. */
 	uint16_t timesync;
 
-	uint32_t dynfield1[7]; /**< Reserved for dynamic fields. */
+	uint32_t dynfield1[9]; /**< Reserved for dynamic fields. */
 } __rte_cache_aligned;
 
 /**
diff --git a/lib/librte_mbuf/rte_mbuf_dyn.c b/lib/librte_mbuf/rte_mbuf_dyn.c
index fd3e019a22..7d5e942bf0 100644
--- a/lib/librte_mbuf/rte_mbuf_dyn.c
+++ b/lib/librte_mbuf/rte_mbuf_dyn.c
@@ -125,7 +125,6 @@  init_shared_mem(void)
 		 * rte_mbuf_dynfield_copy().
 		 */
 		memset(shm, 0, sizeof(*shm));
-		mark_free(dynfield0);
 		mark_free(dynfield1);
 
 		/* init free_flags */