[15/15] mbuf: move pool pointer in hotter first half

Message ID 20201029092751.3837177-16-thomas@monjalon.net (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series remove mbuf timestamp |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Thomas Monjalon Oct. 29, 2020, 9:27 a.m. UTC
  The mempool pointer in the mbuf struct is moved
from the second to the first half.
It should increase performance on most systems having 64-byte cache line,
i.e. mbuf is split in two cache lines.
On such system, the first half (also called first cache line) is hotter
than the second one where the pool pointer was.

Moving this field gives more space to dynfield1.

This is how the mbuf layout looks like (pahole-style):

word  type                              name                byte  size
 0    void *                            buf_addr;         /*   0 +  8 */
 1    rte_iova_t                        buf_iova          /*   8 +  8 */
      /* --- RTE_MARKER64               rearm_data;                   */
 2    uint16_t                          data_off;         /*  16 +  2 */
      uint16_t                          refcnt;           /*  18 +  2 */
      uint16_t                          nb_segs;          /*  20 +  2 */
      uint16_t                          port;             /*  22 +  2 */
 3    uint64_t                          ol_flags;         /*  24 +  8 */
      /* --- RTE_MARKER                 rx_descriptor_fields1;        */
 4    uint32_t             union        packet_type;      /*  32 +  4 */
      uint32_t                          pkt_len;          /*  36 +  4 */
 5    uint16_t                          data_len;         /*  40 +  2 */
      uint16_t                          vlan_tci;         /*  42 +  2 */
 5.5  uint64_t             union        hash;             /*  44 +  8 */
 6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
      uint16_t                          buf_len;          /*  54 +  2 */
 7    struct rte_mempool *              pool;             /*  56 +  8 */
      /* --- RTE_MARKER                 cacheline1;                   */
 8    struct rte_mbuf *                 next;             /*  64 +  8 */
 9    uint64_t             union        tx_offload;       /*  72 +  8 */
10    uint16_t                          priv_size;        /*  80 +  2 */
      uint16_t                          timesync;         /*  82 +  2 */
      uint32_t                          seqn;             /*  84 +  4 */
11    struct rte_mbuf_ext_shared_info * shinfo;           /*  88 +  8 */
12    uint64_t                          dynfield1[4];     /*  96 + 32 */
16    /* --- END                                             128      */

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 doc/guides/rel_notes/deprecation.rst | 5 -----
 lib/librte_kni/rte_kni_common.h      | 3 ++-
 lib/librte_mbuf/rte_mbuf_core.h      | 5 ++---
 3 files changed, 4 insertions(+), 9 deletions(-)
  

Comments

Andrew Rybchenko Oct. 29, 2020, 10:50 a.m. UTC | #1
On 10/29/20 12:27 PM, Thomas Monjalon wrote:
> The mempool pointer in the mbuf struct is moved
> from the second to the first half.
> It should increase performance on most systems having 64-byte cache line,
> i.e. mbuf is split in two cache lines.
> On such system, the first half (also called first cache line) is hotter
> than the second one where the pool pointer was.
> 
> Moving this field gives more space to dynfield1.
> 
> This is how the mbuf layout looks like (pahole-style):
> 
> word  type                              name                byte  size
>  0    void *                            buf_addr;         /*   0 +  8 */
>  1    rte_iova_t                        buf_iova          /*   8 +  8 */
>       /* --- RTE_MARKER64               rearm_data;                   */
>  2    uint16_t                          data_off;         /*  16 +  2 */
>       uint16_t                          refcnt;           /*  18 +  2 */
>       uint16_t                          nb_segs;          /*  20 +  2 */
>       uint16_t                          port;             /*  22 +  2 */
>  3    uint64_t                          ol_flags;         /*  24 +  8 */
>       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
>  4    uint32_t             union        packet_type;      /*  32 +  4 */
>       uint32_t                          pkt_len;          /*  36 +  4 */
>  5    uint16_t                          data_len;         /*  40 +  2 */
>       uint16_t                          vlan_tci;         /*  42 +  2 */
>  5.5  uint64_t             union        hash;             /*  44 +  8 */
>  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
>       uint16_t                          buf_len;          /*  54 +  2 */
>  7    struct rte_mempool *              pool;             /*  56 +  8 */
>       /* --- RTE_MARKER                 cacheline1;                   */
>  8    struct rte_mbuf *                 next;             /*  64 +  8 */
>  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> 10    uint16_t                          priv_size;        /*  80 +  2 */
>       uint16_t                          timesync;         /*  82 +  2 */
>       uint32_t                          seqn;             /*  84 +  4 */
> 11    struct rte_mbuf_ext_shared_info * shinfo;           /*  88 +  8 */
> 12    uint64_t                          dynfield1[4];     /*  96 + 32 */
> 16    /* --- END                                             128      */
> 
> Signed-off-by: Thomas Monjalon <thomas@monjalon.net>

I'd like to understand why pool is chosen instead of, for
example, next pointer.

Pool is used on housekeeping when driver refills Rx ring or
free completed Tx mbufs. Free thresholds try to avoid it on
every Rx/Tx burst (if possible).

Next is used for multi-segment Tx and scattered (and buffer
split) Rx. IMHO the key question here is we consider these
use cases as common and priority to optimize. If yes, I'd
vote to have next on the first cacheline.

I'm not sure. Just trying to hear a bit more about it.
  
Thomas Monjalon Oct. 29, 2020, 10:56 a.m. UTC | #2
29/10/2020 11:50, Andrew Rybchenko:
> On 10/29/20 12:27 PM, Thomas Monjalon wrote:
> > The mempool pointer in the mbuf struct is moved
> > from the second to the first half.
> > It should increase performance on most systems having 64-byte cache line,
> > i.e. mbuf is split in two cache lines.
> > On such system, the first half (also called first cache line) is hotter
> > than the second one where the pool pointer was.
> > 
> > Moving this field gives more space to dynfield1.
> > 
> > This is how the mbuf layout looks like (pahole-style):
> > 
> > word  type                              name                byte  size
> >  0    void *                            buf_addr;         /*   0 +  8 */
> >  1    rte_iova_t                        buf_iova          /*   8 +  8 */
> >       /* --- RTE_MARKER64               rearm_data;                   */
> >  2    uint16_t                          data_off;         /*  16 +  2 */
> >       uint16_t                          refcnt;           /*  18 +  2 */
> >       uint16_t                          nb_segs;          /*  20 +  2 */
> >       uint16_t                          port;             /*  22 +  2 */
> >  3    uint64_t                          ol_flags;         /*  24 +  8 */
> >       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
> >  4    uint32_t             union        packet_type;      /*  32 +  4 */
> >       uint32_t                          pkt_len;          /*  36 +  4 */
> >  5    uint16_t                          data_len;         /*  40 +  2 */
> >       uint16_t                          vlan_tci;         /*  42 +  2 */
> >  5.5  uint64_t             union        hash;             /*  44 +  8 */
> >  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
> >       uint16_t                          buf_len;          /*  54 +  2 */
> >  7    struct rte_mempool *              pool;             /*  56 +  8 */
> >       /* --- RTE_MARKER                 cacheline1;                   */
> >  8    struct rte_mbuf *                 next;             /*  64 +  8 */
> >  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> > 10    uint16_t                          priv_size;        /*  80 +  2 */
> >       uint16_t                          timesync;         /*  82 +  2 */
> >       uint32_t                          seqn;             /*  84 +  4 */
> > 11    struct rte_mbuf_ext_shared_info * shinfo;           /*  88 +  8 */
> > 12    uint64_t                          dynfield1[4];     /*  96 + 32 */
> > 16    /* --- END                                             128      */
> > 
> > Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
> 
> I'd like to understand why pool is chosen instead of, for
> example, next pointer.
> 
> Pool is used on housekeeping when driver refills Rx ring or
> free completed Tx mbufs. Free thresholds try to avoid it on
> every Rx/Tx burst (if possible).
> 
> Next is used for multi-segment Tx and scattered (and buffer
> split) Rx. IMHO the key question here is we consider these
> use cases as common and priority to optimize. If yes, I'd
> vote to have next on the first cacheline.
> 
> I'm not sure. Just trying to hear a bit more about it.

That's a good question.
Clearly pool and next are good options.
The best would be to have some benchmarks.
If one use case shows no benefit, the decision is easier.

If you prefer, we can leave this last patch for -rc3.
  
Ananyev, Konstantin Oct. 29, 2020, 2:15 p.m. UTC | #3
> 
> 29/10/2020 11:50, Andrew Rybchenko:
> > On 10/29/20 12:27 PM, Thomas Monjalon wrote:
> > > The mempool pointer in the mbuf struct is moved
> > > from the second to the first half.
> > > It should increase performance on most systems having 64-byte cache line,
> > > i.e. mbuf is split in two cache lines.
> > > On such system, the first half (also called first cache line) is hotter
> > > than the second one where the pool pointer was.
> > >
> > > Moving this field gives more space to dynfield1.
> > >
> > > This is how the mbuf layout looks like (pahole-style):
> > >
> > > word  type                              name                byte  size
> > >  0    void *                            buf_addr;         /*   0 +  8 */
> > >  1    rte_iova_t                        buf_iova          /*   8 +  8 */
> > >       /* --- RTE_MARKER64               rearm_data;                   */
> > >  2    uint16_t                          data_off;         /*  16 +  2 */
> > >       uint16_t                          refcnt;           /*  18 +  2 */
> > >       uint16_t                          nb_segs;          /*  20 +  2 */
> > >       uint16_t                          port;             /*  22 +  2 */
> > >  3    uint64_t                          ol_flags;         /*  24 +  8 */
> > >       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
> > >  4    uint32_t             union        packet_type;      /*  32 +  4 */
> > >       uint32_t                          pkt_len;          /*  36 +  4 */
> > >  5    uint16_t                          data_len;         /*  40 +  2 */
> > >       uint16_t                          vlan_tci;         /*  42 +  2 */
> > >  5.5  uint64_t             union        hash;             /*  44 +  8 */
> > >  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
> > >       uint16_t                          buf_len;          /*  54 +  2 */
> > >  7    struct rte_mempool *              pool;             /*  56 +  8 */
> > >       /* --- RTE_MARKER                 cacheline1;                   */
> > >  8    struct rte_mbuf *                 next;             /*  64 +  8 */
> > >  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> > > 10    uint16_t                          priv_size;        /*  80 +  2 */
> > >       uint16_t                          timesync;         /*  82 +  2 */
> > >       uint32_t                          seqn;             /*  84 +  4 */
> > > 11    struct rte_mbuf_ext_shared_info * shinfo;           /*  88 +  8 */
> > > 12    uint64_t                          dynfield1[4];     /*  96 + 32 */
> > > 16    /* --- END                                             128      */
> > >
> > > Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
> >
> > I'd like to understand why pool is chosen instead of, for
> > example, next pointer.
> >
> > Pool is used on housekeeping when driver refills Rx ring or
> > free completed Tx mbufs. Free thresholds try to avoid it on
> > every Rx/Tx burst (if possible).
> >
> > Next is used for multi-segment Tx and scattered (and buffer
> > split) Rx. IMHO the key question here is we consider these
> > use cases as common and priority to optimize. If yes, I'd
> > vote to have next on the first cacheline.

Between these two I also would probably lean towards *next*
(after all _free_ also has to access/update next).
As another alternative to consider: tx_offload.
It is also used quite widely. 

> >
> > I'm not sure. Just trying to hear a bit more about it.
> .
> That's a good question.
> Clearly pool and next are good options.
> The best would be to have some benchmarks.
> If one use case shows no benefit, the decision is easier.
> 
> If you prefer, we can leave this last patch for -rc3.
>
  
Ray Kinsella Oct. 29, 2020, 2:42 p.m. UTC | #4
On 29/10/2020 09:27, Thomas Monjalon wrote:
> The mempool pointer in the mbuf struct is moved
> from the second to the first half.
> It should increase performance on most systems having 64-byte cache line,
> i.e. mbuf is split in two cache lines.
> On such system, the first half (also called first cache line) is hotter
> than the second one where the pool pointer was.
> 
> Moving this field gives more space to dynfield1.
> 
> This is how the mbuf layout looks like (pahole-style):
> 
> word  type                              name                byte  size
>  0    void *                            buf_addr;         /*   0 +  8 */
>  1    rte_iova_t                        buf_iova          /*   8 +  8 */
>       /* --- RTE_MARKER64               rearm_data;                   */
>  2    uint16_t                          data_off;         /*  16 +  2 */
>       uint16_t                          refcnt;           /*  18 +  2 */
>       uint16_t                          nb_segs;          /*  20 +  2 */
>       uint16_t                          port;             /*  22 +  2 */
>  3    uint64_t                          ol_flags;         /*  24 +  8 */
>       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
>  4    uint32_t             union        packet_type;      /*  32 +  4 */
>       uint32_t                          pkt_len;          /*  36 +  4 */
>  5    uint16_t                          data_len;         /*  40 +  2 */
>       uint16_t                          vlan_tci;         /*  42 +  2 */
>  5.5  uint64_t             union        hash;             /*  44 +  8 */
>  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
>       uint16_t                          buf_len;          /*  54 +  2 */
>  7    struct rte_mempool *              pool;             /*  56 +  8 */
>       /* --- RTE_MARKER                 cacheline1;                   */
>  8    struct rte_mbuf *                 next;             /*  64 +  8 */
>  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> 10    uint16_t                          priv_size;        /*  80 +  2 */
>       uint16_t                          timesync;         /*  82 +  2 */
>       uint32_t                          seqn;             /*  84 +  4 */
> 11    struct rte_mbuf_ext_shared_info * shinfo;           /*  88 +  8 */
> 12    uint64_t                          dynfield1[4];     /*  96 + 32 */
> 16    /* --- END                                             128      */
> 
> Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
> ---
>  doc/guides/rel_notes/deprecation.rst | 5 -----
>  lib/librte_kni/rte_kni_common.h      | 3 ++-
>  lib/librte_mbuf/rte_mbuf_core.h      | 5 ++---
>  3 files changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
> index 72dbb25b83..07ca1dcbb2 100644
> --- a/doc/guides/rel_notes/deprecation.rst
> +++ b/doc/guides/rel_notes/deprecation.rst
> @@ -88,11 +88,6 @@ Deprecation Notices
>  
>    - ``seqn``
>  
> -  As a consequence, the layout of the ``struct rte_mbuf`` will be re-arranged,
> -  avoiding impact on vectorized implementation of the driver datapaths,
> -  while evaluating performance gains of a better use of the first cache line.
> -
> -
>  * ethdev: the legacy filter API, including
>    ``rte_eth_dev_filter_supported()``, ``rte_eth_dev_filter_ctrl()`` as well
>    as filter types MACVLAN, ETHERTYPE, FLEXIBLE, SYN, NTUPLE, TUNNEL, FDIR,
> diff --git a/lib/librte_kni/rte_kni_common.h b/lib/librte_kni/rte_kni_common.h
> index 36d66e2ffa..ffb3182731 100644
> --- a/lib/librte_kni/rte_kni_common.h
> +++ b/lib/librte_kni/rte_kni_common.h
> @@ -84,10 +84,11 @@ struct rte_kni_mbuf {
>  	char pad2[4];
>  	uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
>  	uint16_t data_len;      /**< Amount of data in segment buffer. */
> +	char pad3[14];
> +	void *pool;
>  
>  	/* fields on second cache line */
>  	__attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)))
> -	void *pool;
>  	void *next;             /**< Physical address of next mbuf in kernel. */
>  };
>  
> diff --git a/lib/librte_mbuf/rte_mbuf_core.h b/lib/librte_mbuf/rte_mbuf_core.h
> index 52ca1c842f..ee185fa32b 100644
> --- a/lib/librte_mbuf/rte_mbuf_core.h
> +++ b/lib/librte_mbuf/rte_mbuf_core.h
> @@ -584,12 +584,11 @@ struct rte_mbuf {
>  
>  	uint16_t buf_len;         /**< Length of segment buffer. */
>  
> -	uint64_t unused;
> +	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
>  
>  	/* second cache line - fields only used in slow path or on TX */
>  	RTE_MARKER cacheline1 __rte_cache_min_aligned;
>  
> -	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
>  	struct rte_mbuf *next;    /**< Next segment of scattered packet. */
>  
>  	/* fields to support TX offloads */
> @@ -646,7 +645,7 @@ struct rte_mbuf {
>  	 */
>  	struct rte_mbuf_ext_shared_info *shinfo;
>  
> -	uint64_t dynfield1[3]; /**< Reserved for dynamic fields. */
> +	uint64_t dynfield1[4]; /**< Reserved for dynamic fields. */
>  } __rte_cache_aligned;
>  
>  /**
> 

I will let other chime in on the merits of positioning cache alignment of the 
mempool pointer. 

From the ABI PoV, depreciate notice has been observed and since mbuf effects 
everything doing it outside of a ABI Breakage window is impossible, so it now or
never.

Ray K
  
Ajit Khaparde Oct. 29, 2020, 6:45 p.m. UTC | #5
On Thu, Oct 29, 2020 at 7:15 AM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
>
>
>
> >
> > 29/10/2020 11:50, Andrew Rybchenko:
> > > On 10/29/20 12:27 PM, Thomas Monjalon wrote:
> > > > The mempool pointer in the mbuf struct is moved
> > > > from the second to the first half.
> > > > It should increase performance on most systems having 64-byte cache line,
> > > > i.e. mbuf is split in two cache lines.
> > > > On such system, the first half (also called first cache line) is hotter
> > > > than the second one where the pool pointer was.
> > > >
> > > > Moving this field gives more space to dynfield1.
> > > >
> > > > This is how the mbuf layout looks like (pahole-style):
> > > >
> > > > word  type                              name                byte  size
> > > >  0    void *                            buf_addr;         /*   0 +  8 */
> > > >  1    rte_iova_t                        buf_iova          /*   8 +  8 */
> > > >       /* --- RTE_MARKER64               rearm_data;                   */
> > > >  2    uint16_t                          data_off;         /*  16 +  2 */
> > > >       uint16_t                          refcnt;           /*  18 +  2 */
> > > >       uint16_t                          nb_segs;          /*  20 +  2 */
> > > >       uint16_t                          port;             /*  22 +  2 */
> > > >  3    uint64_t                          ol_flags;         /*  24 +  8 */
> > > >       /* --- RTE_MARKER                 rx_descriptor_fields1;        */
> > > >  4    uint32_t             union        packet_type;      /*  32 +  4 */
> > > >       uint32_t                          pkt_len;          /*  36 +  4 */
> > > >  5    uint16_t                          data_len;         /*  40 +  2 */
> > > >       uint16_t                          vlan_tci;         /*  42 +  2 */
> > > >  5.5  uint64_t             union        hash;             /*  44 +  8 */
> > > >  6.5  uint16_t                          vlan_tci_outer;   /*  52 +  2 */
> > > >       uint16_t                          buf_len;          /*  54 +  2 */
> > > >  7    struct rte_mempool *              pool;             /*  56 +  8 */
> > > >       /* --- RTE_MARKER                 cacheline1;                   */
> > > >  8    struct rte_mbuf *                 next;             /*  64 +  8 */
> > > >  9    uint64_t             union        tx_offload;       /*  72 +  8 */
> > > > 10    uint16_t                          priv_size;        /*  80 +  2 */
> > > >       uint16_t                          timesync;         /*  82 +  2 */
> > > >       uint32_t                          seqn;             /*  84 +  4 */
> > > > 11    struct rte_mbuf_ext_shared_info * shinfo;           /*  88 +  8 */
> > > > 12    uint64_t                          dynfield1[4];     /*  96 + 32 */
> > > > 16    /* --- END                                             128      */
> > > >
> > > > Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
> > >
> > > I'd like to understand why pool is chosen instead of, for
> > > example, next pointer.
> > >
> > > Pool is used on housekeeping when driver refills Rx ring or
> > > free completed Tx mbufs. Free thresholds try to avoid it on
> > > every Rx/Tx burst (if possible).
> > >
> > > Next is used for multi-segment Tx and scattered (and buffer
> > > split) Rx. IMHO the key question here is we consider these
> > > use cases as common and priority to optimize. If yes, I'd
> > > vote to have next on the first cacheline.
>
> Between these two I also would probably lean towards *next*
> (after all _free_ also has to access/update next).
+1

> As another alternative to consider: tx_offload.
> It is also used quite widely.
>
> > >
> > > I'm not sure. Just trying to hear a bit more about it.
> > .
> > That's a good question.
> > Clearly pool and next are good options.
> > The best would be to have some benchmarks.
> > If one use case shows no benefit, the decision is easier.
> >
> > If you prefer, we can leave this last patch for -rc3.
> >
>
  
Morten Brørup Oct. 31, 2020, 6:20 p.m. UTC | #6
Thomas,

Adding my thoughts to the already detailed feedback on this important patch...

The first cache line is not inherently "hotter" than the second. The hotness depends on their usage.

The mbuf cacheline1 marker has the following comment:
/* second cache line - fields only used in slow path or on TX */

In other words, the second cache line is intended not to be touched in fast path RX.

I do not think this is true anymore. Not even with simple non-scattered RX. And regression testing probably didn't catch this, because the tests perform TX after RX, so the cache miss moved from TX to RX and became a cache hit in TX instead. (I may be wrong about this claim, but it's not important for the discussion.)

I think the right question for this patch is: Can we achieve this - not using the second cache line for fast path RX - again by putting the right fields in the first cache line?

Probably not in all cases, but perhaps for some...

Consider the application scenarios.

When a packet is received, one of three things happens to it:
1. It is immediately transmitted on one or more ports.
2. It is immediately discarded, e.g. by a firewall rule.
3. It is put in some sort of queue, e.g. a ring for the next pipeline stage, or in a QoS queue.

1. If the packet is immediately transmitted, the m->tx_offload field in the second cache line will be touched by the application and TX function anyway, so we don't need to optimize the mbuf layout for this scenario.

2. The second scenario touches m->pool no matter how it is implemented. The application can avoid touching m->next by using rte_mbuf_raw_free(), knowing that the mbuf came directly from RX and thus no other fields have been touched. In this scenario, we want m->pool in the first cache line.

3. Now, let's consider the third scenario, where RX is followed by enqueue into a ring. If the application does nothing but put the packet into a ring, we don't need to move anything into the first cache line. But applications usually does more... So it is application specific what would be good to move to the first cache line:

A. If the application does not use segmented mbufs, and performs analysis and preparation for transmission in the initial pipeline stages, and only the last pipeline stage performs TX, we could move m->tx_offload to the first cache line, which would keep the second cache line cold until the actual TX happens in the last pipeline stage - maybe even after the packet has waited in a QoS queue for a long time, and its cache lines have gone cold.

B. If the application uses segmented mbufs on RX, it might make sense moving m->next to the first cache line. (We don't use segmented mbufs, so I'm not sure about this.)


However, reality perhaps beats theory:

Looking at the E1000 PMD, it seems like even its non-scattered RX function, eth_igb_recv_pkts(), sets m->next. If it only kept its own free pool pre-initialized instead... I haven't investigated other PMDs, except briefly looking at the mlx5 PMD, and it seems like it doesn't touch m->next in RX.

I haven't looked deeper into how m->pool is being used by RX in PMDs, but I suppose that it isn't touched in RX.

<rant on>
If only we had a performance test where RX was not immediately followed by TX, but the packets were passed through a large queue in-between, so RX cache misses were not free of charge because they transform TX cache misses into cache hits instead...
<rant off>

Whatever you choose, I am sure that most applications will find it more useful than the timestamp. :-)


Med venlig hilsen / kind regards
- Morten Brørup
  
Thomas Monjalon Oct. 31, 2020, 8:40 p.m. UTC | #7
Thanks for the thoughts Morten.
I believe we need benchmarks of different scenarios with different drivers.


31/10/2020 19:20, Morten Brørup:
> Thomas,
> 
> Adding my thoughts to the already detailed feedback on this important patch...
> 
> The first cache line is not inherently "hotter" than the second. The hotness depends on their usage.
> 
> The mbuf cacheline1 marker has the following comment:
> /* second cache line - fields only used in slow path or on TX */
> 
> In other words, the second cache line is intended not to be touched in fast path RX.
> 
> I do not think this is true anymore. Not even with simple non-scattered RX. And regression testing probably didn't catch this, because the tests perform TX after RX, so the cache miss moved from TX to RX and became a cache hit in TX instead. (I may be wrong about this claim, but it's not important for the discussion.)
> 
> I think the right question for this patch is: Can we achieve this - not using the second cache line for fast path RX - again by putting the right fields in the first cache line?
> 
> Probably not in all cases, but perhaps for some...
> 
> Consider the application scenarios.
> 
> When a packet is received, one of three things happens to it:
> 1. It is immediately transmitted on one or more ports.
> 2. It is immediately discarded, e.g. by a firewall rule.
> 3. It is put in some sort of queue, e.g. a ring for the next pipeline stage, or in a QoS queue.
> 
> 1. If the packet is immediately transmitted, the m->tx_offload field in the second cache line will be touched by the application and TX function anyway, so we don't need to optimize the mbuf layout for this scenario.
> 
> 2. The second scenario touches m->pool no matter how it is implemented. The application can avoid touching m->next by using rte_mbuf_raw_free(), knowing that the mbuf came directly from RX and thus no other fields have been touched. In this scenario, we want m->pool in the first cache line.
> 
> 3. Now, let's consider the third scenario, where RX is followed by enqueue into a ring. If the application does nothing but put the packet into a ring, we don't need to move anything into the first cache line. But applications usually does more... So it is application specific what would be good to move to the first cache line:
> 
> A. If the application does not use segmented mbufs, and performs analysis and preparation for transmission in the initial pipeline stages, and only the last pipeline stage performs TX, we could move m->tx_offload to the first cache line, which would keep the second cache line cold until the actual TX happens in the last pipeline stage - maybe even after the packet has waited in a QoS queue for a long time, and its cache lines have gone cold.
> 
> B. If the application uses segmented mbufs on RX, it might make sense moving m->next to the first cache line. (We don't use segmented mbufs, so I'm not sure about this.)
> 
> 
> However, reality perhaps beats theory:
> 
> Looking at the E1000 PMD, it seems like even its non-scattered RX function, eth_igb_recv_pkts(), sets m->next. If it only kept its own free pool pre-initialized instead... I haven't investigated other PMDs, except briefly looking at the mlx5 PMD, and it seems like it doesn't touch m->next in RX.
> 
> I haven't looked deeper into how m->pool is being used by RX in PMDs, but I suppose that it isn't touched in RX.
> 
> <rant on>
> If only we had a performance test where RX was not immediately followed by TX, but the packets were passed through a large queue in-between, so RX cache misses were not free of charge because they transform TX cache misses into cache hits instead...
> <rant off>
> 
> Whatever you choose, I am sure that most applications will find it more useful than the timestamp. :-)
  
Morten Brørup Nov. 1, 2020, 9:12 a.m. UTC | #8
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Saturday, October 31, 2020 9:41 PM
> 
> 31/10/2020 19:20, Morten Brørup:
> > Thomas,
> >
> > Adding my thoughts to the already detailed feedback on this important
> patch...
> >
> > The first cache line is not inherently "hotter" than the second. The
> hotness depends on their usage.
> >
> > The mbuf cacheline1 marker has the following comment:
> > /* second cache line - fields only used in slow path or on TX */
> >
> > In other words, the second cache line is intended not to be touched in
> fast path RX.
> >
> > I do not think this is true anymore. Not even with simple non-scattered
> RX. And regression testing probably didn't catch this, because the tests
> perform TX after RX, so the cache miss moved from TX to RX and became a
> cache hit in TX instead. (I may be wrong about this claim, but it's not
> important for the discussion.)
> >
> > I think the right question for this patch is: Can we achieve this - not
> using the second cache line for fast path RX - again by putting the right
> fields in the first cache line?
> >
> > Probably not in all cases, but perhaps for some...
> >
> > Consider the application scenarios.
> >
> > When a packet is received, one of three things happens to it:
> > 1. It is immediately transmitted on one or more ports.
> > 2. It is immediately discarded, e.g. by a firewall rule.
> > 3. It is put in some sort of queue, e.g. a ring for the next pipeline
> stage, or in a QoS queue.
> >
> > 1. If the packet is immediately transmitted, the m->tx_offload field in
> the second cache line will be touched by the application and TX function
> anyway, so we don't need to optimize the mbuf layout for this scenario.
> >
> > 2. The second scenario touches m->pool no matter how it is implemented.
> The application can avoid touching m->next by using rte_mbuf_raw_free(),
> knowing that the mbuf came directly from RX and thus no other fields have
> been touched. In this scenario, we want m->pool in the first cache line.
> >
> > 3. Now, let's consider the third scenario, where RX is followed by
> enqueue into a ring. If the application does nothing but put the packet
> into a ring, we don't need to move anything into the first cache line. But
> applications usually does more... So it is application specific what would
> be good to move to the first cache line:
> >
> > A. If the application does not use segmented mbufs, and performs analysis
> and preparation for transmission in the initial pipeline stages, and only
> the last pipeline stage performs TX, we could move m->tx_offload to the
> first cache line, which would keep the second cache line cold until the
> actual TX happens in the last pipeline stage - maybe even after the packet
> has waited in a QoS queue for a long time, and its cache lines have gone
> cold.
> >
> > B. If the application uses segmented mbufs on RX, it might make sense
> moving m->next to the first cache line. (We don't use segmented mbufs, so
> I'm not sure about this.)
> >
> >
> > However, reality perhaps beats theory:
> >
> > Looking at the E1000 PMD, it seems like even its non-scattered RX
> function, eth_igb_recv_pkts(), sets m->next. If it only kept its own free
> pool pre-initialized instead... I haven't investigated other PMDs, except
> briefly looking at the mlx5 PMD, and it seems like it doesn't touch m->next
> in RX.
> >
> > I haven't looked deeper into how m->pool is being used by RX in PMDs, but
> I suppose that it isn't touched in RX.
> >
> > <rant on>
> > If only we had a performance test where RX was not immediately followed
> by TX, but the packets were passed through a large queue in-between, so RX
> cache misses were not free of charge because they transform TX cache misses
> into cache hits instead...
> > <rant off>
> >
> > Whatever you choose, I am sure that most applications will find it more
> useful than the timestamp. :-)
> 
> Thanks for the thoughts Morten.
> I believe we need benchmarks of different scenarios with different drivers.
>

If we are only allowed to modify the mbuf structure this one more time, we should look forward, not backwards!

If we move m->tx_offload to the first cache line, applications using simple, non-scattered packet mbufs would never even need to touch the second cache line, except for freeing the mbuf (which needs to read m->pool).

And this leads to my next suggestion...

One thing has always puzzled me: Why do we use 64 bits to indicate which memory pool an mbuf belongs to? The portid only uses 16 bits and an indirection index. Why don't we use the same kind of indirection index for mbuf pools?

I can easily imagine using one mbuf pool (or perhaps a few pools) per CPU socket (or per physical memory bus closest to an attached NIC), but not more than 256 mbuf memory pools in total. So, let's introduce an mbufpoolid like the portid, and cut this mbuf field down from 64 to 8 bits.

If we also cut down m->pkt_len from 32 to 24 bits, we can get the 8 bit mbuf pool index into the first cache line at no additional cost.

In other words: This would free up another 64 bit field in the mbuf structure!


And even though the m->next pointer for scattered packets resides in the second cache line, the libraries and application knows that m->next is NULL when m->nb_segs is 1. This proves that my suggestion would make touching the second cache line unnecessary (in simple cases), even for re-initializing the mbuf.


And now I will proceed out on a tangent with two more independent thoughts, so feel free to ignore.

Consider a multi CPU socket system with one mbuf pool per CPU socket, the NICs attached to each CPU socket use an RX mbuf pool with RAM on the same CPU socket. I would imagine that (re-)initializing these mbufs could be faster if performed only on a CPU on the same socket. If this is the case, mbufs should be re-initialized as part of the RX preparation at ingress, not as part of the mbuf free at egress.

Perhaps some microarchitectures are faster to compare nb_segs==0 than nb_segs==1. If so, nb_segs could be redefined to mean number of additional segments, rather than number of segments.


PS: I have added two more mlx5 maintainers to the discussion; they might have qualified opinions about how PMDs could benefit from this.


Med venlig hilsen / kind regards
- Morten Brørup
  
Thomas Monjalon Nov. 1, 2020, 4:21 p.m. UTC | #9
That's very interesting food for thoughts.
I hope we will have a good community discussion on this list
during this week to make some decisions.


01/11/2020 10:12, Morten Brørup:
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Saturday, October 31, 2020 9:41 PM
> > 
> > 31/10/2020 19:20, Morten Brørup:
> > > Thomas,
> > >
> > > Adding my thoughts to the already detailed feedback on this important
> > patch...
> > >
> > > The first cache line is not inherently "hotter" than the second. The
> > hotness depends on their usage.
> > >
> > > The mbuf cacheline1 marker has the following comment:
> > > /* second cache line - fields only used in slow path or on TX */
> > >
> > > In other words, the second cache line is intended not to be touched in
> > fast path RX.
> > >
> > > I do not think this is true anymore. Not even with simple non-scattered
> > RX. And regression testing probably didn't catch this, because the tests
> > perform TX after RX, so the cache miss moved from TX to RX and became a
> > cache hit in TX instead. (I may be wrong about this claim, but it's not
> > important for the discussion.)
> > >
> > > I think the right question for this patch is: Can we achieve this - not
> > using the second cache line for fast path RX - again by putting the right
> > fields in the first cache line?
> > >
> > > Probably not in all cases, but perhaps for some...
> > >
> > > Consider the application scenarios.
> > >
> > > When a packet is received, one of three things happens to it:
> > > 1. It is immediately transmitted on one or more ports.
> > > 2. It is immediately discarded, e.g. by a firewall rule.
> > > 3. It is put in some sort of queue, e.g. a ring for the next pipeline
> > stage, or in a QoS queue.
> > >
> > > 1. If the packet is immediately transmitted, the m->tx_offload field in
> > the second cache line will be touched by the application and TX function
> > anyway, so we don't need to optimize the mbuf layout for this scenario.
> > >
> > > 2. The second scenario touches m->pool no matter how it is implemented.
> > The application can avoid touching m->next by using rte_mbuf_raw_free(),
> > knowing that the mbuf came directly from RX and thus no other fields have
> > been touched. In this scenario, we want m->pool in the first cache line.
> > >
> > > 3. Now, let's consider the third scenario, where RX is followed by
> > enqueue into a ring. If the application does nothing but put the packet
> > into a ring, we don't need to move anything into the first cache line. But
> > applications usually does more... So it is application specific what would
> > be good to move to the first cache line:
> > >
> > > A. If the application does not use segmented mbufs, and performs analysis
> > and preparation for transmission in the initial pipeline stages, and only
> > the last pipeline stage performs TX, we could move m->tx_offload to the
> > first cache line, which would keep the second cache line cold until the
> > actual TX happens in the last pipeline stage - maybe even after the packet
> > has waited in a QoS queue for a long time, and its cache lines have gone
> > cold.
> > >
> > > B. If the application uses segmented mbufs on RX, it might make sense
> > moving m->next to the first cache line. (We don't use segmented mbufs, so
> > I'm not sure about this.)
> > >
> > >
> > > However, reality perhaps beats theory:
> > >
> > > Looking at the E1000 PMD, it seems like even its non-scattered RX
> > function, eth_igb_recv_pkts(), sets m->next. If it only kept its own free
> > pool pre-initialized instead... I haven't investigated other PMDs, except
> > briefly looking at the mlx5 PMD, and it seems like it doesn't touch m->next
> > in RX.
> > >
> > > I haven't looked deeper into how m->pool is being used by RX in PMDs, but
> > I suppose that it isn't touched in RX.
> > >
> > > <rant on>
> > > If only we had a performance test where RX was not immediately followed
> > by TX, but the packets were passed through a large queue in-between, so RX
> > cache misses were not free of charge because they transform TX cache misses
> > into cache hits instead...
> > > <rant off>
> > >
> > > Whatever you choose, I am sure that most applications will find it more
> > useful than the timestamp. :-)
> > 
> > Thanks for the thoughts Morten.
> > I believe we need benchmarks of different scenarios with different drivers.
> >
> 
> If we are only allowed to modify the mbuf structure this one more time, we should look forward, not backwards!
> 
> If we move m->tx_offload to the first cache line, applications using simple, non-scattered packet mbufs would never even need to touch the second cache line, except for freeing the mbuf (which needs to read m->pool).
> 
> And this leads to my next suggestion...
> 
> One thing has always puzzled me: Why do we use 64 bits to indicate which memory pool an mbuf belongs to? The portid only uses 16 bits and an indirection index. Why don't we use the same kind of indirection index for mbuf pools?
> 
> I can easily imagine using one mbuf pool (or perhaps a few pools) per CPU socket (or per physical memory bus closest to an attached NIC), but not more than 256 mbuf memory pools in total. So, let's introduce an mbufpoolid like the portid, and cut this mbuf field down from 64 to 8 bits.
> 
> If we also cut down m->pkt_len from 32 to 24 bits, we can get the 8 bit mbuf pool index into the first cache line at no additional cost.
> 
> In other words: This would free up another 64 bit field in the mbuf structure!
> 
> 
> And even though the m->next pointer for scattered packets resides in the second cache line, the libraries and application knows that m->next is NULL when m->nb_segs is 1. This proves that my suggestion would make touching the second cache line unnecessary (in simple cases), even for re-initializing the mbuf.
> 
> 
> And now I will proceed out on a tangent with two more independent thoughts, so feel free to ignore.
> 
> Consider a multi CPU socket system with one mbuf pool per CPU socket, the NICs attached to each CPU socket use an RX mbuf pool with RAM on the same CPU socket. I would imagine that (re-)initializing these mbufs could be faster if performed only on a CPU on the same socket. If this is the case, mbufs should be re-initialized as part of the RX preparation at ingress, not as part of the mbuf free at egress.
> 
> Perhaps some microarchitectures are faster to compare nb_segs==0 than nb_segs==1. If so, nb_segs could be redefined to mean number of additional segments, rather than number of segments.
  
Thomas Monjalon Nov. 1, 2020, 4:38 p.m. UTC | #10
01/11/2020 10:12, Morten Brørup:
> One thing has always puzzled me:
> Why do we use 64 bits to indicate which memory pool
> an mbuf belongs to?
> The portid only uses 16 bits and an indirection index.
> Why don't we use the same kind of indirection index for mbuf pools?

I wonder what would be the cost of indirection. Probably neglectible.
I think it is a good proposal...
... for next year, after a deprecation notice.

> I can easily imagine using one mbuf pool (or perhaps a few pools)
> per CPU socket (or per physical memory bus closest to an attached NIC),
> but not more than 256 mbuf memory pools in total.
> So, let's introduce an mbufpoolid like the portid,
> and cut this mbuf field down from 64 to 8 bits.
> 
> If we also cut down m->pkt_len from 32 to 24 bits,

Who is using packets larger than 64k? Are 16 bits enough?

> we can get the 8 bit mbuf pool index into the first cache line
> at no additional cost.

I like the idea.
It means we don't need to move the pool pointer now,
i.e. it does not have to replace the timestamp field.

> In other words: This would free up another 64 bit field in the mbuf structure!

That would be great!


> And even though the m->next pointer for scattered packets resides
> in the second cache line, the libraries and application knows
> that m->next is NULL when m->nb_segs is 1.
> This proves that my suggestion would make touching
> the second cache line unnecessary (in simple cases),
> even for re-initializing the mbuf.

So you think the "next" pointer should stay in the second half of mbuf?

I feel you would like to move the Tx offloads in the first half
to improve performance of very simple apps.
I am thinking the opposite: we could have some dynamic fields space
in the first half to improve performance of complex Rx.
Note: we can add a flag hint for field registration in this first half.


> And now I will proceed out on a tangent with two more
> independent thoughts, so feel free to ignore.
> 
> Consider a multi CPU socket system with one mbuf pool
> per CPU socket, the NICs attached to each CPU socket
> use an RX mbuf pool with RAM on the same CPU socket.
> I would imagine that (re-)initializing these mbufs could be faster
> if performed only on a CPU on the same socket.
> If this is the case, mbufs should be re-initialized
> as part of the RX preparation at ingress,
> not as part of the mbuf free at egress.
> 
> Perhaps some microarchitectures are faster to compare
> nb_segs==0 than nb_segs==1.
> If so, nb_segs could be redefined to mean number of
> additional segments, rather than number of segments.
  
Morten Brørup Nov. 1, 2020, 8:59 p.m. UTC | #11
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Sunday, November 1, 2020 5:38 PM
> 
> 01/11/2020 10:12, Morten Brørup:
> > One thing has always puzzled me:
> > Why do we use 64 bits to indicate which memory pool
> > an mbuf belongs to?
> > The portid only uses 16 bits and an indirection index.
> > Why don't we use the same kind of indirection index for mbuf pools?
> 
> I wonder what would be the cost of indirection. Probably neglectible.

Probably. The portid does it, and that indirection is heavily used everywhere.

The size of mbuf memory pool indirection array should be compile time configurable, like the size of the portid indirection array.

And for reference, the indirection array will fit into one cache line if we default to 8 mbuf pools, thus supporting an 8 CPU socket system with one mbuf pool per CPU socket, or a 4 CPU socket system with two mbuf pools per CPU socket.

(And as a side note: Our application is optimized for single-socket systems, and we only use one mbuf pool. I guess many applications were developed without carefully optimizing for multi-socket systems, and also just use one mbuf pool. In these cases, the mbuf structure doesn't really need a pool field. But it is still there, and the DPDK libraries use it, so we didn't bother removing it.)

> I think it is a good proposal...
> ... for next year, after a deprecation notice.
> 
> > I can easily imagine using one mbuf pool (or perhaps a few pools)
> > per CPU socket (or per physical memory bus closest to an attached NIC),
> > but not more than 256 mbuf memory pools in total.
> > So, let's introduce an mbufpoolid like the portid,
> > and cut this mbuf field down from 64 to 8 bits.
> >
> > If we also cut down m->pkt_len from 32 to 24 bits,
> 
> Who is using packets larger than 64k? Are 16 bits enough?

I personally consider 64k a reasonable packet size limit. Exotic applications with even larger packets would have to live with this constraint. But let's see if there are any objections. For reference, 64k corresponds to ca. 44 Ethernet (1500 byte) packets.

(The limit could be 65535 bytes, to avoid translation of the value 0 into 65536 bytes.)

This modification would go nicely hand in hand with the mbuf pool indirection modification.

... after yet another round of ABI stability discussions, depreciation notices, and so on. :-)

> 
> > we can get the 8 bit mbuf pool index into the first cache line
> > at no additional cost.
> 
> I like the idea.
> It means we don't need to move the pool pointer now,
> i.e. it does not have to replace the timestamp field.

Agreed! Don't move m->pool to the first cache line; it is not used for RX.

> 
> > In other words: This would free up another 64 bit field in the mbuf
> structure!
> 
> That would be great!
> 
> 
> > And even though the m->next pointer for scattered packets resides
> > in the second cache line, the libraries and application knows
> > that m->next is NULL when m->nb_segs is 1.
> > This proves that my suggestion would make touching
> > the second cache line unnecessary (in simple cases),
> > even for re-initializing the mbuf.
> 
> So you think the "next" pointer should stay in the second half of mbuf?
> 
> I feel you would like to move the Tx offloads in the first half
> to improve performance of very simple apps.

"Very simple apps" sounds like a minority of apps. I would rather say "very simple packet handling scenarios", e.g. forwarding of normal size non-segmented packets. I would guess that the vast majority of packets handled by DPDK applications actually match this scenario. So I'm proposing to optimize for what I think is the most common scenario.

If segmented packets are common, then m->next could be moved to the first cache line. But it will only improve the pure RX steps of the pipeline. When preparing the packet for TX, m->tx_offloads will need to be set, and the second cache line comes into play. So I'm wondering how big the benefit of having m->next in the first cache line really is - assuming that m->nb_segs will be checked before accessing m->next.

> I am thinking the opposite: we could have some dynamic fields space
> in the first half to improve performance of complex Rx.
> Note: we can add a flag hint for field registration in this first half.
> 

I have had the same thoughts. However, I would prefer being able to forward ordinary packets without using the second mbuf cache line at all (although only in specific scenarios like my example above).

Furthermore, the application can abuse the 64 bit m->tx_offload field for private purposes until it is time to prepare the packet for TX and pass it on to the driver. This hack somewhat resembles a dynamic field in the first cache line, and will not be possible if the m->pool or m->next field is moved there.
  
Thomas Monjalon Nov. 2, 2020, 3:58 p.m. UTC | #12
+Cc techboard

We need benchmark numbers in order to take a decision.
Please all, prepare some arguments and numbers so we can discuss
the mbuf layout in the next techboard meeting.


01/11/2020 21:59, Morten Brørup:
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Sunday, November 1, 2020 5:38 PM
> > 
> > 01/11/2020 10:12, Morten Brørup:
> > > One thing has always puzzled me:
> > > Why do we use 64 bits to indicate which memory pool
> > > an mbuf belongs to?
> > > The portid only uses 16 bits and an indirection index.
> > > Why don't we use the same kind of indirection index for mbuf pools?
> > 
> > I wonder what would be the cost of indirection. Probably neglectible.
> 
> Probably. The portid does it, and that indirection is heavily used everywhere.
> 
> The size of mbuf memory pool indirection array should be compile time configurable, like the size of the portid indirection array.
> 
> And for reference, the indirection array will fit into one cache line if we default to 8 mbuf pools, thus supporting an 8 CPU socket system with one mbuf pool per CPU socket, or a 4 CPU socket system with two mbuf pools per CPU socket.
> 
> (And as a side note: Our application is optimized for single-socket systems, and we only use one mbuf pool. I guess many applications were developed without carefully optimizing for multi-socket systems, and also just use one mbuf pool. In these cases, the mbuf structure doesn't really need a pool field. But it is still there, and the DPDK libraries use it, so we didn't bother removing it.)
> 
> > I think it is a good proposal...
> > ... for next year, after a deprecation notice.
> > 
> > > I can easily imagine using one mbuf pool (or perhaps a few pools)
> > > per CPU socket (or per physical memory bus closest to an attached NIC),
> > > but not more than 256 mbuf memory pools in total.
> > > So, let's introduce an mbufpoolid like the portid,
> > > and cut this mbuf field down from 64 to 8 bits.

We will need to measure the perf of the solution.
There is a chance for the cost to be too much high.


> > > If we also cut down m->pkt_len from 32 to 24 bits,
> > 
> > Who is using packets larger than 64k? Are 16 bits enough?
> 
> I personally consider 64k a reasonable packet size limit. Exotic applications with even larger packets would have to live with this constraint. But let's see if there are any objections. For reference, 64k corresponds to ca. 44 Ethernet (1500 byte) packets.
> 
> (The limit could be 65535 bytes, to avoid translation of the value 0 into 65536 bytes.)
> 
> This modification would go nicely hand in hand with the mbuf pool indirection modification.
> 
> ... after yet another round of ABI stability discussions, depreciation notices, and so on. :-)

After more thoughts, I'm afraid 64k is too small in some cases.
And 24-bit manipulation would probably break performance.
I'm afraid we are stuck with 32-bit length.


> > > we can get the 8 bit mbuf pool index into the first cache line
> > > at no additional cost.
> > 
> > I like the idea.
> > It means we don't need to move the pool pointer now,
> > i.e. it does not have to replace the timestamp field.
> 
> Agreed! Don't move m->pool to the first cache line; it is not used for RX.
> 
> > 
> > > In other words: This would free up another 64 bit field in the mbuf
> > structure!
> > 
> > That would be great!
> > 
> > 
> > > And even though the m->next pointer for scattered packets resides
> > > in the second cache line, the libraries and application knows
> > > that m->next is NULL when m->nb_segs is 1.
> > > This proves that my suggestion would make touching
> > > the second cache line unnecessary (in simple cases),
> > > even for re-initializing the mbuf.
> > 
> > So you think the "next" pointer should stay in the second half of mbuf?
> > 
> > I feel you would like to move the Tx offloads in the first half
> > to improve performance of very simple apps.
> 
> "Very simple apps" sounds like a minority of apps. I would rather say "very simple packet handling scenarios", e.g. forwarding of normal size non-segmented packets. I would guess that the vast majority of packets handled by DPDK applications actually match this scenario. So I'm proposing to optimize for what I think is the most common scenario.
> 
> If segmented packets are common, then m->next could be moved to the first cache line. But it will only improve the pure RX steps of the pipeline. When preparing the packet for TX, m->tx_offloads will need to be set, and the second cache line comes into play. So I'm wondering how big the benefit of having m->next in the first cache line really is - assuming that m->nb_segs will be checked before accessing m->next.
> 
> > I am thinking the opposite: we could have some dynamic fields space
> > in the first half to improve performance of complex Rx.
> > Note: we can add a flag hint for field registration in this first half.
> > 
> 
> I have had the same thoughts. However, I would prefer being able to forward ordinary packets without using the second mbuf cache line at all (although only in specific scenarios like my example above).
> 
> Furthermore, the application can abuse the 64 bit m->tx_offload field for private purposes until it is time to prepare the packet for TX and pass it on to the driver. This hack somewhat resembles a dynamic field in the first cache line, and will not be possible if the m->pool or m->next field is moved there.
  
Morten Brørup Nov. 3, 2020, 12:10 p.m. UTC | #13
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Monday, November 2, 2020 4:58 PM
> 
> +Cc techboard
> 
> We need benchmark numbers in order to take a decision.
> Please all, prepare some arguments and numbers so we can discuss
> the mbuf layout in the next techboard meeting.

I propose that the techboard considers this from two angels:

1. Long term goals and their relative priority. I.e. what can be
achieved with wide-ranging modifications, requiring yet another ABI
break and due notices.

2. Short term goals, i.e. what can be achieved for this release.


My suggestions follow...

1. Regarding long term goals:

I have argued that simple forwarding of non-segmented packets using
only the first mbuf cache line can be achieved by making three
modifications:

a) Move m->tx_offload to the first cache line.
b) Use an 8 bit pktmbuf mempool index in the first cache line,
   instead of the 64 bit m->pool pointer in the second cache line.
c) Do not access m->next when we know that it is NULL.
   We can use m->nb_segs == 1 or some other invariant as the gate.
   It can be implemented by adding an m->next accessor function:
   struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
   {
       return m->nb_segs == 1 ? NULL : m->next;
   }

Regarding the priority of this goal, I guess that simple forwarding
of non-segmented packets is probably the path taken by the majority
of packets handled by DPDK.


An alternative goal could be:
Do not touch the second cache line during RX.
A comment in the mbuf structure says so, but it is not true anymore.

(I guess that regression testing didn't catch this because the tests
perform TX immediately after RX, so the cache miss just moves from
the TX to the RX part of the test application.)


2. Regarding short term goals:

The current DPDK source code looks to me like m->next is the most
frequently accessed field in the second cache line, so it makes sense
moving this to the first cache line, rather than m->pool.
Benchmarking may help here.

If we - without breaking the ABI - can introduce a gate to avoid
accessing m->next when we know that it is NULL, we should keep it in
the second cache line.

In this case, I would prefer to move m->tx_offload to the first cache
line, thereby providing a field available for application use, until
the application prepares the packet for transmission.


> 
> 
> 01/11/2020 21:59, Morten Brørup:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Sent: Sunday, November 1, 2020 5:38 PM
> > >
> > > 01/11/2020 10:12, Morten Brørup:
> > > > One thing has always puzzled me:
> > > > Why do we use 64 bits to indicate which memory pool
> > > > an mbuf belongs to?
> > > > The portid only uses 16 bits and an indirection index.
> > > > Why don't we use the same kind of indirection index for mbuf
> pools?
> > >
> > > I wonder what would be the cost of indirection. Probably
> neglectible.
> >
> > Probably. The portid does it, and that indirection is heavily used
> everywhere.
> >
> > The size of mbuf memory pool indirection array should be compile time
> configurable, like the size of the portid indirection array.
> >
> > And for reference, the indirection array will fit into one cache line
> if we default to 8 mbuf pools, thus supporting an 8 CPU socket system
> with one mbuf pool per CPU socket, or a 4 CPU socket system with two
> mbuf pools per CPU socket.
> >
> > (And as a side note: Our application is optimized for single-socket
> systems, and we only use one mbuf pool. I guess many applications were
> developed without carefully optimizing for multi-socket systems, and
> also just use one mbuf pool. In these cases, the mbuf structure doesn't
> really need a pool field. But it is still there, and the DPDK libraries
> use it, so we didn't bother removing it.)
> >
> > > I think it is a good proposal...
> > > ... for next year, after a deprecation notice.
> > >
> > > > I can easily imagine using one mbuf pool (or perhaps a few pools)
> > > > per CPU socket (or per physical memory bus closest to an attached
> NIC),
> > > > but not more than 256 mbuf memory pools in total.
> > > > So, let's introduce an mbufpoolid like the portid,
> > > > and cut this mbuf field down from 64 to 8 bits.
> 
> We will need to measure the perf of the solution.
> There is a chance for the cost to be too much high.
> 
> 
> > > > If we also cut down m->pkt_len from 32 to 24 bits,
> > >
> > > Who is using packets larger than 64k? Are 16 bits enough?
> >
> > I personally consider 64k a reasonable packet size limit. Exotic
> applications with even larger packets would have to live with this
> constraint. But let's see if there are any objections. For reference,
> 64k corresponds to ca. 44 Ethernet (1500 byte) packets.
> >
> > (The limit could be 65535 bytes, to avoid translation of the value 0
> into 65536 bytes.)
> >
> > This modification would go nicely hand in hand with the mbuf pool
> indirection modification.
> >
> > ... after yet another round of ABI stability discussions,
> depreciation notices, and so on. :-)
> 
> After more thoughts, I'm afraid 64k is too small in some cases.
> And 24-bit manipulation would probably break performance.
> I'm afraid we are stuck with 32-bit length.

Yes, 24 bit manipulation would probably break performance.

Perhaps a solution exists with 16 bits (least significant bits) for
the common cases, and 8 bits more (most significant bits) for the less
common cases. Just thinking out loud here...

> 
> > > > we can get the 8 bit mbuf pool index into the first cache line
> > > > at no additional cost.
> > >
> > > I like the idea.
> > > It means we don't need to move the pool pointer now,
> > > i.e. it does not have to replace the timestamp field.
> >
> > Agreed! Don't move m->pool to the first cache line; it is not used
> for RX.
> >
> > >
> > > > In other words: This would free up another 64 bit field in the
> mbuf
> > > structure!
> > >
> > > That would be great!
> > >
> > >
> > > > And even though the m->next pointer for scattered packets resides
> > > > in the second cache line, the libraries and application knows
> > > > that m->next is NULL when m->nb_segs is 1.
> > > > This proves that my suggestion would make touching
> > > > the second cache line unnecessary (in simple cases),
> > > > even for re-initializing the mbuf.
> > >
> > > So you think the "next" pointer should stay in the second half of
> mbuf?
> > >
> > > I feel you would like to move the Tx offloads in the first half
> > > to improve performance of very simple apps.
> >
> > "Very simple apps" sounds like a minority of apps. I would rather say
> "very simple packet handling scenarios", e.g. forwarding of normal size
> non-segmented packets. I would guess that the vast majority of packets
> handled by DPDK applications actually match this scenario. So I'm
> proposing to optimize for what I think is the most common scenario.
> >
> > If segmented packets are common, then m->next could be moved to the
> first cache line. But it will only improve the pure RX steps of the
> pipeline. When preparing the packet for TX, m->tx_offloads will need to
> be set, and the second cache line comes into play. So I'm wondering how
> big the benefit of having m->next in the first cache line really is -
> assuming that m->nb_segs will be checked before accessing m->next.
> >
> > > I am thinking the opposite: we could have some dynamic fields space
> > > in the first half to improve performance of complex Rx.
> > > Note: we can add a flag hint for field registration in this first
> half.
> > >
> >
> > I have had the same thoughts. However, I would prefer being able to
> forward ordinary packets without using the second mbuf cache line at
> all (although only in specific scenarios like my example above).
> >
> > Furthermore, the application can abuse the 64 bit m->tx_offload field
> for private purposes until it is time to prepare the packet for TX and
> pass it on to the driver. This hack somewhat resembles a dynamic field
> in the first cache line, and will not be possible if the m->pool or m-
> >next field is moved there.
> 
> 
>
  
Bruce Richardson Nov. 3, 2020, 12:25 p.m. UTC | #14
On Tue, Nov 03, 2020 at 01:10:05PM +0100, Morten Brørup wrote:
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Monday, November 2, 2020 4:58 PM
> > 
> > +Cc techboard
> > 
> > We need benchmark numbers in order to take a decision.
> > Please all, prepare some arguments and numbers so we can discuss
> > the mbuf layout in the next techboard meeting.
> 
> I propose that the techboard considers this from two angels:
> 
> 1. Long term goals and their relative priority. I.e. what can be
> achieved with wide-ranging modifications, requiring yet another ABI
> break and due notices.
> 
> 2. Short term goals, i.e. what can be achieved for this release.
> 
> 
> My suggestions follow...
> 
> 1. Regarding long term goals:
> 
> I have argued that simple forwarding of non-segmented packets using
> only the first mbuf cache line can be achieved by making three
> modifications:
> 
> a) Move m->tx_offload to the first cache line.
> b) Use an 8 bit pktmbuf mempool index in the first cache line,
>    instead of the 64 bit m->pool pointer in the second cache line.
> c) Do not access m->next when we know that it is NULL.
>    We can use m->nb_segs == 1 or some other invariant as the gate.
>    It can be implemented by adding an m->next accessor function:
>    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
>    {
>        return m->nb_segs == 1 ? NULL : m->next;
>    }
> 
> Regarding the priority of this goal, I guess that simple forwarding
> of non-segmented packets is probably the path taken by the majority
> of packets handled by DPDK.
> 
> 
> An alternative goal could be:
> Do not touch the second cache line during RX.
> A comment in the mbuf structure says so, but it is not true anymore.
>

The comment should be true for non-scattered RX, I believe. I'm not aware
of any use of second cacheline for the fast-path RXs for many drivers. Am I
missing something that has changed recently here?

/Bruce
  
Morten Brørup Nov. 3, 2020, 1:46 p.m. UTC | #15
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Tuesday, November 3, 2020 1:26 PM
> 
> On Tue, Nov 03, 2020 at 01:10:05PM +0100, Morten Brørup wrote:
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Sent: Monday, November 2, 2020 4:58 PM
> > >
> > > +Cc techboard
> > >
> > > We need benchmark numbers in order to take a decision.
> > > Please all, prepare some arguments and numbers so we can discuss
> > > the mbuf layout in the next techboard meeting.
> >
> > I propose that the techboard considers this from two angels:
> >
> > 1. Long term goals and their relative priority. I.e. what can be
> > achieved with wide-ranging modifications, requiring yet another ABI
> > break and due notices.
> >
> > 2. Short term goals, i.e. what can be achieved for this release.
> >
> >
> > My suggestions follow...
> >
> > 1. Regarding long term goals:
> >
> > I have argued that simple forwarding of non-segmented packets using
> > only the first mbuf cache line can be achieved by making three
> > modifications:
> >
> > a) Move m->tx_offload to the first cache line.
> > b) Use an 8 bit pktmbuf mempool index in the first cache line,
> >    instead of the 64 bit m->pool pointer in the second cache line.
> > c) Do not access m->next when we know that it is NULL.
> >    We can use m->nb_segs == 1 or some other invariant as the gate.
> >    It can be implemented by adding an m->next accessor function:
> >    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
> >    {
> >        return m->nb_segs == 1 ? NULL : m->next;
> >    }
> >
> > Regarding the priority of this goal, I guess that simple forwarding
> > of non-segmented packets is probably the path taken by the majority
> > of packets handled by DPDK.
> >
> >
> > An alternative goal could be:
> > Do not touch the second cache line during RX.
> > A comment in the mbuf structure says so, but it is not true anymore.
> >
> 
> The comment should be true for non-scattered RX, I believe.

You are correct.

My suggestion was unclear: Extend this remark to include segmented packets.

This could be a priority if the techboard considers RX segmented packets more important than my suggestion for single cache line forwarding of non-segmented packets.


> I'm not aware of any use of second cacheline for the fast-path RXs for many drivers.
> Am I missing something that has changed recently here?

Check out eth_igb_recv_pkts() in the E1000 driver: rxm->next = NULL;
Or pmd_rx_burst() in the TAP driver: new_tail->next = seg->next;

Perhaps the documentation should describe best practices for implementing RX and TX functions in drivers, including allocating/freeing mbufs. Or an example dummy Ethernet driver could do it.
  
Bruce Richardson Nov. 3, 2020, 1:50 p.m. UTC | #16
On Tue, Nov 03, 2020 at 02:46:17PM +0100, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Tuesday, November 3, 2020 1:26 PM
> > 
> > On Tue, Nov 03, 2020 at 01:10:05PM +0100, Morten Brørup wrote:
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > Sent: Monday, November 2, 2020 4:58 PM
> > > >
> > > > +Cc techboard
> > > >
> > > > We need benchmark numbers in order to take a decision.
> > > > Please all, prepare some arguments and numbers so we can discuss
> > > > the mbuf layout in the next techboard meeting.
> > >
> > > I propose that the techboard considers this from two angels:
> > >
> > > 1. Long term goals and their relative priority. I.e. what can be
> > > achieved with wide-ranging modifications, requiring yet another ABI
> > > break and due notices.
> > >
> > > 2. Short term goals, i.e. what can be achieved for this release.
> > >
> > >
> > > My suggestions follow...
> > >
> > > 1. Regarding long term goals:
> > >
> > > I have argued that simple forwarding of non-segmented packets using
> > > only the first mbuf cache line can be achieved by making three
> > > modifications:
> > >
> > > a) Move m->tx_offload to the first cache line.
> > > b) Use an 8 bit pktmbuf mempool index in the first cache line,
> > >    instead of the 64 bit m->pool pointer in the second cache line.
> > > c) Do not access m->next when we know that it is NULL.
> > >    We can use m->nb_segs == 1 or some other invariant as the gate.
> > >    It can be implemented by adding an m->next accessor function:
> > >    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
> > >    {
> > >        return m->nb_segs == 1 ? NULL : m->next;
> > >    }
> > >
> > > Regarding the priority of this goal, I guess that simple forwarding
> > > of non-segmented packets is probably the path taken by the majority
> > > of packets handled by DPDK.
> > >
> > >
> > > An alternative goal could be:
> > > Do not touch the second cache line during RX.
> > > A comment in the mbuf structure says so, but it is not true anymore.
> > >
> > 
> > The comment should be true for non-scattered RX, I believe.
> 
> You are correct.
> 
> My suggestion was unclear: Extend this remark to include segmented packets.
> 
> This could be a priority if the techboard considers RX segmented packets more important than my suggestion for single cache line forwarding of non-segmented packets.
> 
> 
> > I'm not aware of any use of second cacheline for the fast-path RXs for many drivers.
> > Am I missing something that has changed recently here?
> 
> Check out eth_igb_recv_pkts() in the E1000 driver: rxm->next = NULL;
> Or pmd_rx_burst() in the TAP driver: new_tail->next = seg->next;
> 
> Perhaps the documentation should describe best practices for implementing RX and TX functions in drivers, including allocating/freeing mbufs. Or an example dummy Ethernet driver could do it.
> 

Yes, perhaps I should be clearer about the "fast-path", because I was
thinking of the optimized RX/TX paths for those nics at 10G and above.
Probably the documentation should indeed have an update clarifying things a
bit, since using the first cacheline only possible but not mandatory for
simple RX.

/Bruce
  
Slava Ovsiienko Nov. 3, 2020, 2:02 p.m. UTC | #17
Hi, Morten

> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: Tuesday, November 3, 2020 14:10
> To: NBU-Contact-Thomas Monjalon <thomas@monjalon.net>; dev@dpdk.org;
> techboard@dpdk.org
> Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Andrew Rybchenko
> <andrew.rybchenko@oktetlabs.ru>; dev@dpdk.org; Yigit, Ferruh
> <ferruh.yigit@intel.com>; david.marchand@redhat.com; Richardson, Bruce
> <bruce.richardson@intel.com>; olivier.matz@6wind.com; jerinj@marvell.com;
> Slava Ovsiienko <viacheslavo@nvidia.com>; honnappa.nagarahalli@arm.com;
> maxime.coquelin@redhat.com; stephen@networkplumber.org;
> hemant.agrawal@nxp.com; Slava Ovsiienko <viacheslavo@nvidia.com>; Matan
> Azrad <matan@nvidia.com>; Shahaf Shuler <shahafs@nvidia.com>
> Subject: RE: [dpdk-dev] [PATCH 15/15] mbuf: move pool pointer in hotterfirst
> half
> 
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Monday, November 2, 2020 4:58 PM
> >
> > +Cc techboard
> >
> > We need benchmark numbers in order to take a decision.
> > Please all, prepare some arguments and numbers so we can discuss the
> > mbuf layout in the next techboard meeting.
> 
> I propose that the techboard considers this from two angels:
> 
> 1. Long term goals and their relative priority. I.e. what can be achieved with
> wide-ranging modifications, requiring yet another ABI break and due notices.
> 
> 2. Short term goals, i.e. what can be achieved for this release.
> 
> 
> My suggestions follow...
> 
> 1. Regarding long term goals:
> 
> I have argued that simple forwarding of non-segmented packets using only the
> first mbuf cache line can be achieved by making three
> modifications:
> 
> a) Move m->tx_offload to the first cache line.
Not all PMDs use this field on Tx. HW might support the checksum offloads
directly, not requiring these fields at all. 


> b) Use an 8 bit pktmbuf mempool index in the first cache line,
>    instead of the 64 bit m->pool pointer in the second cache line.
256 mpool looks enough, as for me. Regarding the indirect access to the pool
(via some table) - it might introduce some performance impact. For example, 
mlx5 PMD strongly relies on pool field for allocating mbufs in Rx datapath.
We're going to update (o-o, we found point to optimize), but for now it does.

> c) Do not access m->next when we know that it is NULL.
>    We can use m->nb_segs == 1 or some other invariant as the gate.
>    It can be implemented by adding an m->next accessor function:
>    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
>    {
>        return m->nb_segs == 1 ? NULL : m->next;
>    }

Sorry, not sure about this. IIRC, nb_segs is valid in the first segment/mbuf  only.
If we have the 4 segments in the pkt we see nb_seg=4 in the first one, and the nb_seg=1
in the others. The next field is NULL in the last mbuf only. Am I wrong and miss something ?

> Regarding the priority of this goal, I guess that simple forwarding of non-
> segmented packets is probably the path taken by the majority of packets
> handled by DPDK.
> 
> An alternative goal could be:
> Do not touch the second cache line during RX.
> A comment in the mbuf structure says so, but it is not true anymore.
> 
> (I guess that regression testing didn't catch this because the tests perform TX
> immediately after RX, so the cache miss just moves from the TX to the RX part
> of the test application.)
> 
> 
> 2. Regarding short term goals:
> 
> The current DPDK source code looks to me like m->next is the most frequently
> accessed field in the second cache line, so it makes sense moving this to the
> first cache line, rather than m->pool.
> Benchmarking may help here.

Moreover, for the segmented packets the packet size is supposed to be large,
and it imposes the relatively low packet rate, so probably optimization of
moving next to the 1st cache line might be negligible at all. Just compare 148Mpps of
64B pkts and 4Mpps of 3000B pkts over 100Gbps link. Currently we are on benchmarking
and did not succeed yet on difference finding. The benefit can't be expressed in mpps delta,
we should measure CPU clocks, but Rx queue is almost always empty - we have an empty
loops. So, if we have the boost - it is extremely hard to catch one.

With best regards, Slava

>
> 
> If we - without breaking the ABI - can introduce a gate to avoid accessing m-
> >next when we know that it is NULL, we should keep it in the second cache
> line.
> 
> In this case, I would prefer to move m->tx_offload to the first cache line,
> thereby providing a field available for application use, until the application
> prepares the packet for transmission.
> 
> 
> >
> >
> > 01/11/2020 21:59, Morten Brørup:
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > Sent: Sunday, November 1, 2020 5:38 PM
> > > >
> > > > 01/11/2020 10:12, Morten Brørup:
> > > > > One thing has always puzzled me:
> > > > > Why do we use 64 bits to indicate which memory pool an mbuf
> > > > > belongs to?
> > > > > The portid only uses 16 bits and an indirection index.
> > > > > Why don't we use the same kind of indirection index for mbuf
> > pools?
> > > >
> > > > I wonder what would be the cost of indirection. Probably
> > neglectible.
> > >
> > > Probably. The portid does it, and that indirection is heavily used
> > everywhere.
> > >
> > > The size of mbuf memory pool indirection array should be compile
> > > time
> > configurable, like the size of the portid indirection array.
> > >
> > > And for reference, the indirection array will fit into one cache
> > > line
> > if we default to 8 mbuf pools, thus supporting an 8 CPU socket system
> > with one mbuf pool per CPU socket, or a 4 CPU socket system with two
> > mbuf pools per CPU socket.
> > >
> > > (And as a side note: Our application is optimized for single-socket
> > systems, and we only use one mbuf pool. I guess many applications were
> > developed without carefully optimizing for multi-socket systems, and
> > also just use one mbuf pool. In these cases, the mbuf structure
> > doesn't really need a pool field. But it is still there, and the DPDK
> > libraries use it, so we didn't bother removing it.)
> > >
> > > > I think it is a good proposal...
> > > > ... for next year, after a deprecation notice.
> > > >
> > > > > I can easily imagine using one mbuf pool (or perhaps a few
> > > > > pools) per CPU socket (or per physical memory bus closest to an
> > > > > attached
> > NIC),
> > > > > but not more than 256 mbuf memory pools in total.
> > > > > So, let's introduce an mbufpoolid like the portid, and cut this
> > > > > mbuf field down from 64 to 8 bits.
> >
> > We will need to measure the perf of the solution.
> > There is a chance for the cost to be too much high.
> >
> >
> > > > > If we also cut down m->pkt_len from 32 to 24 bits,
> > > >
> > > > Who is using packets larger than 64k? Are 16 bits enough?
> > >
> > > I personally consider 64k a reasonable packet size limit. Exotic
> > applications with even larger packets would have to live with this
> > constraint. But let's see if there are any objections. For reference,
> > 64k corresponds to ca. 44 Ethernet (1500 byte) packets.
> > >
> > > (The limit could be 65535 bytes, to avoid translation of the value 0
> > into 65536 bytes.)
> > >
> > > This modification would go nicely hand in hand with the mbuf pool
> > indirection modification.
> > >
> > > ... after yet another round of ABI stability discussions,
> > depreciation notices, and so on. :-)
> >
> > After more thoughts, I'm afraid 64k is too small in some cases.
> > And 24-bit manipulation would probably break performance.
> > I'm afraid we are stuck with 32-bit length.
> 
> Yes, 24 bit manipulation would probably break performance.
> 
> Perhaps a solution exists with 16 bits (least significant bits) for the common
> cases, and 8 bits more (most significant bits) for the less common cases. Just
> thinking out loud here...
> 
> >
> > > > > we can get the 8 bit mbuf pool index into the first cache line
> > > > > at no additional cost.
> > > >
> > > > I like the idea.
> > > > It means we don't need to move the pool pointer now, i.e. it does
> > > > not have to replace the timestamp field.
> > >
> > > Agreed! Don't move m->pool to the first cache line; it is not used
> > for RX.
> > >
> > > >
> > > > > In other words: This would free up another 64 bit field in the
> > mbuf
> > > > structure!
> > > >
> > > > That would be great!
> > > >
> > > >
> > > > > And even though the m->next pointer for scattered packets
> > > > > resides in the second cache line, the libraries and application
> > > > > knows that m->next is NULL when m->nb_segs is 1.
> > > > > This proves that my suggestion would make touching the second
> > > > > cache line unnecessary (in simple cases), even for
> > > > > re-initializing the mbuf.
> > > >
> > > > So you think the "next" pointer should stay in the second half of
> > mbuf?
> > > >
> > > > I feel you would like to move the Tx offloads in the first half to
> > > > improve performance of very simple apps.
> > >
> > > "Very simple apps" sounds like a minority of apps. I would rather
> > > say
> > "very simple packet handling scenarios", e.g. forwarding of normal
> > size non-segmented packets. I would guess that the vast majority of
> > packets handled by DPDK applications actually match this scenario. So
> > I'm proposing to optimize for what I think is the most common scenario.
> > >
> > > If segmented packets are common, then m->next could be moved to the
> > first cache line. But it will only improve the pure RX steps of the
> > pipeline. When preparing the packet for TX, m->tx_offloads will need
> > to be set, and the second cache line comes into play. So I'm wondering
> > how big the benefit of having m->next in the first cache line really
> > is - assuming that m->nb_segs will be checked before accessing m->next.
> > >
> > > > I am thinking the opposite: we could have some dynamic fields
> > > > space in the first half to improve performance of complex Rx.
> > > > Note: we can add a flag hint for field registration in this first
> > half.
> > > >
> > >
> > > I have had the same thoughts. However, I would prefer being able to
> > forward ordinary packets without using the second mbuf cache line at
> > all (although only in specific scenarios like my example above).
> > >
> > > Furthermore, the application can abuse the 64 bit m->tx_offload
> > > field
> > for private purposes until it is time to prepare the packet for TX and
> > pass it on to the driver. This hack somewhat resembles a dynamic field
> > in the first cache line, and will not be possible if the m->pool or m-
> > >next field is moved there.
> >
> >
> >
  
Morten Brørup Nov. 3, 2020, 2:03 p.m. UTC | #18
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Tuesday, November 3, 2020 2:50 PM
> 
> On Tue, Nov 03, 2020 at 02:46:17PM +0100, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Tuesday, November 3, 2020 1:26 PM
> > >
> > > On Tue, Nov 03, 2020 at 01:10:05PM +0100, Morten Brørup wrote:
> > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > Sent: Monday, November 2, 2020 4:58 PM
> > > > >
> > > > > +Cc techboard
> > > > >
> > > > > We need benchmark numbers in order to take a decision.
> > > > > Please all, prepare some arguments and numbers so we can
> discuss
> > > > > the mbuf layout in the next techboard meeting.
> > > >
> > > > I propose that the techboard considers this from two angels:
> > > >
> > > > 1. Long term goals and their relative priority. I.e. what can be
> > > > achieved with wide-ranging modifications, requiring yet another
> ABI
> > > > break and due notices.
> > > >
> > > > 2. Short term goals, i.e. what can be achieved for this release.
> > > >
> > > >
> > > > My suggestions follow...
> > > >
> > > > 1. Regarding long term goals:
> > > >
> > > > I have argued that simple forwarding of non-segmented packets
> using
> > > > only the first mbuf cache line can be achieved by making three
> > > > modifications:
> > > >
> > > > a) Move m->tx_offload to the first cache line.
> > > > b) Use an 8 bit pktmbuf mempool index in the first cache line,
> > > >    instead of the 64 bit m->pool pointer in the second cache
> line.
> > > > c) Do not access m->next when we know that it is NULL.
> > > >    We can use m->nb_segs == 1 or some other invariant as the
> gate.
> > > >    It can be implemented by adding an m->next accessor function:
> > > >    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
> > > >    {
> > > >        return m->nb_segs == 1 ? NULL : m->next;
> > > >    }
> > > >
> > > > Regarding the priority of this goal, I guess that simple
> forwarding
> > > > of non-segmented packets is probably the path taken by the
> majority
> > > > of packets handled by DPDK.
> > > >
> > > >
> > > > An alternative goal could be:
> > > > Do not touch the second cache line during RX.
> > > > A comment in the mbuf structure says so, but it is not true
> anymore.
> > > >
> > >
> > > The comment should be true for non-scattered RX, I believe.
> >
> > You are correct.
> >
> > My suggestion was unclear: Extend this remark to include segmented
> packets.
> >
> > This could be a priority if the techboard considers RX segmented
> packets more important than my suggestion for single cache line
> forwarding of non-segmented packets.
> >
> >
> > > I'm not aware of any use of second cacheline for the fast-path RXs
> for many drivers.
> > > Am I missing something that has changed recently here?
> >
> > Check out eth_igb_recv_pkts() in the E1000 driver: rxm->next = NULL;
> > Or pmd_rx_burst() in the TAP driver: new_tail->next = seg->next;
> >
> > Perhaps the documentation should describe best practices for
> implementing RX and TX functions in drivers, including
> allocating/freeing mbufs. Or an example dummy Ethernet driver could do
> it.
> >
> 
> Yes, perhaps I should be clearer about the "fast-path", because I was
> thinking of the optimized RX/TX paths for those nics at 10G and above.
> Probably the documentation should indeed have an update clarifying
> things a
> bit, since using the first cacheline only possible but not mandatory
> for
> simple RX.

I sometimes look at the source code of the simple drivers for reference, as they are easier to understand than the advanced vector drivers.
I suppose new PMD developers also would. :-)

Anyway, it is probably a good idea to add a clarifying note to the documentation, thus reflecting reality.

Just make sure that it says that the second cache line is supposed to be untouched by RX of high performance drivers, so application developers still consider it cold.
  
Morten Brørup Nov. 3, 2020, 3:03 p.m. UTC | #19
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Slava Ovsiienko
> Sent: Tuesday, November 3, 2020 3:03 PM
> 
> Hi, Morten
> 
> > From: Morten Brørup <mb@smartsharesystems.com>
> > Sent: Tuesday, November 3, 2020 14:10
> >
> > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Sent: Monday, November 2, 2020 4:58 PM
> > >
> > > +Cc techboard
> > >
> > > We need benchmark numbers in order to take a decision.
> > > Please all, prepare some arguments and numbers so we can discuss
> the
> > > mbuf layout in the next techboard meeting.
> >
> > I propose that the techboard considers this from two angels:
> >
> > 1. Long term goals and their relative priority. I.e. what can be
> achieved with
> > wide-ranging modifications, requiring yet another ABI break and due
> notices.
> >
> > 2. Short term goals, i.e. what can be achieved for this release.
> >
> >
> > My suggestions follow...
> >
> > 1. Regarding long term goals:
> >
> > I have argued that simple forwarding of non-segmented packets using
> only the
> > first mbuf cache line can be achieved by making three
> > modifications:
> >
> > a) Move m->tx_offload to the first cache line.
> Not all PMDs use this field on Tx. HW might support the checksum
> offloads
> directly, not requiring these fields at all.
> 
> 
> > b) Use an 8 bit pktmbuf mempool index in the first cache line,
> >    instead of the 64 bit m->pool pointer in the second cache line.
> 256 mpool looks enough, as for me. Regarding the indirect access to the
> pool
> (via some table) - it might introduce some performance impact.

It might, but I hope that it is negligible, so the benefits outweigh the disadvantages.

It would have to be measured, though.

And m->pool is only used for free()'ing (and detach()'ing) mbufs.

> For example,
> mlx5 PMD strongly relies on pool field for allocating mbufs in Rx
> datapath.
> We're going to update (o-o, we found point to optimize), but for now it
> does.

Without looking at the source code, I don't think the PMD is using m->pool in the RX datapath, I think it is using a pool dedicated to a receive queue used for RX descriptors in the PMD (i.e. driver->queue->pool).

> 
> > c) Do not access m->next when we know that it is NULL.
> >    We can use m->nb_segs == 1 or some other invariant as the gate.
> >    It can be implemented by adding an m->next accessor function:
> >    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
> >    {
> >        return m->nb_segs == 1 ? NULL : m->next;
> >    }
> 
> Sorry, not sure about this. IIRC, nb_segs is valid in the first
> segment/mbuf  only.
> If we have the 4 segments in the pkt we see nb_seg=4 in the first one,
> and the nb_seg=1
> in the others. The next field is NULL in the last mbuf only. Am I wrong
> and miss something ?

You are correct.

This would have to be updated too. Either by increasing m->nb_seg in the following segments, or by splitting up relevant functions into functions for working on first segments (incl. non-segmented packets), and functions for working on following segments of segmented packets.

> 
> > Regarding the priority of this goal, I guess that simple forwarding
> of non-
> > segmented packets is probably the path taken by the majority of
> packets
> > handled by DPDK.
> >
> > An alternative goal could be:
> > Do not touch the second cache line during RX.
> > A comment in the mbuf structure says so, but it is not true anymore.
> >
> > (I guess that regression testing didn't catch this because the tests
> perform TX
> > immediately after RX, so the cache miss just moves from the TX to the
> RX part
> > of the test application.)
> >
> >
> > 2. Regarding short term goals:
> >
> > The current DPDK source code looks to me like m->next is the most
> frequently
> > accessed field in the second cache line, so it makes sense moving
> this to the
> > first cache line, rather than m->pool.
> > Benchmarking may help here.
> 
> Moreover, for the segmented packets the packet size is supposed to be
> large,
> and it imposes the relatively low packet rate, so probably optimization
> of
> moving next to the 1st cache line might be negligible at all. Just
> compare 148Mpps of
> 64B pkts and 4Mpps of 3000B pkts over 100Gbps link. Currently we are on
> benchmarking
> and did not succeed yet on difference finding. The benefit can't be
> expressed in mpps delta,
> we should measure CPU clocks, but Rx queue is almost always empty - we
> have an empty
> loops. So, if we have the boost - it is extremely hard to catch one.

Very good point regarding the value of such an optimization, Slava!

And when free()'ing packets, both m->next and m->pool are touched.

So perhaps the free()/detach() functions in the mbuf library can be modified to handle first segments (and non-segmented packets) and following segments differently, so accessing m->next can be avoided for non-segmented packets. Then m->pool should be moved to the first cache line.
  
Olivier Matz Nov. 4, 2020, 3 p.m. UTC | #20
Hi,

On Tue, Nov 03, 2020 at 04:03:46PM +0100, Morten Brørup wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Slava Ovsiienko
> > Sent: Tuesday, November 3, 2020 3:03 PM
> > 
> > Hi, Morten
> > 
> > > From: Morten Brørup <mb@smartsharesystems.com>
> > > Sent: Tuesday, November 3, 2020 14:10
> > >
> > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > Sent: Monday, November 2, 2020 4:58 PM
> > > >
> > > > +Cc techboard
> > > >
> > > > We need benchmark numbers in order to take a decision.
> > > > Please all, prepare some arguments and numbers so we can discuss
> > the
> > > > mbuf layout in the next techboard meeting.

I did some quick tests, and it appears to me that just moving the pool
pointer to the first cache line has not a significant impact.

However, I agree with Morten that there is some room for optimization
around m->pool: I did a hack in the ixgbe driver to assume there is only
one mbuf pool. This simplifies a lot the freeing of mbufs in Tx, because
we don't have to group them in bulks that shares the same pool (see
ixgbe_tx_free_bufs()). The impact of this hack is quite good: +~5% on a
real-life forwarding use case.

It is maybe possible to store the pool in the sw ring to avoid a later
access to m->pool. Having a pool index as suggested by Morten would also
help to reduce used room in sw ring in this case. But this is a bit
off-topic :)



> > > I propose that the techboard considers this from two angels:
> > >
> > > 1. Long term goals and their relative priority. I.e. what can be
> > achieved with
> > > wide-ranging modifications, requiring yet another ABI break and due
> > notices.
> > >
> > > 2. Short term goals, i.e. what can be achieved for this release.
> > >
> > >
> > > My suggestions follow...
> > >
> > > 1. Regarding long term goals:
> > >
> > > I have argued that simple forwarding of non-segmented packets using
> > only the
> > > first mbuf cache line can be achieved by making three
> > > modifications:
> > >
> > > a) Move m->tx_offload to the first cache line.
> > Not all PMDs use this field on Tx. HW might support the checksum
> > offloads
> > directly, not requiring these fields at all.

To me, a driver should use m->tx_offload, because the application
specifies the offset where the checksum has to be done, in case the hw
is not able to recognize the protocol.

> > > b) Use an 8 bit pktmbuf mempool index in the first cache line,
> > >    instead of the 64 bit m->pool pointer in the second cache line.
> > 256 mpool looks enough, as for me. Regarding the indirect access to the
> > pool
> > (via some table) - it might introduce some performance impact.
> 
> It might, but I hope that it is negligible, so the benefits outweigh the disadvantages.
> 
> It would have to be measured, though.
> 
> And m->pool is only used for free()'ing (and detach()'ing) mbufs.
> 
> > For example,
> > mlx5 PMD strongly relies on pool field for allocating mbufs in Rx
> > datapath.
> > We're going to update (o-o, we found point to optimize), but for now it
> > does.
> 
> Without looking at the source code, I don't think the PMD is using m->pool in the RX datapath, I think it is using a pool dedicated to a receive queue used for RX descriptors in the PMD (i.e. driver->queue->pool).
> 
> > 
> > > c) Do not access m->next when we know that it is NULL.
> > >    We can use m->nb_segs == 1 or some other invariant as the gate.
> > >    It can be implemented by adding an m->next accessor function:
> > >    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
> > >    {
> > >        return m->nb_segs == 1 ? NULL : m->next;
> > >    }
> > 
> > Sorry, not sure about this. IIRC, nb_segs is valid in the first
> > segment/mbuf  only.
> > If we have the 4 segments in the pkt we see nb_seg=4 in the first one,
> > and the nb_seg=1
> > in the others. The next field is NULL in the last mbuf only. Am I wrong
> > and miss something ?
> 
> You are correct.
> 
> This would have to be updated too. Either by increasing m->nb_seg in the following segments, or by splitting up relevant functions into functions for working on first segments (incl. non-segmented packets), and functions for working on following segments of segmented packets.

Instead of maintaining a valid nb_segs, a HAS_NEXT flag would be easier
to implement. However it means that an accessor needs to be used instead
of any m->next access.

> > > Regarding the priority of this goal, I guess that simple forwarding
> > of non-
> > > segmented packets is probably the path taken by the majority of
> > packets
> > > handled by DPDK.
> > >
> > > An alternative goal could be:
> > > Do not touch the second cache line during RX.
> > > A comment in the mbuf structure says so, but it is not true anymore.
> > >
> > > (I guess that regression testing didn't catch this because the tests
> > perform TX
> > > immediately after RX, so the cache miss just moves from the TX to the
> > RX part
> > > of the test application.)
> > >
> > >
> > > 2. Regarding short term goals:
> > >
> > > The current DPDK source code looks to me like m->next is the most
> > frequently
> > > accessed field in the second cache line, so it makes sense moving
> > this to the
> > > first cache line, rather than m->pool.
> > > Benchmarking may help here.
> > 
> > Moreover, for the segmented packets the packet size is supposed to be
> > large,
> > and it imposes the relatively low packet rate, so probably optimization
> > of
> > moving next to the 1st cache line might be negligible at all. Just
> > compare 148Mpps of
> > 64B pkts and 4Mpps of 3000B pkts over 100Gbps link. Currently we are on
> > benchmarking
> > and did not succeed yet on difference finding. The benefit can't be
> > expressed in mpps delta,
> > we should measure CPU clocks, but Rx queue is almost always empty - we
> > have an empty
> > loops. So, if we have the boost - it is extremely hard to catch one.
> 
> Very good point regarding the value of such an optimization, Slava!
> 
> And when free()'ing packets, both m->next and m->pool are touched.
> 
> So perhaps the free()/detach() functions in the mbuf library can be modified to handle first segments (and non-segmented packets) and following segments differently, so accessing m->next can be avoided for non-segmented packets. Then m->pool should be moved to the first cache line.
> 

I also think that Moving m->pool without doing something else about
m->next is probably useless. And it's too late for 20.11 to do
additionnal changes, so I suggest to postpone the field move to 21.11,
once we have a clearer view of possible optimizations.

Olivier
  
Ananyev, Konstantin Nov. 5, 2020, 12:25 a.m. UTC | #21
> 
> Hi,
> 
> On Tue, Nov 03, 2020 at 04:03:46PM +0100, Morten Brørup wrote:
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Slava Ovsiienko
> > > Sent: Tuesday, November 3, 2020 3:03 PM
> > >
> > > Hi, Morten
> > >
> > > > From: Morten Brørup <mb@smartsharesystems.com>
> > > > Sent: Tuesday, November 3, 2020 14:10
> > > >
> > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > Sent: Monday, November 2, 2020 4:58 PM
> > > > >
> > > > > +Cc techboard
> > > > >
> > > > > We need benchmark numbers in order to take a decision.
> > > > > Please all, prepare some arguments and numbers so we can discuss
> > > the
> > > > > mbuf layout in the next techboard meeting.
> 
> I did some quick tests, and it appears to me that just moving the pool
> pointer to the first cache line has not a significant impact.

Hmm, as I remember Thomas mentioned about 5%+ improvement
with that change. Though I suppose a lot depends from actual test-case. 
Would be good to know when it does help and when it doesn't.

> 
> However, I agree with Morten that there is some room for optimization
> around m->pool: I did a hack in the ixgbe driver to assume there is only
> one mbuf pool. This simplifies a lot the freeing of mbufs in Tx, because
> we don't have to group them in bulks that shares the same pool (see
> ixgbe_tx_free_bufs()). The impact of this hack is quite good: +~5% on a
> real-life forwarding use case.

I think we already have such optimization ability within DPDK:
#define DEV_TX_OFFLOAD_MBUF_FAST_FREE   0x00010000
/**< Device supports optimization for fast release of mbufs.
 *   When set application must guarantee that per-queue all mbufs comes from
 *   the same mempool and has refcnt = 1.
 */

Seems over-optimistic to me, but many PMDs do support it.

> 
> It is maybe possible to store the pool in the sw ring to avoid a later
> access to m->pool. Having a pool index as suggested by Morten would also
> help to reduce used room in sw ring in this case. But this is a bit
> off-topic :)
> 
> 
> 
> > > > I propose that the techboard considers this from two angels:
> > > >
> > > > 1. Long term goals and their relative priority. I.e. what can be
> > > achieved with
> > > > wide-ranging modifications, requiring yet another ABI break and due
> > > notices.
> > > >
> > > > 2. Short term goals, i.e. what can be achieved for this release.
> > > >
> > > >
> > > > My suggestions follow...
> > > >
> > > > 1. Regarding long term goals:
> > > >
> > > > I have argued that simple forwarding of non-segmented packets using
> > > only the
> > > > first mbuf cache line can be achieved by making three
> > > > modifications:
> > > >
> > > > a) Move m->tx_offload to the first cache line.
> > > Not all PMDs use this field on Tx. HW might support the checksum
> > > offloads
> > > directly, not requiring these fields at all.
> 
> To me, a driver should use m->tx_offload, because the application
> specifies the offset where the checksum has to be done, in case the hw
> is not able to recognize the protocol.
> 
> > > > b) Use an 8 bit pktmbuf mempool index in the first cache line,
> > > >    instead of the 64 bit m->pool pointer in the second cache line.
> > > 256 mpool looks enough, as for me. Regarding the indirect access to the
> > > pool
> > > (via some table) - it might introduce some performance impact.
> >
> > It might, but I hope that it is negligible, so the benefits outweigh the disadvantages.
> >
> > It would have to be measured, though.
> >
> > And m->pool is only used for free()'ing (and detach()'ing) mbufs.
> >
> > > For example,
> > > mlx5 PMD strongly relies on pool field for allocating mbufs in Rx
> > > datapath.
> > > We're going to update (o-o, we found point to optimize), but for now it
> > > does.
> >
> > Without looking at the source code, I don't think the PMD is using m->pool in the RX datapath, I think it is using a pool dedicated to a
> receive queue used for RX descriptors in the PMD (i.e. driver->queue->pool).
> >
> > >
> > > > c) Do not access m->next when we know that it is NULL.
> > > >    We can use m->nb_segs == 1 or some other invariant as the gate.
> > > >    It can be implemented by adding an m->next accessor function:
> > > >    struct rte_mbuf * rte_mbuf_next(struct rte_mbuf * m)
> > > >    {
> > > >        return m->nb_segs == 1 ? NULL : m->next;
> > > >    }
> > >
> > > Sorry, not sure about this. IIRC, nb_segs is valid in the first
> > > segment/mbuf  only.
> > > If we have the 4 segments in the pkt we see nb_seg=4 in the first one,
> > > and the nb_seg=1
> > > in the others. The next field is NULL in the last mbuf only. Am I wrong
> > > and miss something ?
> >
> > You are correct.
> >
> > This would have to be updated too. Either by increasing m->nb_seg in the following segments, or by splitting up relevant functions into
> functions for working on first segments (incl. non-segmented packets), and functions for working on following segments of segmented
> packets.
> 
> Instead of maintaining a valid nb_segs, a HAS_NEXT flag would be easier
> to implement. However it means that an accessor needs to be used instead
> of any m->next access.
> 
> > > > Regarding the priority of this goal, I guess that simple forwarding
> > > of non-
> > > > segmented packets is probably the path taken by the majority of
> > > packets
> > > > handled by DPDK.
> > > >
> > > > An alternative goal could be:
> > > > Do not touch the second cache line during RX.
> > > > A comment in the mbuf structure says so, but it is not true anymore.
> > > >
> > > > (I guess that regression testing didn't catch this because the tests
> > > perform TX
> > > > immediately after RX, so the cache miss just moves from the TX to the
> > > RX part
> > > > of the test application.)
> > > >
> > > >
> > > > 2. Regarding short term goals:
> > > >
> > > > The current DPDK source code looks to me like m->next is the most
> > > frequently
> > > > accessed field in the second cache line, so it makes sense moving
> > > this to the
> > > > first cache line, rather than m->pool.
> > > > Benchmarking may help here.
> > >
> > > Moreover, for the segmented packets the packet size is supposed to be
> > > large,
> > > and it imposes the relatively low packet rate, so probably optimization
> > > of
> > > moving next to the 1st cache line might be negligible at all. Just
> > > compare 148Mpps of
> > > 64B pkts and 4Mpps of 3000B pkts over 100Gbps link. Currently we are on
> > > benchmarking
> > > and did not succeed yet on difference finding. The benefit can't be
> > > expressed in mpps delta,
> > > we should measure CPU clocks, but Rx queue is almost always empty - we
> > > have an empty
> > > loops. So, if we have the boost - it is extremely hard to catch one.
> >
> > Very good point regarding the value of such an optimization, Slava!
> >
> > And when free()'ing packets, both m->next and m->pool are touched.
> >
> > So perhaps the free()/detach() functions in the mbuf library can be modified to handle first segments (and non-segmented packets) and
> following segments differently, so accessing m->next can be avoided for non-segmented packets. Then m->pool should be moved to the
> first cache line.
> >
> 
> I also think that Moving m->pool without doing something else about
> m->next is probably useless. And it's too late for 20.11 to do
> additionnal changes, so I suggest to postpone the field move to 21.11,
> once we have a clearer view of possible optimizations.
> 
> Olivier
  
Morten Brørup Nov. 5, 2020, 9:04 a.m. UTC | #22
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev,
> Konstantin
> Sent: Thursday, November 5, 2020 1:26 AM
> 
> >
> > Hi,
> >
> > On Tue, Nov 03, 2020 at 04:03:46PM +0100, Morten Brørup wrote:
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Slava
> Ovsiienko
> > > > Sent: Tuesday, November 3, 2020 3:03 PM
> > > >
> > > > Hi, Morten
> > > >
> > > > > From: Morten Brørup <mb@smartsharesystems.com>
> > > > > Sent: Tuesday, November 3, 2020 14:10
> > > > >
> > > > > > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > > > > Sent: Monday, November 2, 2020 4:58 PM
> > > > > >
> > > > > > +Cc techboard
> > > > > >
> > > > > > We need benchmark numbers in order to take a decision.
> > > > > > Please all, prepare some arguments and numbers so we can
> discuss
> > > > the
> > > > > > mbuf layout in the next techboard meeting.
> >
> > I did some quick tests, and it appears to me that just moving the
> pool
> > pointer to the first cache line has not a significant impact.
> 
> Hmm, as I remember Thomas mentioned about 5%+ improvement
> with that change. Though I suppose a lot depends from actual test-case.
> Would be good to know when it does help and when it doesn't.
> 
> >
> > However, I agree with Morten that there is some room for optimization
> > around m->pool: I did a hack in the ixgbe driver to assume there is
> only
> > one mbuf pool. This simplifies a lot the freeing of mbufs in Tx,
> because
> > we don't have to group them in bulks that shares the same pool (see
> > ixgbe_tx_free_bufs()). The impact of this hack is quite good: +~5% on
> a
> > real-life forwarding use case.
> 
> I think we already have such optimization ability within DPDK:
> #define DEV_TX_OFFLOAD_MBUF_FAST_FREE   0x00010000
> /**< Device supports optimization for fast release of mbufs.
>  *   When set application must guarantee that per-queue all mbufs comes
> from
>  *   the same mempool and has refcnt = 1.
>  */
> 
> Seems over-optimistic to me, but many PMDs do support it.

Looking at a few drivers using this flag, Intel drivers use rte_mempool_put(m->pool), and thus still reads the second cache line. Only ThunderX seems to use the optimization benefit and use rte_mempool_put_bulk(q->pool).

I would rather see a generic optimization of free()'ing non-segmented packets in the mbuf library, where free() and free_seg() take advantage of knowing whether they are working on the first segment or not - like the is_header indicator in many of the mbuf check functions - and, when working on the first segment, gate access to n->next by m->nb_segs > 1.

Concept:

static inline void
rte_pktmbuf_free(struct rte_mbuf *m)
{
    struct rte_mbuf *m_next;

    /* NOTE: Sanity check of header has moved to __rte_pktmbuf_prefree_seg(). */

    if (m != NULL) {
        if (m->nb_segs == 1) {
            __rte_pktmbuf_free_seg(m, 1);
        } else {
            m_next = m->next;
            __rte_pktmbuf_free_seg(m, 1);
            m = m_next;
            while (m != NULL) {
                m_next = m->next;
                __rte_pktmbuf_free_seg(m, 0);
                m = m_next;
            }
        }
    }
}

static __rte_always_inline void
__rte_pktmbuf_free_seg(struct rte_mbuf *m, int is_header)
{
    m = __rte_pktmbuf_prefree_seg(m, is_header);
    if (likely(m != NULL))
        rte_mbuf_raw_free(m);
}

static __rte_always_inline struct rte_mbuf *
__rte_pktmbuf_prefree_seg(struct rte_mbuf *m, int is_header)
{
    __rte_mbuf_sanity_check(m, is_header);

    if (likely(rte_mbuf_refcnt_read(m) == 1)) {

        if (!RTE_MBUF_DIRECT(m)) {
            rte_pktmbuf_detach(m);
            if (RTE_MBUF_HAS_EXTBUF(m) &&
                RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
                __rte_pktmbuf_pinned_extbuf_decref(m))
                return NULL;
        }

        if (is_header && m->nb_segs == 1)
            return m; /* NOTE: Avoid touching (writing to) the second cache line. */

        if (m->next != NULL) {
            m->next = NULL;
            m->nb_segs = 1;
        }

        return m;

    } else if (__rte_mbuf_refcnt_update(m, -1) == 0) {

        if (!RTE_MBUF_DIRECT(m)) {
            rte_pktmbuf_detach(m);
            if (RTE_MBUF_HAS_EXTBUF(m) &&
                RTE_MBUF_HAS_PINNED_EXTBUF(m) &&
                __rte_pktmbuf_pinned_extbuf_decref(m))
                return NULL;
        }

        if (is_header && m->nb_segs == 1) {
            /* NOTE: Avoid touching (writing to) the second cache line. */
            rte_mbuf_refcnt_set(m, 1);
            return m;
        }

        if (m->next != NULL) {
            m->next = NULL;
            m->nb_segs = 1;
        }
        rte_mbuf_refcnt_set(m, 1);

        return m;
    }
    return NULL;
}

Furthermore, this concept might provide an additional performance improvement by moving m->pool to the first cache line, so rte_mempool_put() in rte_mbuf_raw_free() wouldn't have to touch the second cache line either.
  
Morten Brørup Nov. 5, 2020, 9:35 a.m. UTC | #23
There is a simple alternative for applications with a single mbuf pool to avoid accessing m->pool.

We could add a global variable pointing to the single mbuf pool.

It would be NULL by default.

It would be set by rte_pktmbuf_pool_create() on first invocation, and reset back to NULL on following invocations. (There would need to be a counter too, to prevent setting it again on the third invocation.)

All functions accessing m->pool would use the global mbuf pool pointer if set, and otherwise use the m->pool pointer, like this:

- rte_mempool_put(m->pool, m);
+ rte_mempool_put(global_mbuf_pool ? global_mbuf_pool : m->pool, m);

This optimization can be implemented without ABI breakage:

Since m->pool is initialized as always, functions that are not modified to use the global_mbuf_pool will simply continue using m->pool, not knowing that a global mbuf pool exists.


Med venlig hilsen / kind regards
- Morten Brørup
  
Bruce Richardson Nov. 5, 2020, 10:29 a.m. UTC | #24
On Thu, Nov 05, 2020 at 10:35:45AM +0100, Morten Brørup wrote:
> There is a simple alternative for applications with a single mbuf pool to avoid accessing m->pool.
> 
> We could add a global variable pointing to the single mbuf pool.
> 
> It would be NULL by default.
> 
> It would be set by rte_pktmbuf_pool_create() on first invocation, and reset back to NULL on following invocations. (There would need to be a counter too, to prevent setting it again on the third invocation.)
> 
> All functions accessing m->pool would use the global mbuf pool pointer if set, and otherwise use the m->pool pointer, like this:
> 
> - rte_mempool_put(m->pool, m);
> + rte_mempool_put(global_mbuf_pool ? global_mbuf_pool : m->pool, m);
> 
> This optimization can be implemented without ABI breakage:
> 
> Since m->pool is initialized as always, functions that are not modified to use the global_mbuf_pool will simply continue using m->pool, not knowing that a global mbuf pool exists.
> 
Very interesting idea. Definitely worth considering. A TX function would
only have to check the global variable once at the start of cleanup too,
and if set, it can use bulk frees without any additional work.

/Bruce
  

Patch

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index 72dbb25b83..07ca1dcbb2 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -88,11 +88,6 @@  Deprecation Notices
 
   - ``seqn``
 
-  As a consequence, the layout of the ``struct rte_mbuf`` will be re-arranged,
-  avoiding impact on vectorized implementation of the driver datapaths,
-  while evaluating performance gains of a better use of the first cache line.
-
-
 * ethdev: the legacy filter API, including
   ``rte_eth_dev_filter_supported()``, ``rte_eth_dev_filter_ctrl()`` as well
   as filter types MACVLAN, ETHERTYPE, FLEXIBLE, SYN, NTUPLE, TUNNEL, FDIR,
diff --git a/lib/librte_kni/rte_kni_common.h b/lib/librte_kni/rte_kni_common.h
index 36d66e2ffa..ffb3182731 100644
--- a/lib/librte_kni/rte_kni_common.h
+++ b/lib/librte_kni/rte_kni_common.h
@@ -84,10 +84,11 @@  struct rte_kni_mbuf {
 	char pad2[4];
 	uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
 	uint16_t data_len;      /**< Amount of data in segment buffer. */
+	char pad3[14];
+	void *pool;
 
 	/* fields on second cache line */
 	__attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)))
-	void *pool;
 	void *next;             /**< Physical address of next mbuf in kernel. */
 };
 
diff --git a/lib/librte_mbuf/rte_mbuf_core.h b/lib/librte_mbuf/rte_mbuf_core.h
index 52ca1c842f..ee185fa32b 100644
--- a/lib/librte_mbuf/rte_mbuf_core.h
+++ b/lib/librte_mbuf/rte_mbuf_core.h
@@ -584,12 +584,11 @@  struct rte_mbuf {
 
 	uint16_t buf_len;         /**< Length of segment buffer. */
 
-	uint64_t unused;
+	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
 
 	/* second cache line - fields only used in slow path or on TX */
 	RTE_MARKER cacheline1 __rte_cache_min_aligned;
 
-	struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
 	struct rte_mbuf *next;    /**< Next segment of scattered packet. */
 
 	/* fields to support TX offloads */
@@ -646,7 +645,7 @@  struct rte_mbuf {
 	 */
 	struct rte_mbuf_ext_shared_info *shinfo;
 
-	uint64_t dynfield1[3]; /**< Reserved for dynamic fields. */
+	uint64_t dynfield1[4]; /**< Reserved for dynamic fields. */
 } __rte_cache_aligned;
 
 /**