[v3,1/2] net/mlx5: optimize mprq memcpy

Message ID 20201010090034.1797958-1-aman.kumar@vvdntech.in (mailing list archive)
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers
Series [v3,1/2] net/mlx5: optimize mprq memcpy |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Aman Kumar Oct. 10, 2020, 9 a.m. UTC
  add non temporal load and temporal store for mprq memcpy.
define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
configuration to enable this optimization.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 drivers/net/mlx5/meson.build |   1 +
 drivers/net/mlx5/mlx5.c      |  12 ++++
 drivers/net/mlx5/mlx5.h      |   3 +
 drivers/net/mlx5/mlx5_rxq.c  |   3 +
 drivers/net/mlx5/mlx5_rxtx.c | 116 ++++++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |   3 +
 meson_options.txt            |   2 +
 7 files changed, 138 insertions(+), 2 deletions(-)
  

Comments

Slava Ovsiienko Feb. 4, 2021, 2:14 p.m. UTC | #1
Hi,

I'm sorry for asking the questions very late.
Is still this patch set actual and should it be updated and considered? 

As I can understand this one optimizes the memory writes in some way using the instructions with the hints.
Is this specific for some CPU families? Is this more common? I suppose it should we considered and discussed
more widely, possible on EAL level. I would propose to introduce these special memory routines on EAL level
to give advantage to all PMDs, not specifically to mlx5.

With best regards,
Slava

> -----Original Message-----
> From: Aman Kumar <aman.kumar@vvdntech.in>
> Sent: Saturday, October 10, 2020 12:01
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; keesang.song@amd.com;
> Asaf Penso <asafp@nvidia.com>; Shy Shyman <shys@nvidia.com>; Slava
> Ovsiienko <viacheslavo@nvidia.com>; Alexander Kozyrev
> <akozyrev@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> aman.kumar@vvdntech.in
> Subject: [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
> 
> add non temporal load and temporal store for mprq memcpy.
> define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
> configuration to enable this optimization.
> 
> Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> ---
>  drivers/net/mlx5/meson.build |   1 +
>  drivers/net/mlx5/mlx5.c      |  12 ++++
>  drivers/net/mlx5/mlx5.h      |   3 +
>  drivers/net/mlx5/mlx5_rxq.c  |   3 +
>  drivers/net/mlx5/mlx5_rxtx.c | 116
> ++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx5/mlx5_rxtx.h |   3 +
>  meson_options.txt            |   2 +
>  7 files changed, 138 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
> index 9a97bb9c8..38e93fdc1 100644
> --- a/drivers/net/mlx5/meson.build
> +++ b/drivers/net/mlx5/meson.build
> @@ -47,6 +47,7 @@ foreach option:cflags_options
>  		cflags += option
>  	endif
>  endforeach
> +dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY',
> +get_option('mlx5_ntload_tstore'))
>  if get_option('buildtype').contains('debug')
>  	cflags += [ '-pedantic', '-DPEDANTIC' ]  else diff --git
> a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 01ead6e6a..a2796eaa5 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -160,6 +160,11 @@
>  /* Configure timeout of LRO session (in microseconds). */  #define
> MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
> 
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +/* mprq_tstore_memcpy */
> +#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
> +#endif
> +
>  /*
>   * Device parameter to configure the total data buffer size for a single
>   * hairpin queue (logarithm value).
> @@ -1623,6 +1628,10 @@ mlx5_args_check(const char *key, const char
> *val, void *opaque)
>  		config->sys_mem_en = !!tmp;
>  	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
>  		config->decap_en = !!tmp;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
> +		config->mprq_tstore_memcpy = tmp;
> +#endif
>  	} else {
>  		DRV_LOG(WARNING, "%s: unknown parameter", key);
>  		rte_errno = EINVAL;
> @@ -1683,6 +1692,9 @@ mlx5_args(struct mlx5_dev_config *config, struct
> rte_devargs *devargs)
>  		MLX5_RECLAIM_MEM,
>  		MLX5_SYS_MEM_EN,
>  		MLX5_DECAP_EN,
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +		MLX5_MPRQ_TSTORE_MEMCPY,
> +#endif
>  		NULL,
>  	};
>  	struct rte_kvargs *kvlist;
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 43da9a1fb..1eb305650 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -234,6 +234,9 @@ struct mlx5_dev_config {
>  	int tx_skew; /* Tx scheduling skew between WQE and data on wire.
> */
>  	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
>  	struct mlx5_lro_config lro; /* LRO configuration. */
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	unsigned int mprq_tstore_memcpy:1;
> +#endif
>  };
> 
> 
> diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index
> c059e216d..c8db59a12 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> idx, uint16_t desc,
>  	tmpl->socket = socket;
>  	if (dev->data->dev_conf.intr_conf.rxq)
>  		tmpl->irq = 1;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
> #endif
>  	mprq_stride_nums = config->mprq.stride_num_n ?
>  		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
>  	mprq_stride_size = non_scatter_min_mbuf_size <= diff --git
> a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> 0b87be15b..f59e30d82 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -123,6 +123,97 @@ uint8_t mlx5_swp_types_table[1 << 10]
> __rte_cache_aligned;  uint64_t rte_net_mlx5_dynf_inline_mask;  #define
> PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
> 
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +static void copy16B_ts(void *dst, void *src) {
> +	__m128i var128;
> +
> +	var128 = _mm_stream_load_si128((__m128i *)src);
> +	_mm_storeu_si128((__m128i *)dst, var128); }
> +
> +static void copy32B_ts(void *dst, void *src) {
> +	__m256i ymm0;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	_mm256_storeu_si256((__m256i *)dst, ymm0); }
> +
> +static void copy64B_ts(void *dst, void *src) {
> +	__m256i ymm0, ymm1;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 32));
> +	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1); }
> +
> +static void copy128B_ts(void *dst, void *src) {
> +	__m256i ymm0, ymm1, ymm2, ymm3;
> +
> +	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> +	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 32));
> +	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 64));
> +	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> *)src + 96));
> +	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
> +	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3); }
> +
> +static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int
> +len) {
> +	void *dest = dst;
> +
> +	while (len >= 128) {
> +		copy128B_ts(dst, src);
> +		dst = (uint8_t *)dst + 128;
> +		src = (uint8_t *)src + 128;
> +		len -= 128;
> +	}
> +	while (len >= 64) {
> +		copy64B_ts(dst, src);
> +		dst = (uint8_t *)dst + 64;
> +		src = (uint8_t *)src + 64;
> +		len -= 64;
> +	}
> +	while (len >= 32) {
> +		copy32B_ts(dst, src);
> +		dst = (uint8_t *)dst + 32;
> +		src = (uint8_t *)src + 32;
> +		len -= 32;
> +	}
> +	if (len >= 16) {
> +		copy16B_ts(dst, src);
> +		dst = (uint8_t *)dst + 16;
> +		src = (uint8_t *)src + 16;
> +		len -= 16;
> +	}
> +	if (len >= 8) {
> +		*(uint64_t *)dst = *(const uint64_t *)src;
> +		dst = (uint8_t *)dst + 8;
> +		src = (uint8_t *)src + 8;
> +		len -= 8;
> +	}
> +	if (len >= 4) {
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +		dst = (uint8_t *)dst + 4;
> +		src = (uint8_t *)src + 4;
> +		len -= 4;
> +	}
> +	if (len != 0) {
> +		dst = (uint8_t *)dst - (4 - len);
> +		src = (uint8_t *)src - (4 - len);
> +		*(uint32_t *)dst = *(const uint32_t *)src;
> +	}
> +
> +	return dest;
> +}
> +#endif
> +
>  /**
>   * Build a table to translate Rx completion flags to packet type.
>   *
> @@ -1707,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
>  		int32_t hdrm_overlap;
>  		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
>  		uint32_t rss_hash_res = 0;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +		uintptr_t data_addr;
> +#endif
> 
>  		if (consumed_strd == strd_n) {
>  			/* Replace WQE only if the buffer is still in use. */
> @@ -1772,12 +1866,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
>  		 * - Out of buffer in the Mempool for Multi-Packet RQ.
>  		 * - The packet's stride overlaps a headroom and scatter is
> off.
>  		 */
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +		if (unlikely(!rxq->mprq_tstore_memcpy) &&
> +			len <= rxq->mprq_max_memcpy_len) {
> +			rte_prefetch1(addr);
> +			if (len > RTE_CACHE_LINE_SIZE)
> +				rte_prefetch2((void *)((uintptr_t)addr +
> RTE_CACHE_LINE_SIZE));
> +		}
> +#endif
>  		if (len <= rxq->mprq_max_memcpy_len ||
>  		    rxq->mprq_repl == NULL ||
>  		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
>  			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
> -				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
> -					   addr, len);
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +				data_addr =
> (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
> +				if (!(rxq->mprq_tstore_memcpy))
> +					rte_memcpy((void *)data_addr,
> addr, len);
> +				else if ((rxq->mprq_tstore_memcpy) &&
> +					   !((data_addr | (uintptr_t)addr) &
> ALIGNMENT_MASK))
> +
> 	memcpy_aligned_rx_tstore_16B((void *)data_addr,
> +							addr, len);
> +				else
> +#endif
> +					rte_memcpy(rte_pktmbuf_mtod(pkt,
> void *),
> +							addr, len);
>  				DATA_LEN(pkt) = len;
>  			} else if (rxq->strd_scatter_en) {
>  				struct rte_mbuf *prev = pkt;
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 9ffa028d2..a8ea1a795 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -153,6 +153,9 @@ struct mlx5_rxq_data {
>  	uint32_t tunnel; /* Tunnel information. */
>  	uint64_t flow_meta_mask;
>  	int32_t flow_meta_offset;
> +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> +	unsigned int mprq_tstore_memcpy:1;
> +#endif
>  } __rte_cache_aligned;
> 
>  enum mlx5_rxq_type {
> diff --git a/meson_options.txt b/meson_options.txt index
> 9bf18ab6b..a4bc565d2 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
>  	description: 'maximum number of cores/threads supported by EAL')
> option('max_numa_nodes', type: 'integer', value: 4,
>  	description: 'maximum number of NUMA nodes supported by EAL')
> +option('mlx5_ntload_tstore', type: 'boolean', value: false,
> +	description: 'to enable optimized MPRQ in RX datapath')
>  option('enable_trace_fp', type: 'boolean', value: false,
>  	description: 'enable fast path trace points.')  option('tests', type:
> 'boolean', value: true,
> --
> 2.25.1
  
Aman Kumar Feb. 9, 2021, 6:22 a.m. UTC | #2
Hi Slava,

Thank you for your reply.
This is currently supported (and tested) on 2nd gen AMD EPYC series
processors. We're currently trying to make it more generic across other
EPYC platforms. We too believe having these available at EAL may also help
applications and other PMDs. I'll move memory copy instructions to
lib/librte_eal/* and update this patchset.
Thanks.

*Regards*
Aman Kumar

On Thu, Feb 4, 2021 at 7:45 PM Slava Ovsiienko <viacheslavo@nvidia.com>
wrote:

> Hi,
>
> I'm sorry for asking the questions very late.
> Is still this patch set actual and should it be updated and considered?
>
> As I can understand this one optimizes the memory writes in some way using
> the instructions with the hints.
> Is this specific for some CPU families? Is this more common? I suppose it
> should we considered and discussed
> more widely, possible on EAL level. I would propose to introduce these
> special memory routines on EAL level
> to give advantage to all PMDs, not specifically to mlx5.
>
> With best regards,
> Slava
>
> > -----Original Message-----
> > From: Aman Kumar <aman.kumar@vvdntech.in>
> > Sent: Saturday, October 10, 2020 12:01
> > To: dev@dpdk.org
> > Cc: Raslan Darawsheh <rasland@nvidia.com>; keesang.song@amd.com;
> > Asaf Penso <asafp@nvidia.com>; Shy Shyman <shys@nvidia.com>; Slava
> > Ovsiienko <viacheslavo@nvidia.com>; Alexander Kozyrev
> > <akozyrev@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> > aman.kumar@vvdntech.in
> > Subject: [PATCH v3 1/2] net/mlx5: optimize mprq memcpy
> >
> > add non temporal load and temporal store for mprq memcpy.
> > define RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY in build
> > configuration to enable this optimization.
> >
> > Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
> > ---
> >  drivers/net/mlx5/meson.build |   1 +
> >  drivers/net/mlx5/mlx5.c      |  12 ++++
> >  drivers/net/mlx5/mlx5.h      |   3 +
> >  drivers/net/mlx5/mlx5_rxq.c  |   3 +
> >  drivers/net/mlx5/mlx5_rxtx.c | 116
> > ++++++++++++++++++++++++++++++++++-
> >  drivers/net/mlx5/mlx5_rxtx.h |   3 +
> >  meson_options.txt            |   2 +
> >  7 files changed, 138 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
> > index 9a97bb9c8..38e93fdc1 100644
> > --- a/drivers/net/mlx5/meson.build
> > +++ b/drivers/net/mlx5/meson.build
> > @@ -47,6 +47,7 @@ foreach option:cflags_options
> >               cflags += option
> >       endif
> >  endforeach
> > +dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY',
> > +get_option('mlx5_ntload_tstore'))
> >  if get_option('buildtype').contains('debug')
> >       cflags += [ '-pedantic', '-DPEDANTIC' ]  else diff --git
> > a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 01ead6e6a..a2796eaa5 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -160,6 +160,11 @@
> >  /* Configure timeout of LRO session (in microseconds). */  #define
> > MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
> >
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +/* mprq_tstore_memcpy */
> > +#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
> > +#endif
> > +
> >  /*
> >   * Device parameter to configure the total data buffer size for a single
> >   * hairpin queue (logarithm value).
> > @@ -1623,6 +1628,10 @@ mlx5_args_check(const char *key, const char
> > *val, void *opaque)
> >               config->sys_mem_en = !!tmp;
> >       } else if (strcmp(MLX5_DECAP_EN, key) == 0) {
> >               config->decap_en = !!tmp;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     } else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
> > +             config->mprq_tstore_memcpy = tmp;
> > +#endif
> >       } else {
> >               DRV_LOG(WARNING, "%s: unknown parameter", key);
> >               rte_errno = EINVAL;
> > @@ -1683,6 +1692,9 @@ mlx5_args(struct mlx5_dev_config *config, struct
> > rte_devargs *devargs)
> >               MLX5_RECLAIM_MEM,
> >               MLX5_SYS_MEM_EN,
> >               MLX5_DECAP_EN,
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +             MLX5_MPRQ_TSTORE_MEMCPY,
> > +#endif
> >               NULL,
> >       };
> >       struct rte_kvargs *kvlist;
> > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 43da9a1fb..1eb305650 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -234,6 +234,9 @@ struct mlx5_dev_config {
> >       int tx_skew; /* Tx scheduling skew between WQE and data on wire.
> > */
> >       struct mlx5_hca_attr hca_attr; /* HCA attributes. */
> >       struct mlx5_lro_config lro; /* LRO configuration. */
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     unsigned int mprq_tstore_memcpy:1;
> > +#endif
> >  };
> >
> >
> > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> index
> > c059e216d..c8db59a12 100644
> > --- a/drivers/net/mlx5/mlx5_rxq.c
> > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > @@ -1380,6 +1380,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t
> > idx, uint16_t desc,
> >       tmpl->socket = socket;
> >       if (dev->data->dev_conf.intr_conf.rxq)
> >               tmpl->irq = 1;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
> > #endif
> >       mprq_stride_nums = config->mprq.stride_num_n ?
> >               config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
> >       mprq_stride_size = non_scatter_min_mbuf_size <= diff --git
> > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > 0b87be15b..f59e30d82 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > @@ -123,6 +123,97 @@ uint8_t mlx5_swp_types_table[1 << 10]
> > __rte_cache_aligned;  uint64_t rte_net_mlx5_dynf_inline_mask;  #define
> > PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
> >
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +static void copy16B_ts(void *dst, void *src) {
> > +     __m128i var128;
> > +
> > +     var128 = _mm_stream_load_si128((__m128i *)src);
> > +     _mm_storeu_si128((__m128i *)dst, var128); }
> > +
> > +static void copy32B_ts(void *dst, void *src) {
> > +     __m256i ymm0;
> > +
> > +     ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +     _mm256_storeu_si256((__m256i *)dst, ymm0); }
> > +
> > +static void copy64B_ts(void *dst, void *src) {
> > +     __m256i ymm0, ymm1;
> > +
> > +     ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +     ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 32));
> > +     _mm256_storeu_si256((__m256i *)dst, ymm0);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1); }
> > +
> > +static void copy128B_ts(void *dst, void *src) {
> > +     __m256i ymm0, ymm1, ymm2, ymm3;
> > +
> > +     ymm0 = _mm256_stream_load_si256((const __m256i *)src);
> > +     ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 32));
> > +     ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 64));
> > +     ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t
> > *)src + 96));
> > +     _mm256_storeu_si256((__m256i *)dst, ymm0);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
> > +     _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3); }
> > +
> > +static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int
> > +len) {
> > +     void *dest = dst;
> > +
> > +     while (len >= 128) {
> > +             copy128B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 128;
> > +             src = (uint8_t *)src + 128;
> > +             len -= 128;
> > +     }
> > +     while (len >= 64) {
> > +             copy64B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 64;
> > +             src = (uint8_t *)src + 64;
> > +             len -= 64;
> > +     }
> > +     while (len >= 32) {
> > +             copy32B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 32;
> > +             src = (uint8_t *)src + 32;
> > +             len -= 32;
> > +     }
> > +     if (len >= 16) {
> > +             copy16B_ts(dst, src);
> > +             dst = (uint8_t *)dst + 16;
> > +             src = (uint8_t *)src + 16;
> > +             len -= 16;
> > +     }
> > +     if (len >= 8) {
> > +             *(uint64_t *)dst = *(const uint64_t *)src;
> > +             dst = (uint8_t *)dst + 8;
> > +             src = (uint8_t *)src + 8;
> > +             len -= 8;
> > +     }
> > +     if (len >= 4) {
> > +             *(uint32_t *)dst = *(const uint32_t *)src;
> > +             dst = (uint8_t *)dst + 4;
> > +             src = (uint8_t *)src + 4;
> > +             len -= 4;
> > +     }
> > +     if (len != 0) {
> > +             dst = (uint8_t *)dst - (4 - len);
> > +             src = (uint8_t *)src - (4 - len);
> > +             *(uint32_t *)dst = *(const uint32_t *)src;
> > +     }
> > +
> > +     return dest;
> > +}
> > +#endif
> > +
> >  /**
> >   * Build a table to translate Rx completion flags to packet type.
> >   *
> > @@ -1707,6 +1798,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >               int32_t hdrm_overlap;
> >               volatile struct mlx5_mini_cqe8 *mcqe = NULL;
> >               uint32_t rss_hash_res = 0;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +             uintptr_t data_addr;
> > +#endif
> >
> >               if (consumed_strd == strd_n) {
> >                       /* Replace WQE only if the buffer is still in use.
> */
> > @@ -1772,12 +1866,30 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >                * - Out of buffer in the Mempool for Multi-Packet RQ.
> >                * - The packet's stride overlaps a headroom and scatter is
> > off.
> >                */
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +             if (unlikely(!rxq->mprq_tstore_memcpy) &&
> > +                     len <= rxq->mprq_max_memcpy_len) {
> > +                     rte_prefetch1(addr);
> > +                     if (len > RTE_CACHE_LINE_SIZE)
> > +                             rte_prefetch2((void *)((uintptr_t)addr +
> > RTE_CACHE_LINE_SIZE));
> > +             }
> > +#endif
> >               if (len <= rxq->mprq_max_memcpy_len ||
> >                   rxq->mprq_repl == NULL ||
> >                   (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
> >                       if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
> > -                             rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
> > -                                        addr, len);
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +                             data_addr =
> > (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
> > +                             if (!(rxq->mprq_tstore_memcpy))
> > +                                     rte_memcpy((void *)data_addr,
> > addr, len);
> > +                             else if ((rxq->mprq_tstore_memcpy) &&
> > +                                        !((data_addr | (uintptr_t)addr)
> &
> > ALIGNMENT_MASK))
> > +
> >       memcpy_aligned_rx_tstore_16B((void *)data_addr,
> > +                                                     addr, len);
> > +                             else
> > +#endif
> > +                                     rte_memcpy(rte_pktmbuf_mtod(pkt,
> > void *),
> > +                                                     addr, len);
> >                               DATA_LEN(pkt) = len;
> >                       } else if (rxq->strd_scatter_en) {
> >                               struct rte_mbuf *prev = pkt;
> > diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > index 9ffa028d2..a8ea1a795 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > @@ -153,6 +153,9 @@ struct mlx5_rxq_data {
> >       uint32_t tunnel; /* Tunnel information. */
> >       uint64_t flow_meta_mask;
> >       int32_t flow_meta_offset;
> > +#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
> > +     unsigned int mprq_tstore_memcpy:1;
> > +#endif
> >  } __rte_cache_aligned;
> >
> >  enum mlx5_rxq_type {
> > diff --git a/meson_options.txt b/meson_options.txt index
> > 9bf18ab6b..a4bc565d2 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -30,6 +30,8 @@ option('max_lcores', type: 'integer', value: 128,
> >       description: 'maximum number of cores/threads supported by EAL')
> > option('max_numa_nodes', type: 'integer', value: 4,
> >       description: 'maximum number of NUMA nodes supported by EAL')
> > +option('mlx5_ntload_tstore', type: 'boolean', value: false,
> > +     description: 'to enable optimized MPRQ in RX datapath')
> >  option('enable_trace_fp', type: 'boolean', value: false,
> >       description: 'enable fast path trace points.')  option('tests',
> type:
> > 'boolean', value: true,
> > --
> > 2.25.1
>
>
  

Patch

diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 9a97bb9c8..38e93fdc1 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -47,6 +47,7 @@  foreach option:cflags_options
 		cflags += option
 	endif
 endforeach
+dpdk_conf.set('RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY', get_option('mlx5_ntload_tstore'))
 if get_option('buildtype').contains('debug')
 	cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 01ead6e6a..a2796eaa5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -160,6 +160,11 @@ 
 /* Configure timeout of LRO session (in microseconds). */
 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+/* mprq_tstore_memcpy */
+#define MLX5_MPRQ_TSTORE_MEMCPY "mprq_tstore_memcpy"
+#endif
+
 /*
  * Device parameter to configure the total data buffer size for a single
  * hairpin queue (logarithm value).
@@ -1623,6 +1628,10 @@  mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	} else if (strcmp(MLX5_MPRQ_TSTORE_MEMCPY, key) == 0) {
+		config->mprq_tstore_memcpy = tmp;
+#endif
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1683,6 +1692,9 @@  mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		MLX5_MPRQ_TSTORE_MEMCPY,
+#endif
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 43da9a1fb..1eb305650 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -234,6 +234,9 @@  struct mlx5_dev_config {
 	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 };
 
 
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index c059e216d..c8db59a12 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1380,6 +1380,9 @@  mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->socket = socket;
 	if (dev->data->dev_conf.intr_conf.rxq)
 		tmpl->irq = 1;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	tmpl->rxq.mprq_tstore_memcpy = config->mprq_tstore_memcpy;
+#endif
 	mprq_stride_nums = config->mprq.stride_num_n ?
 		config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
 	mprq_stride_size = non_scatter_min_mbuf_size <=
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 0b87be15b..f59e30d82 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -123,6 +123,97 @@  uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
 uint64_t rte_net_mlx5_dynf_inline_mask;
 #define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
 
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+static void copy16B_ts(void *dst, void *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+static void copy32B_ts(void *dst, void *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static void copy64B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+}
+
+static void copy128B_ts(void *dst, void *src)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	ymm1 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 32));
+	ymm2 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 64));
+	ymm3 = _mm256_stream_load_si256((const __m256i *)((uint8_t *)src + 96));
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 32), ymm1);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 64), ymm2);
+	_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 96), ymm3);
+}
+
+static void *memcpy_aligned_rx_tstore_16B(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		copy128B_ts(dst, src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		copy64B_ts(dst, src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		copy32B_ts(dst, src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		copy16B_ts(dst, src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+#endif
+
 /**
  * Build a table to translate Rx completion flags to packet type.
  *
@@ -1707,6 +1798,9 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		uintptr_t data_addr;
+#endif
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
@@ -1772,12 +1866,30 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
 		 * - The packet's stride overlaps a headroom and scatter is off.
 		 */
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+		if (unlikely(!rxq->mprq_tstore_memcpy) &&
+			len <= rxq->mprq_max_memcpy_len) {
+			rte_prefetch1(addr);
+			if (len > RTE_CACHE_LINE_SIZE)
+				rte_prefetch2((void *)((uintptr_t)addr + RTE_CACHE_LINE_SIZE));
+		}
+#endif
 		if (len <= rxq->mprq_max_memcpy_len ||
 		    rxq->mprq_repl == NULL ||
 		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
 			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
-				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
-					   addr, len);
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+				data_addr = (uintptr_t)rte_pktmbuf_mtod(pkt, void *);
+				if (!(rxq->mprq_tstore_memcpy))
+					rte_memcpy((void *)data_addr, addr, len);
+				else if ((rxq->mprq_tstore_memcpy) &&
+					   !((data_addr | (uintptr_t)addr) & ALIGNMENT_MASK))
+					memcpy_aligned_rx_tstore_16B((void *)data_addr,
+							addr, len);
+				else
+#endif
+					rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+							addr, len);
 				DATA_LEN(pkt) = len;
 			} else if (rxq->strd_scatter_en) {
 				struct rte_mbuf *prev = pkt;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 9ffa028d2..a8ea1a795 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -153,6 +153,9 @@  struct mlx5_rxq_data {
 	uint32_t tunnel; /* Tunnel information. */
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
+#ifdef RTE_LIBRTE_MLX5_NTLOAD_TSTORE_ALIGN_COPY
+	unsigned int mprq_tstore_memcpy:1;
+#endif
 } __rte_cache_aligned;
 
 enum mlx5_rxq_type {
diff --git a/meson_options.txt b/meson_options.txt
index 9bf18ab6b..a4bc565d2 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -30,6 +30,8 @@  option('max_lcores', type: 'integer', value: 128,
 	description: 'maximum number of cores/threads supported by EAL')
 option('max_numa_nodes', type: 'integer', value: 4,
 	description: 'maximum number of NUMA nodes supported by EAL')
+option('mlx5_ntload_tstore', type: 'boolean', value: false,
+	description: 'to enable optimized MPRQ in RX datapath')
 option('enable_trace_fp', type: 'boolean', value: false,
 	description: 'enable fast path trace points.')
 option('tests', type: 'boolean', value: true,