diff mbox series

[v2] eal/x86: improve rte_memcpy const size 16 performance

Message ID	20240303094621.16404-1-mb@smartsharesystems.com (mailing list archive)
State	Changes Requested
Delegated to:	Thomas Monjalon
Headers	From: =?utf-8?q?Morten_Br=C3=B8rup?= <mb@smartsharesystems.com> To: bruce.richardson@intel.com, konstantin.v.ananyev@yandex.ru, stephen@networkplumber.org Cc: mattias.ronnblom@ericsson.com, dev@dpdk.org, =?utf-8?q?Morten_Br=C3=B8ru?= =?utf-8?q?p?= <mb@smartsharesystems.com> Subject: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance Date: Sun, 3 Mar 2024 10:46:21 +0100 Message-Id: <20240303094621.16404-1-mb@smartsharesystems.com> In-Reply-To: <20240302234812.9137-1-mb@smartsharesystems.com> References: <20240302234812.9137-1-mb@smartsharesystems.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: dev-bounces@dpdk.org
Series	[v2] eal/x86: improve rte_memcpy const size 16 performance \| [v2] eal/x86: improve rte_memcpy const size 16 performance

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK
ci/loongarch-compilation	success	Compilation OK
ci/loongarch-unit-testing	success	Unit Testing PASS
ci/Intel-compilation	success	Compilation OK
ci/intel-Testing	success	Testing PASS
ci/iol-intel-Performance	success	Performance Testing PASS
ci/iol-intel-Functional	success	Functional Testing PASS
ci/iol-mellanox-Performance	success	Performance Testing PASS
ci/iol-abi-testing	success	Testing PASS
ci/intel-Functional	success	Functional PASS
ci/github-robot: build	success	github build: passed
ci/iol-compile-amd64-testing	success	Testing PASS
ci/iol-unit-amd64-testing	success	Testing PASS
ci/iol-unit-arm64-testing	success	Testing PASS
ci/iol-compile-arm64-testing	success	Testing PASS
ci/iol-broadcom-Performance	success	Performance Testing PASS
ci/iol-sample-apps-testing	success	Testing PASS
ci/iol-broadcom-Functional	success	Functional Testing PASS

Commit Message

Morten Brørup March 3, 2024, 9:46 a.m. UTC

  When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
 1 file changed, 56 insertions(+), 175 deletions(-)

Comments

Morten Brørup April 4, 2024, 9:18 a.m. UTC | #1

PING Intel x86 maintainers for review.

> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Sunday, 3 March 2024 10.46
> 
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
>  1 file changed, 56 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..d1df841f5e 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  	return ret;
>  }
> 
> -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> -
> -#define ALIGNMENT_MASK 0x3F
> -
> -/**
> - * AVX512 implementation below
> - */
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ ||
> \
> +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> < 110000)))
>  	__m256i ymm0;
> 
>  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
>  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +#endif
>  }
> 
>  /**
> @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov64(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;
> 
>  	zmm0 = _mm512_loadu_si512((const void *)src);
>  	_mm512_storeu_si512((void *)dst, zmm0);
> +#else /* AVX2, AVX & SSE implementation */
> +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +#endif
>  }
> 
>  /**
> @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> +	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
> 
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> +
> +/**
> + * AVX512 implementation below
> + */
> +
> +#define ALIGNMENT_MASK 0x3F
> +
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
> @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
> +	if (__builtin_constant_p(n) && n == 64) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		rte_mov32((uint8_t *)dst - 32 + n,
> @@ -313,80 +332,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_128_BACK63;
>  }
> 
> -#elif defined __AVX2__
> -
> -#define ALIGNMENT_MASK 0x1F
> +#elif defined __AVX2__ || \
> +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> < 110000)))
> 
>  /**
> - * AVX2 implementation below
> + * AVX2 (and AVX, unless too old GCC version) implementation below
>   */
> 
> -/**
> - * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> -{
> -	__m128i xmm0;
> -
> -	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
> -	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
> -}
> -
> -/**
> - * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> -{
> -	__m256i ymm0;
> -
> -	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
> -	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
> -}
> -
> -/**
> - * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -}
> -
> -/**
> - * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> -}
> -
> -/**
> - * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> -	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
> -	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
> -	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
> -	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
> -}
> +#define ALIGNMENT_MASK 0x1F
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
> @@ -437,15 +390,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		return ret;
>  	}
> -	if (n <= 48) {
> +	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
> @@ -513,90 +465,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> 
>  #else /* __AVX512F__ */
> 
> -#define ALIGNMENT_MASK 0x0F
> -
> -/**
> - * SSE & AVX implementation below
> - */
> -
> -/**
> - * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> -{
> -	__m128i xmm0;
> -
> -	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
> -	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
> -}
> -
> -/**
> - * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -}
> -
>  /**
> - * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -}
> -
> -/**
> - * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> + * SSE (and AVX, with too old GCC version) implementation below
>   */
> -static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -}
> 
> -/**
> - * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> -}
> +#define ALIGNMENT_MASK 0x0F
> 
>  /**
>   * Macro for copying unaligned block from one location to another with
> constant load offset,
> @@ -712,17 +585,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> -		return ret;
> -	}
> -	if (n <= 48) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
>  		return ret;
>  	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +		if (n > 48)
> +			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
>  		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
>  		return ret;
>  	}
> @@ -828,8 +699,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  	}
> 
>  	/* Copy 16 <= size <= 32 bytes */
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
> 
> @@ -837,6 +714,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  	}
> 
>  	/* Copy 32 < size <= 64 bytes */
> +	if (__builtin_constant_p(n) && n == 64) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		rte_mov32((uint8_t *)dst - 32 + n,
> --
> 2.17.1

Bruce Richardson April 4, 2024, 10:07 a.m. UTC | #2

On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>

Changes in general look good to me. Comments inline below.

/Bruce

> ---
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
>  1 file changed, 56 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..d1df841f5e 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  	return ret;
>  }
>  
> -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> -
> -#define ALIGNMENT_MASK 0x3F
> -
> -/**
> - * AVX512 implementation below
> - */
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || \
> +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))

I think we can drop the AVX512 checks here, since I'm not aware of any
system where we'd have AVX512 but not AVX2 available, so just checking for
AVX2 support should be sufficient.

On the final compiler-based check, I don't strongly object to it, but I
just wonder as to its real value. AVX2 was first introduced by Intel over 10
years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
~2015. While we did have CPUs still being produced without AVX2 since that
time, they generally didn't have AVX1 either, only having SSE instructions.
Therefore the number of systems which require this additional check is
likely very small at this stage.
That said, I'm ok to either keep or omit it at your choice. If you do keep
it, how about putting the check once at the top of the file and using a
single short define instead for the multiple places it's used e.g.

#if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
#define RTE_MEMCPY_AVX2
#endif


>  	__m256i ymm0;
>  
>  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
>  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +#endif
>  }
>  
>  /**
> @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov64(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;
>  
>  	zmm0 = _mm512_loadu_si512((const void *)src);
>  	_mm512_storeu_si512((void *)dst, zmm0);
> +#else /* AVX2, AVX & SSE implementation */
> +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +#endif
>  }
>  
>  /**
> @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> +	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
>  
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> +
> +/**
> + * AVX512 implementation below
> + */
> +
> +#define ALIGNMENT_MASK 0x3F
> +
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
> @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}

There's an outstanding patchset from Stephen to replace all use of
rte_memcpy with a constant parameter with an actual call to regular memcpy.
On a wider scale should we not look to do something similar in this file,
have calls to rte_memcpy with constant parameter always turn into a call to
regular memcpy? We used to have such a macro in older DPDK e.g.
from DPDK 1.8

http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?h=v1.8.0#n171

This would elminiate the need to put in constant_p checks all through the
code.

>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
<snip>

Morten Brørup April 4, 2024, 11:19 a.m. UTC | #3

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Thursday, 4 April 2024 12.07
> 
> On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > In the case where the size is known to be 16 at build tine, omit the
> > duplicate copy.
> >
> > Reduced the amount of effectively copy-pasted code by using #ifdef
> > inside functions instead of outside functions.
> >
> > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> 
> Changes in general look good to me. Comments inline below.
> 
> /Bruce
> 
> > ---
> > v2:
> > * For GCC, version 11 is required for proper AVX handling;
> >   if older GCC version, treat AVX as SSE.
> >   Clang does not have this issue.
> >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > * Do not add copyright. (Stephen Hemminger)
> > ---
> >  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
> >  1 file changed, 56 insertions(+), 175 deletions(-)
> >
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> > index 72a92290e0..d1df841f5e 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
> >  	return ret;
> >  }
> >
> > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > -
> > -#define ALIGNMENT_MASK 0x3F
> > -
> > -/**
> > - * AVX512 implementation below
> > - */
> > -
> >  /**
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >  static __rte_always_inline void
> >  rte_mov32(uint8_t *dst, const uint8_t *src)
> >  {
> > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__
> || \
> > +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> < 110000)))
> 
> I think we can drop the AVX512 checks here, since I'm not aware of any
> system where we'd have AVX512 but not AVX2 available, so just checking for
> AVX2 support should be sufficient.

RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512:
https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memcpy.h#L98

Without it, the AVX2 version will be used, regardless if the CPU has AVX512.

Also, there are some binutils bugs that might disable compilation for AVX512:
https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4
https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17

> 
> On the final compiler-based check, I don't strongly object to it, but I
> just wonder as to its real value. AVX2 was first introduced by Intel over 10
> years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
> ~2015. While we did have CPUs still being produced without AVX2 since that
> time, they generally didn't have AVX1 either, only having SSE instructions.
> Therefore the number of systems which require this additional check is
> likely very small at this stage.
> That said, I'm ok to either keep or omit it at your choice.

I kept it for consistency, and to support older compilers still officially supported by DPDK.

I don't feel qualified to change support for CPU features; I'll leave that to the CPU vendors.
Also, I have no clue what has been produced by Intel and AMD. :-)

> If you do keep
> it, how about putting the check once at the top of the file and using a
> single short define instead for the multiple places it's used e.g.
> 
> #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> 110000)))
> #define RTE_MEMCPY_AVX2
> #endif

Much of the code reorganization in this patch was done with the intention to improve readability.

And I don't think this suggestion improves readability; especially considering that RTE_MEMCPY_AVX512 is something manually defined.

However, I get your point; and if the conditional was very long or very complex, I might agree to a "shadow" definition to keep it short.

> 
> 
> >  	__m256i ymm0;
> >
> >  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
> >  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> > +#else /* SSE implementation */
> > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +#endif
> >  }
> >
> >  /**
> > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >  static __rte_always_inline void
> >  rte_mov64(uint8_t *dst, const uint8_t *src)
> >  {
> > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> >  	__m512i zmm0;
> >
> >  	zmm0 = _mm512_loadu_si512((const void *)src);
> >  	_mm512_storeu_si512((void *)dst, zmm0);
> > +#else /* AVX2, AVX & SSE implementation */
> > +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > +#endif
> >  }
> >
> >  /**
> > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> >  static __rte_always_inline void
> >  rte_mov256(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> > +	rte_mov128(dst + 1 * 128, src + 1 * 128);
> >  }
> >
> > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > +
> > +/**
> > + * AVX512 implementation below
> > + */
> > +
> > +#define ALIGNMENT_MASK 0x3F
> > +
> >  /**
> >   * Copy 128-byte blocks from one location to another,
> >   * locations should not overlap.
> > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t
> n)
> >  	/**
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> > +	if (__builtin_constant_p(n) && n == 32) {
> > +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +		return ret;
> > +	}
> 
> There's an outstanding patchset from Stephen to replace all use of
> rte_memcpy with a constant parameter with an actual call to regular memcpy.
> On a wider scale should we not look to do something similar in this file,
> have calls to rte_memcpy with constant parameter always turn into a call to
> regular memcpy? We used to have such a macro in older DPDK e.g.
> from DPDK 1.8
> 
> http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp
> y.h?h=v1.8.0#n171
> 
> This would elminiate the need to put in constant_p checks all through the
> code.

The old macro in DPDK 1.8 was removed with the description "Remove slow glibc call for constant copies":
https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475

Stephen believes that the memcpy() built-ins provided by compilers are faster than rte_memcpy() for constant size.
I'm not convinced.
Such a change should be backed up by performance tests, preferably for all supported compilers - especially the old compilers that come with some of the supported distros might not be as good as we would hope.

Bruce Richardson April 4, 2024, 1:29 p.m. UTC | #4

On Thu, Apr 04, 2024 at 01:19:54PM +0200, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Thursday, 4 April 2024 12.07
> > 
> > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > > In the case where the size is known to be 16 at build tine, omit the
> > > duplicate copy.
> > >
> > > Reduced the amount of effectively copy-pasted code by using #ifdef
> > > inside functions instead of outside functions.
> > >
> > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > 
> > Changes in general look good to me. Comments inline below.
> > 
> > /Bruce
> > 
> > > ---
> > > v2:
> > > * For GCC, version 11 is required for proper AVX handling;
> > >   if older GCC version, treat AVX as SSE.
> > >   Clang does not have this issue.
> > >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > > * Do not add copyright. (Stephen Hemminger)
> > > ---
> > >  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
> > >  1 file changed, 56 insertions(+), 175 deletions(-)
> > >
> > > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > b/lib/eal/x86/include/rte_memcpy.h
> > > index 72a92290e0..d1df841f5e 100644
> > > --- a/lib/eal/x86/include/rte_memcpy.h
> > > +++ b/lib/eal/x86/include/rte_memcpy.h
> > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
> > >  	return ret;
> > >  }
> > >
> > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > -
> > > -#define ALIGNMENT_MASK 0x3F
> > > -
> > > -/**
> > > - * AVX512 implementation below
> > > - */
> > > -
> > >  /**
> > >   * Copy 16 bytes from one location to another,
> > >   * locations should not overlap.
> > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> > >  static __rte_always_inline void
> > >  rte_mov32(uint8_t *dst, const uint8_t *src)
> > >  {
> > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__
> > || \
> > > +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> > < 110000)))
> > 
> > I think we can drop the AVX512 checks here, since I'm not aware of any
> > system where we'd have AVX512 but not AVX2 available, so just checking for
> > AVX2 support should be sufficient.
> 
> RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512:
> https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memcpy.h#L98
> 
> Without it, the AVX2 version will be used, regardless if the CPU has AVX512.
> 
> Also, there are some binutils bugs that might disable compilation for AVX512:
> https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4
> https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17
> 

Yes, I realise that, but the guard here is for an AVX2 block only, so there
is no point in checking for AVX512 - it's AVX512 or AVX2.

> > 
> > On the final compiler-based check, I don't strongly object to it, but I
> > just wonder as to its real value. AVX2 was first introduced by Intel over 10
> > years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
> > ~2015. While we did have CPUs still being produced without AVX2 since that
> > time, they generally didn't have AVX1 either, only having SSE instructions.
> > Therefore the number of systems which require this additional check is
> > likely very small at this stage.
> > That said, I'm ok to either keep or omit it at your choice.
> 
> I kept it for consistency, and to support older compilers still officially supported by DPDK.
> 
> I don't feel qualified to change support for CPU features; I'll leave that to the CPU vendors.
> Also, I have no clue what has been produced by Intel and AMD. :-)
> 
> > If you do keep
> > it, how about putting the check once at the top of the file and using a
> > single short define instead for the multiple places it's used e.g.
> > 
> > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> > 110000)))
> > #define RTE_MEMCPY_AVX2
> > #endif
> 
> Much of the code reorganization in this patch was done with the intention to improve readability.
> 
> And I don't think this suggestion improves readability; especially considering that RTE_MEMCPY_AVX512 is something manually defined.
> 
> However, I get your point; and if the conditional was very long or very complex, I might agree to a "shadow" definition to keep it short.
> 

I just find it long enough that duplication of it seems painful. :-) I'd
rather we check once at the top if we can use an AVX copy vs SSE, rather
than duplicate the compiler version checks multiple times.


> > 
> > 
> > >  	__m256i ymm0;
> > >
> > >  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
> > >  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> > > +#else /* SSE implementation */
> > > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > > +#endif
> > >  }
> > >
> > >  /**
> > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> > >  static __rte_always_inline void
> > >  rte_mov64(uint8_t *dst, const uint8_t *src)
> > >  {
> > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > >  	__m512i zmm0;
> > >
> > >  	zmm0 = _mm512_loadu_si512((const void *)src);
> > >  	_mm512_storeu_si512((void *)dst, zmm0);
> > > +#else /* AVX2, AVX & SSE implementation */
> > > +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > > +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > > +#endif
> > >  }
> > >
> > >  /**
> > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> > >  static __rte_always_inline void
> > >  rte_mov256(uint8_t *dst, const uint8_t *src)
> > >  {
> > > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > > +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> > > +	rte_mov128(dst + 1 * 128, src + 1 * 128);
> > >  }
> > >
> > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > +
> > > +/**
> > > + * AVX512 implementation below
> > > + */
> > > +
> > > +#define ALIGNMENT_MASK 0x3F
> > > +
> > >  /**
> > >   * Copy 128-byte blocks from one location to another,
> > >   * locations should not overlap.
> > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t
> > n)
> > >  	/**
> > >  	 * Fast way when copy size doesn't exceed 512 bytes
> > >  	 */
> > > +	if (__builtin_constant_p(n) && n == 32) {
> > > +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > > +		return ret;
> > > +	}
> > 
> > There's an outstanding patchset from Stephen to replace all use of
> > rte_memcpy with a constant parameter with an actual call to regular memcpy.
> > On a wider scale should we not look to do something similar in this file,
> > have calls to rte_memcpy with constant parameter always turn into a call to
> > regular memcpy? We used to have such a macro in older DPDK e.g.
> > from DPDK 1.8
> > 
> > http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp
> > y.h?h=v1.8.0#n171
> > 
> > This would elminiate the need to put in constant_p checks all through the
> > code.
> 
> The old macro in DPDK 1.8 was removed with the description "Remove slow glibc call for constant copies":
> https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475
> 
> Stephen believes that the memcpy() built-ins provided by compilers are faster than rte_memcpy() for constant size.
> I'm not convinced.
> Such a change should be backed up by performance tests, preferably for all supported compilers - especially the old compilers that come with some of the supported distros might not be as good as we would hope.
>

I would tend to agree with Stephen that whereever possible we should use
the built-in memcpy calls. Hence my suggestion of re-introducing the macro.
I'm not sure why it previously was seen as slower, it may be that the
compiler-expanded memcpy calls are not done beyond a certain size.
However, since we lack data, I'm ok with taking the changes in your patch
as-is.

With the above-flagged superfluous AVX512 check on AVX2 code removed:

Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Morten Brørup April 4, 2024, 3:37 p.m. UTC | #5

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Thursday, 4 April 2024 15.29
> 
> On Thu, Apr 04, 2024 at 01:19:54PM +0200, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Thursday, 4 April 2024 12.07
> > >
> > > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > > > In the case where the size is known to be 16 at build tine, omit the
> > > > duplicate copy.
> > > >
> > > > Reduced the amount of effectively copy-pasted code by using #ifdef
> > > > inside functions instead of outside functions.
> > > >
> > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > >
> > > Changes in general look good to me. Comments inline below.
> > >
> > > /Bruce
> > >
> > > > ---
> > > > v2:
> > > > * For GCC, version 11 is required for proper AVX handling;
> > > >   if older GCC version, treat AVX as SSE.
> > > >   Clang does not have this issue.
> > > >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > > > * Do not add copyright. (Stephen Hemminger)
> > > > ---
> > > >  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
> > > >  1 file changed, 56 insertions(+), 175 deletions(-)
> > > >
> > > > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > > b/lib/eal/x86/include/rte_memcpy.h
> > > > index 72a92290e0..d1df841f5e 100644
> > > > --- a/lib/eal/x86/include/rte_memcpy.h
> > > > +++ b/lib/eal/x86/include/rte_memcpy.h
> > > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t
> n)
> > > >  	return ret;
> > > >  }
> > > >
> > > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > > -
> > > > -#define ALIGNMENT_MASK 0x3F
> > > > -
> > > > -/**
> > > > - * AVX512 implementation below
> > > > - */
> > > > -
> > > >  /**
> > > >   * Copy 16 bytes from one location to another,
> > > >   * locations should not overlap.
> > > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> > > >  static __rte_always_inline void
> > > >  rte_mov32(uint8_t *dst, const uint8_t *src)
> > > >  {
> > > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined
> __AVX2__
> > > || \
> > > > +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) &&
> (GCC_VERSION
> > > < 110000)))
> > >
> > > I think we can drop the AVX512 checks here, since I'm not aware of any
> > > system where we'd have AVX512 but not AVX2 available, so just checking for
> > > AVX2 support should be sufficient.
> >
> > RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512:
> >
> https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memc
> py.h#L98
> >
> > Without it, the AVX2 version will be used, regardless if the CPU has AVX512.
> >
> > Also, there are some binutils bugs that might disable compilation for
> AVX512:
> > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4
> > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17
> >
> 
> Yes, I realise that, but the guard here is for an AVX2 block only, so there
> is no point in checking for AVX512 - it's AVX512 or AVX2.

Aha! Now I get your point:
Checking for AVX2 suffices for AVX2 code.

I didn't think of that when combining the copy-pasted code into one code block.

Well spotted! Thank you.

> 
> > >
> > > On the final compiler-based check, I don't strongly object to it, but I
> > > just wonder as to its real value. AVX2 was first introduced by Intel over
> 10
> > > years ago, and (from what I find in wikipedia), it's been in AMD CPUs
> since
> > > ~2015. While we did have CPUs still being produced without AVX2 since that
> > > time, they generally didn't have AVX1 either, only having SSE
> instructions.
> > > Therefore the number of systems which require this additional check is
> > > likely very small at this stage.
> > > That said, I'm ok to either keep or omit it at your choice.
> >
> > I kept it for consistency, and to support older compilers still officially
> supported by DPDK.
> >
> > I don't feel qualified to change support for CPU features; I'll leave that
> to the CPU vendors.
> > Also, I have no clue what has been produced by Intel and AMD. :-)
> >
> > > If you do keep
> > > it, how about putting the check once at the top of the file and using a
> > > single short define instead for the multiple places it's used e.g.
> > >
> > > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> > > 110000)))
> > > #define RTE_MEMCPY_AVX2
> > > #endif
> >
> > Much of the code reorganization in this patch was done with the intention to
> improve readability.
> >
> > And I don't think this suggestion improves readability; especially
> considering that RTE_MEMCPY_AVX512 is something manually defined.
> >
> > However, I get your point; and if the conditional was very long or very
> complex, I might agree to a "shadow" definition to keep it short.
> >
> 
> I just find it long enough that duplication of it seems painful. :-) I'd
> rather we check once at the top if we can use an AVX copy vs SSE, rather
> than duplicate the compiler version checks multiple times.

OK. And I suppose the same principle as above applies:
AVX2 implies AVX, so checking for AVX suffices.

I suppose your suggested name RTE_MEMCPY_AVX2 was a typo, and will define it as RTE_MEMCPY_AVX.

> 
> 
> > >
> > >
> > > >  	__m256i ymm0;
> > > >
> > > >  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
> > > >  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> > > > +#else /* SSE implementation */
> > > > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> 16);
> > > > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> 16);
> > > > +#endif
> > > >  }
> > > >
> > > >  /**
> > > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> > > >  static __rte_always_inline void
> > > >  rte_mov64(uint8_t *dst, const uint8_t *src)
> > > >  {
> > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > >  	__m512i zmm0;
> > > >
> > > >  	zmm0 = _mm512_loadu_si512((const void *)src);
> > > >  	_mm512_storeu_si512((void *)dst, zmm0);
> > > > +#else /* AVX2, AVX & SSE implementation */
> > > > +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 *
> 32);
> > > > +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 *
> 32);
> > > > +#endif
> > > >  }
> > > >
> > > >  /**
> > > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> > > >  static __rte_always_inline void
> > > >  rte_mov256(uint8_t *dst, const uint8_t *src)
> > > >  {
> > > > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > > > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > > > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > > > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > > > +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> > > > +	rte_mov128(dst + 1 * 128, src + 1 * 128);
> > > >  }
> > > >
> > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > > +
> > > > +/**
> > > > + * AVX512 implementation below
> > > > + */
> > > > +
> > > > +#define ALIGNMENT_MASK 0x3F
> > > > +
> > > >  /**
> > > >   * Copy 128-byte blocks from one location to another,
> > > >   * locations should not overlap.
> > > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t
> > > n)
> > > >  	/**
> > > >  	 * Fast way when copy size doesn't exceed 512 bytes
> > > >  	 */
> > > > +	if (__builtin_constant_p(n) && n == 32) {
> > > > +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > > > +		return ret;
> > > > +	}
> > >
> > > There's an outstanding patchset from Stephen to replace all use of
> > > rte_memcpy with a constant parameter with an actual call to regular
> memcpy.
> > > On a wider scale should we not look to do something similar in this file,
> > > have calls to rte_memcpy with constant parameter always turn into a call
> to
> > > regular memcpy? We used to have such a macro in older DPDK e.g.
> > > from DPDK 1.8
> > >
> > >
> http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp
> > > y.h?h=v1.8.0#n171
> > >
> > > This would elminiate the need to put in constant_p checks all through the
> > > code.
> >
> > The old macro in DPDK 1.8 was removed with the description "Remove slow
> glibc call for constant copies":
> >
> https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_me
> mcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475
> >
> > Stephen believes that the memcpy() built-ins provided by compilers are
> faster than rte_memcpy() for constant size.
> > I'm not convinced.
> > Such a change should be backed up by performance tests, preferably for all
> supported compilers - especially the old compilers that come with some of the
> supported distros might not be as good as we would hope.
> >
> 
> I would tend to agree with Stephen that whereever possible we should use
> the built-in memcpy calls. Hence my suggestion of re-introducing the macro.

I agree in principle, but strongly prefer data to back up such changes in the fast path.

> I'm not sure why it previously was seen as slower, it may be that the
> compiler-expanded memcpy calls are not done beyond a certain size.
> However, since we lack data, I'm ok with taking the changes in your patch
> as-is.
> 
> With the above-flagged superfluous AVX512 check on AVX2 code removed:
> 
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Thanks. I'll provide a v3 patch.

Stephen Hemminger April 4, 2024, 3:55 p.m. UTC | #6

On Thu, 4 Apr 2024 17:37:53 +0200
Morten Brørup <mb@smartsharesystems.com> wrote:

> > I would tend to agree with Stephen that whereever possible we should use
> > the built-in memcpy calls. Hence my suggestion of re-introducing the macro.  
> 
> I agree in principle, but strongly prefer data to back up such changes in the fast path.


godbolt.org shows same instructions for the cases I looked at.

Morten Brørup April 4, 2024, 4:10 p.m. UTC | #7

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Thursday, 4 April 2024 17.56
> 
> On Thu, 4 Apr 2024 17:37:53 +0200
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > I would tend to agree with Stephen that whereever possible we should use
> > > the built-in memcpy calls. Hence my suggestion of re-introducing the
> macro.
> >
> > I agree in principle, but strongly prefer data to back up such changes in
> the fast path.
> 
> 
> godbolt.org shows same instructions for the cases I looked at.

Such a fundamental change belongs in a separate patch, with a description of what has been confirmed to generate same instructions or otherwise tested.
On behalf of the distros, I'm mostly worried about older compilers.

Anyway, this patch also tidies up the code, removing a lot of copy-paste, so I think we should go ahead with this patch first.

Bruce Richardson April 4, 2024, 4:55 p.m. UTC | #8

On Thu, Apr 04, 2024 at 06:10:32PM +0200, Morten Brørup wrote:
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Thursday, 4 April 2024 17.56
> > 
> > On Thu, 4 Apr 2024 17:37:53 +0200
> > Morten Brørup <mb@smartsharesystems.com> wrote:
> > 
> > > > I would tend to agree with Stephen that whereever possible we should use
> > > > the built-in memcpy calls. Hence my suggestion of re-introducing the
> > macro.
> > >
> > > I agree in principle, but strongly prefer data to back up such changes in
> > the fast path.
> > 
> > 
> > godbolt.org shows same instructions for the cases I looked at.
> 
> Such a fundamental change belongs in a separate patch, with a description of what has been confirmed to generate same instructions or otherwise tested.
> On behalf of the distros, I'm mostly worried about older compilers.
> 
> Anyway, this patch also tidies up the code, removing a lot of copy-paste, so I think we should go ahead with this patch first.
> 
I agree. Best to keep such changes in separate patches.

diff mbox series

Patch

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..d1df841f5e 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -91,14 +91,6 @@  rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +111,16 @@  rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || \
+		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +130,15 @@  rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +159,18 @@  rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +240,22 @@  rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +332,14 @@  rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
+#elif defined __AVX2__ || \
+		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
 
 /**
- * AVX2 implementation below
+ * AVX2 (and AVX, unless too old GCC version) implementation below
  */
 
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +390,14 @@  rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +465,11 @@  rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * SSE (and AVX, with too old GCC version) implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +585,15 @@  rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +699,14 @@  rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +714,10 @@  rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,