diff mbox series

[v2,1/2] lib/eal: add amd epyc2 memcpy routine to eal

Message ID 20211019104724.19416-1-aman.kumar@vvdntech.in (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series [v2,1/2] lib/eal: add amd epyc2 memcpy routine to eal | expand

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Aman Kumar Oct. 19, 2021, 10:47 a.m. UTC
This patch provides rte_memcpy* calls optimized for
AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
as cross-file with meson to build dpdk for AMD EPYC platforms.

Signed-off-by: Aman Kumar <aman.kumar@vvdntech.in>
---
 config/x86/meson.build            |   7 +
 config/x86/x86_amd_epyc_linux_gcc |  16 +
 lib/eal/x86/include/rte_memcpy.h  | 502 ++++++++++++++++++++++++++++++
 3 files changed, 525 insertions(+)
 create mode 100644 config/x86/x86_amd_epyc_linux_gcc

Comments

Thomas Monjalon Oct. 19, 2021, 12:31 p.m. UTC | #1
19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for
> AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> as cross-file with meson to build dpdk for AMD EPYC platforms.

Please split in 2 patches: platform & memcpy.

What optimization is specific to EPYC?

I dislike the asm code below.
What is AMD specific inside?
Can it use compiler intrinsics as it is done elsewhere?

> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +					    const void *src,
> +					    size_t size)
> +{
> +	asm volatile goto("movq %0, %%rsi\n\t"
> +	"movq %1, %%rdi\n\t"
> +	"movq %2, %%rdx\n\t"
> +	"cmpq   $(128), %%rdx\n\t"
> +	"jb     202f\n\t"
> +	"201:\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +	"vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +	"vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +	"vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +	"addq   $128, %%rsi\n\t"
> +	"addq   $128, %%rdi\n\t"
> +	"subq   $128, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +	"jae    201b\n\t"
> +	"202:\n\t"
> +	"cmpq   $64, %%rdx\n\t"
> +	"jb     203f\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +	"addq   $64, %%rsi\n\t"
> +	"addq   $64, %%rdi\n\t"
> +	"subq   $64, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"203:\n\t"
> +	"cmpq   $32, %%rdx\n\t"
> +	"jb     204f\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"addq   $32, %%rsi\n\t"
> +	"addq   $32, %%rdi\n\t"
> +	"subq   $32, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"204:\n\t"
> +	"cmpb   $16, %%dl\n\t"
> +	"jb     205f\n\t"
> +	"vmovntdqa (%%rsi), %%xmm0\n\t"
> +	"vmovdqu  %%xmm0, (%%rdi)\n\t"
> +	"addq   $16, %%rsi\n\t"
> +	"addq   $16, %%rdi\n\t"
> +	"subq   $16, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"205:\n\t"
> +	"cmpb   $2, %%dl\n\t"
> +	"jb     208f\n\t"
> +	"cmpb   $4, %%dl\n\t"
> +	"jbe    207f\n\t"
> +	"cmpb   $8, %%dl\n\t"
> +	"jbe    206f\n\t"
> +	"movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +	"movq   (%%rsi), %%rsi\n\t"
> +	"movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +	"movq   %%rsi, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"206:\n\t"
> +	"movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +	"movl   (%%rsi), %%esi\n\t"
> +	"movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +	"movl   %%esi, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"207:\n\t"
> +	"movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +	"movzwl (%%rsi), %%esi\n\t"
> +	"movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +	"movw   %%si, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"208:\n\t"
> +	"movzbl (%%rsi), %%ecx\n\t"
> +	"movb   %%cl, (%%rdi)"
> +	:
> +	: "r"(src), "r"(dst), "r"(size)
> +	: "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +	: done
> +	);
> +done:
> +	return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len)
> +{
> +	asm goto("movq	%0, %%rsi\n\t"
> +	"movq	%1, %%rdi\n\t"
> +	"movq	%2, %%rdx\n\t"
> +	"movq    %%rdi, %%rax\n\t"
> +	"cmp     $32, %%rdx\n\t"
> +	"jb      101f\n\t"
> +	"cmp     $(32 * 2), %%rdx\n\t"
> +	"ja      108f\n\t"
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"101:\n\t"
> +	/* Less than 1 VEC.  */
> +	"cmpb    $32, %%dl\n\t"
> +	"jae     103f\n\t"
> +	"cmpb    $16, %%dl\n\t"
> +	"jae     104f\n\t"
> +	"cmpb    $8, %%dl\n\t"
> +	"jae     105f\n\t"
> +	"cmpb    $4, %%dl\n\t"
> +	"jae     106f\n\t"
> +	"cmpb    $1, %%dl\n\t"
> +	"ja      107f\n\t"
> +	"jb      102f\n\t"
> +	"movzbl  (%%rsi), %%ecx\n\t"
> +	"movb    %%cl, (%%rdi)\n\t"
> +	"102:\n\t"
> +	"jmp %l[done]\n\t"
> +	"103:\n\t"
> +	/* From 32 to 63.  No branch when size == 32.  */
> +	"vmovdqu (%%rsi), %%ymm0\n\t"
> +	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +	"vmovdqu %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	/* From 16 to 31.  No branch when size == 16.  */
> +	"104:\n\t"
> +	"vmovdqu (%%rsi), %%xmm0\n\t"
> +	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +	"vmovdqu %%xmm0, (%%rdi)\n\t"
> +	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +	"jmp %l[done]\n\t"
> +	"105:\n\t"
> +	/* From 8 to 15.  No branch when size == 8.  */
> +	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +	"movq    (%%rsi), %%rsi\n\t"
> +	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +	"movq    %%rsi, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"106:\n\t"
> +	/* From 4 to 7.  No branch when size == 4.  */
> +	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +	"movl    (%%rsi), %%esi\n\t"
> +	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +	"movl    %%esi, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"107:\n\t"
> +	/* From 2 to 3.  No branch when size == 2.  */
> +	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +	"movzwl  (%%rsi), %%esi\n\t"
> +	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +	"movw    %%si, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"108:\n\t"
> +	/* More than 2 * VEC and there may be overlap between destination */
> +	/* and source.  */
> +	"cmpq    $(32 * 8), %%rdx\n\t"
> +	"ja      111f\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"jb      109f\n\t"
> +	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"109:\n\t"
> +	/* Copy from 2 * VEC to 4 * VEC. */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"110:\n\t"
> +	"jmp %l[done]\n\t"
> +	"111:\n\t"
> +	"cmpq    %%rsi, %%rdi\n\t"
> +	"ja      113f\n\t"
> +	/* Source == destination is less common.  */
> +	"je      110b\n\t"
> +	/* Load the first VEC and last 4 * VEC to
> +	 * support overlapping addresses.
> +	 */
> +	"vmovdqu   (%%rsi), %%ymm4\n\t"
> +	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +	/* Save start and stop of the destination buffer.  */
> +	"movq    %%rdi, %%r11\n\t"
> +	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +	/* Align destination for aligned stores in the loop.  Compute */
> +	/* how much destination is misaligned.  */
> +	"movq    %%rdi, %%r8\n\t"
> +	"andq    $(32 - 1), %%r8\n\t"
> +	/* Get the negative of offset for alignment.  */
> +	"subq    $32, %%r8\n\t"
> +	/* Adjust source.  */
> +	"subq    %%r8, %%rsi\n\t"
> +	/* Adjust destination which should be aligned now.  */
> +	"subq    %%r8, %%rdi\n\t"
> +	/* Adjust length.  */
> +	"addq    %%r8, %%rdx\n\t"
> +	/* Check non-temporal store threshold.  */
> +	"cmpq	 $(1024*1024), %%rdx\n\t"
> +	"ja      115f\n\t"
> +	"112:\n\t"
> +	/* Copy 4 * VEC a time forward.  */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"addq    $(32 * 4), %%rsi\n\t"
> +	"subq    $(32 * 4), %%rdx\n\t"
> +	"vmovdqa   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"addq    $(32 * 4), %%rdi\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      112b\n\t"
> +	/* Store the last 4 * VEC.  */
> +	"vmovdqu   %%ymm5, (%%rcx)\n\t"
> +	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +	/* Store the first VEC.  */
> +	"vmovdqu   %%ymm4, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"113:\n\t"
> +	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +	"vmovdqu   (%%rsi), %%ymm4\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm5\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +	/* Save stop of the destination buffer.  */
> +	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +	/* Align destination end for aligned stores in the loop.  Compute */
> +	/* how much destination end is misaligned.  */
> +	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +	"movq    %%r11, %%r9\n\t"
> +	"movq    %%r11, %%r8\n\t"
> +	"andq    $(32 - 1), %%r8\n\t"
> +	/* Adjust source.  */
> +	"subq    %%r8, %%rcx\n\t"
> +	/* Adjust the end of destination which should be aligned now.  */
> +	"subq    %%r8, %%r9\n\t"
> +	/* Adjust length.  */
> +	"subq    %%r8, %%rdx\n\t"
> +	 /* Check non-temporal store threshold.  */
> +	"cmpq	 $(1024*1024), %%rdx\n\t"
> +	"ja      117f\n\t"
> +	"114:\n\t"
> +	/* Copy 4 * VEC a time backward.  */
> +	"vmovdqu   (%%rcx), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +	"subq    $(32 * 4), %%rcx\n\t"
> +	"subq    $(32 * 4), %%rdx\n\t"
> +	"vmovdqa   %%ymm0, (%%r9)\n\t"
> +	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +	"subq    $(32 * 4), %%r9\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      114b\n\t"
> +	/* Store the first 4 * VEC. */
> +	"vmovdqu   %%ymm4, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +	/* Store the last VEC. */
> +	"vmovdqu   %%ymm8, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +
> +	"115:\n\t"
> +	/* Don't use non-temporal store if there is overlap between */
> +	/* destination and source since destination may be in cache */
> +	/* when source is loaded. */
> +	"leaq    (%%rdi, %%rdx), %%r10\n\t"
> +	"cmpq    %%r10, %%rsi\n\t"
> +	"jb      112b\n\t"
> +	"116:\n\t"
> +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> +	"prefetcht0 (32*4*2)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*3)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"addq    $(32*4), %%rsi\n\t"
> +	"subq    $(32*4), %%rdx\n\t"
> +	"vmovntdq  %%ymm0, (%%rdi)\n\t"
> +	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"addq    $(32*4), %%rdi\n\t"
> +	"cmpq    $(32*4), %%rdx\n\t"
> +	"ja      116b\n\t"
> +	"sfence\n\t"
> +	/* Store the last 4 * VEC.  */
> +	"vmovdqu   %%ymm5, (%%rcx)\n\t"
> +	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +	/* Store the first VEC.  */
> +	"vmovdqu   %%ymm4, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"117:\n\t"
> +	/* Don't use non-temporal store if there is overlap between */
> +	/* destination and source since destination may be in cache */
> +	/* when source is loaded.  */
> +	"leaq    (%%rcx, %%rdx), %%r10\n\t"
> +	"cmpq    %%r10, %%r9\n\t"
> +	"jb      114b\n\t"
> +	"118:\n\t"
> +	/* Copy 4 * VEC a time backward with non-temporal stores. */
> +	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +	"vmovdqu   (%%rcx), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +	"subq    $(32*4), %%rcx\n\t"
> +	"subq    $(32*4), %%rdx\n\t"
> +	"vmovntdq  %%ymm0, (%%r9)\n\t"
> +	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +	"subq    $(32 * 4), %%r9\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      118b\n\t"
> +	"sfence\n\t"
> +	/* Store the first 4 * VEC.  */
> +	"vmovdqu   %%ymm4, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +	/* Store the last VEC.  */
> +	"vmovdqu   %%ymm8, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]"
> +	:
> +	: "r"(src), "r"(dst), "r"(len)
> +	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> +	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +	: done
> +	);
> +done:
> +	return dst;
> +}
Stephen Hemminger Oct. 19, 2021, 3:35 p.m. UTC | #2
On Tue, 19 Oct 2021 14:31:01 +0200
Thomas Monjalon <thomas@monjalon.net> wrote:

> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for
> > AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> > as cross-file with meson to build dpdk for AMD EPYC platforms.  
> 
> Please split in 2 patches: platform & memcpy.
> 
> What optimization is specific to EPYC?
> 
> I dislike the asm code below.
> What is AMD specific inside?
> Can it use compiler intrinsics as it is done elsewhere?

And why is this not done by Gcc?
Song, Keesang Oct. 21, 2021, 5:10 p.m. UTC | #3
[AMD Official Use Only]

Hi Thomas,

I hope this can make some explanation to your question.
We(AMD Linux library support team) have implemented the custom tailored memcpy solution which is a close match with DPDK use case requirements like the below.
1)      Min 64B length data packet with cache aligned Source and Destination.
2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
3)      This solution works for all AVX2 supported AMD machines.

Internally we have completed the integrity testing and benchmarking of the solution and found gains of 8.4% to 14.5% specifically on Milan CPU(3rd Gen of EPYC Processor)

Thanks for your support,
Keesang

-----Original Message-----
From: Thomas Monjalon <thomas@monjalon.net>
Sent: Tuesday, October 19, 2021 5:31 AM
To: Aman Kumar <aman.kumar@vvdntech.in>
Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang <Keesang.Song@amd.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; bruce.richardson@intel.com; konstantin.ananyev@intel.com; david.marchand@redhat.com
Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal

[CAUTION: External Email]

19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for AMD EPYC
> platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> meson to build dpdk for AMD EPYC platforms.

Please split in 2 patches: platform & memcpy.

What optimization is specific to EPYC?

I dislike the asm code below.
What is AMD specific inside?
Can it use compiler intrinsics as it is done elsewhere?

> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +                                         const void *src,
> +                                         size_t size) {
> +     asm volatile goto("movq %0, %%rsi\n\t"
> +     "movq %1, %%rdi\n\t"
> +     "movq %2, %%rdx\n\t"
> +     "cmpq   $(128), %%rdx\n\t"
> +     "jb     202f\n\t"
> +     "201:\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +     "addq   $128, %%rsi\n\t"
> +     "addq   $128, %%rdi\n\t"
> +     "subq   $128, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +     "jae    201b\n\t"
> +     "202:\n\t"
> +     "cmpq   $64, %%rdx\n\t"
> +     "jb     203f\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +     "addq   $64, %%rsi\n\t"
> +     "addq   $64, %%rdi\n\t"
> +     "subq   $64, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "203:\n\t"
> +     "cmpq   $32, %%rdx\n\t"
> +     "jb     204f\n\t"
> +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> +     "addq   $32, %%rsi\n\t"
> +     "addq   $32, %%rdi\n\t"
> +     "subq   $32, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "204:\n\t"
> +     "cmpb   $16, %%dl\n\t"
> +     "jb     205f\n\t"
> +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> +     "addq   $16, %%rsi\n\t"
> +     "addq   $16, %%rdi\n\t"
> +     "subq   $16, %%rdx\n\t"
> +     "jz     %l[done]\n\t"
> +     "205:\n\t"
> +     "cmpb   $2, %%dl\n\t"
> +     "jb     208f\n\t"
> +     "cmpb   $4, %%dl\n\t"
> +     "jbe    207f\n\t"
> +     "cmpb   $8, %%dl\n\t"
> +     "jbe    206f\n\t"
> +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +     "movq   (%%rsi), %%rsi\n\t"
> +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +     "movq   %%rsi, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "206:\n\t"
> +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +     "movl   (%%rsi), %%esi\n\t"
> +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +     "movl   %%esi, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "207:\n\t"
> +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +     "movzwl (%%rsi), %%esi\n\t"
> +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +     "movw   %%si, (%%rdi)\n\t"
> +     "jmp    %l[done]\n\t"
> +     "208:\n\t"
> +     "movzbl (%%rsi), %%ecx\n\t"
> +     "movb   %%cl, (%%rdi)"
> +     :
> +     : "r"(src), "r"(dst), "r"(size)
> +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +     : done
> +     );
> +done:
> +     return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> +     asm goto("movq  %0, %%rsi\n\t"
> +     "movq   %1, %%rdi\n\t"
> +     "movq   %2, %%rdx\n\t"
> +     "movq    %%rdi, %%rax\n\t"
> +     "cmp     $32, %%rdx\n\t"
> +     "jb      101f\n\t"
> +     "cmp     $(32 * 2), %%rdx\n\t"
> +     "ja      108f\n\t"
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "101:\n\t"
> +     /* Less than 1 VEC.  */
> +     "cmpb    $32, %%dl\n\t"
> +     "jae     103f\n\t"
> +     "cmpb    $16, %%dl\n\t"
> +     "jae     104f\n\t"
> +     "cmpb    $8, %%dl\n\t"
> +     "jae     105f\n\t"
> +     "cmpb    $4, %%dl\n\t"
> +     "jae     106f\n\t"
> +     "cmpb    $1, %%dl\n\t"
> +     "ja      107f\n\t"
> +     "jb      102f\n\t"
> +     "movzbl  (%%rsi), %%ecx\n\t"
> +     "movb    %%cl, (%%rdi)\n\t"
> +     "102:\n\t"
> +     "jmp %l[done]\n\t"
> +     "103:\n\t"
> +     /* From 32 to 63.  No branch when size == 32.  */
> +     "vmovdqu (%%rsi), %%ymm0\n\t"
> +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     /* From 16 to 31.  No branch when size == 16.  */
> +     "104:\n\t"
> +     "vmovdqu (%%rsi), %%xmm0\n\t"
> +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +     "jmp %l[done]\n\t"
> +     "105:\n\t"
> +     /* From 8 to 15.  No branch when size == 8.  */
> +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +     "movq    (%%rsi), %%rsi\n\t"
> +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +     "movq    %%rsi, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "106:\n\t"
> +     /* From 4 to 7.  No branch when size == 4.  */
> +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +     "movl    (%%rsi), %%esi\n\t"
> +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +     "movl    %%esi, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "107:\n\t"
> +     /* From 2 to 3.  No branch when size == 2.  */
> +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +     "movzwl  (%%rsi), %%esi\n\t"
> +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +     "movw    %%si, (%%rdi)\n\t"
> +     "jmp %l[done]\n\t"
> +     "108:\n\t"
> +     /* More than 2 * VEC and there may be overlap between destination */
> +     /* and source.  */
> +     "cmpq    $(32 * 8), %%rdx\n\t"
> +     "ja      111f\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "jb      109f\n\t"
> +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "109:\n\t"
> +     /* Copy from 2 * VEC to 4 * VEC. */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +     "vzeroupper\n\t"
> +     "110:\n\t"
> +     "jmp %l[done]\n\t"
> +     "111:\n\t"
> +     "cmpq    %%rsi, %%rdi\n\t"
> +     "ja      113f\n\t"
> +     /* Source == destination is less common.  */
> +     "je      110b\n\t"
> +     /* Load the first VEC and last 4 * VEC to
> +      * support overlapping addresses.
> +      */
> +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +     /* Save start and stop of the destination buffer.  */
> +     "movq    %%rdi, %%r11\n\t"
> +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +     /* Align destination for aligned stores in the loop.  Compute */
> +     /* how much destination is misaligned.  */
> +     "movq    %%rdi, %%r8\n\t"
> +     "andq    $(32 - 1), %%r8\n\t"
> +     /* Get the negative of offset for alignment.  */
> +     "subq    $32, %%r8\n\t"
> +     /* Adjust source.  */
> +     "subq    %%r8, %%rsi\n\t"
> +     /* Adjust destination which should be aligned now.  */
> +     "subq    %%r8, %%rdi\n\t"
> +     /* Adjust length.  */
> +     "addq    %%r8, %%rdx\n\t"
> +     /* Check non-temporal store threshold.  */
> +     "cmpq    $(1024*1024), %%rdx\n\t"
> +     "ja      115f\n\t"
> +     "112:\n\t"
> +     /* Copy 4 * VEC a time forward.  */
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "addq    $(32 * 4), %%rsi\n\t"
> +     "subq    $(32 * 4), %%rdx\n\t"
> +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "addq    $(32 * 4), %%rdi\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      112b\n\t"
> +     /* Store the last 4 * VEC.  */
> +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +     /* Store the first VEC.  */
> +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "113:\n\t"
> +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +     /* Save stop of the destination buffer.  */
> +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +     /* Align destination end for aligned stores in the loop.  Compute */
> +     /* how much destination end is misaligned.  */
> +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +     "movq    %%r11, %%r9\n\t"
> +     "movq    %%r11, %%r8\n\t"
> +     "andq    $(32 - 1), %%r8\n\t"
> +     /* Adjust source.  */
> +     "subq    %%r8, %%rcx\n\t"
> +     /* Adjust the end of destination which should be aligned now.  */
> +     "subq    %%r8, %%r9\n\t"
> +     /* Adjust length.  */
> +     "subq    %%r8, %%rdx\n\t"
> +      /* Check non-temporal store threshold.  */
> +     "cmpq    $(1024*1024), %%rdx\n\t"
> +     "ja      117f\n\t"
> +     "114:\n\t"
> +     /* Copy 4 * VEC a time backward.  */
> +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +     "subq    $(32 * 4), %%rcx\n\t"
> +     "subq    $(32 * 4), %%rdx\n\t"
> +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +     "subq    $(32 * 4), %%r9\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      114b\n\t"
> +     /* Store the first 4 * VEC. */
> +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +     /* Store the last VEC. */
> +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +
> +     "115:\n\t"
> +     /* Don't use non-temporal store if there is overlap between */
> +     /* destination and source since destination may be in cache */
> +     /* when source is loaded. */
> +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> +     "cmpq    %%r10, %%rsi\n\t"
> +     "jb      112b\n\t"
> +     "116:\n\t"
> +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +     "addq    $(32*4), %%rsi\n\t"
> +     "subq    $(32*4), %%rdx\n\t"
> +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +     "addq    $(32*4), %%rdi\n\t"
> +     "cmpq    $(32*4), %%rdx\n\t"
> +     "ja      116b\n\t"
> +     "sfence\n\t"
> +     /* Store the last 4 * VEC.  */
> +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +     /* Store the first VEC.  */
> +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]\n\t"
> +     "117:\n\t"
> +     /* Don't use non-temporal store if there is overlap between */
> +     /* destination and source since destination may be in cache */
> +     /* when source is loaded.  */
> +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> +     "cmpq    %%r10, %%r9\n\t"
> +     "jb      114b\n\t"
> +     "118:\n\t"
> +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +     "subq    $(32*4), %%rcx\n\t"
> +     "subq    $(32*4), %%rdx\n\t"
> +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +     "subq    $(32 * 4), %%r9\n\t"
> +     "cmpq    $(32 * 4), %%rdx\n\t"
> +     "ja      118b\n\t"
> +     "sfence\n\t"
> +     /* Store the first 4 * VEC.  */
> +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +     /* Store the last VEC.  */
> +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> +     "vzeroupper\n\t"
> +     "jmp %l[done]"
> +     :
> +     : "r"(src), "r"(dst), "r"(len)
> +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +     : done
> +     );
> +done:
> +     return dst;
> +}
Ananyev, Konstantin Oct. 21, 2021, 5:40 p.m. UTC | #4
> 
> Hi Thomas,
> 
> I hope this can make some explanation to your question.
> We(AMD Linux library support team) have implemented the custom tailored memcpy solution which is a close match with DPDK use case
> requirements like the below.
> 1)      Min 64B length data packet with cache aligned Source and Destination.
> 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> 3)      This solution works for all AVX2 supported AMD machines.
> 
> Internally we have completed the integrity testing and benchmarking of the solution and found gains of 8.4% to 14.5% specifically on Milan
> CPU(3rd Gen of EPYC Processor)

It still not clear to me why it has to be written in assembler.
Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?

> 
> Thanks for your support,
> Keesang
> 
> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Tuesday, October 19, 2021 5:31 AM
> To: Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang <Keesang.Song@amd.com>; aman.kumar@vvdntech.in;
> jerinjacobk@gmail.com; bruce.richardson@intel.com; konstantin.ananyev@intel.com; david.marchand@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
> 
> [CAUTION: External Email]
> 
> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> > meson to build dpdk for AMD EPYC platforms.
> 
> Please split in 2 patches: platform & memcpy.
> 
> What optimization is specific to EPYC?
> 
> I dislike the asm code below.
> What is AMD specific inside?
> Can it use compiler intrinsics as it is done elsewhere?
> 
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > +                                         const void *src,
> > +                                         size_t size) {
> > +     asm volatile goto("movq %0, %%rsi\n\t"
> > +     "movq %1, %%rdi\n\t"
> > +     "movq %2, %%rdx\n\t"
> > +     "cmpq   $(128), %%rdx\n\t"
> > +     "jb     202f\n\t"
> > +     "201:\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > +     "addq   $128, %%rsi\n\t"
> > +     "addq   $128, %%rdi\n\t"
> > +     "subq   $128, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > +     "jae    201b\n\t"
> > +     "202:\n\t"
> > +     "cmpq   $64, %%rdx\n\t"
> > +     "jb     203f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "addq   $64, %%rsi\n\t"
> > +     "addq   $64, %%rdi\n\t"
> > +     "subq   $64, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "203:\n\t"
> > +     "cmpq   $32, %%rdx\n\t"
> > +     "jb     204f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "addq   $32, %%rsi\n\t"
> > +     "addq   $32, %%rdi\n\t"
> > +     "subq   $32, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "204:\n\t"
> > +     "cmpb   $16, %%dl\n\t"
> > +     "jb     205f\n\t"
> > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > +     "addq   $16, %%rsi\n\t"
> > +     "addq   $16, %%rdi\n\t"
> > +     "subq   $16, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "205:\n\t"
> > +     "cmpb   $2, %%dl\n\t"
> > +     "jb     208f\n\t"
> > +     "cmpb   $4, %%dl\n\t"
> > +     "jbe    207f\n\t"
> > +     "cmpb   $8, %%dl\n\t"
> > +     "jbe    206f\n\t"
> > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq   (%%rsi), %%rsi\n\t"
> > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq   %%rsi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "206:\n\t"
> > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl   (%%rsi), %%esi\n\t"
> > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl   %%esi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "207:\n\t"
> > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl (%%rsi), %%esi\n\t"
> > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw   %%si, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "208:\n\t"
> > +     "movzbl (%%rsi), %%ecx\n\t"
> > +     "movb   %%cl, (%%rdi)"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(size)
> > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> > +     asm goto("movq  %0, %%rsi\n\t"
> > +     "movq   %1, %%rdi\n\t"
> > +     "movq   %2, %%rdx\n\t"
> > +     "movq    %%rdi, %%rax\n\t"
> > +     "cmp     $32, %%rdx\n\t"
> > +     "jb      101f\n\t"
> > +     "cmp     $(32 * 2), %%rdx\n\t"
> > +     "ja      108f\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "101:\n\t"
> > +     /* Less than 1 VEC.  */
> > +     "cmpb    $32, %%dl\n\t"
> > +     "jae     103f\n\t"
> > +     "cmpb    $16, %%dl\n\t"
> > +     "jae     104f\n\t"
> > +     "cmpb    $8, %%dl\n\t"
> > +     "jae     105f\n\t"
> > +     "cmpb    $4, %%dl\n\t"
> > +     "jae     106f\n\t"
> > +     "cmpb    $1, %%dl\n\t"
> > +     "ja      107f\n\t"
> > +     "jb      102f\n\t"
> > +     "movzbl  (%%rsi), %%ecx\n\t"
> > +     "movb    %%cl, (%%rdi)\n\t"
> > +     "102:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "103:\n\t"
> > +     /* From 32 to 63.  No branch when size == 32.  */
> > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     /* From 16 to 31.  No branch when size == 16.  */
> > +     "104:\n\t"
> > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "105:\n\t"
> > +     /* From 8 to 15.  No branch when size == 8.  */
> > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq    (%%rsi), %%rsi\n\t"
> > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq    %%rsi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "106:\n\t"
> > +     /* From 4 to 7.  No branch when size == 4.  */
> > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl    (%%rsi), %%esi\n\t"
> > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl    %%esi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "107:\n\t"
> > +     /* From 2 to 3.  No branch when size == 2.  */
> > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl  (%%rsi), %%esi\n\t"
> > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw    %%si, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "108:\n\t"
> > +     /* More than 2 * VEC and there may be overlap between destination */
> > +     /* and source.  */
> > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > +     "ja      111f\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "jb      109f\n\t"
> > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "109:\n\t"
> > +     /* Copy from 2 * VEC to 4 * VEC. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "110:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "111:\n\t"
> > +     "cmpq    %%rsi, %%rdi\n\t"
> > +     "ja      113f\n\t"
> > +     /* Source == destination is less common.  */
> > +     "je      110b\n\t"
> > +     /* Load the first VEC and last 4 * VEC to
> > +      * support overlapping addresses.
> > +      */
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > +     /* Save start and stop of the destination buffer.  */
> > +     "movq    %%rdi, %%r11\n\t"
> > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > +     /* Align destination for aligned stores in the loop.  Compute */
> > +     /* how much destination is misaligned.  */
> > +     "movq    %%rdi, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Get the negative of offset for alignment.  */
> > +     "subq    $32, %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rsi\n\t"
> > +     /* Adjust destination which should be aligned now.  */
> > +     "subq    %%r8, %%rdi\n\t"
> > +     /* Adjust length.  */
> > +     "addq    %%r8, %%rdx\n\t"
> > +     /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      115f\n\t"
> > +     "112:\n\t"
> > +     /* Copy 4 * VEC a time forward.  */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32 * 4), %%rsi\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32 * 4), %%rdi\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      112b\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "113:\n\t"
> > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > +     /* Save stop of the destination buffer.  */
> > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > +     /* Align destination end for aligned stores in the loop.  Compute */
> > +     /* how much destination end is misaligned.  */
> > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > +     "movq    %%r11, %%r9\n\t"
> > +     "movq    %%r11, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rcx\n\t"
> > +     /* Adjust the end of destination which should be aligned now.  */
> > +     "subq    %%r8, %%r9\n\t"
> > +     /* Adjust length.  */
> > +     "subq    %%r8, %%rdx\n\t"
> > +      /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      117f\n\t"
> > +     "114:\n\t"
> > +     /* Copy 4 * VEC a time backward.  */
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32 * 4), %%rcx\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      114b\n\t"
> > +     /* Store the first 4 * VEC. */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC. */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +
> > +     "115:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded. */
> > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%rsi\n\t"
> > +     "jb      112b\n\t"
> > +     "116:\n\t"
> > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32*4), %%rsi\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32*4), %%rdi\n\t"
> > +     "cmpq    $(32*4), %%rdx\n\t"
> > +     "ja      116b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "117:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded.  */
> > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%r9\n\t"
> > +     "jb      114b\n\t"
> > +     "118:\n\t"
> > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32*4), %%rcx\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      118b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the first 4 * VEC.  */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC.  */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(len)
> > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
> 
>
Song, Keesang Oct. 21, 2021, 6:12 p.m. UTC | #5
[AMD Official Use Only]

Hi Ananyev,

The current memcpy implementation in Glibc is based out of assembly coding.
Although memcpy could have been implemented with intrinsic, but since our AMD library developers are working on the Glibc functions, they have provided a tailored implementation based out of inline assembly coding.

Thanks for your support,
Keesang

-----Original Message-----
From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
Sent: Thursday, October 21, 2021 10:40 AM
To: Song, Keesang <Keesang.Song@amd.com>; Thomas Monjalon <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; david.marchand@redhat.com
Subject: RE: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal

[AMD Official Use Only]

[CAUTION: External Email]

>
> Hi Thomas,
>
> I hope this can make some explanation to your question.
> We(AMD Linux library support team) have implemented the custom
> tailored memcpy solution which is a close match with DPDK use case requirements like the below.
> 1)      Min 64B length data packet with cache aligned Source and Destination.
> 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> 3)      This solution works for all AVX2 supported AMD machines.
>
> Internally we have completed the integrity testing and benchmarking of
> the solution and found gains of 8.4% to 14.5% specifically on Milan
> CPU(3rd Gen of EPYC Processor)

It still not clear to me why it has to be written in assembler.
Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?

>
> Thanks for your support,
> Keesang
>
> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Tuesday, October 19, 2021 5:31 AM
> To: Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang
> <Keesang.Song@amd.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com;
> bruce.richardson@intel.com; konstantin.ananyev@intel.com;
> david.marchand@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> routine to eal
>
> [CAUTION: External Email]
>
> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> > meson to build dpdk for AMD EPYC platforms.
>
> Please split in 2 patches: platform & memcpy.
>
> What optimization is specific to EPYC?
>
> I dislike the asm code below.
> What is AMD specific inside?
> Can it use compiler intrinsics as it is done elsewhere?
>
> > +static __rte_always_inline void *
> > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > +                                         const void *src,
> > +                                         size_t size) {
> > +     asm volatile goto("movq %0, %%rsi\n\t"
> > +     "movq %1, %%rdi\n\t"
> > +     "movq %2, %%rdx\n\t"
> > +     "cmpq   $(128), %%rdx\n\t"
> > +     "jb     202f\n\t"
> > +     "201:\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > +     "addq   $128, %%rsi\n\t"
> > +     "addq   $128, %%rdi\n\t"
> > +     "subq   $128, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > +     "jae    201b\n\t"
> > +     "202:\n\t"
> > +     "cmpq   $64, %%rdx\n\t"
> > +     "jb     203f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > +     "addq   $64, %%rsi\n\t"
> > +     "addq   $64, %%rdi\n\t"
> > +     "subq   $64, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "203:\n\t"
> > +     "cmpq   $32, %%rdx\n\t"
> > +     "jb     204f\n\t"
> > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > +     "addq   $32, %%rsi\n\t"
> > +     "addq   $32, %%rdi\n\t"
> > +     "subq   $32, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "204:\n\t"
> > +     "cmpb   $16, %%dl\n\t"
> > +     "jb     205f\n\t"
> > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > +     "addq   $16, %%rsi\n\t"
> > +     "addq   $16, %%rdi\n\t"
> > +     "subq   $16, %%rdx\n\t"
> > +     "jz     %l[done]\n\t"
> > +     "205:\n\t"
> > +     "cmpb   $2, %%dl\n\t"
> > +     "jb     208f\n\t"
> > +     "cmpb   $4, %%dl\n\t"
> > +     "jbe    207f\n\t"
> > +     "cmpb   $8, %%dl\n\t"
> > +     "jbe    206f\n\t"
> > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq   (%%rsi), %%rsi\n\t"
> > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq   %%rsi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "206:\n\t"
> > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl   (%%rsi), %%esi\n\t"
> > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl   %%esi, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "207:\n\t"
> > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl (%%rsi), %%esi\n\t"
> > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw   %%si, (%%rdi)\n\t"
> > +     "jmp    %l[done]\n\t"
> > +     "208:\n\t"
> > +     "movzbl (%%rsi), %%ecx\n\t"
> > +     "movb   %%cl, (%%rdi)"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(size)
> > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> > +     asm goto("movq  %0, %%rsi\n\t"
> > +     "movq   %1, %%rdi\n\t"
> > +     "movq   %2, %%rdx\n\t"
> > +     "movq    %%rdi, %%rax\n\t"
> > +     "cmp     $32, %%rdx\n\t"
> > +     "jb      101f\n\t"
> > +     "cmp     $(32 * 2), %%rdx\n\t"
> > +     "ja      108f\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "101:\n\t"
> > +     /* Less than 1 VEC.  */
> > +     "cmpb    $32, %%dl\n\t"
> > +     "jae     103f\n\t"
> > +     "cmpb    $16, %%dl\n\t"
> > +     "jae     104f\n\t"
> > +     "cmpb    $8, %%dl\n\t"
> > +     "jae     105f\n\t"
> > +     "cmpb    $4, %%dl\n\t"
> > +     "jae     106f\n\t"
> > +     "cmpb    $1, %%dl\n\t"
> > +     "ja      107f\n\t"
> > +     "jb      102f\n\t"
> > +     "movzbl  (%%rsi), %%ecx\n\t"
> > +     "movb    %%cl, (%%rdi)\n\t"
> > +     "102:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "103:\n\t"
> > +     /* From 32 to 63.  No branch when size == 32.  */
> > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     /* From 16 to 31.  No branch when size == 16.  */
> > +     "104:\n\t"
> > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "105:\n\t"
> > +     /* From 8 to 15.  No branch when size == 8.  */
> > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > +     "movq    (%%rsi), %%rsi\n\t"
> > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > +     "movq    %%rsi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "106:\n\t"
> > +     /* From 4 to 7.  No branch when size == 4.  */
> > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movl    (%%rsi), %%esi\n\t"
> > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > +     "movl    %%esi, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "107:\n\t"
> > +     /* From 2 to 3.  No branch when size == 2.  */
> > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > +     "movzwl  (%%rsi), %%esi\n\t"
> > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > +     "movw    %%si, (%%rdi)\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "108:\n\t"
> > +     /* More than 2 * VEC and there may be overlap between destination */
> > +     /* and source.  */
> > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > +     "ja      111f\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "jb      109f\n\t"
> > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "109:\n\t"
> > +     /* Copy from 2 * VEC to 4 * VEC. */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > +     "vzeroupper\n\t"
> > +     "110:\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "111:\n\t"
> > +     "cmpq    %%rsi, %%rdi\n\t"
> > +     "ja      113f\n\t"
> > +     /* Source == destination is less common.  */
> > +     "je      110b\n\t"
> > +     /* Load the first VEC and last 4 * VEC to
> > +      * support overlapping addresses.
> > +      */
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > +     /* Save start and stop of the destination buffer.  */
> > +     "movq    %%rdi, %%r11\n\t"
> > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > +     /* Align destination for aligned stores in the loop.  Compute */
> > +     /* how much destination is misaligned.  */
> > +     "movq    %%rdi, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Get the negative of offset for alignment.  */
> > +     "subq    $32, %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rsi\n\t"
> > +     /* Adjust destination which should be aligned now.  */
> > +     "subq    %%r8, %%rdi\n\t"
> > +     /* Adjust length.  */
> > +     "addq    %%r8, %%rdx\n\t"
> > +     /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      115f\n\t"
> > +     "112:\n\t"
> > +     /* Copy 4 * VEC a time forward.  */
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32 * 4), %%rsi\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32 * 4), %%rdi\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      112b\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "113:\n\t"
> > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > +     /* Save stop of the destination buffer.  */
> > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > +     /* Align destination end for aligned stores in the loop.  Compute */
> > +     /* how much destination end is misaligned.  */
> > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > +     "movq    %%r11, %%r9\n\t"
> > +     "movq    %%r11, %%r8\n\t"
> > +     "andq    $(32 - 1), %%r8\n\t"
> > +     /* Adjust source.  */
> > +     "subq    %%r8, %%rcx\n\t"
> > +     /* Adjust the end of destination which should be aligned now.  */
> > +     "subq    %%r8, %%r9\n\t"
> > +     /* Adjust length.  */
> > +     "subq    %%r8, %%rdx\n\t"
> > +      /* Check non-temporal store threshold.  */
> > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > +     "ja      117f\n\t"
> > +     "114:\n\t"
> > +     /* Copy 4 * VEC a time backward.  */
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32 * 4), %%rcx\n\t"
> > +     "subq    $(32 * 4), %%rdx\n\t"
> > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      114b\n\t"
> > +     /* Store the first 4 * VEC. */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC. */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +
> > +     "115:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded. */
> > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%rsi\n\t"
> > +     "jb      112b\n\t"
> > +     "116:\n\t"
> > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > +     "addq    $(32*4), %%rsi\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > +     "addq    $(32*4), %%rdi\n\t"
> > +     "cmpq    $(32*4), %%rdx\n\t"
> > +     "ja      116b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the last 4 * VEC.  */
> > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > +     /* Store the first VEC.  */
> > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]\n\t"
> > +     "117:\n\t"
> > +     /* Don't use non-temporal store if there is overlap between */
> > +     /* destination and source since destination may be in cache */
> > +     /* when source is loaded.  */
> > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > +     "cmpq    %%r10, %%r9\n\t"
> > +     "jb      114b\n\t"
> > +     "118:\n\t"
> > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > +     "subq    $(32*4), %%rcx\n\t"
> > +     "subq    $(32*4), %%rdx\n\t"
> > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > +     "subq    $(32 * 4), %%r9\n\t"
> > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > +     "ja      118b\n\t"
> > +     "sfence\n\t"
> > +     /* Store the first 4 * VEC.  */
> > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > +     /* Store the last VEC.  */
> > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > +     "vzeroupper\n\t"
> > +     "jmp %l[done]"
> > +     :
> > +     : "r"(src), "r"(dst), "r"(len)
> > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > +     : done
> > +     );
> > +done:
> > +     return dst;
> > +}
>
>
Thomas Monjalon Oct. 21, 2021, 6:41 p.m. UTC | #6
Please convert it to C code, thanks.


21/10/2021 20:12, Song, Keesang:
> [AMD Official Use Only]
> 
> Hi Ananyev,
> 
> The current memcpy implementation in Glibc is based out of assembly coding.
> Although memcpy could have been implemented with intrinsic, but since our AMD library developers are working on the Glibc functions, they have provided a tailored implementation based out of inline assembly coding.
> 
> Thanks for your support,
> Keesang
> 
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, October 21, 2021 10:40 AM
> To: Song, Keesang <Keesang.Song@amd.com>; Thomas Monjalon <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; david.marchand@redhat.com
> Subject: RE: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
> 
> [AMD Official Use Only]
> 
> [CAUTION: External Email]
> 
> >
> > Hi Thomas,
> >
> > I hope this can make some explanation to your question.
> > We(AMD Linux library support team) have implemented the custom
> > tailored memcpy solution which is a close match with DPDK use case requirements like the below.
> > 1)      Min 64B length data packet with cache aligned Source and Destination.
> > 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> > store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> > 3)      This solution works for all AVX2 supported AMD machines.
> >
> > Internally we have completed the integrity testing and benchmarking of
> > the solution and found gains of 8.4% to 14.5% specifically on Milan
> > CPU(3rd Gen of EPYC Processor)
> 
> It still not clear to me why it has to be written in assembler.
> Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?
> 
> >
> > Thanks for your support,
> > Keesang
> >
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Tuesday, October 19, 2021 5:31 AM
> > To: Aman Kumar <aman.kumar@vvdntech.in>
> > Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> > shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> > matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang
> > <Keesang.Song@amd.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com;
> > bruce.richardson@intel.com; konstantin.ananyev@intel.com;
> > david.marchand@redhat.com
> > Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> > routine to eal
> >
> > [CAUTION: External Email]
> >
> > 19/10/2021 12:47, Aman Kumar:
> > > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file with
> > > meson to build dpdk for AMD EPYC platforms.
> >
> > Please split in 2 patches: platform & memcpy.
> >
> > What optimization is specific to EPYC?
> >
> > I dislike the asm code below.
> > What is AMD specific inside?
> > Can it use compiler intrinsics as it is done elsewhere?
> >
> > > +static __rte_always_inline void *
> > > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > > +                                         const void *src,
> > > +                                         size_t size) {
> > > +     asm volatile goto("movq %0, %%rsi\n\t"
> > > +     "movq %1, %%rdi\n\t"
> > > +     "movq %2, %%rdx\n\t"
> > > +     "cmpq   $(128), %%rdx\n\t"
> > > +     "jb     202f\n\t"
> > > +     "201:\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > > +     "addq   $128, %%rsi\n\t"
> > > +     "addq   $128, %%rdi\n\t"
> > > +     "subq   $128, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > > +     "jae    201b\n\t"
> > > +     "202:\n\t"
> > > +     "cmpq   $64, %%rdx\n\t"
> > > +     "jb     203f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "addq   $64, %%rsi\n\t"
> > > +     "addq   $64, %%rdi\n\t"
> > > +     "subq   $64, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "203:\n\t"
> > > +     "cmpq   $32, %%rdx\n\t"
> > > +     "jb     204f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "addq   $32, %%rsi\n\t"
> > > +     "addq   $32, %%rdi\n\t"
> > > +     "subq   $32, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "204:\n\t"
> > > +     "cmpb   $16, %%dl\n\t"
> > > +     "jb     205f\n\t"
> > > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > > +     "addq   $16, %%rsi\n\t"
> > > +     "addq   $16, %%rdi\n\t"
> > > +     "subq   $16, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "205:\n\t"
> > > +     "cmpb   $2, %%dl\n\t"
> > > +     "jb     208f\n\t"
> > > +     "cmpb   $4, %%dl\n\t"
> > > +     "jbe    207f\n\t"
> > > +     "cmpb   $8, %%dl\n\t"
> > > +     "jbe    206f\n\t"
> > > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq   (%%rsi), %%rsi\n\t"
> > > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq   %%rsi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "206:\n\t"
> > > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl   (%%rsi), %%esi\n\t"
> > > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl   %%esi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "207:\n\t"
> > > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl (%%rsi), %%esi\n\t"
> > > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw   %%si, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "208:\n\t"
> > > +     "movzbl (%%rsi), %%ecx\n\t"
> > > +     "movb   %%cl, (%%rdi)"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(size)
> > > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> > > +
> > > +static __rte_always_inline void *
> > > +rte_memcpy_generic(void *dst, const void *src, size_t len) {
> > > +     asm goto("movq  %0, %%rsi\n\t"
> > > +     "movq   %1, %%rdi\n\t"
> > > +     "movq   %2, %%rdx\n\t"
> > > +     "movq    %%rdi, %%rax\n\t"
> > > +     "cmp     $32, %%rdx\n\t"
> > > +     "jb      101f\n\t"
> > > +     "cmp     $(32 * 2), %%rdx\n\t"
> > > +     "ja      108f\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "101:\n\t"
> > > +     /* Less than 1 VEC.  */
> > > +     "cmpb    $32, %%dl\n\t"
> > > +     "jae     103f\n\t"
> > > +     "cmpb    $16, %%dl\n\t"
> > > +     "jae     104f\n\t"
> > > +     "cmpb    $8, %%dl\n\t"
> > > +     "jae     105f\n\t"
> > > +     "cmpb    $4, %%dl\n\t"
> > > +     "jae     106f\n\t"
> > > +     "cmpb    $1, %%dl\n\t"
> > > +     "ja      107f\n\t"
> > > +     "jb      102f\n\t"
> > > +     "movzbl  (%%rsi), %%ecx\n\t"
> > > +     "movb    %%cl, (%%rdi)\n\t"
> > > +     "102:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "103:\n\t"
> > > +     /* From 32 to 63.  No branch when size == 32.  */
> > > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     /* From 16 to 31.  No branch when size == 16.  */
> > > +     "104:\n\t"
> > > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "105:\n\t"
> > > +     /* From 8 to 15.  No branch when size == 8.  */
> > > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq    (%%rsi), %%rsi\n\t"
> > > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq    %%rsi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "106:\n\t"
> > > +     /* From 4 to 7.  No branch when size == 4.  */
> > > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl    (%%rsi), %%esi\n\t"
> > > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl    %%esi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "107:\n\t"
> > > +     /* From 2 to 3.  No branch when size == 2.  */
> > > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl  (%%rsi), %%esi\n\t"
> > > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw    %%si, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "108:\n\t"
> > > +     /* More than 2 * VEC and there may be overlap between destination */
> > > +     /* and source.  */
> > > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > > +     "ja      111f\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "jb      109f\n\t"
> > > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "109:\n\t"
> > > +     /* Copy from 2 * VEC to 4 * VEC. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "110:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "111:\n\t"
> > > +     "cmpq    %%rsi, %%rdi\n\t"
> > > +     "ja      113f\n\t"
> > > +     /* Source == destination is less common.  */
> > > +     "je      110b\n\t"
> > > +     /* Load the first VEC and last 4 * VEC to
> > > +      * support overlapping addresses.
> > > +      */
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > > +     /* Save start and stop of the destination buffer.  */
> > > +     "movq    %%rdi, %%r11\n\t"
> > > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > > +     /* Align destination for aligned stores in the loop.  Compute */
> > > +     /* how much destination is misaligned.  */
> > > +     "movq    %%rdi, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Get the negative of offset for alignment.  */
> > > +     "subq    $32, %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rsi\n\t"
> > > +     /* Adjust destination which should be aligned now.  */
> > > +     "subq    %%r8, %%rdi\n\t"
> > > +     /* Adjust length.  */
> > > +     "addq    %%r8, %%rdx\n\t"
> > > +     /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      115f\n\t"
> > > +     "112:\n\t"
> > > +     /* Copy 4 * VEC a time forward.  */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32 * 4), %%rsi\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32 * 4), %%rdi\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      112b\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "113:\n\t"
> > > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > > +     /* Save stop of the destination buffer.  */
> > > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > > +     /* Align destination end for aligned stores in the loop.  Compute */
> > > +     /* how much destination end is misaligned.  */
> > > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > > +     "movq    %%r11, %%r9\n\t"
> > > +     "movq    %%r11, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rcx\n\t"
> > > +     /* Adjust the end of destination which should be aligned now.  */
> > > +     "subq    %%r8, %%r9\n\t"
> > > +     /* Adjust length.  */
> > > +     "subq    %%r8, %%rdx\n\t"
> > > +      /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      117f\n\t"
> > > +     "114:\n\t"
> > > +     /* Copy 4 * VEC a time backward.  */
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32 * 4), %%rcx\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      114b\n\t"
> > > +     /* Store the first 4 * VEC. */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC. */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +
> > > +     "115:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded. */
> > > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%rsi\n\t"
> > > +     "jb      112b\n\t"
> > > +     "116:\n\t"
> > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32*4), %%rsi\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32*4), %%rdi\n\t"
> > > +     "cmpq    $(32*4), %%rdx\n\t"
> > > +     "ja      116b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "117:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded.  */
> > > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%r9\n\t"
> > > +     "jb      114b\n\t"
> > > +     "118:\n\t"
> > > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32*4), %%rcx\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      118b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the first 4 * VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC.  */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(len)
> > > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> >
> >
> 
>
Song, Keesang Oct. 21, 2021, 7:03 p.m. UTC | #7
[AMD Official Use Only]

Hi Thomas,

I've already asked our AMD tools team, but they're saying they are not really familiar with C code implementation. We need your approval for now since we really need to get this patch submitted to 21.11 LTS.

Thanks,
Keesang

-----Original Message-----
From: Thomas Monjalon <thomas@monjalon.net>
Sent: Thursday, October 21, 2021 11:42 AM
To: Aman Kumar <aman.kumar@vvdntech.in>; Song, Keesang <Keesang.Song@amd.com>
Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com; shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com; matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>; aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; david.marchand@redhat.com
Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal

[CAUTION: External Email]

Please convert it to C code, thanks.


21/10/2021 20:12, Song, Keesang:
> [AMD Official Use Only]
>
> Hi Ananyev,
>
> The current memcpy implementation in Glibc is based out of assembly coding.
> Although memcpy could have been implemented with intrinsic, but since our AMD library developers are working on the Glibc functions, they have provided a tailored implementation based out of inline assembly coding.
>
> Thanks for your support,
> Keesang
>
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, October 21, 2021 10:40 AM
> To: Song, Keesang <Keesang.Song@amd.com>; Thomas Monjalon
> <thomas@monjalon.net>; Aman Kumar <aman.kumar@vvdntech.in>
> Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> matan@nvidia.com; Burakov, Anatoly <anatoly.burakov@intel.com>;
> aman.kumar@vvdntech.in; jerinjacobk@gmail.com; Richardson, Bruce
> <bruce.richardson@intel.com>; david.marchand@redhat.com
> Subject: RE: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> routine to eal
>
> [AMD Official Use Only]
>
> [CAUTION: External Email]
>
> >
> > Hi Thomas,
> >
> > I hope this can make some explanation to your question.
> > We(AMD Linux library support team) have implemented the custom
> > tailored memcpy solution which is a close match with DPDK use case requirements like the below.
> > 1)      Min 64B length data packet with cache aligned Source and Destination.
> > 2)      Non-Temporal load and temporal store for cache aligned source for both RX and TX paths. Could not implement the non-temporal
> > store for TX_PATH, as non-Temporal load/stores works only with 32B aligned addresses for AVX2
> > 3)      This solution works for all AVX2 supported AMD machines.
> >
> > Internally we have completed the integrity testing and benchmarking
> > of the solution and found gains of 8.4% to 14.5% specifically on
> > Milan CPU(3rd Gen of EPYC Processor)
>
> It still not clear to me why it has to be written in assembler.
> Why similar stuff can't be written in C with instincts, as rest of rte_memcpy.h does?
>
> >
> > Thanks for your support,
> > Keesang
> >
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Tuesday, October 19, 2021 5:31 AM
> > To: Aman Kumar <aman.kumar@vvdntech.in>
> > Cc: dev@dpdk.org; rasland@nvidia.com; asafp@nvidia.com;
> > shys@nvidia.com; viacheslavo@nvidia.com; akozyrev@nvidia.com;
> > matan@nvidia.com; anatoly.burakov@intel.com; Song, Keesang
> > <Keesang.Song@amd.com>; aman.kumar@vvdntech.in;
> > jerinjacobk@gmail.com; bruce.richardson@intel.com;
> > konstantin.ananyev@intel.com; david.marchand@redhat.com
> > Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy
> > routine to eal
> >
> > [CAUTION: External Email]
> >
> > 19/10/2021 12:47, Aman Kumar:
> > > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file
> > > with meson to build dpdk for AMD EPYC platforms.
> >
> > Please split in 2 patches: platform & memcpy.
> >
> > What optimization is specific to EPYC?
> >
> > I dislike the asm code below.
> > What is AMD specific inside?
> > Can it use compiler intrinsics as it is done elsewhere?
> >
> > > +static __rte_always_inline void *
> > > +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> > > +                                         const void *src,
> > > +                                         size_t size) {
> > > +     asm volatile goto("movq %0, %%rsi\n\t"
> > > +     "movq %1, %%rdi\n\t"
> > > +     "movq %2, %%rdx\n\t"
> > > +     "cmpq   $(128), %%rdx\n\t"
> > > +     "jb     202f\n\t"
> > > +     "201:\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovntdqa 64(%%rsi), %%ymm2\n\t"
> > > +     "vmovntdqa 96(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> > > +     "addq   $128, %%rsi\n\t"
> > > +     "addq   $128, %%rdi\n\t"
> > > +     "subq   $128, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> > > +     "jae    201b\n\t"
> > > +     "202:\n\t"
> > > +     "cmpq   $64, %%rdx\n\t"
> > > +     "jb     203f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovntdqa 32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> > > +     "addq   $64, %%rsi\n\t"
> > > +     "addq   $64, %%rdi\n\t"
> > > +     "subq   $64, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "203:\n\t"
> > > +     "cmpq   $32, %%rdx\n\t"
> > > +     "jb     204f\n\t"
> > > +     "vmovntdqa (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu  %%ymm0, (%%rdi)\n\t"
> > > +     "addq   $32, %%rsi\n\t"
> > > +     "addq   $32, %%rdi\n\t"
> > > +     "subq   $32, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "204:\n\t"
> > > +     "cmpb   $16, %%dl\n\t"
> > > +     "jb     205f\n\t"
> > > +     "vmovntdqa (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu  %%xmm0, (%%rdi)\n\t"
> > > +     "addq   $16, %%rsi\n\t"
> > > +     "addq   $16, %%rdi\n\t"
> > > +     "subq   $16, %%rdx\n\t"
> > > +     "jz     %l[done]\n\t"
> > > +     "205:\n\t"
> > > +     "cmpb   $2, %%dl\n\t"
> > > +     "jb     208f\n\t"
> > > +     "cmpb   $4, %%dl\n\t"
> > > +     "jbe    207f\n\t"
> > > +     "cmpb   $8, %%dl\n\t"
> > > +     "jbe    206f\n\t"
> > > +     "movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq   (%%rsi), %%rsi\n\t"
> > > +     "movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq   %%rsi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "206:\n\t"
> > > +     "movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl   (%%rsi), %%esi\n\t"
> > > +     "movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl   %%esi, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "207:\n\t"
> > > +     "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl (%%rsi), %%esi\n\t"
> > > +     "movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw   %%si, (%%rdi)\n\t"
> > > +     "jmp    %l[done]\n\t"
> > > +     "208:\n\t"
> > > +     "movzbl (%%rsi), %%ecx\n\t"
> > > +     "movb   %%cl, (%%rdi)"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(size)
> > > +     : "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> > > +
> > > +static __rte_always_inline void * rte_memcpy_generic(void *dst,
> > > +const void *src, size_t len) {
> > > +     asm goto("movq  %0, %%rsi\n\t"
> > > +     "movq   %1, %%rdi\n\t"
> > > +     "movq   %2, %%rdx\n\t"
> > > +     "movq    %%rdi, %%rax\n\t"
> > > +     "cmp     $32, %%rdx\n\t"
> > > +     "jb      101f\n\t"
> > > +     "cmp     $(32 * 2), %%rdx\n\t"
> > > +     "ja      108f\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "101:\n\t"
> > > +     /* Less than 1 VEC.  */
> > > +     "cmpb    $32, %%dl\n\t"
> > > +     "jae     103f\n\t"
> > > +     "cmpb    $16, %%dl\n\t"
> > > +     "jae     104f\n\t"
> > > +     "cmpb    $8, %%dl\n\t"
> > > +     "jae     105f\n\t"
> > > +     "cmpb    $4, %%dl\n\t"
> > > +     "jae     106f\n\t"
> > > +     "cmpb    $1, %%dl\n\t"
> > > +     "ja      107f\n\t"
> > > +     "jb      102f\n\t"
> > > +     "movzbl  (%%rsi), %%ecx\n\t"
> > > +     "movb    %%cl, (%%rdi)\n\t"
> > > +     "102:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "103:\n\t"
> > > +     /* From 32 to 63.  No branch when size == 32.  */
> > > +     "vmovdqu (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> > > +     "vmovdqu %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     /* From 16 to 31.  No branch when size == 16.  */
> > > +     "104:\n\t"
> > > +     "vmovdqu (%%rsi), %%xmm0\n\t"
> > > +     "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> > > +     "vmovdqu %%xmm0, (%%rdi)\n\t"
> > > +     "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "105:\n\t"
> > > +     /* From 8 to 15.  No branch when size == 8.  */
> > > +     "movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> > > +     "movq    (%%rsi), %%rsi\n\t"
> > > +     "movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> > > +     "movq    %%rsi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "106:\n\t"
> > > +     /* From 4 to 7.  No branch when size == 4.  */
> > > +     "movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movl    (%%rsi), %%esi\n\t"
> > > +     "movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> > > +     "movl    %%esi, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "107:\n\t"
> > > +     /* From 2 to 3.  No branch when size == 2.  */
> > > +     "movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> > > +     "movzwl  (%%rsi), %%esi\n\t"
> > > +     "movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> > > +     "movw    %%si, (%%rdi)\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "108:\n\t"
> > > +     /* More than 2 * VEC and there may be overlap between destination */
> > > +     /* and source.  */
> > > +     "cmpq    $(32 * 8), %%rdx\n\t"
> > > +     "ja      111f\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "jb      109f\n\t"
> > > +     /* Copy from 4 * VEC to 8 * VEC, inclusively. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "109:\n\t"
> > > +     /* Copy from 2 * VEC to 4 * VEC. */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> > > +     "vmovdqu   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> > > +     "vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "110:\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "111:\n\t"
> > > +     "cmpq    %%rsi, %%rdi\n\t"
> > > +     "ja      113f\n\t"
> > > +     /* Source == destination is less common.  */
> > > +     "je      110b\n\t"
> > > +     /* Load the first VEC and last 4 * VEC to
> > > +      * support overlapping addresses.
> > > +      */
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> > > +     "vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> > > +     /* Save start and stop of the destination buffer.  */
> > > +     "movq    %%rdi, %%r11\n\t"
> > > +     "leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> > > +     /* Align destination for aligned stores in the loop.  Compute */
> > > +     /* how much destination is misaligned.  */
> > > +     "movq    %%rdi, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Get the negative of offset for alignment.  */
> > > +     "subq    $32, %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rsi\n\t"
> > > +     /* Adjust destination which should be aligned now.  */
> > > +     "subq    %%r8, %%rdi\n\t"
> > > +     /* Adjust length.  */
> > > +     "addq    %%r8, %%rdx\n\t"
> > > +     /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      115f\n\t"
> > > +     "112:\n\t"
> > > +     /* Copy 4 * VEC a time forward.  */
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32 * 4), %%rsi\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32 * 4), %%rdi\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      112b\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "113:\n\t"
> > > +     /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> > > +     "vmovdqu   (%%rsi), %%ymm4\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm5\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> > > +     "vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> > > +     /* Save stop of the destination buffer.  */
> > > +     "leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> > > +     /* Align destination end for aligned stores in the loop.  Compute */
> > > +     /* how much destination end is misaligned.  */
> > > +     "leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> > > +     "movq    %%r11, %%r9\n\t"
> > > +     "movq    %%r11, %%r8\n\t"
> > > +     "andq    $(32 - 1), %%r8\n\t"
> > > +     /* Adjust source.  */
> > > +     "subq    %%r8, %%rcx\n\t"
> > > +     /* Adjust the end of destination which should be aligned now.  */
> > > +     "subq    %%r8, %%r9\n\t"
> > > +     /* Adjust length.  */
> > > +     "subq    %%r8, %%rdx\n\t"
> > > +      /* Check non-temporal store threshold.  */
> > > +     "cmpq    $(1024*1024), %%rdx\n\t"
> > > +     "ja      117f\n\t"
> > > +     "114:\n\t"
> > > +     /* Copy 4 * VEC a time backward.  */
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32 * 4), %%rcx\n\t"
> > > +     "subq    $(32 * 4), %%rdx\n\t"
> > > +     "vmovdqa   %%ymm0, (%%r9)\n\t"
> > > +     "vmovdqa   %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      114b\n\t"
> > > +     /* Store the first 4 * VEC. */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC. */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +
> > > +     "115:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded. */
> > > +     "leaq    (%%rdi, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%rsi\n\t"
> > > +     "jb      112b\n\t"
> > > +     "116:\n\t"
> > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > +     "prefetcht0 (32*4*2)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3)(%%rsi)\n\t"
> > > +     "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> > > +     "vmovdqu   (%%rsi), %%ymm0\n\t"
> > > +     "vmovdqu   32(%%rsi), %%ymm1\n\t"
> > > +     "vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> > > +     "vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> > > +     "addq    $(32*4), %%rsi\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> > > +     "addq    $(32*4), %%rdi\n\t"
> > > +     "cmpq    $(32*4), %%rdx\n\t"
> > > +     "ja      116b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the last 4 * VEC.  */
> > > +     "vmovdqu   %%ymm5, (%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> > > +     "vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> > > +     /* Store the first VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]\n\t"
> > > +     "117:\n\t"
> > > +     /* Don't use non-temporal store if there is overlap between */
> > > +     /* destination and source since destination may be in cache */
> > > +     /* when source is loaded.  */
> > > +     "leaq    (%%rcx, %%rdx), %%r10\n\t"
> > > +     "cmpq    %%r10, %%r9\n\t"
> > > +     "jb      114b\n\t"
> > > +     "118:\n\t"
> > > +     /* Copy 4 * VEC a time backward with non-temporal stores. */
> > > +     "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> > > +     "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> > > +     "vmovdqu   (%%rcx), %%ymm0\n\t"
> > > +     "vmovdqu   -32(%%rcx), %%ymm1\n\t"
> > > +     "vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> > > +     "vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> > > +     "subq    $(32*4), %%rcx\n\t"
> > > +     "subq    $(32*4), %%rdx\n\t"
> > > +     "vmovntdq  %%ymm0, (%%r9)\n\t"
> > > +     "vmovntdq  %%ymm1, -32(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> > > +     "vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> > > +     "subq    $(32 * 4), %%r9\n\t"
> > > +     "cmpq    $(32 * 4), %%rdx\n\t"
> > > +     "ja      118b\n\t"
> > > +     "sfence\n\t"
> > > +     /* Store the first 4 * VEC.  */
> > > +     "vmovdqu   %%ymm4, (%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> > > +     "vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> > > +     /* Store the last VEC.  */
> > > +     "vmovdqu   %%ymm8, (%%r11)\n\t"
> > > +     "vzeroupper\n\t"
> > > +     "jmp %l[done]"
> > > +     :
> > > +     : "r"(src), "r"(dst), "r"(len)
> > > +     : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> > > +     "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> > > +     : done
> > > +     );
> > > +done:
> > > +     return dst;
> > > +}
> >
> >
>
>
Thomas Monjalon Oct. 21, 2021, 7:50 p.m. UTC | #8
21/10/2021 21:03, Song, Keesang:
> From: Thomas Monjalon <thomas@monjalon.net>
> > 21/10/2021 20:12, Song, Keesang:
> > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > 21/10/2021 19:10, Song, Keesang:
> > > > > 19/10/2021 17:35, Stephen Hemminger:
> > > > > > From: Thomas Monjalon <thomas@monjalon.net>
> > > > > > > 19/10/2021 12:47, Aman Kumar:
> > > > > > > > This patch provides rte_memcpy* calls optimized for AMD EPYC
> > > > > > > > platforms. Use config/x86/x86_amd_epyc_linux_gcc as cross-file
> > > > > > > > with meson to build dpdk for AMD EPYC platforms.
> > > > > > > 
> > > > > > > Please split in 2 patches: platform & memcpy.
> > > > > > > 
> > > > > > > What optimization is specific to EPYC?
> > > > > > > 
> > > > > > > I dislike the asm code below.
> > > > > > > What is AMD specific inside?
> > > > > > > Can it use compiler intrinsics as it is done elsewhere?
> > > > > > 
> > > > > > And why is this not done by Gcc?
> > > > >
> > > > > I hope this can make some explanation to your question.
> > > > > We(AMD Linux library support team) have implemented the custom
> > > > > tailored memcpy solution which is a close match with DPDK use case
> > > > > requirements like the below.
> > > > > 1)      Min 64B length data packet with cache aligned
> > > > > Source and Destination.
> > > > > 2)      Non-Temporal load and temporal store for cache aligned
> > > > > source for both RX and TX paths.
> > > > > Could not implement the non-temporal store for TX_PATH,
> > > > > as non-Temporal load/stores works only with 32B aligned addresses
> > > > > for AVX2
> > > > > 3)      This solution works for all AVX2 supported AMD machines.
> > > > > 
> > > > > Internally we have completed the integrity testing and benchmarking
> > > > > of the solution and found gains of 8.4% to 14.5% specifically on
> > > > > Milan CPU(3rd Gen of EPYC Processor)
> > > > 
> > > > It still not clear to me why it has to be written in assembler.
> > > > Why similar stuff can't be written in C with instincts, as rest of
> > > > rte_memcpy.h does?
> > > 
> > > The current memcpy implementation in Glibc is based out of assembly
> > > coding.
> > > Although memcpy could have been implemented with intrinsic,
> > > but since our AMD library developers are working on the Glibc
> > > functions, they have provided a tailored implementation based
> > > out of inline assembly coding.
> > 
> > Please convert it to C code, thanks.
> 
> I've already asked our AMD tools team, but they're saying
> they are not really familiar with C code implementation.
> We need your approval for now since we really need to get
> this patch submitted to 21.11 LTS.

Not sure it is urgent given that v2 came after the planned -rc1 date,
after 6 weeks of silence.
About the approval, there are already 3 technical board members
(Konstantin, Stephen and me) objecting against this patch.
Not being familiar with C code when working on CPU optimization
in 2021 is a strange argument.

In general, I don't really understand why we should maintain memcpy
functions in DPDK instead of relying on libc optimizations.
Having big asm code to maintain and debug is not helping.

I think this case shows that AMD needs to become more familiar
with DPDK schedule and expectations.
I would encourage you to contribute more in the project,
so such misunderstanding won't happen in future.

Hope that's all understandable


PS: discussion is more readable with replies below
Thomas Monjalon Oct. 21, 2021, 8:14 p.m. UTC | #9
19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for
> AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> as cross-file with meson to build dpdk for AMD EPYC platforms.
[...]
> --- a/config/x86/meson.build
> +++ b/config/x86/meson.build
> @@ -72,3 +72,10 @@ endif
>  dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
>  dpdk_conf.set('RTE_MAX_LCORE', 128)
>  dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
> +
> +if meson.is_cross_build()
> +	if meson.get_cross_property('platform') == 'amd-epyc'
> +	    dpdk_conf.set('RTE_MAX_LCORE', 512)
> +	    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
> +	endif
> +endif

Thinking again about the cross file.
Why not using the meson option "cpu_instruction_set"
to define RTE_MACHINE as "epyc" and tune other compilation options
without using artificial cross build?

Reminder, the default in config/meson.build is:
if cpu_instruction_set == 'generic'
    if host_machine.cpu_family().startswith('x86')
        cpu_instruction_set = 'corei7'

Cc Bruce who maintains this meson code.
Bruce Richardson Oct. 22, 2021, 8:45 a.m. UTC | #10
On Thu, Oct 21, 2021 at 10:14:47PM +0200, Thomas Monjalon wrote:
> 19/10/2021 12:47, Aman Kumar:
> > This patch provides rte_memcpy* calls optimized for
> > AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> > as cross-file with meson to build dpdk for AMD EPYC platforms.
> [...]
> > --- a/config/x86/meson.build
> > +++ b/config/x86/meson.build
> > @@ -72,3 +72,10 @@ endif
> >  dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
> >  dpdk_conf.set('RTE_MAX_LCORE', 128)
> >  dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
> > +
> > +if meson.is_cross_build()
> > +	if meson.get_cross_property('platform') == 'amd-epyc'
> > +	    dpdk_conf.set('RTE_MAX_LCORE', 512)
> > +	    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
> > +	endif
> > +endif
> 
> Thinking again about the cross file.
> Why not using the meson option "cpu_instruction_set"
> to define RTE_MACHINE as "epyc" and tune other compilation options
> without using artificial cross build?
> 
> Reminder, the default in config/meson.build is:
> if cpu_instruction_set == 'generic'
>     if host_machine.cpu_family().startswith('x86')
>         cpu_instruction_set = 'corei7'
> 
> Cc Bruce who maintains this meson code.
>
Yes, that is a good suggestion. You could detect a particular instruction
set value and set additional defines based on it.

/Bruce
diff mbox series

Patch

diff --git a/config/x86/meson.build b/config/x86/meson.build
index 29f3dea181..598b0e62ce 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -72,3 +72,10 @@  endif
 dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
 dpdk_conf.set('RTE_MAX_LCORE', 128)
 dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
+
+if meson.is_cross_build()
+	if meson.get_cross_property('platform') == 'amd-epyc'
+	    dpdk_conf.set('RTE_MAX_LCORE', 512)
+	    dpdk_conf.set('RTE_MEMCPY_AMDEPYC', 1)
+	endif
+endif
diff --git a/config/x86/x86_amd_epyc_linux_gcc b/config/x86/x86_amd_epyc_linux_gcc
new file mode 100644
index 0000000000..0b2453135f
--- /dev/null
+++ b/config/x86/x86_amd_epyc_linux_gcc
@@ -0,0 +1,16 @@ 
+[binaries]
+c = 'x86_64-linux-gnu-gcc'
+cpp = 'x86_64-linux-gnu-g++'
+ld = 'x86_64-linux-gnu-ld'
+ar = 'x86_64-linux-gnu-ar'
+strip = 'x86_64-linux-gnu-strip'
+pkgconfig = 'x86_64-linux-gnu-pkg-config'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'x86_64'
+cpu = 'native'
+endian = 'little'
+
+[properties]
+platform = 'amd-epyc'
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 79f381dd9b..8f66c115e3 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -368,6 +368,498 @@  rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 	}
 }
 
+#if defined RTE_MEMCPY_AMDEPYC
+
+/**
+ * Copy 16 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy16_ts(uint8_t *dst, uint8_t *src)
+{
+	__m128i var128;
+
+	var128 = _mm_stream_load_si128((__m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, var128);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy32_ts(uint8_t *dst, uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_stream_load_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy64_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * with temporal stores
+ */
+static __rte_always_inline void
+rte_copy128_ts(uint8_t *dst, uint8_t *src)
+{
+	rte_copy32_ts(dst + 0 * 32, src + 0 * 32);
+	rte_copy32_ts(dst + 1 * 32, src + 1 * 32);
+	rte_copy32_ts(dst + 2 * 32, src + 2 * 32);
+	rte_copy32_ts(dst + 3 * 32, src + 3 * 32);
+}
+
+/**
+ * Copy len bytes from one location to another,
+ * with temporal stores 16B aligned
+ */
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16_generic(void *dst, void *src, int len)
+{
+	void *dest = dst;
+
+	while (len >= 128) {
+		rte_copy128_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 128;
+		src = (uint8_t *)src + 128;
+		len -= 128;
+	}
+	while (len >= 64) {
+		rte_copy64_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (uint8_t *)src + 64;
+		len -= 64;
+	}
+	while (len >= 32) {
+		rte_copy32_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 32;
+		src = (uint8_t *)src + 32;
+		len -= 32;
+	}
+	if (len >= 16) {
+		rte_copy16_ts((uint8_t *)dst, (uint8_t *)src);
+		dst = (uint8_t *)dst + 16;
+		src = (uint8_t *)src + 16;
+		len -= 16;
+	}
+	if (len >= 8) {
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		dst = (uint8_t *)dst + 8;
+		src = (uint8_t *)src + 8;
+		len -= 8;
+	}
+	if (len >= 4) {
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		dst = (uint8_t *)dst + 4;
+		src = (uint8_t *)src + 4;
+		len -= 4;
+	}
+	if (len != 0) {
+		dst = (uint8_t *)dst - (4 - len);
+		src = (uint8_t *)src - (4 - len);
+		*(uint32_t *)dst = *(const uint32_t *)src;
+	}
+
+	return dest;
+}
+
+static __rte_always_inline void *
+rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
+					    const void *src,
+					    size_t size)
+{
+	asm volatile goto("movq %0, %%rsi\n\t"
+	"movq %1, %%rdi\n\t"
+	"movq %2, %%rdx\n\t"
+	"cmpq   $(128), %%rdx\n\t"
+	"jb     202f\n\t"
+	"201:\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
+	"vmovntdqa 64(%%rsi), %%ymm2\n\t"
+	"vmovntdqa 96(%%rsi), %%ymm3\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu  %%ymm2, 64(%%rdi)\n\t"
+	"vmovdqu  %%ymm3, 96(%%rdi)\n\t"
+	"addq   $128, %%rsi\n\t"
+	"addq   $128, %%rdi\n\t"
+	"subq   $128, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
+	"jae    201b\n\t"
+	"202:\n\t"
+	"cmpq   $64, %%rdx\n\t"
+	"jb     203f\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
+	"addq   $64, %%rsi\n\t"
+	"addq   $64, %%rdi\n\t"
+	"subq   $64, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"203:\n\t"
+	"cmpq   $32, %%rdx\n\t"
+	"jb     204f\n\t"
+	"vmovntdqa (%%rsi), %%ymm0\n\t"
+	"vmovdqu  %%ymm0, (%%rdi)\n\t"
+	"addq   $32, %%rsi\n\t"
+	"addq   $32, %%rdi\n\t"
+	"subq   $32, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"204:\n\t"
+	"cmpb   $16, %%dl\n\t"
+	"jb     205f\n\t"
+	"vmovntdqa (%%rsi), %%xmm0\n\t"
+	"vmovdqu  %%xmm0, (%%rdi)\n\t"
+	"addq   $16, %%rsi\n\t"
+	"addq   $16, %%rdi\n\t"
+	"subq   $16, %%rdx\n\t"
+	"jz     %l[done]\n\t"
+	"205:\n\t"
+	"cmpb   $2, %%dl\n\t"
+	"jb     208f\n\t"
+	"cmpb   $4, %%dl\n\t"
+	"jbe    207f\n\t"
+	"cmpb   $8, %%dl\n\t"
+	"jbe    206f\n\t"
+	"movq   -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq   (%%rsi), %%rsi\n\t"
+	"movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq   %%rsi, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"206:\n\t"
+	"movl   -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl   (%%rsi), %%esi\n\t"
+	"movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl   %%esi, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"207:\n\t"
+	"movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl (%%rsi), %%esi\n\t"
+	"movw   %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw   %%si, (%%rdi)\n\t"
+	"jmp    %l[done]\n\t"
+	"208:\n\t"
+	"movzbl (%%rsi), %%ecx\n\t"
+	"movb   %%cl, (%%rdi)"
+	:
+	: "r"(src), "r"(dst), "r"(size)
+	: "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
+	: done
+	);
+done:
+	return dst;
+}
+
+static __rte_always_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t len)
+{
+	asm goto("movq	%0, %%rsi\n\t"
+	"movq	%1, %%rdi\n\t"
+	"movq	%2, %%rdx\n\t"
+	"movq    %%rdi, %%rax\n\t"
+	"cmp     $32, %%rdx\n\t"
+	"jb      101f\n\t"
+	"cmp     $(32 * 2), %%rdx\n\t"
+	"ja      108f\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"101:\n\t"
+	/* Less than 1 VEC.  */
+	"cmpb    $32, %%dl\n\t"
+	"jae     103f\n\t"
+	"cmpb    $16, %%dl\n\t"
+	"jae     104f\n\t"
+	"cmpb    $8, %%dl\n\t"
+	"jae     105f\n\t"
+	"cmpb    $4, %%dl\n\t"
+	"jae     106f\n\t"
+	"cmpb    $1, %%dl\n\t"
+	"ja      107f\n\t"
+	"jb      102f\n\t"
+	"movzbl  (%%rsi), %%ecx\n\t"
+	"movb    %%cl, (%%rdi)\n\t"
+	"102:\n\t"
+	"jmp %l[done]\n\t"
+	"103:\n\t"
+	/* From 32 to 63.  No branch when size == 32.  */
+	"vmovdqu (%%rsi), %%ymm0\n\t"
+	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
+	"vmovdqu %%ymm0, (%%rdi)\n\t"
+	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	/* From 16 to 31.  No branch when size == 16.  */
+	"104:\n\t"
+	"vmovdqu (%%rsi), %%xmm0\n\t"
+	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
+	"vmovdqu %%xmm0, (%%rdi)\n\t"
+	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
+	"jmp %l[done]\n\t"
+	"105:\n\t"
+	/* From 8 to 15.  No branch when size == 8.  */
+	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
+	"movq    (%%rsi), %%rsi\n\t"
+	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
+	"movq    %%rsi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"106:\n\t"
+	/* From 4 to 7.  No branch when size == 4.  */
+	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
+	"movl    (%%rsi), %%esi\n\t"
+	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
+	"movl    %%esi, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"107:\n\t"
+	/* From 2 to 3.  No branch when size == 2.  */
+	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
+	"movzwl  (%%rsi), %%esi\n\t"
+	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
+	"movw    %%si, (%%rdi)\n\t"
+	"jmp %l[done]\n\t"
+	"108:\n\t"
+	/* More than 2 * VEC and there may be overlap between destination */
+	/* and source.  */
+	"cmpq    $(32 * 8), %%rdx\n\t"
+	"ja      111f\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"jb      109f\n\t"
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"109:\n\t"
+	/* Copy from 2 * VEC to 4 * VEC. */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
+	"vmovdqu   %%ymm0, (%%rdi)\n\t"
+	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
+	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
+	"vzeroupper\n\t"
+	"110:\n\t"
+	"jmp %l[done]\n\t"
+	"111:\n\t"
+	"cmpq    %%rsi, %%rdi\n\t"
+	"ja      113f\n\t"
+	/* Source == destination is less common.  */
+	"je      110b\n\t"
+	/* Load the first VEC and last 4 * VEC to
+	 * support overlapping addresses.
+	 */
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
+	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
+	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
+	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
+	/* Save start and stop of the destination buffer.  */
+	"movq    %%rdi, %%r11\n\t"
+	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
+	/* Align destination for aligned stores in the loop.  Compute */
+	/* how much destination is misaligned.  */
+	"movq    %%rdi, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Get the negative of offset for alignment.  */
+	"subq    $32, %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rsi\n\t"
+	/* Adjust destination which should be aligned now.  */
+	"subq    %%r8, %%rdi\n\t"
+	/* Adjust length.  */
+	"addq    %%r8, %%rdx\n\t"
+	/* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      115f\n\t"
+	"112:\n\t"
+	/* Copy 4 * VEC a time forward.  */
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32 * 4), %%rsi\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%rdi)\n\t"
+	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
+	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32 * 4), %%rdi\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      112b\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"113:\n\t"
+	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
+	"vmovdqu   (%%rsi), %%ymm4\n\t"
+	"vmovdqu   32(%%rsi), %%ymm5\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
+	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
+	/* Save stop of the destination buffer.  */
+	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
+	/* Align destination end for aligned stores in the loop.  Compute */
+	/* how much destination end is misaligned.  */
+	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
+	"movq    %%r11, %%r9\n\t"
+	"movq    %%r11, %%r8\n\t"
+	"andq    $(32 - 1), %%r8\n\t"
+	/* Adjust source.  */
+	"subq    %%r8, %%rcx\n\t"
+	/* Adjust the end of destination which should be aligned now.  */
+	"subq    %%r8, %%r9\n\t"
+	/* Adjust length.  */
+	"subq    %%r8, %%rdx\n\t"
+	 /* Check non-temporal store threshold.  */
+	"cmpq	 $(1024*1024), %%rdx\n\t"
+	"ja      117f\n\t"
+	"114:\n\t"
+	/* Copy 4 * VEC a time backward.  */
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32 * 4), %%rcx\n\t"
+	"subq    $(32 * 4), %%rdx\n\t"
+	"vmovdqa   %%ymm0, (%%r9)\n\t"
+	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
+	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      114b\n\t"
+	/* Store the first 4 * VEC. */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC. */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+
+	"115:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded. */
+	"leaq    (%%rdi, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%rsi\n\t"
+	"jb      112b\n\t"
+	"116:\n\t"
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	"prefetcht0 (32*4*2)(%%rsi)\n\t"
+	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3)(%%rsi)\n\t"
+	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
+	"vmovdqu   (%%rsi), %%ymm0\n\t"
+	"vmovdqu   32(%%rsi), %%ymm1\n\t"
+	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
+	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
+	"addq    $(32*4), %%rsi\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%rdi)\n\t"
+	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
+	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
+	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
+	"addq    $(32*4), %%rdi\n\t"
+	"cmpq    $(32*4), %%rdx\n\t"
+	"ja      116b\n\t"
+	"sfence\n\t"
+	/* Store the last 4 * VEC.  */
+	"vmovdqu   %%ymm5, (%%rcx)\n\t"
+	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
+	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
+	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
+	/* Store the first VEC.  */
+	"vmovdqu   %%ymm4, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]\n\t"
+	"117:\n\t"
+	/* Don't use non-temporal store if there is overlap between */
+	/* destination and source since destination may be in cache */
+	/* when source is loaded.  */
+	"leaq    (%%rcx, %%rdx), %%r10\n\t"
+	"cmpq    %%r10, %%r9\n\t"
+	"jb      114b\n\t"
+	"118:\n\t"
+	/* Copy 4 * VEC a time backward with non-temporal stores. */
+	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
+	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
+	"vmovdqu   (%%rcx), %%ymm0\n\t"
+	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
+	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
+	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
+	"subq    $(32*4), %%rcx\n\t"
+	"subq    $(32*4), %%rdx\n\t"
+	"vmovntdq  %%ymm0, (%%r9)\n\t"
+	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
+	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
+	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
+	"subq    $(32 * 4), %%r9\n\t"
+	"cmpq    $(32 * 4), %%rdx\n\t"
+	"ja      118b\n\t"
+	"sfence\n\t"
+	/* Store the first 4 * VEC.  */
+	"vmovdqu   %%ymm4, (%%rdi)\n\t"
+	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
+	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
+	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
+	/* Store the last VEC.  */
+	"vmovdqu   %%ymm8, (%%r11)\n\t"
+	"vzeroupper\n\t"
+	"jmp %l[done]"
+	:
+	: "r"(src), "r"(dst), "r"(len)
+	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
+	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
+	: done
+	);
+done:
+	return dst;
+}
+
+#else
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
@@ -479,6 +971,8 @@  rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK31;
 }
 
+#endif /* RTE_MEMCPY_AMDEPYC */
+
 #else /* __AVX512F__ */
 
 #define ALIGNMENT_MASK 0x0F
@@ -874,6 +1368,14 @@  rte_memcpy(void *dst, const void *src, size_t n)
 		return rte_memcpy_generic(dst, src, n);
 }
 
+#if defined __AVX2__ && defined(RTE_MEMCPY_AMDEPYC)
+static __rte_always_inline void *
+rte_memcpy_aligned_tstore16(void *dst, void *src, int len)
+{
+	return rte_memcpy_aligned_ntload_tstore16_amdepyc2(dst, src, len);
+}
+#endif
+
 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 #pragma GCC diagnostic pop
 #endif