[dpdk-dev,v3,1/6] eal/arm: add 64-bit armv8 version of rte_memcpy.h

Message ID 1446212959-19832-2-git-send-email-david.hunt@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Hunt, David Oct. 30, 2015, 1:49 p.m. UTC
Signed-off-by: David Hunt <david.hunt@intel.com>
---
 .../common/include/arch/arm/rte_memcpy.h           |   4 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 308 +++++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
  

Comments

Jerin Jacob Nov. 2, 2015, 4:57 a.m. UTC | #1
On Fri, Oct 30, 2015 at 01:49:14PM +0000, David Hunt wrote:
> Signed-off-by: David Hunt <david.hunt@intel.com>
> ---
>  .../common/include/arch/arm/rte_memcpy.h           |   4 +
>  .../common/include/arch/arm/rte_memcpy_64.h        | 308 +++++++++++++++++++++
>  2 files changed, 312 insertions(+)
>  create mode 100644 lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> 
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h
> index d9f5bf1..1d562c3 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h
> @@ -33,6 +33,10 @@
>  #ifndef _RTE_MEMCPY_ARM_H_
>  #define _RTE_MEMCPY_ARM_H_
>  
> +#ifdef RTE_ARCH_64
> +#include <rte_memcpy_64.h>
> +#else
>  #include <rte_memcpy_32.h>
> +#endif
>  
>  #endif /* _RTE_MEMCPY_ARM_H_ */
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> new file mode 100644
> index 0000000..6d85113
> --- /dev/null
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -0,0 +1,308 @@
> +/*
> + *   BSD LICENSE
> + *
> + *   Copyright (C) IBM Corporation 2014.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of IBM Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifndef _RTE_MEMCPY_ARM_64_H_
> +#define _RTE_MEMCPY_ARM_64_H_
> +
> +#include <stdint.h>
> +#include <string.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include "generic/rte_memcpy.h"
> +
> +#ifdef __ARM_NEON_FP

SIMD is not optional in armv8 spec.So every armv8 machine will have
SIMD instruction unlike armv7.More over LDP/STP instruction is
not part of SIMD.So this check is not required or it can
be replaced with a check that select memcpy from either libc or this specific
implementation

> +
> +/* ARM NEON Intrinsics are used to copy data */
> +#include <arm_neon.h>
> +
> +static inline void
> +rte_mov16(uint8_t *dst, const uint8_t *src)
> +{
> +	asm volatile("LDP d0, d1, [%0]\n\t"
> +		     "STP d0, d1, [%1]\n\t"
> +		     : : "r" (src), "r" (dst) :
> +	);
> +}

IMO, no need to hardcode registers used for the mem move(d0, d1).
Let compiler schedule the registers for better performance.


> +
> +static inline void
> +rte_mov32(uint8_t *dst, const uint8_t *src)
> +{
> +	asm volatile("LDP q0, q1, [%0]\n\t"
> +		     "STP q0, q1, [%1]\n\t"
> +		     : : "r" (src), "r" (dst) :
> +	);
> +}
> +
> +static inline void
> +rte_mov48(uint8_t *dst, const uint8_t *src)
> +{
> +	asm volatile("LDP q0, q1, [%0]\n\t"
> +		     "STP q0, q1, [%1]\n\t"
> +		     "LDP d0, d1, [%0 , #32]\n\t"
> +		     "STP d0, d1, [%1 , #32]\n\t"
> +		     : : "r" (src), "r" (dst) :
> +	);
> +}
> +
> +static inline void
> +rte_mov64(uint8_t *dst, const uint8_t *src)
> +{
> +	asm volatile("LDP q0, q1, [%0]\n\t"
> +		     "STP q0, q1, [%1]\n\t"
> +		     "LDP q0, q1, [%0 , #32]\n\t"
> +		     "STP q0, q1, [%1 , #32]\n\t"
> +		     : : "r" (src), "r" (dst) :
> +	);
> +}
> +
> +static inline void
> +rte_mov128(uint8_t *dst, const uint8_t *src)
> +{
> +	asm volatile("LDP q0, q1, [%0]\n\t"
> +		     "STP q0, q1, [%1]\n\t"
> +		     "LDP q0, q1, [%0 , #32]\n\t"
> +		     "STP q0, q1, [%1 , #32]\n\t"
> +		     "LDP q0, q1, [%0 , #64]\n\t"
> +		     "STP q0, q1, [%1 , #64]\n\t"
> +		     "LDP q0, q1, [%0 , #96]\n\t"
> +		     "STP q0, q1, [%1 , #96]\n\t"
> +		     : : "r" (src), "r" (dst) :
> +	);
> +}
> +
> +static inline void
> +rte_mov256(uint8_t *dst, const uint8_t *src)
> +{
> +	asm volatile("LDP q0, q1, [%0]\n\t"
> +		     "STP q0, q1, [%1]\n\t"
> +		     "LDP q0, q1, [%0 , #32]\n\t"
> +		     "STP q0, q1, [%1 , #32]\n\t"
> +		     "LDP q0, q1, [%0 , #64]\n\t"
> +		     "STP q0, q1, [%1 , #64]\n\t"
> +		     "LDP q0, q1, [%0 , #96]\n\t"
> +		     "STP q0, q1, [%1 , #96]\n\t"
> +		     "LDP q0, q1, [%0 , #128]\n\t"
> +		     "STP q0, q1, [%1 , #128]\n\t"
> +		     "LDP q0, q1, [%0 , #160]\n\t"
> +		     "STP q0, q1, [%1 , #160]\n\t"
> +		     "LDP q0, q1, [%0 , #192]\n\t"
> +		     "STP q0, q1, [%1 , #192]\n\t"
> +		     "LDP q0, q1, [%0 , #224]\n\t"
> +		     "STP q0, q1, [%1 , #224]\n\t"
> +		     : : "r" (src), "r" (dst) :
> +	);
> +}
> +
> +#define rte_memcpy(dst, src, n)              \
> +	({ (__builtin_constant_p(n)) ?       \
> +	memcpy((dst), (src), (n)) :          \
> +	rte_memcpy_func((dst), (src), (n)); })
> +
> +static inline void *
> +rte_memcpy_func(void *dst, const void *src, size_t n)
> +{
> +	void *ret = dst;
> +
> +	/* We can't copy < 16 bytes using XMM registers so do it manually. */
> +	if (n < 16) {
> +		if (n & 0x01) {
> +			*(uint8_t *)dst = *(const uint8_t *)src;
> +			dst = (uint8_t *)dst + 1;
> +			src = (const uint8_t *)src + 1;
> +		}
> +		if (n & 0x02) {
> +			*(uint16_t *)dst = *(const uint16_t *)src;
> +			dst = (uint16_t *)dst + 1;
> +			src = (const uint16_t *)src + 1;
> +		}
> +		if (n & 0x04) {
> +			*(uint32_t *)dst = *(const uint32_t *)src;
> +			dst = (uint32_t *)dst + 1;
> +			src = (const uint32_t *)src + 1;
> +		}
> +		if (n & 0x08)
> +			*(uint64_t *)dst = *(const uint64_t *)src;
> +		return ret;
> +	}
> +
> +	/* Special fast cases for <= 128 bytes */
> +	if (n <= 32) {
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov16((uint8_t *)dst - 16 + n,
> +			(const uint8_t *)src - 16 + n);
> +		return ret;
> +	}
> +
> +	if (n <= 64) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov32((uint8_t *)dst - 32 + n,
> +			(const uint8_t *)src - 32 + n);
> +		return ret;
> +	}
> +
> +	if (n <= 128) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		rte_mov64((uint8_t *)dst - 64 + n,
> +			(const uint8_t *)src - 64 + n);
> +		return ret;
> +	}
> +
> +	/*
> +	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
> +	 * copies was found to be faster than doing 128 and 32 byte copies as
> +	 * well.
> +	 */
> +	for ( ; n >= 256; n -= 256) {

There is room for prefetching the next cacheline based on the cache line
size.

> +		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
> +		dst = (uint8_t *)dst + 256;
> +		src = (const uint8_t *)src + 256;
> +	}
> +
> +	/*
> +	 * We split the remaining bytes (which will be less than 256) into
> +	 * 64byte (2^6) chunks.
> +	 * Using incrementing integers in the case labels of a switch statement
> +	 * enourages the compiler to use a jump table. To get incrementing
> +	 * integers, we shift the 2 relevant bits to the LSB position to first
> +	 * get decrementing integers, and then subtract.
> +	 */
> +	switch (3 - (n >> 6)) {
> +	case 0x00:
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		n -= 64;
> +		dst = (uint8_t *)dst + 64;
> +		src = (const uint8_t *)src + 64;      /* fallthrough */
> +	case 0x01:
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		n -= 64;
> +		dst = (uint8_t *)dst + 64;
> +		src = (const uint8_t *)src + 64;      /* fallthrough */
> +	case 0x02:
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		n -= 64;
> +		dst = (uint8_t *)dst + 64;
> +		src = (const uint8_t *)src + 64;      /* fallthrough */
> +	default:
> +		break;
> +	}
> +
> +	/*
> +	 * We split the remaining bytes (which will be less than 64) into
> +	 * 16byte (2^4) chunks, using the same switch structure as above.
> +	 */
> +	switch (3 - (n >> 4)) {
> +	case 0x00:
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		n -= 16;
> +		dst = (uint8_t *)dst + 16;
> +		src = (const uint8_t *)src + 16;      /* fallthrough */
> +	case 0x01:
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		n -= 16;
> +		dst = (uint8_t *)dst + 16;
> +		src = (const uint8_t *)src + 16;      /* fallthrough */
> +	case 0x02:
> +		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		n -= 16;
> +		dst = (uint8_t *)dst + 16;
> +		src = (const uint8_t *)src + 16;      /* fallthrough */
> +	default:
> +		break;
> +	}
> +
> +	/* Copy any remaining bytes, without going beyond end of buffers */
> +	if (n != 0)
> +		rte_mov16((uint8_t *)dst - 16 + n,
> +			(const uint8_t *)src - 16 + n);
> +	return ret;
> +}
> +
> +#else
> +
> +static inline void
> +rte_mov16(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 16);
> +}
> +
> +static inline void
> +rte_mov32(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 32);
> +}
> +
> +static inline void
> +rte_mov48(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 48);
> +}
> +
> +static inline void
> +rte_mov64(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 64);
> +}
> +
> +static inline void
> +rte_mov128(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 128);
> +}
> +
> +static inline void
> +rte_mov256(uint8_t *dst, const uint8_t *src)
> +{
> +	memcpy(dst, src, 256);
> +}
> +
> +static inline void *
> +rte_memcpy(void *dst, const void *src, size_t n)
> +{
> +	return memcpy(dst, src, n);
> +}
> +
> +static inline void *
> +rte_memcpy_func(void *dst, const void *src, size_t n)
> +{
> +	return memcpy(dst, src, n);
> +}
> +
> +#endif /* __ARM_NEON_FP */
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_MEMCPY_ARM_64_H_ */
> -- 
> 1.9.1
>
  
Hunt, David Nov. 2, 2015, 12:22 p.m. UTC | #2
On 02/11/2015 04:57, Jerin Jacob wrote:
> On Fri, Oct 30, 2015 at 01:49:14PM +0000, David Hunt wrote:
>> Signed-off-by: David Hunt <david.hunt@intel.com>
--snip--
>> +#ifndef _RTE_MEMCPY_ARM_64_H_
>> +#define _RTE_MEMCPY_ARM_64_H_
>> +
>> +#include <stdint.h>
>> +#include <string.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#include "generic/rte_memcpy.h"
>> +
>> +#ifdef __ARM_NEON_FP
>
> SIMD is not optional in armv8 spec.So every armv8 machine will have
> SIMD instruction unlike armv7.More over LDP/STP instruction is
> not part of SIMD.So this check is not required or it can
> be replaced with a check that select memcpy from either libc or this specific
> implementation

Jerin,
    I've just benchmarked the libc version against the hand-coded 
version of the memcpy routines, and the libc wins in most cases. This 
code was just an initial attempt at optimising the memccpy's, so I feel 
that with the current benchmark results, it would better just to remove 
the assembly versions, and use the libc version for the initial release 
on ARMv8.
Then, in the future, the ARMv8 experts are free to submit an optimised 
version as a patch in the future. Does that sound reasonable to you?
Rgds,
Dave.


--snip--
  
Jan Viktorin Nov. 2, 2015, 12:45 p.m. UTC | #3
On Mon, 2 Nov 2015 12:22:47 +0000
"Hunt, David" <david.hunt@intel.com> wrote:

> On 02/11/2015 04:57, Jerin Jacob wrote:
> > On Fri, Oct 30, 2015 at 01:49:14PM +0000, David Hunt wrote:  
> >> Signed-off-by: David Hunt <david.hunt@intel.com>  
> --snip--
> >> +#ifndef _RTE_MEMCPY_ARM_64_H_
> >> +#define _RTE_MEMCPY_ARM_64_H_
> >> +
> >> +#include <stdint.h>
> >> +#include <string.h>
> >> +
> >> +#ifdef __cplusplus
> >> +extern "C" {
> >> +#endif
> >> +
> >> +#include "generic/rte_memcpy.h"
> >> +
> >> +#ifdef __ARM_NEON_FP  
> >
> > SIMD is not optional in armv8 spec.So every armv8 machine will have
> > SIMD instruction unlike armv7.More over LDP/STP instruction is
> > not part of SIMD.So this check is not required or it can
> > be replaced with a check that select memcpy from either libc or this specific
> > implementation  
> 
> Jerin,
>     I've just benchmarked the libc version against the hand-coded 
> version of the memcpy routines, and the libc wins in most cases. This 
> code was just an initial attempt at optimising the memccpy's, so I feel 
> that with the current benchmark results, it would better just to remove 
> the assembly versions, and use the libc version for the initial release 
> on ARMv8.
> Then, in the future, the ARMv8 experts are free to submit an optimised 
> version as a patch in the future. Does that sound reasonable to you?
> Rgds,
> Dave.

As there is no use of NEON in the code, this optimization seems to be
useless to me...

Jan

> 
> 
> --snip--
> 
> 
>
  
Jerin Jacob Nov. 2, 2015, 12:57 p.m. UTC | #4
On Mon, Nov 02, 2015 at 12:22:47PM +0000, Hunt, David wrote:
> On 02/11/2015 04:57, Jerin Jacob wrote:
> >On Fri, Oct 30, 2015 at 01:49:14PM +0000, David Hunt wrote:
> >>Signed-off-by: David Hunt <david.hunt@intel.com>
> --snip--
> >>+#ifndef _RTE_MEMCPY_ARM_64_H_
> >>+#define _RTE_MEMCPY_ARM_64_H_
> >>+
> >>+#include <stdint.h>
> >>+#include <string.h>
> >>+
> >>+#ifdef __cplusplus
> >>+extern "C" {
> >>+#endif
> >>+
> >>+#include "generic/rte_memcpy.h"
> >>+
> >>+#ifdef __ARM_NEON_FP
> >
> >SIMD is not optional in armv8 spec.So every armv8 machine will have
> >SIMD instruction unlike armv7.More over LDP/STP instruction is
> >not part of SIMD.So this check is not required or it can
> >be replaced with a check that select memcpy from either libc or this specific
> >implementation
> 
> Jerin,
>    I've just benchmarked the libc version against the hand-coded version of
> the memcpy routines, and the libc wins in most cases. This code was just an
> initial attempt at optimising the memccpy's, so I feel that with the current
> benchmark results, it would better just to remove the assembly versions, and
> use the libc version for the initial release on ARMv8.
> Then, in the future, the ARMv8 experts are free to submit an optimised
> version as a patch in the future. Does that sound reasonable to you?

Make sense. Based on my understanding, other blocks are also not optimized 
for arm64.
So better to revert back to CONFIG_RTE_FORCE_INTRINSICS and
libc for initial version.

BTW: I just tested ./arm64-armv8a-linuxapp-gcc/app/test and
"byteorder_autotest" is broken. I think existing arm64 code is not optimized
beyond CONFIG_RTE_FORCE_INTRINSICS. So better to use verified
CONFIG_RTE_FORCE_INTRINSICS scheme.

if you guys are OK with arm and arm64 as two different platform then
I can summit the complete working patch for arm64.(as in my current source
code "arm64" is a different platform(lib/librte_eal/common/include/arch/arm64/)


> Rgds,
> Dave.
> 
> 
> --snip--
> 
> 
>
  
Hunt, David Nov. 2, 2015, 3:26 p.m. UTC | #5
On 02/11/2015 12:57, Jerin Jacob wrote:
> On Mon, Nov 02, 2015 at 12:22:47PM +0000, Hunt, David wrote:
>> Jerin,
>>     I've just benchmarked the libc version against the hand-coded version of
>> the memcpy routines, and the libc wins in most cases. This code was just an
>> initial attempt at optimising the memccpy's, so I feel that with the current
>> benchmark results, it would better just to remove the assembly versions, and
>> use the libc version for the initial release on ARMv8.
>> Then, in the future, the ARMv8 experts are free to submit an optimised
>> version as a patch in the future. Does that sound reasonable to you?
>
> Make sense. Based on my understanding, other blocks are also not optimized
> for arm64.
> So better to revert back to CONFIG_RTE_FORCE_INTRINSICS and
> libc for initial version.
>
> BTW: I just tested ./arm64-armv8a-linuxapp-gcc/app/test and
> "byteorder_autotest" is broken. I think existing arm64 code is not optimized
> beyond CONFIG_RTE_FORCE_INTRINSICS. So better to use verified
> CONFIG_RTE_FORCE_INTRINSICS scheme.

Agreed.

> if you guys are OK with arm and arm64 as two different platform then
> I can summit the complete working patch for arm64.(as in my current source
> code "arm64" is a different platform(lib/librte_eal/common/include/arch/arm64/)

Sure. That would be great. We initially started with two ARMv7 
patch-sets, and Jan merged into one. Something similar could happen for 
the ARMv8 patch set. We just want to end up with the best implementation 
possible. :)

Dave.
  
Jan Viktorin Nov. 2, 2015, 3:36 p.m. UTC | #6
On Mon, 2 Nov 2015 15:26:19 +0000
"Hunt, David" <david.hunt@intel.com> wrote:

> On 02/11/2015 12:57, Jerin Jacob wrote:
> > On Mon, Nov 02, 2015 at 12:22:47PM +0000, Hunt, David wrote:  
> >> Jerin,
> >>     I've just benchmarked the libc version against the hand-coded version of
> >> the memcpy routines, and the libc wins in most cases. This code was just an
> >> initial attempt at optimising the memccpy's, so I feel that with the current
> >> benchmark results, it would better just to remove the assembly versions, and
> >> use the libc version for the initial release on ARMv8.
> >> Then, in the future, the ARMv8 experts are free to submit an optimised
> >> version as a patch in the future. Does that sound reasonable to you?  
> >
> > Make sense. Based on my understanding, other blocks are also not optimized
> > for arm64.
> > So better to revert back to CONFIG_RTE_FORCE_INTRINSICS and
> > libc for initial version.
> >
> > BTW: I just tested ./arm64-armv8a-linuxapp-gcc/app/test and
> > "byteorder_autotest" is broken. I think existing arm64 code is not optimized
> > beyond CONFIG_RTE_FORCE_INTRINSICS. So better to use verified
> > CONFIG_RTE_FORCE_INTRINSICS scheme.  
> 
> Agreed.
> 
> > if you guys are OK with arm and arm64 as two different platform then
> > I can summit the complete working patch for arm64.(as in my current source
> > code "arm64" is a different platform(lib/librte_eal/common/include/arch/arm64/)  
> 
> Sure. That would be great. We initially started with two ARMv7 
> patch-sets, and Jan merged into one. Something similar could happen for 
> the ARMv8 patch set. We just want to end up with the best implementation 
> possible. :)
> 

It was looking like we can share a lot of common code for both
architectures. I didn't know how much different are the cpuflags.

IMHO, it'd be better to have two directories arm and arm64. I thought
to refer from arm64 to arm where possible. But I don't know whether is
this possible with the DPDK build system.

Jan

> Dave.
> 
> 
> 
>
  
Hunt, David Nov. 2, 2015, 3:49 p.m. UTC | #7
On 02/11/2015 15:36, Jan Viktorin wrote:
> On Mon, 2 Nov 2015 15:26:19 +0000
--snip--
> It was looking like we can share a lot of common code for both
> architectures. I didn't know how much different are the cpuflags.

CPU flags for ARMv8 are looking like this now. Quite different to the 
ARMv7 ones.

static const struct feature_entry cpu_feature_table[] = {
         FEAT_DEF(FP,        0x00000001, 0, REG_HWCAP,  0)
         FEAT_DEF(ASIMD,     0x00000001, 0, REG_HWCAP,  1)
         FEAT_DEF(EVTSTRM,   0x00000001, 0, REG_HWCAP,  2)
         FEAT_DEF(AES,       0x00000001, 0, REG_HWCAP,  3)
         FEAT_DEF(PMULL,     0x00000001, 0, REG_HWCAP,  4)
         FEAT_DEF(SHA1,      0x00000001, 0, REG_HWCAP,  5)
         FEAT_DEF(SHA2,      0x00000001, 0, REG_HWCAP,  6)
         FEAT_DEF(CRC32,     0x00000001, 0, REG_HWCAP,  7)
         FEAT_DEF(AARCH32,   0x00000001, 0, REG_PLATFORM, 0)
         FEAT_DEF(AARCH64,   0x00000001, 0, REG_PLATFORM, 1)
};

> IMHO, it'd be better to have two directories arm and arm64. I thought
> to refer from arm64 to arm where possible. But I don't know whether is
> this possible with the DPDK build system.

I think both methodologies have their pros and cons. However, I'd lean 
towards the common directory with the "filename_32/64.h" scheme, as that 
similar to the x86 methodology, and we don't need to tweak the include 
paths to pull files from multiple directories.

Dave
  
Jerin Jacob Nov. 2, 2015, 4:29 p.m. UTC | #8
On Mon, Nov 02, 2015 at 03:49:17PM +0000, Hunt, David wrote:
> On 02/11/2015 15:36, Jan Viktorin wrote:
> >On Mon, 2 Nov 2015 15:26:19 +0000
> --snip--
> >It was looking like we can share a lot of common code for both
> >architectures. I didn't know how much different are the cpuflags.
> 
> CPU flags for ARMv8 are looking like this now. Quite different to the ARMv7
> ones.
> 
> static const struct feature_entry cpu_feature_table[] = {
>         FEAT_DEF(FP,        0x00000001, 0, REG_HWCAP,  0)
>         FEAT_DEF(ASIMD,     0x00000001, 0, REG_HWCAP,  1)
>         FEAT_DEF(EVTSTRM,   0x00000001, 0, REG_HWCAP,  2)
>         FEAT_DEF(AES,       0x00000001, 0, REG_HWCAP,  3)
>         FEAT_DEF(PMULL,     0x00000001, 0, REG_HWCAP,  4)
>         FEAT_DEF(SHA1,      0x00000001, 0, REG_HWCAP,  5)
>         FEAT_DEF(SHA2,      0x00000001, 0, REG_HWCAP,  6)
>         FEAT_DEF(CRC32,     0x00000001, 0, REG_HWCAP,  7)
>         FEAT_DEF(AARCH32,   0x00000001, 0, REG_PLATFORM, 0)
>         FEAT_DEF(AARCH64,   0x00000001, 0, REG_PLATFORM, 1)
> };
> 
> >IMHO, it'd be better to have two directories arm and arm64. I thought
> >to refer from arm64 to arm where possible. But I don't know whether is
> >this possible with the DPDK build system.
> 
> I think both methodologies have their pros and cons. However, I'd lean
> towards the common directory with the "filename_32/64.h" scheme, as that
> similar to the x86 methodology, and we don't need to tweak the include paths
> to pull files from multiple directories.
> 

I agree. Jan, could you please send the next version with
filename_32/64.h for atomic and cpuflags(ie for all header files).
I can re-base and send the complete arm64 patch based on your version.

Thanks,
Jerin



> Dave
>
  
Jan Viktorin Nov. 2, 2015, 5:29 p.m. UTC | #9
On Mon, 2 Nov 2015 21:59:12 +0530
Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:

> On Mon, Nov 02, 2015 at 03:49:17PM +0000, Hunt, David wrote:
> > On 02/11/2015 15:36, Jan Viktorin wrote:  
> > >On Mon, 2 Nov 2015 15:26:19 +0000  
> > --snip--  
> > >It was looking like we can share a lot of common code for both
> > >architectures. I didn't know how much different are the cpuflags.  
> > 
> > CPU flags for ARMv8 are looking like this now. Quite different to the ARMv7
> > ones.
> > 
> > static const struct feature_entry cpu_feature_table[] = {
> >         FEAT_DEF(FP,        0x00000001, 0, REG_HWCAP,  0)
> >         FEAT_DEF(ASIMD,     0x00000001, 0, REG_HWCAP,  1)
> >         FEAT_DEF(EVTSTRM,   0x00000001, 0, REG_HWCAP,  2)
> >         FEAT_DEF(AES,       0x00000001, 0, REG_HWCAP,  3)
> >         FEAT_DEF(PMULL,     0x00000001, 0, REG_HWCAP,  4)
> >         FEAT_DEF(SHA1,      0x00000001, 0, REG_HWCAP,  5)
> >         FEAT_DEF(SHA2,      0x00000001, 0, REG_HWCAP,  6)
> >         FEAT_DEF(CRC32,     0x00000001, 0, REG_HWCAP,  7)
> >         FEAT_DEF(AARCH32,   0x00000001, 0, REG_PLATFORM, 0)
> >         FEAT_DEF(AARCH64,   0x00000001, 0, REG_PLATFORM, 1)
> > };
> >   
> > >IMHO, it'd be better to have two directories arm and arm64. I thought
> > >to refer from arm64 to arm where possible. But I don't know whether is
> > >this possible with the DPDK build system.  
> > 
> > I think both methodologies have their pros and cons. However, I'd lean
> > towards the common directory with the "filename_32/64.h" scheme, as that
> > similar to the x86 methodology, and we don't need to tweak the include paths
> > to pull files from multiple directories.
> >   
> 
> I agree. Jan, could you please send the next version with
> filename_32/64.h for atomic and cpuflags(ie for all header files).
> I can re-base and send the complete arm64 patch based on your version.
> 

I am working on it, however, after I've removed the unnecessary
intrinsics code and set the RTE_FORCE_INTRINSICS=y, it doesn't
build... So I'm figuring out what is wrong.

Jan

> Thanks,
> Jerin
> 
> 
> 
> > Dave
> >
  

Patch

diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h
index d9f5bf1..1d562c3 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h
@@ -33,6 +33,10 @@ 
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#ifdef RTE_ARCH_64
+#include <rte_memcpy_64.h>
+#else
 #include <rte_memcpy_32.h>
+#endif
 
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
new file mode 100644
index 0000000..6d85113
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -0,0 +1,308 @@ 
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) IBM Corporation 2014.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IBM Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _RTE_MEMCPY_ARM_64_H_
+#define _RTE_MEMCPY_ARM_64_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcpy.h"
+
+#ifdef __ARM_NEON_FP
+
+/* ARM NEON Intrinsics are used to copy data */
+#include <arm_neon.h>
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	asm volatile("LDP d0, d1, [%0]\n\t"
+		     "STP d0, d1, [%1]\n\t"
+		     : : "r" (src), "r" (dst) :
+	);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	asm volatile("LDP q0, q1, [%0]\n\t"
+		     "STP q0, q1, [%1]\n\t"
+		     : : "r" (src), "r" (dst) :
+	);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	asm volatile("LDP q0, q1, [%0]\n\t"
+		     "STP q0, q1, [%1]\n\t"
+		     "LDP d0, d1, [%0 , #32]\n\t"
+		     "STP d0, d1, [%1 , #32]\n\t"
+		     : : "r" (src), "r" (dst) :
+	);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	asm volatile("LDP q0, q1, [%0]\n\t"
+		     "STP q0, q1, [%1]\n\t"
+		     "LDP q0, q1, [%0 , #32]\n\t"
+		     "STP q0, q1, [%1 , #32]\n\t"
+		     : : "r" (src), "r" (dst) :
+	);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	asm volatile("LDP q0, q1, [%0]\n\t"
+		     "STP q0, q1, [%1]\n\t"
+		     "LDP q0, q1, [%0 , #32]\n\t"
+		     "STP q0, q1, [%1 , #32]\n\t"
+		     "LDP q0, q1, [%0 , #64]\n\t"
+		     "STP q0, q1, [%1 , #64]\n\t"
+		     "LDP q0, q1, [%0 , #96]\n\t"
+		     "STP q0, q1, [%1 , #96]\n\t"
+		     : : "r" (src), "r" (dst) :
+	);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	asm volatile("LDP q0, q1, [%0]\n\t"
+		     "STP q0, q1, [%1]\n\t"
+		     "LDP q0, q1, [%0 , #32]\n\t"
+		     "STP q0, q1, [%1 , #32]\n\t"
+		     "LDP q0, q1, [%0 , #64]\n\t"
+		     "STP q0, q1, [%1 , #64]\n\t"
+		     "LDP q0, q1, [%0 , #96]\n\t"
+		     "STP q0, q1, [%1 , #96]\n\t"
+		     "LDP q0, q1, [%0 , #128]\n\t"
+		     "STP q0, q1, [%1 , #128]\n\t"
+		     "LDP q0, q1, [%0 , #160]\n\t"
+		     "STP q0, q1, [%1 , #160]\n\t"
+		     "LDP q0, q1, [%0 , #192]\n\t"
+		     "STP q0, q1, [%1 , #192]\n\t"
+		     "LDP q0, q1, [%0 , #224]\n\t"
+		     "STP q0, q1, [%1 , #224]\n\t"
+		     : : "r" (src), "r" (dst) :
+	);
+}
+
+#define rte_memcpy(dst, src, n)              \
+	({ (__builtin_constant_p(n)) ?       \
+	memcpy((dst), (src), (n)) :          \
+	rte_memcpy_func((dst), (src), (n)); })
+
+static inline void *
+rte_memcpy_func(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+
+	/* We can't copy < 16 bytes using XMM registers so do it manually. */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dst = *(const uint8_t *)src;
+			dst = (uint8_t *)dst + 1;
+			src = (const uint8_t *)src + 1;
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dst = *(const uint16_t *)src;
+			dst = (uint16_t *)dst + 1;
+			src = (const uint16_t *)src + 1;
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dst = *(const uint32_t *)src;
+			dst = (uint32_t *)dst + 1;
+			src = (const uint32_t *)src + 1;
+		}
+		if (n & 0x08)
+			*(uint64_t *)dst = *(const uint64_t *)src;
+		return ret;
+	}
+
+	/* Special fast cases for <= 128 bytes */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+			(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+			(const uint8_t *)src - 32 + n);
+		return ret;
+	}
+
+	if (n <= 128) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov64((uint8_t *)dst - 64 + n,
+			(const uint8_t *)src - 64 + n);
+		return ret;
+	}
+
+	/*
+	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
+	 * copies was found to be faster than doing 128 and 32 byte copies as
+	 * well.
+	 */
+	for ( ; n >= 256; n -= 256) {
+		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+		dst = (uint8_t *)dst + 256;
+		src = (const uint8_t *)src + 256;
+	}
+
+	/*
+	 * We split the remaining bytes (which will be less than 256) into
+	 * 64byte (2^6) chunks.
+	 * Using incrementing integers in the case labels of a switch statement
+	 * enourages the compiler to use a jump table. To get incrementing
+	 * integers, we shift the 2 relevant bits to the LSB position to first
+	 * get decrementing integers, and then subtract.
+	 */
+	switch (3 - (n >> 6)) {
+	case 0x00:
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		n -= 64;
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;      /* fallthrough */
+	case 0x01:
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		n -= 64;
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;      /* fallthrough */
+	case 0x02:
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		n -= 64;
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;      /* fallthrough */
+	default:
+		break;
+	}
+
+	/*
+	 * We split the remaining bytes (which will be less than 64) into
+	 * 16byte (2^4) chunks, using the same switch structure as above.
+	 */
+	switch (3 - (n >> 4)) {
+	case 0x00:
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		n -= 16;
+		dst = (uint8_t *)dst + 16;
+		src = (const uint8_t *)src + 16;      /* fallthrough */
+	case 0x01:
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		n -= 16;
+		dst = (uint8_t *)dst + 16;
+		src = (const uint8_t *)src + 16;      /* fallthrough */
+	case 0x02:
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		n -= 16;
+		dst = (uint8_t *)dst + 16;
+		src = (const uint8_t *)src + 16;      /* fallthrough */
+	default:
+		break;
+	}
+
+	/* Copy any remaining bytes, without going beyond end of buffers */
+	if (n != 0)
+		rte_mov16((uint8_t *)dst - 16 + n,
+			(const uint8_t *)src - 16 + n);
+	return ret;
+}
+
+#else
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+
+static inline void *
+rte_memcpy_func(void *dst, const void *src, size_t n)
+{
+	return memcpy(dst, src, n);
+}
+
+#endif /* __ARM_NEON_FP */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_ARM_64_H_ */