From patchwork Fri Oct 30 11:51:21 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Hunt, David" X-Patchwork-Id: 8364 Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [IPv6:::1]) by dpdk.org (Postfix) with ESMTP id 5B5288E7B; Fri, 30 Oct 2015 12:51:54 +0100 (CET) Received: from mga11.intel.com (mga11.intel.com [192.55.52.93]) by dpdk.org (Postfix) with ESMTP id 99C7D8E6F for ; Fri, 30 Oct 2015 12:51:48 +0100 (CET) Received: from orsmga003.jf.intel.com ([10.7.209.27]) by fmsmga102.fm.intel.com with ESMTP; 30 Oct 2015 04:51:47 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.20,218,1444719600"; d="scan'208";a="675000173" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by orsmga003.jf.intel.com with ESMTP; 30 Oct 2015 04:51:46 -0700 Received: from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com [10.237.217.46]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id t9UBpj3O024772; Fri, 30 Oct 2015 11:51:45 GMT Received: from sivswdev02.ir.intel.com (localhost [127.0.0.1]) by sivswdev02.ir.intel.com with ESMTP id t9UBpjIP023884; Fri, 30 Oct 2015 11:51:45 GMT Received: (from dhunt5@localhost) by sivswdev02.ir.intel.com with id t9UBpjKJ023879; Fri, 30 Oct 2015 11:51:45 GMT From: David Hunt To: dev@dpdk.org Date: Fri, 30 Oct 2015 11:51:21 +0000 Message-Id: <1446205886-23823-2-git-send-email-david.hunt@intel.com> X-Mailer: git-send-email 1.7.4.1 In-Reply-To: <1446205886-23823-1-git-send-email-david.hunt@intel.com> References: <1446205886-23823-1-git-send-email-david.hunt@intel.com> Subject: [dpdk-dev] [PATCH v2 1/6] eal/arm: add 64-bit armv8 version of rte_memcpy.h X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Signed-off-by: David Hunt --- .../common/include/arch/arm/rte_memcpy.h | 4 + .../common/include/arch/arm/rte_memcpy_64.h | 308 +++++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h index d9f5bf1..1d562c3 100644 --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy.h @@ -33,6 +33,10 @@ #ifndef _RTE_MEMCPY_ARM_H_ #define _RTE_MEMCPY_ARM_H_ +#ifdef RTE_ARCH_64 +#include +#else #include +#endif #endif /* _RTE_MEMCPY_ARM_H_ */ diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h new file mode 100644 index 0000000..6d85113 --- /dev/null +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h @@ -0,0 +1,308 @@ +/* + * BSD LICENSE + * + * Copyright (C) IBM Corporation 2014. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of IBM Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _RTE_MEMCPY_ARM_64_H_ +#define _RTE_MEMCPY_ARM_64_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_memcpy.h" + +#ifdef __ARM_NEON_FP + +/* ARM NEON Intrinsics are used to copy data */ +#include + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + asm volatile("LDP d0, d1, [%0]\n\t" + "STP d0, d1, [%1]\n\t" + : : "r" (src), "r" (dst) : + ); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + asm volatile("LDP q0, q1, [%0]\n\t" + "STP q0, q1, [%1]\n\t" + : : "r" (src), "r" (dst) : + ); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + asm volatile("LDP q0, q1, [%0]\n\t" + "STP q0, q1, [%1]\n\t" + "LDP d0, d1, [%0 , #32]\n\t" + "STP d0, d1, [%1 , #32]\n\t" + : : "r" (src), "r" (dst) : + ); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + asm volatile("LDP q0, q1, [%0]\n\t" + "STP q0, q1, [%1]\n\t" + "LDP q0, q1, [%0 , #32]\n\t" + "STP q0, q1, [%1 , #32]\n\t" + : : "r" (src), "r" (dst) : + ); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + asm volatile("LDP q0, q1, [%0]\n\t" + "STP q0, q1, [%1]\n\t" + "LDP q0, q1, [%0 , #32]\n\t" + "STP q0, q1, [%1 , #32]\n\t" + "LDP q0, q1, [%0 , #64]\n\t" + "STP q0, q1, [%1 , #64]\n\t" + "LDP q0, q1, [%0 , #96]\n\t" + "STP q0, q1, [%1 , #96]\n\t" + : : "r" (src), "r" (dst) : + ); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + asm volatile("LDP q0, q1, [%0]\n\t" + "STP q0, q1, [%1]\n\t" + "LDP q0, q1, [%0 , #32]\n\t" + "STP q0, q1, [%1 , #32]\n\t" + "LDP q0, q1, [%0 , #64]\n\t" + "STP q0, q1, [%1 , #64]\n\t" + "LDP q0, q1, [%0 , #96]\n\t" + "STP q0, q1, [%1 , #96]\n\t" + "LDP q0, q1, [%0 , #128]\n\t" + "STP q0, q1, [%1 , #128]\n\t" + "LDP q0, q1, [%0 , #160]\n\t" + "STP q0, q1, [%1 , #160]\n\t" + "LDP q0, q1, [%0 , #192]\n\t" + "STP q0, q1, [%1 , #192]\n\t" + "LDP q0, q1, [%0 , #224]\n\t" + "STP q0, q1, [%1 , #224]\n\t" + : : "r" (src), "r" (dst) : + ); +} + +#define rte_memcpy(dst, src, n) \ + ({ (__builtin_constant_p(n)) ? \ + memcpy((dst), (src), (n)) : \ + rte_memcpy_func((dst), (src), (n)); }) + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* We can't copy < 16 bytes using XMM registers so do it manually. */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + dst = (uint8_t *)dst + 1; + src = (const uint8_t *)src + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst = (uint16_t *)dst + 1; + src = (const uint16_t *)src + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + return ret; + } + + /* Special fast cases for <= 128 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + + if (n <= 128) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /* + * For large copies > 128 bytes. This combination of 256, 64 and 16 byte + * copies was found to be faster than doing 128 and 32 byte copies as + * well. + */ + for ( ; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /* + * We split the remaining bytes (which will be less than 256) into + * 64byte (2^6) chunks. + * Using incrementing integers in the case labels of a switch statement + * enourages the compiler to use a jump table. To get incrementing + * integers, we shift the 2 relevant bits to the LSB position to first + * get decrementing integers, and then subtract. + */ + switch (3 - (n >> 6)) { + case 0x00: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x01: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x02: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + default: + break; + } + + /* + * We split the remaining bytes (which will be less than 64) into + * 16byte (2^4) chunks, using the same switch structure as above. + */ + switch (3 - (n >> 4)) { + case 0x00: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x01: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x02: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + default: + break; + } + + /* Copy any remaining bytes, without going beyond end of buffers */ + if (n != 0) + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; +} + +#else + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 16); +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 32); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 48); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 64); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 128); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + memcpy(dst, src, 256); +} + +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + return memcpy(dst, src, n); +} + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + return memcpy(dst, src, n); +} + +#endif /* __ARM_NEON_FP */ + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_ARM_64_H_ */