get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/27911/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 27911,
    "url": "http://patches.dpdk.org/api/patches/27911/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/1503626773-184682-2-git-send-email-xiaoyun.li@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1503626773-184682-2-git-send-email-xiaoyun.li@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1503626773-184682-2-git-send-email-xiaoyun.li@intel.com",
    "date": "2017-08-25T02:06:11",
    "name": "[dpdk-dev,1/3] eal/x86: run-time dispatch over memcpy",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "569a6e27472248b801b940c3a2d197886b6e4c2c",
    "submitter": {
        "id": 798,
        "url": "http://patches.dpdk.org/api/people/798/?format=api",
        "name": "Li, Xiaoyun",
        "email": "xiaoyun.li@intel.com"
    },
    "delegate": null,
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/1503626773-184682-2-git-send-email-xiaoyun.li@intel.com/mbox/",
    "series": [],
    "comments": "http://patches.dpdk.org/api/patches/27911/comments/",
    "check": "fail",
    "checks": "http://patches.dpdk.org/api/patches/27911/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [IPv6:::1])\n\tby dpdk.org (Postfix) with ESMTP id 81F7A7D56;\n\tFri, 25 Aug 2017 04:07:05 +0200 (CEST)",
            "from mga09.intel.com (mga09.intel.com [134.134.136.24])\n\tby dpdk.org (Postfix) with ESMTP id DC50C7D22\n\tfor <dev@dpdk.org>; Fri, 25 Aug 2017 04:07:03 +0200 (CEST)",
            "from fmsmga001.fm.intel.com ([10.253.24.23])\n\tby orsmga102.jf.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384;\n\t24 Aug 2017 19:07:02 -0700",
            "from dpdk-lixiaoyun.sh.intel.com ([10.67.111.119])\n\tby fmsmga001.fm.intel.com with ESMTP; 24 Aug 2017 19:07:01 -0700"
        ],
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos; i=\"5.41,423,1498546800\"; d=\"scan'208\";\n\ta=\"1187946310\"",
        "From": "Xiaoyun Li <xiaoyun.li@intel.com>",
        "To": "bruce.richardson@intel.com",
        "Cc": "dev@dpdk.org, wenzhuo.lu@intel.com, zhihong.wang@intel.com,\n\tqi.z.zhang@intel.com, Xiaoyun Li <xiaoyun.li@intel.com>",
        "Date": "Fri, 25 Aug 2017 10:06:11 +0800",
        "Message-Id": "<1503626773-184682-2-git-send-email-xiaoyun.li@intel.com>",
        "X-Mailer": "git-send-email 2.7.4",
        "In-Reply-To": "<1503626773-184682-1-git-send-email-xiaoyun.li@intel.com>",
        "References": "<1503626773-184682-1-git-send-email-xiaoyun.li@intel.com>",
        "Subject": "[dpdk-dev] [PATCH 1/3] eal/x86: run-time dispatch over memcpy",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "This patch dynamically selects functions of memcpy at run-time based\non CPU flags that current machine supports. This patch uses function\npointers which are bind to the relative functions at constrctor time.\nTo make AVX512 instructions pass compilation, enable the switch in\nmakefile.\n\nSigned-off-by: Xiaoyun Li <xiaoyun.li@intel.com>\n---\n .../common/include/arch/x86/rte_memcpy.h           | 305 ++++++++++++---------\n mk/machine/native/rte.vars.mk                      |   2 +\n 2 files changed, 181 insertions(+), 126 deletions(-)",
    "diff": "diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h\nindex 74c280c..f68ebd2 100644\n--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h\n+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h\n@@ -45,11 +45,37 @@\n #include <string.h>\n #include <rte_vect.h>\n #include <rte_common.h>\n+#include <rte_cpuflags.h>\n+#include <rte_log.h>\n \n #ifdef __cplusplus\n extern \"C\" {\n #endif\n \n+/*\n+ * Select SSE/AVX memory copy method as default one.\n+ */\n+\n+static uint16_t alignment_mask = 0x0F;\n+\n+typedef void (*rte_mov16_t)(uint8_t *dst, const uint8_t *src);\n+typedef void (*rte_mov32_t)(uint8_t *dst, const uint8_t *src);\n+typedef void (*rte_mov64_t)(uint8_t *dst, const uint8_t *src);\n+typedef void (*rte_mov128_t)(uint8_t *dst, const uint8_t *src);\n+typedef void (*rte_mov256_t)(uint8_t *dst, const uint8_t *src);\n+typedef void (*rte_mov128blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);\n+typedef void (*rte_mov512blocks_t)(uint8_t *dst, const uint8_t *src, size_t n);\n+typedef void * (*rte_memcpy_generic_t)(void *dst, const void *src, size_t n);\n+\n+static rte_mov16_t rte_mov16;\n+static rte_mov32_t rte_mov32;\n+static rte_mov64_t rte_mov64;\n+static rte_mov128_t rte_mov128;\n+static rte_mov256_t rte_mov256;\n+static rte_mov128blocks_t rte_mov128blocks;\n+static rte_mov512blocks_t rte_mov512blocks;\n+static rte_memcpy_generic_t rte_memcpy_generic;\n+\n /**\n  * Copy bytes from one location to another. The locations must not overlap.\n  *\n@@ -68,10 +94,6 @@ extern \"C\" {\n static __rte_always_inline void *\n rte_memcpy(void *dst, const void *src, size_t n);\n \n-#ifdef RTE_MACHINE_CPUFLAG_AVX512F\n-\n-#define ALIGNMENT_MASK 0x3F\n-\n /**\n  * AVX512 implementation below\n  */\n@@ -81,7 +103,7 @@ rte_memcpy(void *dst, const void *src, size_t n);\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov16(uint8_t *dst, const uint8_t *src)\n+rte_mov16_AVX512F(uint8_t *dst, const uint8_t *src)\n {\n \t__m128i xmm0;\n \n@@ -94,7 +116,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov32(uint8_t *dst, const uint8_t *src)\n+rte_mov32_AVX512F(uint8_t *dst, const uint8_t *src)\n {\n \t__m256i ymm0;\n \n@@ -107,7 +129,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov64(uint8_t *dst, const uint8_t *src)\n+rte_mov64_AVX512F(uint8_t *dst, const uint8_t *src)\n {\n \t__m512i zmm0;\n \n@@ -120,10 +142,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov128(uint8_t *dst, const uint8_t *src)\n+rte_mov128_AVX512F(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov64(dst + 0 * 64, src + 0 * 64);\n-\trte_mov64(dst + 1 * 64, src + 1 * 64);\n+\t(*rte_mov64)(dst + 0 * 64, src + 0 * 64);\n+\t(*rte_mov64)(dst + 1 * 64, src + 1 * 64);\n }\n \n /**\n@@ -131,12 +153,12 @@ rte_mov128(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov256(uint8_t *dst, const uint8_t *src)\n+rte_mov256_AVX512F(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov64(dst + 0 * 64, src + 0 * 64);\n-\trte_mov64(dst + 1 * 64, src + 1 * 64);\n-\trte_mov64(dst + 2 * 64, src + 2 * 64);\n-\trte_mov64(dst + 3 * 64, src + 3 * 64);\n+\t(*rte_mov64)(dst + 0 * 64, src + 0 * 64);\n+\t(*rte_mov64)(dst + 1 * 64, src + 1 * 64);\n+\t(*rte_mov64)(dst + 2 * 64, src + 2 * 64);\n+\t(*rte_mov64)(dst + 3 * 64, src + 3 * 64);\n }\n \n /**\n@@ -144,7 +166,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)\n+rte_mov128blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)\n {\n \t__m512i zmm0, zmm1;\n \n@@ -164,7 +186,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)\n+rte_mov512blocks_AVX512F(uint8_t *dst, const uint8_t *src, size_t n)\n {\n \t__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;\n \n@@ -192,7 +214,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)\n }\n \n static inline void *\n-rte_memcpy_generic(void *dst, const void *src, size_t n)\n+rte_memcpy_generic_AVX512F(void *dst, const void *src, size_t n)\n {\n \tuintptr_t dstu = (uintptr_t)dst;\n \tuintptr_t srcu = (uintptr_t)src;\n@@ -228,39 +250,39 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t * Fast way when copy size doesn't exceed 512 bytes\n \t */\n \tif (n <= 32) {\n-\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst - 16 + n,\n+\t\t(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n \t\t\t\t  (const uint8_t *)src - 16 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 64) {\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov32((uint8_t *)dst - 32 + n,\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov32)((uint8_t *)dst - 32 + n,\n \t\t\t\t  (const uint8_t *)src - 32 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 512) {\n \t\tif (n >= 256) {\n \t\t\tn -= 256;\n-\t\t\trte_mov256((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 256;\n \t\t\tdst = (uint8_t *)dst + 256;\n \t\t}\n \t\tif (n >= 128) {\n \t\t\tn -= 128;\n-\t\t\trte_mov128((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 128;\n \t\t\tdst = (uint8_t *)dst + 128;\n \t\t}\n COPY_BLOCK_128_BACK63:\n \t\tif (n > 64) {\n-\t\t\trte_mov64((uint8_t *)dst, (const uint8_t *)src);\n-\t\t\trte_mov64((uint8_t *)dst - 64 + n,\n+\t\t\t(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov64)((uint8_t *)dst - 64 + n,\n \t\t\t\t\t  (const uint8_t *)src - 64 + n);\n \t\t\treturn ret;\n \t\t}\n \t\tif (n > 0)\n-\t\t\trte_mov64((uint8_t *)dst - 64 + n,\n+\t\t\t(*rte_mov64)((uint8_t *)dst - 64 + n,\n \t\t\t\t\t  (const uint8_t *)src - 64 + n);\n \t\treturn ret;\n \t}\n@@ -272,7 +294,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tif (dstofss > 0) {\n \t\tdstofss = 64 - dstofss;\n \t\tn -= dstofss;\n-\t\trte_mov64((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);\n \t\tsrc = (const uint8_t *)src + dstofss;\n \t\tdst = (uint8_t *)dst + dstofss;\n \t}\n@@ -282,7 +304,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t * Use copy block function for better instruction order control,\n \t * which is important when load is unaligned.\n \t */\n-\trte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);\n+\t(*rte_mov512blocks)((uint8_t *)dst, (const uint8_t *)src, n);\n \tbits = n;\n \tn = n & 511;\n \tbits -= n;\n@@ -295,7 +317,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t * which is important when load is unaligned.\n \t */\n \tif (n >= 128) {\n-\t\trte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);\n+\t\t(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);\n \t\tbits = n;\n \t\tn = n & 127;\n \t\tbits -= n;\n@@ -309,10 +331,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tgoto COPY_BLOCK_128_BACK63;\n }\n \n-#elif defined RTE_MACHINE_CPUFLAG_AVX2\n-\n-#define ALIGNMENT_MASK 0x1F\n-\n /**\n  * AVX2 implementation below\n  */\n@@ -322,7 +340,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov16(uint8_t *dst, const uint8_t *src)\n+rte_mov16_AVX2(uint8_t *dst, const uint8_t *src)\n {\n \t__m128i xmm0;\n \n@@ -335,7 +353,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov32(uint8_t *dst, const uint8_t *src)\n+rte_mov32_AVX2(uint8_t *dst, const uint8_t *src)\n {\n \t__m256i ymm0;\n \n@@ -348,10 +366,10 @@ rte_mov32(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov64(uint8_t *dst, const uint8_t *src)\n+rte_mov64_AVX2(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);\n-\trte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);\n+\t(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);\n+\t(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);\n }\n \n /**\n@@ -359,12 +377,12 @@ rte_mov64(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov128(uint8_t *dst, const uint8_t *src)\n+rte_mov128_AVX2(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);\n-\trte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);\n-\trte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);\n-\trte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);\n+\t(*rte_mov32)((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);\n+\t(*rte_mov32)((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);\n+\t(*rte_mov32)((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);\n+\t(*rte_mov32)((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);\n }\n \n /**\n@@ -372,7 +390,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)\n+rte_mov128blocks_AVX2(uint8_t *dst, const uint8_t *src, size_t n)\n {\n \t__m256i ymm0, ymm1, ymm2, ymm3;\n \n@@ -392,7 +410,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)\n }\n \n static inline void *\n-rte_memcpy_generic(void *dst, const void *src, size_t n)\n+rte_memcpy_generic_AVX2(void *dst, const void *src, size_t n)\n {\n \tuintptr_t dstu = (uintptr_t)dst;\n \tuintptr_t srcu = (uintptr_t)src;\n@@ -429,46 +447,46 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t * Fast way when copy size doesn't exceed 256 bytes\n \t */\n \tif (n <= 32) {\n-\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst - 16 + n,\n+\t\t(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n \t\t\t\t(const uint8_t *)src - 16 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 48) {\n-\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);\n-\t\trte_mov16((uint8_t *)dst - 16 + n,\n+\t\t(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst + 16, (const uint8_t *)src + 16);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n \t\t\t\t(const uint8_t *)src - 16 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 64) {\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov32((uint8_t *)dst - 32 + n,\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov32)((uint8_t *)dst - 32 + n,\n \t\t\t\t(const uint8_t *)src - 32 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 256) {\n \t\tif (n >= 128) {\n \t\t\tn -= 128;\n-\t\t\trte_mov128((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 128;\n \t\t\tdst = (uint8_t *)dst + 128;\n \t\t}\n COPY_BLOCK_128_BACK31:\n \t\tif (n >= 64) {\n \t\t\tn -= 64;\n-\t\t\trte_mov64((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 64;\n \t\t\tdst = (uint8_t *)dst + 64;\n \t\t}\n \t\tif (n > 32) {\n-\t\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n-\t\t\trte_mov32((uint8_t *)dst - 32 + n,\n+\t\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov32)((uint8_t *)dst - 32 + n,\n \t\t\t\t\t(const uint8_t *)src - 32 + n);\n \t\t\treturn ret;\n \t\t}\n \t\tif (n > 0) {\n-\t\t\trte_mov32((uint8_t *)dst - 32 + n,\n+\t\t\t(*rte_mov32)((uint8_t *)dst - 32 + n,\n \t\t\t\t\t(const uint8_t *)src - 32 + n);\n \t\t}\n \t\treturn ret;\n@@ -481,7 +499,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tif (dstofss > 0) {\n \t\tdstofss = 32 - dstofss;\n \t\tn -= dstofss;\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n \t\tsrc = (const uint8_t *)src + dstofss;\n \t\tdst = (uint8_t *)dst + dstofss;\n \t}\n@@ -489,7 +507,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t/**\n \t * Copy 128-byte blocks\n \t */\n-\trte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);\n+\t(*rte_mov128blocks)((uint8_t *)dst, (const uint8_t *)src, n);\n \tbits = n;\n \tn = n & 127;\n \tbits -= n;\n@@ -502,10 +520,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tgoto COPY_BLOCK_128_BACK31;\n }\n \n-#else /* RTE_MACHINE_CPUFLAG */\n-\n-#define ALIGNMENT_MASK 0x0F\n-\n /**\n  * SSE & AVX implementation below\n  */\n@@ -515,7 +529,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov16(uint8_t *dst, const uint8_t *src)\n+rte_mov16_DEFAULT(uint8_t *dst, const uint8_t *src)\n {\n \t__m128i xmm0;\n \n@@ -528,10 +542,10 @@ rte_mov16(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov32(uint8_t *dst, const uint8_t *src)\n+rte_mov32_DEFAULT(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n-\trte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n }\n \n /**\n@@ -539,12 +553,12 @@ rte_mov32(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov64(uint8_t *dst, const uint8_t *src)\n+rte_mov64_DEFAULT(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n-\trte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n-\trte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);\n-\trte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);\n }\n \n /**\n@@ -552,16 +566,16 @@ rte_mov64(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov128(uint8_t *dst, const uint8_t *src)\n+rte_mov128_DEFAULT(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n-\trte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n-\trte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);\n-\trte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);\n-\trte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);\n-\trte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);\n-\trte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);\n-\trte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);\n }\n \n /**\n@@ -569,24 +583,24 @@ rte_mov128(uint8_t *dst, const uint8_t *src)\n  * locations should not overlap.\n  */\n static inline void\n-rte_mov256(uint8_t *dst, const uint8_t *src)\n+rte_mov256_DEFAULT(uint8_t *dst, const uint8_t *src)\n {\n-\trte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n-\trte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n-\trte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);\n-\trte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);\n-\trte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);\n-\trte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);\n-\trte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);\n-\trte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);\n-\trte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);\n-\trte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);\n-\trte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);\n-\trte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);\n-\trte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);\n-\trte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);\n-\trte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);\n-\trte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);\n+\t(*rte_mov16)((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);\n }\n \n /**\n@@ -684,7 +698,7 @@ __extension__ ({                                                      \\\n })\n \n static inline void *\n-rte_memcpy_generic(void *dst, const void *src, size_t n)\n+rte_memcpy_generic_DEFAULT(void *dst, const void *src, size_t n)\n {\n \t__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;\n \tuintptr_t dstu = (uintptr_t)dst;\n@@ -722,19 +736,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t * Fast way when copy size doesn't exceed 512 bytes\n \t */\n \tif (n <= 32) {\n-\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);\n+\t\t(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n+\t\t\t\t(const uint8_t *)src - 16 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 48) {\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n+\t\t\t\t(const uint8_t *)src - 16 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 64) {\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);\n-\t\trte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst + 32, (const uint8_t *)src + 32);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n+\t\t\t\t(const uint8_t *)src - 16 + n);\n \t\treturn ret;\n \t}\n \tif (n <= 128) {\n@@ -743,39 +760,42 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tif (n <= 512) {\n \t\tif (n >= 256) {\n \t\t\tn -= 256;\n-\t\t\trte_mov128((uint8_t *)dst, (const uint8_t *)src);\n-\t\t\trte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);\n+\t\t\t(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov128)((uint8_t *)dst + 128,\n+\t\t\t\t\t(const uint8_t *)src + 128);\n \t\t\tsrc = (const uint8_t *)src + 256;\n \t\t\tdst = (uint8_t *)dst + 256;\n \t\t}\n COPY_BLOCK_255_BACK15:\n \t\tif (n >= 128) {\n \t\t\tn -= 128;\n-\t\t\trte_mov128((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov128)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 128;\n \t\t\tdst = (uint8_t *)dst + 128;\n \t\t}\n COPY_BLOCK_128_BACK15:\n \t\tif (n >= 64) {\n \t\t\tn -= 64;\n-\t\t\trte_mov64((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 64;\n \t\t\tdst = (uint8_t *)dst + 64;\n \t\t}\n COPY_BLOCK_64_BACK15:\n \t\tif (n >= 32) {\n \t\t\tn -= 32;\n-\t\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tsrc = (const uint8_t *)src + 32;\n \t\t\tdst = (uint8_t *)dst + 32;\n \t\t}\n \t\tif (n > 16) {\n-\t\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n-\t\t\trte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);\n+\t\t\t(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n+\t\t\t\t\t(const uint8_t *)src - 16 + n);\n \t\t\treturn ret;\n \t\t}\n \t\tif (n > 0) {\n-\t\t\trte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);\n+\t\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n+\t\t\t\t\t(const uint8_t *)src - 16 + n);\n \t\t}\n \t\treturn ret;\n \t}\n@@ -790,7 +810,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tif (dstofss > 0) {\n \t\tdstofss = 16 - dstofss + 16;\n \t\tn -= dstofss;\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n \t\tsrc = (const uint8_t *)src + dstofss;\n \t\tdst = (uint8_t *)dst + dstofss;\n \t}\n@@ -804,7 +824,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \t\t * Copy 256-byte blocks\n \t\t */\n \t\tfor (; n >= 256; n -= 256) {\n-\t\t\trte_mov256((uint8_t *)dst, (const uint8_t *)src);\n+\t\t\t(*rte_mov256)((uint8_t *)dst, (const uint8_t *)src);\n \t\t\tdst = (uint8_t *)dst + 256;\n \t\t\tsrc = (const uint8_t *)src + 256;\n \t\t}\n@@ -826,7 +846,40 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)\n \tgoto COPY_BLOCK_64_BACK15;\n }\n \n-#endif /* RTE_MACHINE_CPUFLAG */\n+static void __attribute__((constructor))\n+rte_memcpy_init(void)\n+{\n+\tif (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {\n+\t\talignment_mask = 0x3F;\n+\t\trte_mov16 = rte_mov16_AVX512F;\n+\t\trte_mov32 = rte_mov32_AVX512F;\n+\t\trte_mov64 = rte_mov64_AVX512F;\n+\t\trte_mov128 = rte_mov128_AVX512F;\n+\t\trte_mov256 = rte_mov256_AVX512F;\n+\t\trte_mov128blocks = rte_mov128blocks_AVX512F;\n+\t\trte_mov512blocks = rte_mov512blocks_AVX512F;\n+\t\trte_memcpy_generic = rte_memcpy_generic_AVX512F;\n+\t\tRTE_LOG(INFO, EAL, \"AVX512 implementation of memcpy() is using!\\n\");\n+\t} else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {\n+\t\talignment_mask = 0x1F;\n+\t\trte_mov16 = rte_mov16_AVX2;\n+\t\trte_mov32 = rte_mov32_AVX2;\n+\t\trte_mov64 = rte_mov64_AVX2;\n+\t\trte_mov128 = rte_mov128_AVX2;\n+\t\trte_mov128blocks = rte_mov128blocks_AVX2;\n+\t\trte_memcpy_generic = rte_memcpy_generic_AVX2;\n+\t\tRTE_LOG(INFO, EAL, \"AVX2 implementation of memcpy() is using!\\n\");\n+\t} else {\n+\t\talignment_mask = 0x0F;\n+\t\trte_mov16 = rte_mov16_DEFAULT;\n+\t\trte_mov32 = rte_mov32_DEFAULT;\n+\t\trte_mov64 = rte_mov64_DEFAULT;\n+\t\trte_mov128 = rte_mov128_DEFAULT;\n+\t\trte_mov256 = rte_mov256_DEFAULT;\n+\t\trte_memcpy_generic = rte_memcpy_generic_DEFAULT;\n+\t\tRTE_LOG(INFO, EAL, \"Default SSE/AVX implementation of memcpy() is using!\\n\");\n+\t}\n+}\n \n static inline void *\n rte_memcpy_aligned(void *dst, const void *src, size_t n)\n@@ -858,8 +911,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)\n \n \t/* Copy 16 <= size <= 32 bytes */\n \tif (n <= 32) {\n-\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov16((uint8_t *)dst - 16 + n,\n+\t\t(*rte_mov16)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov16)((uint8_t *)dst - 16 + n,\n \t\t\t\t(const uint8_t *)src - 16 + n);\n \n \t\treturn ret;\n@@ -867,8 +920,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)\n \n \t/* Copy 32 < size <= 64 bytes */\n \tif (n <= 64) {\n-\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n-\t\trte_mov32((uint8_t *)dst - 32 + n,\n+\t\t(*rte_mov32)((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov32)((uint8_t *)dst - 32 + n,\n \t\t\t\t(const uint8_t *)src - 32 + n);\n \n \t\treturn ret;\n@@ -876,13 +929,13 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)\n \n \t/* Copy 64 bytes blocks */\n \tfor (; n >= 64; n -= 64) {\n-\t\trte_mov64((uint8_t *)dst, (const uint8_t *)src);\n+\t\t(*rte_mov64)((uint8_t *)dst, (const uint8_t *)src);\n \t\tdst = (uint8_t *)dst + 64;\n \t\tsrc = (const uint8_t *)src + 64;\n \t}\n \n \t/* Copy whatever left */\n-\trte_mov64((uint8_t *)dst - 64 + n,\n+\t(*rte_mov64)((uint8_t *)dst - 64 + n,\n \t\t\t(const uint8_t *)src - 64 + n);\n \n \treturn ret;\n@@ -891,10 +944,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)\n static inline void *\n rte_memcpy(void *dst, const void *src, size_t n)\n {\n-\tif (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))\n+\tif (!(((uintptr_t)dst | (uintptr_t)src) & alignment_mask))\n \t\treturn rte_memcpy_aligned(dst, src, n);\n \telse\n-\t\treturn rte_memcpy_generic(dst, src, n);\n+\t\treturn (*rte_memcpy_generic)(dst, src, n);\n }\n \n #ifdef __cplusplus\ndiff --git a/mk/machine/native/rte.vars.mk b/mk/machine/native/rte.vars.mk\nindex f7d98d0..cdcf6c6 100644\n--- a/mk/machine/native/rte.vars.mk\n+++ b/mk/machine/native/rte.vars.mk\n@@ -65,3 +65,5 @@ SSE42_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null | grep SSE4_2)\n ifeq ($(SSE42_SUPPORT),)\n     MACHINE_CFLAGS = -march=corei7\n endif\n+\n+MACHINE_CFLAGS += -mavx512f\n",
    "prefixes": [
        "dpdk-dev",
        "1/3"
    ]
}