get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/17753/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 17753,
    "url": "http://patches.dpdk.org/api/patches/17753/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/1481074266-4461-1-git-send-email-zhihong.wang@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1481074266-4461-1-git-send-email-zhihong.wang@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1481074266-4461-1-git-send-email-zhihong.wang@intel.com",
    "date": "2016-12-07T01:31:06",
    "name": "[dpdk-dev,v2] eal: optimize aligned rte_memcpy",
    "commit_ref": null,
    "pull_url": null,
    "state": "accepted",
    "archived": true,
    "hash": "99be54b782d42458a71aeed40dbd9a297a4c6584",
    "submitter": {
        "id": 156,
        "url": "http://patches.dpdk.org/api/people/156/?format=api",
        "name": "Zhihong Wang",
        "email": "zhihong.wang@intel.com"
    },
    "delegate": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/users/1/?format=api",
        "username": "tmonjalo",
        "first_name": "Thomas",
        "last_name": "Monjalon",
        "email": "thomas@monjalon.net"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/1481074266-4461-1-git-send-email-zhihong.wang@intel.com/mbox/",
    "series": [],
    "comments": "http://patches.dpdk.org/api/patches/17753/comments/",
    "check": "success",
    "checks": "http://patches.dpdk.org/api/patches/17753/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [IPv6:::1])\n\tby dpdk.org (Postfix) with ESMTP id 9577037B8;\n\tWed,  7 Dec 2016 09:40:54 +0100 (CET)",
            "from mga03.intel.com (mga03.intel.com [134.134.136.65])\n\tby dpdk.org (Postfix) with ESMTP id C3ED532A5\n\tfor <dev@dpdk.org>; Wed,  7 Dec 2016 09:40:52 +0100 (CET)",
            "from orsmga004.jf.intel.com ([10.7.209.38])\n\tby orsmga103.jf.intel.com with ESMTP; 07 Dec 2016 00:40:50 -0800",
            "from unknown (HELO dpdk5.sh.intel.com) ([10.239.128.211])\n\tby orsmga004.jf.intel.com with ESMTP; 07 Dec 2016 00:40:43 -0800"
        ],
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos;i=\"5.33,310,1477983600\"; d=\"scan'208\";a=\"37819902\"",
        "From": "Zhihong Wang <zhihong.wang@intel.com>",
        "To": "dev@dpdk.org",
        "Cc": "yuanhan.liu@linux.intel.com, thomas.monjalon@6wind.com,\n\tlei.a.yao@intel.com, Zhihong Wang <zhihong.wang@intel.com>",
        "Date": "Tue,  6 Dec 2016 20:31:06 -0500",
        "Message-Id": "<1481074266-4461-1-git-send-email-zhihong.wang@intel.com>",
        "X-Mailer": "git-send-email 2.7.4",
        "In-Reply-To": "<1480641582-56186-1-git-send-email-zhihong.wang@intel.com>",
        "References": "<1480641582-56186-1-git-send-email-zhihong.wang@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v2] eal: optimize aligned rte_memcpy",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "This patch optimizes rte_memcpy for well aligned cases, where both\ndst and src addr are aligned to maximum MOV width. It introduces a\ndedicated function called rte_memcpy_aligned to handle the aligned\ncases with simplified instruction stream. The existing rte_memcpy\nis renamed as rte_memcpy_generic. The selection between them 2 is\ndone at the entry of rte_memcpy.\n\nThe existing rte_memcpy is for generic cases, it handles unaligned\ncopies and make store aligned, it even makes load aligned for micro\narchitectures like Ivy Bridge. However alignment handling comes at\na price: It adds extra load/store instructions, which can cause\ncomplications sometime.\n\nDPDK Vhost memcpy with Mergeable Rx Buffer feature as an example:\nThe copy is aligned, and remote, and there is header write along\nwhich is also remote. In this case the memcpy instruction stream\nshould be simplified, to reduce extra load/store, therefore reduce\nthe probability of load/store buffer full caused pipeline stall, to\nlet the actual memcpy instructions be issued and let H/W prefetcher\ngoes to work as early as possible.\n\nThis patch is tested on Ivy Bridge, Haswell and Skylake, it provides\nup to 20% gain for Virtio Vhost PVP traffic, with packet size ranging\nfrom 64 to 1500 bytes.\n\nThe test can also be conducted without NIC, by setting loopback\ntraffic between Virtio and Vhost. For example, modify the macro\nTXONLY_DEF_PACKET_LEN to the requested packet size in testpmd.h,\nrebuild and start testpmd in both host and guest, then \"start\" on\none side and \"start tx_first 32\" on the other.\n\n\nSigned-off-by: Zhihong Wang <zhihong.wang@intel.com>\n---\n .../common/include/arch/x86/rte_memcpy.h           | 81 +++++++++++++++++++++-\n 1 file changed, 78 insertions(+), 3 deletions(-)",
    "diff": "diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h\nindex b3bfc23..b9785e8 100644\n--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h\n+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h\n@@ -69,6 +69,8 @@ rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));\n \n #ifdef RTE_MACHINE_CPUFLAG_AVX512F\n \n+#define ALIGNMENT_MASK 0x3F\n+\n /**\n  * AVX512 implementation below\n  */\n@@ -189,7 +191,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)\n }\n \n static inline void *\n-rte_memcpy(void *dst, const void *src, size_t n)\n+rte_memcpy_generic(void *dst, const void *src, size_t n)\n {\n \tuintptr_t dstu = (uintptr_t)dst;\n \tuintptr_t srcu = (uintptr_t)src;\n@@ -308,6 +310,8 @@ COPY_BLOCK_128_BACK63:\n \n #elif defined RTE_MACHINE_CPUFLAG_AVX2\n \n+#define ALIGNMENT_MASK 0x1F\n+\n /**\n  * AVX2 implementation below\n  */\n@@ -387,7 +391,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)\n }\n \n static inline void *\n-rte_memcpy(void *dst, const void *src, size_t n)\n+rte_memcpy_generic(void *dst, const void *src, size_t n)\n {\n \tuintptr_t dstu = (uintptr_t)dst;\n \tuintptr_t srcu = (uintptr_t)src;\n@@ -499,6 +503,8 @@ COPY_BLOCK_128_BACK31:\n \n #else /* RTE_MACHINE_CPUFLAG */\n \n+#define ALIGNMENT_MASK 0x0F\n+\n /**\n  * SSE & AVX implementation below\n  */\n@@ -677,7 +683,7 @@ __extension__ ({                                                      \\\n })\n \n static inline void *\n-rte_memcpy(void *dst, const void *src, size_t n)\n+rte_memcpy_generic(void *dst, const void *src, size_t n)\n {\n \t__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;\n \tuintptr_t dstu = (uintptr_t)dst;\n@@ -821,6 +827,75 @@ COPY_BLOCK_64_BACK15:\n \n #endif /* RTE_MACHINE_CPUFLAG */\n \n+static inline void *\n+rte_memcpy_aligned(void *dst, const void *src, size_t n)\n+{\n+\tvoid *ret = dst;\n+\n+\t/* Copy size <= 16 bytes */\n+\tif (n < 16) {\n+\t\tif (n & 0x01) {\n+\t\t\t*(uint8_t *)dst = *(const uint8_t *)src;\n+\t\t\tsrc = (const uint8_t *)src + 1;\n+\t\t\tdst = (uint8_t *)dst + 1;\n+\t\t}\n+\t\tif (n & 0x02) {\n+\t\t\t*(uint16_t *)dst = *(const uint16_t *)src;\n+\t\t\tsrc = (const uint16_t *)src + 1;\n+\t\t\tdst = (uint16_t *)dst + 1;\n+\t\t}\n+\t\tif (n & 0x04) {\n+\t\t\t*(uint32_t *)dst = *(const uint32_t *)src;\n+\t\t\tsrc = (const uint32_t *)src + 1;\n+\t\t\tdst = (uint32_t *)dst + 1;\n+\t\t}\n+\t\tif (n & 0x08)\n+\t\t\t*(uint64_t *)dst = *(const uint64_t *)src;\n+\n+\t\treturn ret;\n+\t}\n+\n+\t/* Copy 16 <= size <= 32 bytes */\n+\tif (n <= 32) {\n+\t\trte_mov16((uint8_t *)dst, (const uint8_t *)src);\n+\t\trte_mov16((uint8_t *)dst - 16 + n,\n+\t\t\t\t(const uint8_t *)src - 16 + n);\n+\n+\t\treturn ret;\n+\t}\n+\n+\t/* Copy 32 < size <= 64 bytes */\n+\tif (n <= 64) {\n+\t\trte_mov32((uint8_t *)dst, (const uint8_t *)src);\n+\t\trte_mov32((uint8_t *)dst - 32 + n,\n+\t\t\t\t(const uint8_t *)src - 32 + n);\n+\n+\t\treturn ret;\n+\t}\n+\n+\t/* Copy 64 bytes blocks */\n+\tfor (; n >= 64; n -= 64) {\n+\t\trte_mov64((uint8_t *)dst, (const uint8_t *)src);\n+\t\tdst = (uint8_t *)dst + 64;\n+\t\tsrc = (const uint8_t *)src + 64;\n+\t}\n+\n+\t/* Copy whatever left */\n+\trte_mov64((uint8_t *)dst - 64 + n,\n+\t\t\t(const uint8_t *)src - 64 + n);\n+\n+\treturn ret;\n+}\n+\n+static inline void *\n+rte_memcpy(void *dst, const void *src, size_t n)\n+{\n+\tif (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))\n+\t\treturn rte_memcpy_aligned(dst, src, n);\n+\telse\n+\t\treturn rte_memcpy_generic(dst, src, n);\n+}\n+\n #ifdef __cplusplus\n }\n #endif\n",
    "prefixes": [
        "dpdk-dev",
        "v2"
    ]
}