get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/77385/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 77385,
    "url": "http://patches.dpdk.org/api/patches/77385/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20200911120906.45995-3-joyce.kong@arm.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20200911120906.45995-3-joyce.kong@arm.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20200911120906.45995-3-joyce.kong@arm.com",
    "date": "2020-09-11T12:09:05",
    "name": "[RFC,2/3] net/virtio: add vectorized packed ring Rx NEON path",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "b1c33055eda4c419fbfb76a1f57dffbbaabb7f4e",
    "submitter": {
        "id": 970,
        "url": "http://patches.dpdk.org/api/people/970/?format=api",
        "name": "Joyce Kong",
        "email": "joyce.kong@arm.com"
    },
    "delegate": {
        "id": 2642,
        "url": "http://patches.dpdk.org/api/users/2642/?format=api",
        "username": "mcoquelin",
        "first_name": "Maxime",
        "last_name": "Coquelin",
        "email": "maxime.coquelin@redhat.com"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20200911120906.45995-3-joyce.kong@arm.com/mbox/",
    "series": [
        {
            "id": 12147,
            "url": "http://patches.dpdk.org/api/series/12147/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=12147",
            "date": "2020-09-11T12:09:03",
            "name": "Vectorize packed ring RX path with NEON",
            "version": 1,
            "mbox": "http://patches.dpdk.org/series/12147/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/77385/comments/",
    "check": "success",
    "checks": "http://patches.dpdk.org/api/patches/77385/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id C8BD4A04B5;\n\tFri, 11 Sep 2020 14:09:35 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 506571C112;\n\tFri, 11 Sep 2020 14:09:25 +0200 (CEST)",
            "from foss.arm.com (foss.arm.com [217.140.110.172])\n by dpdk.org (Postfix) with ESMTP id 6D2351C0DC\n for <dev@dpdk.org>; Fri, 11 Sep 2020 14:09:23 +0200 (CEST)",
            "from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])\n by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id ECF991396;\n Fri, 11 Sep 2020 05:09:22 -0700 (PDT)",
            "from net-arm-thunderx2-03.shanghai.arm.com\n (net-arm-thunderx2-03.shanghai.arm.com [10.169.210.123])\n by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 785673F68F;\n Fri, 11 Sep 2020 05:09:20 -0700 (PDT)"
        ],
        "From": "Joyce Kong <joyce.kong@arm.com>",
        "To": "maxime.coquelin@redhat.com",
        "Cc": "jerinj@marvell.com, dev@dpdk.org, nd@arm.com,\n honnappa.nagarahalli@arm.com,\n ruifeng.wang@arm.com, phil.yang@arm.com",
        "Date": "Fri, 11 Sep 2020 20:09:05 +0800",
        "Message-Id": "<20200911120906.45995-3-joyce.kong@arm.com>",
        "X-Mailer": "git-send-email 2.28.0",
        "In-Reply-To": "<20200911120906.45995-1-joyce.kong@arm.com>",
        "References": "<20200911120906.45995-1-joyce.kong@arm.com>",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "Subject": "[dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON\n\tpath",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Optimize packed ring Rx batch path with NEON instructions.\n\nSigned-off-by: Joyce Kong <joyce.kong@arm.com>\n---\n drivers/net/virtio/meson.build               |   1 +\n drivers/net/virtio/virtio_rxtx.c             |   7 +-\n drivers/net/virtio/virtio_rxtx_packed.h      |  16 ++\n drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++++++++\n 4 files changed, 224 insertions(+), 2 deletions(-)\n create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c",
    "diff": "diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build\nindex e1851b0a6..5af633686 100644\n--- a/drivers/net/virtio/meson.build\n+++ b/drivers/net/virtio/meson.build\n@@ -34,6 +34,7 @@ elif arch_subdir == 'ppc'\n \tsources += files('virtio_rxtx_simple_altivec.c')\n elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64')\n \tsources += files('virtio_rxtx_simple_neon.c')\n+\tsources += files('virtio_rxtx_packed_neon.c')\n endif\n \n if is_linux\ndiff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c\nindex f915b8a2c..1deb77569 100644\n--- a/drivers/net/virtio/virtio_rxtx.c\n+++ b/drivers/net/virtio/virtio_rxtx.c\n@@ -2020,7 +2020,8 @@ virtio_xmit_pkts_inorder(void *tx_queue,\n \treturn nb_tx;\n }\n \n-#ifndef CC_AVX512_SUPPORT\n+#if !defined(CC_AVX512_SUPPORT) && !defined(RTE_ARCH_ARM) && \\\n+\t!defined(RTE_ARCH_ARM64)\n uint16_t\n virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,\n \t\t\t    struct rte_mbuf **rx_pkts __rte_unused,\n@@ -2028,7 +2029,9 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,\n {\n \treturn 0;\n }\n+#endif\n \n+#if !defined(CC_AVX512_SUPPORT)\n uint16_t\n virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,\n \t\t\t    struct rte_mbuf **tx_pkts __rte_unused,\n@@ -2036,4 +2039,4 @@ virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,\n {\n \treturn 0;\n }\n-#endif /* ifndef CC_AVX512_SUPPORT */\n+#endif\ndiff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h\nindex b2447843b..fd2d6baa5 100644\n--- a/drivers/net/virtio/virtio_rxtx_packed.h\n+++ b/drivers/net/virtio/virtio_rxtx_packed.h\n@@ -19,9 +19,16 @@\n #include \"virtqueue.h\"\n \n #define BYTE_SIZE 8\n+\n+#if defined(AVX512_SUPPORT)\n /* flag bits offset in packed ring desc higher 64bits */\n #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \\\n \toffsetof(struct vring_packed_desc, len)) * BYTE_SIZE)\n+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)\n+/* flag bits offset in packed ring desc 32bits */\n+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \\\n+\toffsetof(struct vring_packed_desc, id)) * BYTE_SIZE)\n+#endif\n \n #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \\\n \tFLAGS_BITS_OFFSET)\n@@ -44,8 +51,17 @@\n /* net hdr short size mask */\n #define NET_HDR_MASK 0x3F\n \n+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)\n+/* The cache line size on different aarh64 platforms are\n+ * different, so put a four batch size here to match with\n+ * the minimum cache line size.\n+ */\n+#define PACKED_BATCH_SIZE 4\n+#else\n #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \\\n \tsizeof(struct vring_packed_desc))\n+#endif\n+\n #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)\n \n #ifdef VIRTIO_GCC_UNROLL_PRAGMA\ndiff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.c b/drivers/net/virtio/virtio_rxtx_packed_neon.c\nnew file mode 100644\nindex 000000000..182afe5c6\n--- /dev/null\n+++ b/drivers/net/virtio/virtio_rxtx_packed_neon.c\n@@ -0,0 +1,202 @@\n+#include <stdlib.h>\n+#include <stdint.h>\n+#include <stdio.h>\n+#include <string.h>\n+#include <errno.h>\n+\n+#include <rte_net.h>\n+#include <rte_vect.h>\n+\n+#include \"virtio_logs.h\"\n+#include \"virtio_ethdev.h\"\n+#include \"virtio_pci.h\"\n+#include \"virtio_rxtx_packed.h\"\n+#include \"virtqueue.h\"\n+\n+static inline uint16_t\n+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,\n+\t\t\t\t   struct rte_mbuf **rx_pkts)\n+{\n+\tstruct virtqueue *vq = rxvq->vq;\n+\tstruct virtio_hw *hw = vq->hw;\n+\tuint16_t head_size = hw->vtnet_hdr_size;\n+\tuint16_t id = vq->vq_used_cons_idx;\n+\tstruct vring_packed_desc *p_desc;\n+\tuint16_t i;\n+\n+\tif (id & PACKED_BATCH_MASK)\n+\t\treturn -1;\n+\n+\tif (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))\n+\t\treturn -1;\n+\n+\t/* Map packed descriptor to mbuf fields. */\n+\tuint8x16_t shuf_msk1 = {\n+\t\t0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */\n+\t\t0, 1,\t\t\t/* octet 1~0, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,\t\t/* skip high 16 bits of pkt_len, zero out */\n+\t\t0, 1,\t\t\t/* octet 1~0, 16 bits data_len */\n+\t\t0xFF, 0xFF,\t\t/* vlan tci set as unknown */\n+\t\t0xFF, 0xFF, 0xFF, 0xFF\n+\t};\n+\n+\tuint8x16_t shuf_msk2 = {\n+\t\t0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */\n+\t\t8, 9,\t\t\t/* octet 9~8, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,\t\t/* skip high 16 bits of pkt_len, zero out */\n+\t\t8, 9,\t\t\t/* octet 9~8, 16 bits data_len */\n+\t\t0xFF, 0xFF,\t\t/* vlan tci set as unknown */\n+\t\t0xFF, 0xFF, 0xFF, 0xFF\n+\t};\n+\n+\t/* Subtract the header length. */\n+\tuint16x8_t len_adjust = {\n+\t\t0, 0,\t\t/* ignore pkt_type field */\n+\t\thead_size,\t/* sub head_size on pkt_len */\n+\t\t0,\t\t/* ignore high 16 bits of pkt_len */\n+\t\thead_size,\t/* sub head_size on data_len */\n+\t\t0, 0, 0\t\t/* ignore non-length fields */\n+\t};\n+\n+\tuint64x2_t desc[PACKED_BATCH_SIZE / 2];\n+\tuint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];\n+\tuint64x2_t pkt_mb[PACKED_BATCH_SIZE];\n+\n+\tp_desc = &vq->vq_packed.ring.desc[id];\n+\t/* Load packed descriptor 0,1. */\n+\tdesc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];\n+\t/* Load packed descriptor 2,3. */\n+\tdesc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];\n+\n+\t/* Only care avail/used bits. */\n+\tuint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);\n+\tuint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),\n+\t\t\t\tvreinterpretq_u32_u64(desc[1]));\n+\tuint32x4_t v_flag = vandq_u32(v_desc, v_mask);\n+\n+\tuint32x4_t v_used_flag = vdupq_n_u32(0);\n+\tif (vq->vq_packed.used_wrap_counter)\n+\t\tv_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);\n+\n+\tpoly128_t desc_stats = vreinterpretq_p128_u32(vceqq_u32(v_flag,\n+\t\t\t\t\tv_used_flag));\n+\n+\t/* Check all descs are used. */\n+\tif (!desc_stats)\n+\t\treturn -1;\n+\n+\t/* Load 2 mbuf pointers per time. */\n+\tmbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);\n+\tvst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);\n+\n+\tmbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);\n+\tvst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);\n+\n+\t/**\n+\t *  Update data length and packet length for descriptor.\n+\t *  structure of pkt_mb:\n+\t *  --------------------------------------------------------------------\n+\t *  |4 octet pkt_type|4 octet pkt_len|2 octet data_len|2 octet vlan_tci|\n+\t *  --------------------------------------------------------------------\n+\t */\n+\tpkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk1));\n+\tpkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk2));\n+\tpkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk1));\n+\tpkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk2));\n+\n+\tpkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));\n+\tpkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));\n+\tpkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));\n+\tpkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));\n+\n+\tvst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);\n+\tvst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);\n+\tvst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);\n+\tvst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);\n+\n+\tif (hw->has_rx_offload) {\n+\t\tvirtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {\n+\t\t\tchar *addr = (char *)rx_pkts[i]->buf_addr +\n+\t\t\t\tRTE_PKTMBUF_HEADROOM - head_size;\n+\t\t\tvirtio_vec_rx_offload(rx_pkts[i],\n+\t\t\t\t\t(struct virtio_net_hdr *)addr);\n+\t\t}\n+\t}\n+\n+\tvirtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,\n+\t\t\trx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,\n+\t\t\trx_pkts[3]->pkt_len);\n+\n+\tvq->vq_free_cnt += PACKED_BATCH_SIZE;\n+\n+\tvq->vq_used_cons_idx += PACKED_BATCH_SIZE;\n+\tif (vq->vq_used_cons_idx >= vq->vq_nentries) {\n+\t\tvq->vq_used_cons_idx -= vq->vq_nentries;\n+\t\tvq->vq_packed.used_wrap_counter ^= 1;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+uint16_t\n+virtio_recv_pkts_packed_vec(void *rx_queue,\n+\t\t\t    struct rte_mbuf **rx_pkts,\n+\t\t\t    uint16_t nb_pkts)\n+{\n+\tstruct virtnet_rx *rxvq = rx_queue;\n+\tstruct virtqueue *vq = rxvq->vq;\n+\tstruct virtio_hw *hw = vq->hw;\n+\tuint16_t num, nb_rx = 0;\n+\tuint32_t nb_enqueued = 0;\n+\tuint16_t free_cnt = vq->vq_free_thresh;\n+\n+\tif (unlikely(hw->started == 0))\n+\t\treturn nb_rx;\n+\n+\tnum = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);\n+\tif (likely(num > PACKED_BATCH_SIZE))\n+\t\tnum = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);\n+\n+\twhile (num) {\n+\t\tif (!virtqueue_dequeue_batch_packed_vec(rxvq,\n+\t\t\t\t\t&rx_pkts[nb_rx])) {\n+\t\t\tnb_rx += PACKED_BATCH_SIZE;\n+\t\t\tnum -= PACKED_BATCH_SIZE;\n+\t\t\tcontinue;\n+\t\t}\n+\t\tif (!virtqueue_dequeue_single_packed_vec(rxvq,\n+\t\t\t\t\t&rx_pkts[nb_rx])) {\n+\t\t\tnb_rx++;\n+\t\t\tnum--;\n+\t\t\tcontinue;\n+\t\t}\n+\t\tbreak;\n+\t};\n+\n+\tPMD_RX_LOG(DEBUG, \"dequeue:%d\", num);\n+\n+\trxvq->stats.packets += nb_rx;\n+\n+\tif (likely(vq->vq_free_cnt >= free_cnt)) {\n+\t\tstruct rte_mbuf *new_pkts[free_cnt];\n+\t\tif (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,\n+\t\t\t\t\t\tfree_cnt) == 0)) {\n+\t\t\tvirtio_recv_refill_packed_vec(rxvq, new_pkts,\n+\t\t\t\t\tfree_cnt);\n+\t\t\tnb_enqueued += free_cnt;\n+\t\t} else {\n+\t\t\tstruct rte_eth_dev *dev =\n+\t\t\t\t&rte_eth_devices[rxvq->port_id];\n+\t\t\tdev->data->rx_mbuf_alloc_failed += free_cnt;\n+\t\t}\n+\t}\n+\n+\tif (likely(nb_enqueued)) {\n+\t\tif (unlikely(virtqueue_kick_prepare_packed(vq))) {\n+\t\t\tvirtqueue_notify(vq);\n+\t\t\tPMD_RX_LOG(DEBUG, \"Notified\");\n+\t\t}\n+\t}\n+\n+\treturn nb_rx;\n+}\n",
    "prefixes": [
        "RFC",
        "2/3"
    ]
}