Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/77385/?format=api
http://patches.dpdk.org/api/patches/77385/?format=api", "web_url": "http://patches.dpdk.org/project/dpdk/patch/20200911120906.45995-3-joyce.kong@arm.com/", "project": { "id": 1, "url": "http://patches.dpdk.org/api/projects/1/?format=api", "name": "DPDK", "link_name": "dpdk", "list_id": "dev.dpdk.org", "list_email": "dev@dpdk.org", "web_url": "http://core.dpdk.org", "scm_url": "git://dpdk.org/dpdk", "webscm_url": "http://git.dpdk.org/dpdk", "list_archive_url": "https://inbox.dpdk.org/dev", "list_archive_url_format": "https://inbox.dpdk.org/dev/{}", "commit_url_format": "" }, "msgid": "<20200911120906.45995-3-joyce.kong@arm.com>", "list_archive_url": "https://inbox.dpdk.org/dev/20200911120906.45995-3-joyce.kong@arm.com", "date": "2020-09-11T12:09:05", "name": "[RFC,2/3] net/virtio: add vectorized packed ring Rx NEON path", "commit_ref": null, "pull_url": null, "state": "superseded", "archived": true, "hash": "b1c33055eda4c419fbfb76a1f57dffbbaabb7f4e", "submitter": { "id": 970, "url": "http://patches.dpdk.org/api/people/970/?format=api", "name": "Joyce Kong", "email": "joyce.kong@arm.com" }, "delegate": { "id": 2642, "url": "http://patches.dpdk.org/api/users/2642/?format=api", "username": "mcoquelin", "first_name": "Maxime", "last_name": "Coquelin", "email": "maxime.coquelin@redhat.com" }, "mbox": "http://patches.dpdk.org/project/dpdk/patch/20200911120906.45995-3-joyce.kong@arm.com/mbox/", "series": [ { "id": 12147, "url": "http://patches.dpdk.org/api/series/12147/?format=api", "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=12147", "date": "2020-09-11T12:09:03", "name": "Vectorize packed ring RX path with NEON", "version": 1, "mbox": "http://patches.dpdk.org/series/12147/mbox/" } ], "comments": "http://patches.dpdk.org/api/patches/77385/comments/", "check": "success", "checks": "http://patches.dpdk.org/api/patches/77385/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<dev-bounces@dpdk.org>", "X-Original-To": "patchwork@inbox.dpdk.org", "Delivered-To": "patchwork@inbox.dpdk.org", "Received": [ "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id C8BD4A04B5;\n\tFri, 11 Sep 2020 14:09:35 +0200 (CEST)", "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 506571C112;\n\tFri, 11 Sep 2020 14:09:25 +0200 (CEST)", "from foss.arm.com (foss.arm.com [217.140.110.172])\n by dpdk.org (Postfix) with ESMTP id 6D2351C0DC\n for <dev@dpdk.org>; Fri, 11 Sep 2020 14:09:23 +0200 (CEST)", "from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])\n by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id ECF991396;\n Fri, 11 Sep 2020 05:09:22 -0700 (PDT)", "from net-arm-thunderx2-03.shanghai.arm.com\n (net-arm-thunderx2-03.shanghai.arm.com [10.169.210.123])\n by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 785673F68F;\n Fri, 11 Sep 2020 05:09:20 -0700 (PDT)" ], "From": "Joyce Kong <joyce.kong@arm.com>", "To": "maxime.coquelin@redhat.com", "Cc": "jerinj@marvell.com, dev@dpdk.org, nd@arm.com,\n honnappa.nagarahalli@arm.com,\n ruifeng.wang@arm.com, phil.yang@arm.com", "Date": "Fri, 11 Sep 2020 20:09:05 +0800", "Message-Id": "<20200911120906.45995-3-joyce.kong@arm.com>", "X-Mailer": "git-send-email 2.28.0", "In-Reply-To": "<20200911120906.45995-1-joyce.kong@arm.com>", "References": "<20200911120906.45995-1-joyce.kong@arm.com>", "MIME-Version": "1.0", "Content-Transfer-Encoding": "8bit", "Subject": "[dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON\n\tpath", "X-BeenThere": "dev@dpdk.org", "X-Mailman-Version": "2.1.15", "Precedence": "list", "List-Id": "DPDK patches and discussions <dev.dpdk.org>", "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>", "List-Archive": "<http://mails.dpdk.org/archives/dev/>", "List-Post": "<mailto:dev@dpdk.org>", "List-Help": "<mailto:dev-request@dpdk.org?subject=help>", "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>", "Errors-To": "dev-bounces@dpdk.org", "Sender": "\"dev\" <dev-bounces@dpdk.org>" }, "content": "Optimize packed ring Rx batch path with NEON instructions.\n\nSigned-off-by: Joyce Kong <joyce.kong@arm.com>\n---\n drivers/net/virtio/meson.build | 1 +\n drivers/net/virtio/virtio_rxtx.c | 7 +-\n drivers/net/virtio/virtio_rxtx_packed.h | 16 ++\n drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++++++++\n 4 files changed, 224 insertions(+), 2 deletions(-)\n create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c", "diff": "diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build\nindex e1851b0a6..5af633686 100644\n--- a/drivers/net/virtio/meson.build\n+++ b/drivers/net/virtio/meson.build\n@@ -34,6 +34,7 @@ elif arch_subdir == 'ppc'\n \tsources += files('virtio_rxtx_simple_altivec.c')\n elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64')\n \tsources += files('virtio_rxtx_simple_neon.c')\n+\tsources += files('virtio_rxtx_packed_neon.c')\n endif\n \n if is_linux\ndiff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c\nindex f915b8a2c..1deb77569 100644\n--- a/drivers/net/virtio/virtio_rxtx.c\n+++ b/drivers/net/virtio/virtio_rxtx.c\n@@ -2020,7 +2020,8 @@ virtio_xmit_pkts_inorder(void *tx_queue,\n \treturn nb_tx;\n }\n \n-#ifndef CC_AVX512_SUPPORT\n+#if !defined(CC_AVX512_SUPPORT) && !defined(RTE_ARCH_ARM) && \\\n+\t!defined(RTE_ARCH_ARM64)\n uint16_t\n virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,\n \t\t\t struct rte_mbuf **rx_pkts __rte_unused,\n@@ -2028,7 +2029,9 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,\n {\n \treturn 0;\n }\n+#endif\n \n+#if !defined(CC_AVX512_SUPPORT)\n uint16_t\n virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,\n \t\t\t struct rte_mbuf **tx_pkts __rte_unused,\n@@ -2036,4 +2039,4 @@ virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,\n {\n \treturn 0;\n }\n-#endif /* ifndef CC_AVX512_SUPPORT */\n+#endif\ndiff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h\nindex b2447843b..fd2d6baa5 100644\n--- a/drivers/net/virtio/virtio_rxtx_packed.h\n+++ b/drivers/net/virtio/virtio_rxtx_packed.h\n@@ -19,9 +19,16 @@\n #include \"virtqueue.h\"\n \n #define BYTE_SIZE 8\n+\n+#if defined(AVX512_SUPPORT)\n /* flag bits offset in packed ring desc higher 64bits */\n #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \\\n \toffsetof(struct vring_packed_desc, len)) * BYTE_SIZE)\n+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)\n+/* flag bits offset in packed ring desc 32bits */\n+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \\\n+\toffsetof(struct vring_packed_desc, id)) * BYTE_SIZE)\n+#endif\n \n #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \\\n \tFLAGS_BITS_OFFSET)\n@@ -44,8 +51,17 @@\n /* net hdr short size mask */\n #define NET_HDR_MASK 0x3F\n \n+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)\n+/* The cache line size on different aarh64 platforms are\n+ * different, so put a four batch size here to match with\n+ * the minimum cache line size.\n+ */\n+#define PACKED_BATCH_SIZE 4\n+#else\n #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \\\n \tsizeof(struct vring_packed_desc))\n+#endif\n+\n #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)\n \n #ifdef VIRTIO_GCC_UNROLL_PRAGMA\ndiff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.c b/drivers/net/virtio/virtio_rxtx_packed_neon.c\nnew file mode 100644\nindex 000000000..182afe5c6\n--- /dev/null\n+++ b/drivers/net/virtio/virtio_rxtx_packed_neon.c\n@@ -0,0 +1,202 @@\n+#include <stdlib.h>\n+#include <stdint.h>\n+#include <stdio.h>\n+#include <string.h>\n+#include <errno.h>\n+\n+#include <rte_net.h>\n+#include <rte_vect.h>\n+\n+#include \"virtio_logs.h\"\n+#include \"virtio_ethdev.h\"\n+#include \"virtio_pci.h\"\n+#include \"virtio_rxtx_packed.h\"\n+#include \"virtqueue.h\"\n+\n+static inline uint16_t\n+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,\n+\t\t\t\t struct rte_mbuf **rx_pkts)\n+{\n+\tstruct virtqueue *vq = rxvq->vq;\n+\tstruct virtio_hw *hw = vq->hw;\n+\tuint16_t head_size = hw->vtnet_hdr_size;\n+\tuint16_t id = vq->vq_used_cons_idx;\n+\tstruct vring_packed_desc *p_desc;\n+\tuint16_t i;\n+\n+\tif (id & PACKED_BATCH_MASK)\n+\t\treturn -1;\n+\n+\tif (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))\n+\t\treturn -1;\n+\n+\t/* Map packed descriptor to mbuf fields. */\n+\tuint8x16_t shuf_msk1 = {\n+\t\t0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */\n+\t\t0, 1,\t\t\t/* octet 1~0, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,\t\t/* skip high 16 bits of pkt_len, zero out */\n+\t\t0, 1,\t\t\t/* octet 1~0, 16 bits data_len */\n+\t\t0xFF, 0xFF,\t\t/* vlan tci set as unknown */\n+\t\t0xFF, 0xFF, 0xFF, 0xFF\n+\t};\n+\n+\tuint8x16_t shuf_msk2 = {\n+\t\t0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */\n+\t\t8, 9,\t\t\t/* octet 9~8, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,\t\t/* skip high 16 bits of pkt_len, zero out */\n+\t\t8, 9,\t\t\t/* octet 9~8, 16 bits data_len */\n+\t\t0xFF, 0xFF,\t\t/* vlan tci set as unknown */\n+\t\t0xFF, 0xFF, 0xFF, 0xFF\n+\t};\n+\n+\t/* Subtract the header length. */\n+\tuint16x8_t len_adjust = {\n+\t\t0, 0,\t\t/* ignore pkt_type field */\n+\t\thead_size,\t/* sub head_size on pkt_len */\n+\t\t0,\t\t/* ignore high 16 bits of pkt_len */\n+\t\thead_size,\t/* sub head_size on data_len */\n+\t\t0, 0, 0\t\t/* ignore non-length fields */\n+\t};\n+\n+\tuint64x2_t desc[PACKED_BATCH_SIZE / 2];\n+\tuint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];\n+\tuint64x2_t pkt_mb[PACKED_BATCH_SIZE];\n+\n+\tp_desc = &vq->vq_packed.ring.desc[id];\n+\t/* Load packed descriptor 0,1. */\n+\tdesc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];\n+\t/* Load packed descriptor 2,3. */\n+\tdesc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];\n+\n+\t/* Only care avail/used bits. */\n+\tuint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);\n+\tuint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),\n+\t\t\t\tvreinterpretq_u32_u64(desc[1]));\n+\tuint32x4_t v_flag = vandq_u32(v_desc, v_mask);\n+\n+\tuint32x4_t v_used_flag = vdupq_n_u32(0);\n+\tif (vq->vq_packed.used_wrap_counter)\n+\t\tv_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);\n+\n+\tpoly128_t desc_stats = vreinterpretq_p128_u32(vceqq_u32(v_flag,\n+\t\t\t\t\tv_used_flag));\n+\n+\t/* Check all descs are used. */\n+\tif (!desc_stats)\n+\t\treturn -1;\n+\n+\t/* Load 2 mbuf pointers per time. */\n+\tmbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);\n+\tvst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);\n+\n+\tmbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);\n+\tvst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);\n+\n+\t/**\n+\t * Update data length and packet length for descriptor.\n+\t * structure of pkt_mb:\n+\t * --------------------------------------------------------------------\n+\t * |4 octet pkt_type|4 octet pkt_len|2 octet data_len|2 octet vlan_tci|\n+\t * --------------------------------------------------------------------\n+\t */\n+\tpkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk1));\n+\tpkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk2));\n+\tpkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk1));\n+\tpkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk2));\n+\n+\tpkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));\n+\tpkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));\n+\tpkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));\n+\tpkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));\n+\n+\tvst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);\n+\tvst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);\n+\tvst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);\n+\tvst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);\n+\n+\tif (hw->has_rx_offload) {\n+\t\tvirtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {\n+\t\t\tchar *addr = (char *)rx_pkts[i]->buf_addr +\n+\t\t\t\tRTE_PKTMBUF_HEADROOM - head_size;\n+\t\t\tvirtio_vec_rx_offload(rx_pkts[i],\n+\t\t\t\t\t(struct virtio_net_hdr *)addr);\n+\t\t}\n+\t}\n+\n+\tvirtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,\n+\t\t\trx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,\n+\t\t\trx_pkts[3]->pkt_len);\n+\n+\tvq->vq_free_cnt += PACKED_BATCH_SIZE;\n+\n+\tvq->vq_used_cons_idx += PACKED_BATCH_SIZE;\n+\tif (vq->vq_used_cons_idx >= vq->vq_nentries) {\n+\t\tvq->vq_used_cons_idx -= vq->vq_nentries;\n+\t\tvq->vq_packed.used_wrap_counter ^= 1;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+uint16_t\n+virtio_recv_pkts_packed_vec(void *rx_queue,\n+\t\t\t struct rte_mbuf **rx_pkts,\n+\t\t\t uint16_t nb_pkts)\n+{\n+\tstruct virtnet_rx *rxvq = rx_queue;\n+\tstruct virtqueue *vq = rxvq->vq;\n+\tstruct virtio_hw *hw = vq->hw;\n+\tuint16_t num, nb_rx = 0;\n+\tuint32_t nb_enqueued = 0;\n+\tuint16_t free_cnt = vq->vq_free_thresh;\n+\n+\tif (unlikely(hw->started == 0))\n+\t\treturn nb_rx;\n+\n+\tnum = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);\n+\tif (likely(num > PACKED_BATCH_SIZE))\n+\t\tnum = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);\n+\n+\twhile (num) {\n+\t\tif (!virtqueue_dequeue_batch_packed_vec(rxvq,\n+\t\t\t\t\t&rx_pkts[nb_rx])) {\n+\t\t\tnb_rx += PACKED_BATCH_SIZE;\n+\t\t\tnum -= PACKED_BATCH_SIZE;\n+\t\t\tcontinue;\n+\t\t}\n+\t\tif (!virtqueue_dequeue_single_packed_vec(rxvq,\n+\t\t\t\t\t&rx_pkts[nb_rx])) {\n+\t\t\tnb_rx++;\n+\t\t\tnum--;\n+\t\t\tcontinue;\n+\t\t}\n+\t\tbreak;\n+\t};\n+\n+\tPMD_RX_LOG(DEBUG, \"dequeue:%d\", num);\n+\n+\trxvq->stats.packets += nb_rx;\n+\n+\tif (likely(vq->vq_free_cnt >= free_cnt)) {\n+\t\tstruct rte_mbuf *new_pkts[free_cnt];\n+\t\tif (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,\n+\t\t\t\t\t\tfree_cnt) == 0)) {\n+\t\t\tvirtio_recv_refill_packed_vec(rxvq, new_pkts,\n+\t\t\t\t\tfree_cnt);\n+\t\t\tnb_enqueued += free_cnt;\n+\t\t} else {\n+\t\t\tstruct rte_eth_dev *dev =\n+\t\t\t\t&rte_eth_devices[rxvq->port_id];\n+\t\t\tdev->data->rx_mbuf_alloc_failed += free_cnt;\n+\t\t}\n+\t}\n+\n+\tif (likely(nb_enqueued)) {\n+\t\tif (unlikely(virtqueue_kick_prepare_packed(vq))) {\n+\t\t\tvirtqueue_notify(vq);\n+\t\t\tPMD_RX_LOG(DEBUG, \"Notified\");\n+\t\t}\n+\t}\n+\n+\treturn nb_rx;\n+}\n", "prefixes": [ "RFC", "2/3" ] }{ "id": 77385, "url": "