Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/134888/?format=api
https://patches.dpdk.org/api/patches/134888/?format=api", "web_url": "https://patches.dpdk.org/project/dpdk/patch/20231206172419.878-2-pbhagavatula@marvell.com/", "project": { "id": 1, "url": "https://patches.dpdk.org/api/projects/1/?format=api", "name": "DPDK", "link_name": "dpdk", "list_id": "dev.dpdk.org", "list_email": "dev@dpdk.org", "web_url": "http://core.dpdk.org", "scm_url": "git://dpdk.org/dpdk", "webscm_url": "http://git.dpdk.org/dpdk", "list_archive_url": "https://inbox.dpdk.org/dev", "list_archive_url_format": "https://inbox.dpdk.org/dev/{}", "commit_url_format": "" }, "msgid": "<20231206172419.878-2-pbhagavatula@marvell.com>", "list_archive_url": "https://inbox.dpdk.org/dev/20231206172419.878-2-pbhagavatula@marvell.com", "date": "2023-12-06T17:24:18", "name": "[v3,2/3] net/octeon_ep: use SSE instructions for Rx routine", "commit_ref": null, "pull_url": null, "state": "changes-requested", "archived": true, "hash": "8cb6384814fe16a0821849196fa0ad0913886ed7", "submitter": { "id": 1183, "url": "https://patches.dpdk.org/api/people/1183/?format=api", "name": "Pavan Nikhilesh Bhagavatula", "email": "pbhagavatula@marvell.com" }, "delegate": { "id": 310, "url": "https://patches.dpdk.org/api/users/310/?format=api", "username": "jerin", "first_name": "Jerin", "last_name": "Jacob", "email": "jerinj@marvell.com" }, "mbox": "https://patches.dpdk.org/project/dpdk/patch/20231206172419.878-2-pbhagavatula@marvell.com/mbox/", "series": [ { "id": 30463, "url": "https://patches.dpdk.org/api/series/30463/?format=api", "web_url": "https://patches.dpdk.org/project/dpdk/list/?series=30463", "date": "2023-12-06T17:24:17", "name": "[v3,1/3] net/octeon_ep: optimize Rx and Tx routines", "version": 3, "mbox": "https://patches.dpdk.org/series/30463/mbox/" } ], "comments": "https://patches.dpdk.org/api/patches/134888/comments/", "check": "warning", "checks": "https://patches.dpdk.org/api/patches/134888/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<dev-bounces@dpdk.org>", "X-Original-To": "patchwork@inbox.dpdk.org", "Delivered-To": "patchwork@inbox.dpdk.org", "Received": [ "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 6F8AE4368C;\n\tWed, 6 Dec 2023 18:24:37 +0100 (CET)", "from mails.dpdk.org (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id DB46342E92;\n\tWed, 6 Dec 2023 18:24:31 +0100 (CET)", "from mx0b-0016f401.pphosted.com (mx0a-0016f401.pphosted.com\n [67.231.148.174])\n by mails.dpdk.org (Postfix) with ESMTP id 901D74021E\n for <dev@dpdk.org>; Wed, 6 Dec 2023 18:24:29 +0100 (CET)", "from pps.filterd (m0045849.ppops.net [127.0.0.1])\n by mx0a-0016f401.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id\n 3B6CCCgi032691; Wed, 6 Dec 2023 09:24:28 -0800", "from dc5-exch01.marvell.com ([199.233.59.181])\n by mx0a-0016f401.pphosted.com (PPS) with ESMTPS id 3utd0ruc6r-1\n (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384 bits=256 verify=NOT);\n Wed, 06 Dec 2023 09:24:28 -0800", "from DC5-EXCH01.marvell.com (10.69.176.38) by DC5-EXCH01.marvell.com\n (10.69.176.38) with Microsoft SMTP Server (TLS) id 15.0.1497.48;\n Wed, 6 Dec 2023 09:24:26 -0800", "from maili.marvell.com (10.69.176.80) by DC5-EXCH01.marvell.com\n (10.69.176.38) with Microsoft SMTP Server id 15.0.1497.48 via Frontend\n Transport; Wed, 6 Dec 2023 09:24:26 -0800", "from MININT-80QBFE8.corp.innovium.com (MININT-80QBFE8.marvell.com\n [10.28.164.106])\n by maili.marvell.com (Postfix) with ESMTP id D67023F70A0;\n Wed, 6 Dec 2023 09:24:23 -0800 (PST)" ], "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=marvell.com;\n h=from : to : cc :\n subject : date : message-id : in-reply-to : references : mime-version :\n content-transfer-encoding : content-type; s=pfpt0220;\n bh=QmEAKUdMzZjSwoDEOX6t2b3IRiEDS9HdtOQj4cgJ6OI=;\n b=VJZ+lVZmbjwAeP4nCkrzkthnPA+TWWfbEqter9/koyaDd0/qdmY4zEupzwpXDp9XR926\n Kf/hk8aZPdMaAG8WGBgy9umnTSdhjNYe6WB9pk9gMid7Q6Yv5gx82rzsztk5PNtNJHRC\n c3yI7DJRGVcbu9SS7vc1vcPFXzjW2G05gSdFUI2KjL9eeHfxB8nYcrzms/oJRdJ8KrBh\n uilo8V/AiXFrtXk4PHfXsuIdv8smCCCHrLPBbbUrALaVIgSR3jlb2kI99AY+l0w4njuY\n g8D2ldFWctQPU6BOeEo3f0+TKH4cM4zcFzYvvo+OWdr/58LNPWhvN5sHWIxg83ud6vU9 vQ==", "From": "<pbhagavatula@marvell.com>", "To": "<jerinj@marvell.com>, Vamsi Attunuru <vattunuru@marvell.com>, \"Bruce\n Richardson\" <bruce.richardson@intel.com>, Konstantin Ananyev\n <konstantin.v.ananyev@yandex.ru>", "CC": "<dev@dpdk.org>, Pavan Nikhilesh <pbhagavatula@marvell.com>", "Subject": "[PATCH v3 2/3] net/octeon_ep: use SSE instructions for Rx routine", "Date": "Wed, 6 Dec 2023 22:54:18 +0530", "Message-ID": "<20231206172419.878-2-pbhagavatula@marvell.com>", "X-Mailer": "git-send-email 2.25.1", "In-Reply-To": "<20231206172419.878-1-pbhagavatula@marvell.com>", "References": "<20231125160349.2021-1-pbhagavatula@marvell.com>\n <20231206172419.878-1-pbhagavatula@marvell.com>", "MIME-Version": "1.0", "Content-Transfer-Encoding": "8bit", "Content-Type": "text/plain", "X-Proofpoint-ORIG-GUID": "pAqk4C88vrXPDPtpnu6H9rroGsJrdRkE", "X-Proofpoint-GUID": "pAqk4C88vrXPDPtpnu6H9rroGsJrdRkE", "X-Proofpoint-Virus-Version": "vendor=baseguard\n engine=ICAP:2.0.272,Aquarius:18.0.997,Hydra:6.0.619,FMLib:17.11.176.26\n definitions=2023-12-06_15,2023-12-06_01,2023-05-22_02", "X-BeenThere": "dev@dpdk.org", "X-Mailman-Version": "2.1.29", "Precedence": "list", "List-Id": "DPDK patches and discussions <dev.dpdk.org>", "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>", "List-Archive": "<http://mails.dpdk.org/archives/dev/>", "List-Post": "<mailto:dev@dpdk.org>", "List-Help": "<mailto:dev-request@dpdk.org?subject=help>", "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>", "Errors-To": "dev-bounces@dpdk.org" }, "content": "From: Pavan Nikhilesh <pbhagavatula@marvell.com>\n\nOptimize Rx routine to use SSE instructions.\n\nSigned-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>\n---\n drivers/net/octeon_ep/cnxk_ep_rx.c | 159 +----------------------\n drivers/net/octeon_ep/cnxk_ep_rx.h | 167 +++++++++++++++++++++++++\n drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 130 +++++++++++++++++++\n drivers/net/octeon_ep/meson.build | 11 ++\n drivers/net/octeon_ep/otx_ep_ethdev.c | 7 ++\n drivers/net/octeon_ep/otx_ep_rxtx.h | 6 +\n 6 files changed, 322 insertions(+), 158 deletions(-)\n create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.h\n create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_sse.c", "diff": "diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c\nindex 75bb7225d2..f3e4fb27d1 100644\n--- a/drivers/net/octeon_ep/cnxk_ep_rx.c\n+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c\n@@ -2,164 +2,7 @@\n * Copyright(C) 2023 Marvell.\n */\n \n-#include \"otx_ep_common.h\"\n-#include \"otx2_ep_vf.h\"\n-#include \"otx_ep_rxtx.h\"\n-\n-static inline int\n-cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)\n-{\n-\tstruct otx_ep_droq_desc *desc_ring = droq->desc_ring;\n-\tstruct rte_mbuf **recv_buf_list = droq->recv_buf_list;\n-\tuint32_t refill_idx = droq->refill_idx;\n-\tstruct rte_mbuf *buf;\n-\tuint32_t i;\n-\tint rc;\n-\n-\trc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);\n-\tif (unlikely(rc)) {\n-\t\tdroq->stats.rx_alloc_failure++;\n-\t\treturn rc;\n-\t}\n-\n-\tfor (i = 0; i < count; i++) {\n-\t\tbuf = recv_buf_list[refill_idx];\n-\t\tdesc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);\n-\t\trefill_idx++;\n-\t}\n-\n-\tdroq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);\n-\tdroq->refill_count -= count;\n-\n-\treturn 0;\n-}\n-\n-static inline void\n-cnxk_ep_rx_refill(struct otx_ep_droq *droq)\n-{\n-\tuint32_t desc_refilled = 0, count;\n-\tuint32_t nb_desc = droq->nb_desc;\n-\tuint32_t refill_idx = droq->refill_idx;\n-\tint rc;\n-\n-\tif (unlikely(droq->read_idx == refill_idx))\n-\t\treturn;\n-\n-\tif (refill_idx < droq->read_idx) {\n-\t\tcount = droq->read_idx - refill_idx;\n-\t\trc = cnxk_ep_rx_refill_mbuf(droq, count);\n-\t\tif (unlikely(rc)) {\n-\t\t\tdroq->stats.rx_alloc_failure++;\n-\t\t\treturn;\n-\t\t}\n-\t\tdesc_refilled = count;\n-\t} else {\n-\t\tcount = nb_desc - refill_idx;\n-\t\trc = cnxk_ep_rx_refill_mbuf(droq, count);\n-\t\tif (unlikely(rc)) {\n-\t\t\tdroq->stats.rx_alloc_failure++;\n-\t\t\treturn;\n-\t\t}\n-\n-\t\tdesc_refilled = count;\n-\t\tcount = droq->read_idx;\n-\t\trc = cnxk_ep_rx_refill_mbuf(droq, count);\n-\t\tif (unlikely(rc)) {\n-\t\t\tdroq->stats.rx_alloc_failure++;\n-\t\t\treturn;\n-\t\t}\n-\t\tdesc_refilled += count;\n-\t}\n-\n-\t/* Flush the droq descriptor data to memory to be sure\n-\t * that when we update the credits the data in memory is\n-\t * accurate.\n-\t */\n-\trte_io_wmb();\n-\trte_write32(desc_refilled, droq->pkts_credit_reg);\n-}\n-\n-static inline uint32_t\n-cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)\n-{\n-\tuint32_t new_pkts;\n-\tuint32_t val;\n-\n-\t/* Batch subtractions from the HW counter to reduce PCIe traffic\n-\t * This adds an extra local variable, but almost halves the\n-\t * number of PCIe writes.\n-\t */\n-\tval = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED);\n-\tnew_pkts = val - droq->pkts_sent_ism_prev;\n-\tdroq->pkts_sent_ism_prev = val;\n-\n-\tif (val > RTE_BIT32(31)) {\n-\t\t/* Only subtract the packet count in the HW counter\n-\t\t * when count above halfway to saturation.\n-\t\t */\n-\t\trte_write64((uint64_t)val, droq->pkts_sent_reg);\n-\t\trte_mb();\n-\n-\t\trte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);\n-\t\twhile (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {\n-\t\t\trte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);\n-\t\t\trte_mb();\n-\t\t}\n-\n-\t\tdroq->pkts_sent_ism_prev = 0;\n-\t}\n-\trte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);\n-\tdroq->pkts_pending += new_pkts;\n-\n-\treturn new_pkts;\n-}\n-\n-static inline int16_t __rte_hot\n-cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)\n-{\n-\tif (droq->pkts_pending < nb_pkts)\n-\t\tcnxk_ep_check_rx_pkts(droq);\n-\n-\treturn RTE_MIN(nb_pkts, droq->pkts_pending);\n-}\n-\n-static __rte_always_inline void\n-cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)\n-{\n-\tstruct rte_mbuf **recv_buf_list = droq->recv_buf_list;\n-\tuint32_t bytes_rsvd = 0, read_idx = droq->read_idx;\n-\tuint16_t nb_desc = droq->nb_desc;\n-\tuint16_t pkts;\n-\n-\tfor (pkts = 0; pkts < new_pkts; pkts++) {\n-\t\tstruct otx_ep_droq_info *info;\n-\t\tstruct rte_mbuf *mbuf;\n-\t\tuint16_t pkt_len;\n-\n-\t\trte_prefetch0(recv_buf_list[otx_ep_incr_index(read_idx, 2, nb_desc)]);\n-\t\trte_prefetch0(rte_pktmbuf_mtod(recv_buf_list[otx_ep_incr_index(read_idx,\n-\t\t\t\t\t\t\t\t\t 2, nb_desc)],\n-\t\t\t void *));\n-\n-\t\tmbuf = recv_buf_list[read_idx];\n-\t\tinfo = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);\n-\t\tread_idx = otx_ep_incr_index(read_idx, 1, nb_desc);\n-\t\tpkt_len = rte_bswap16(info->length >> 48);\n-\t\tmbuf->pkt_len = pkt_len;\n-\t\tmbuf->data_len = pkt_len;\n-\n-\t\t*(uint64_t *)&mbuf->rearm_data = droq->rearm_data;\n-\t\trx_pkts[pkts] = mbuf;\n-\t\tbytes_rsvd += pkt_len;\n-\t}\n-\tdroq->read_idx = read_idx;\n-\n-\tdroq->refill_count += new_pkts;\n-\tdroq->pkts_pending -= new_pkts;\n-\t/* Stats */\n-\tdroq->stats.pkts_received += new_pkts;\n-\tdroq->stats.bytes_received += bytes_rsvd;\n-}\n+#include \"cnxk_ep_rx.h\"\n \n static __rte_always_inline void\n cnxk_ep_process_pkts_scalar_mseg(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,\ndiff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h\nnew file mode 100644\nindex 0000000000..e71fc0de5c\n--- /dev/null\n+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h\n@@ -0,0 +1,167 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(C) 2023 Marvell.\n+ */\n+\n+#include <rte_vect.h>\n+\n+#include \"otx_ep_common.h\"\n+#include \"otx2_ep_vf.h\"\n+#include \"otx_ep_rxtx.h\"\n+\n+#define CNXK_EP_OQ_DESC_PER_LOOP_SSE 4\n+#define CNXK_EP_OQ_DESC_PER_LOOP_AVX 8\n+\n+static inline int\n+cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)\n+{\n+\tstruct otx_ep_droq_desc *desc_ring = droq->desc_ring;\n+\tstruct rte_mbuf **recv_buf_list = droq->recv_buf_list;\n+\tuint32_t refill_idx = droq->refill_idx;\n+\tstruct rte_mbuf *buf;\n+\tuint32_t i;\n+\tint rc;\n+\n+\trc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);\n+\tif (unlikely(rc)) {\n+\t\tdroq->stats.rx_alloc_failure++;\n+\t\treturn rc;\n+\t}\n+\n+\tfor (i = 0; i < count; i++) {\n+\t\tbuf = recv_buf_list[refill_idx];\n+\t\tdesc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);\n+\t\trefill_idx++;\n+\t}\n+\n+\tdroq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc);\n+\tdroq->refill_count -= count;\n+\n+\treturn 0;\n+}\n+\n+static inline void\n+cnxk_ep_rx_refill(struct otx_ep_droq *droq)\n+{\n+\tuint32_t desc_refilled = 0, count;\n+\tuint32_t nb_desc = droq->nb_desc;\n+\tuint32_t refill_idx = droq->refill_idx;\n+\tint rc;\n+\n+\tif (unlikely(droq->read_idx == refill_idx))\n+\t\treturn;\n+\n+\tif (refill_idx < droq->read_idx) {\n+\t\tcount = droq->read_idx - refill_idx;\n+\t\trc = cnxk_ep_rx_refill_mbuf(droq, count);\n+\t\tif (unlikely(rc)) {\n+\t\t\tdroq->stats.rx_alloc_failure++;\n+\t\t\treturn;\n+\t\t}\n+\t\tdesc_refilled = count;\n+\t} else {\n+\t\tcount = nb_desc - refill_idx;\n+\t\trc = cnxk_ep_rx_refill_mbuf(droq, count);\n+\t\tif (unlikely(rc)) {\n+\t\t\tdroq->stats.rx_alloc_failure++;\n+\t\t\treturn;\n+\t\t}\n+\n+\t\tdesc_refilled = count;\n+\t\tcount = droq->read_idx;\n+\t\trc = cnxk_ep_rx_refill_mbuf(droq, count);\n+\t\tif (unlikely(rc)) {\n+\t\t\tdroq->stats.rx_alloc_failure++;\n+\t\t\treturn;\n+\t\t}\n+\t\tdesc_refilled += count;\n+\t}\n+\n+\t/* Flush the droq descriptor data to memory to be sure\n+\t * that when we update the credits the data in memory is\n+\t * accurate.\n+\t */\n+\trte_io_wmb();\n+\trte_write32(desc_refilled, droq->pkts_credit_reg);\n+}\n+\n+static inline uint32_t\n+cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq)\n+{\n+\tuint32_t new_pkts;\n+\tuint32_t val;\n+\n+\t/* Batch subtractions from the HW counter to reduce PCIe traffic\n+\t * This adds an extra local variable, but almost halves the\n+\t * number of PCIe writes.\n+\t */\n+\tval = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED);\n+\tnew_pkts = val - droq->pkts_sent_ism_prev;\n+\tdroq->pkts_sent_ism_prev = val;\n+\n+\tif (val > RTE_BIT32(31)) {\n+\t\t/* Only subtract the packet count in the HW counter\n+\t\t * when count above halfway to saturation.\n+\t\t */\n+\t\trte_write64((uint64_t)val, droq->pkts_sent_reg);\n+\t\trte_mb();\n+\n+\t\trte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);\n+\t\twhile (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) {\n+\t\t\trte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);\n+\t\t\trte_mb();\n+\t\t}\n+\n+\t\tdroq->pkts_sent_ism_prev = 0;\n+\t}\n+\trte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg);\n+\tdroq->pkts_pending += new_pkts;\n+\n+\treturn new_pkts;\n+}\n+\n+static inline int16_t __rte_hot\n+cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)\n+{\n+\tif (droq->pkts_pending < nb_pkts)\n+\t\tcnxk_ep_check_rx_pkts(droq);\n+\n+\treturn RTE_MIN(nb_pkts, droq->pkts_pending);\n+}\n+\n+static __rte_always_inline void\n+cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)\n+{\n+\tstruct rte_mbuf **recv_buf_list = droq->recv_buf_list;\n+\tuint32_t bytes_rsvd = 0, read_idx = droq->read_idx;\n+\tuint16_t nb_desc = droq->nb_desc;\n+\tuint16_t pkts;\n+\n+\tfor (pkts = 0; pkts < new_pkts; pkts++) {\n+\t\tstruct otx_ep_droq_info *info;\n+\t\tstruct rte_mbuf *mbuf;\n+\t\tuint16_t pkt_len;\n+\n+\t\trte_prefetch0(recv_buf_list[otx_ep_incr_index(read_idx, 2, nb_desc)]);\n+\t\trte_prefetch0(rte_pktmbuf_mtod(recv_buf_list[otx_ep_incr_index(read_idx,\n+\t\t\t\t\t\t\t\t\t 2, nb_desc)],\n+\t\t\t void *));\n+\n+\t\tmbuf = recv_buf_list[read_idx];\n+\t\tinfo = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);\n+\t\tread_idx = otx_ep_incr_index(read_idx, 1, nb_desc);\n+\t\tpkt_len = rte_bswap16(info->length >> 48);\n+\t\tmbuf->pkt_len = pkt_len;\n+\t\tmbuf->data_len = pkt_len;\n+\n+\t\t*(uint64_t *)&mbuf->rearm_data = droq->rearm_data;\n+\t\trx_pkts[pkts] = mbuf;\n+\t\tbytes_rsvd += pkt_len;\n+\t}\n+\tdroq->read_idx = read_idx;\n+\n+\tdroq->refill_count += new_pkts;\n+\tdroq->pkts_pending -= new_pkts;\n+\t/* Stats */\n+\tdroq->stats.pkts_received += new_pkts;\n+\tdroq->stats.bytes_received += bytes_rsvd;\n+}\ndiff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c\nnew file mode 100644\nindex 0000000000..afa3616caa\n--- /dev/null\n+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c\n@@ -0,0 +1,130 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(C) 2023 Marvell.\n+ */\n+\n+#include \"cnxk_ep_rx.h\"\n+\n+static __rte_always_inline uint32_t\n+hadd(__m128i x)\n+{\n+\t__m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));\n+\t__m128i sum64 = _mm_add_epi32(hi64, x);\n+\t__m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2));\n+\t__m128i sum32 = _mm_add_epi32(sum64, hi32);\n+\treturn _mm_cvtsi128_si32(sum32);\n+}\n+\n+static __rte_always_inline void\n+cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)\n+{\n+\tstruct rte_mbuf **recv_buf_list = droq->recv_buf_list;\n+\tuint32_t bytes_rsvd = 0, read_idx = droq->read_idx;\n+\tuint32_t idx0, idx1, idx2, idx3;\n+\tstruct rte_mbuf *m0, *m1, *m2, *m3;\n+\tuint16_t nb_desc = droq->nb_desc;\n+\tuint16_t pkts = 0;\n+\n+\tidx0 = read_idx;\n+\twhile (pkts < new_pkts) {\n+\t\tconst __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,\n+\t\t\t\t\t\t\t0xFF, 4, 5, 0xFF, 0xFF, 0, 1);\n+\t\tconst __m128i cpy_mask = _mm_set_epi8(0xFF, 0xFF, 9, 8, 0xFF, 0xFF, 9, 8, 0xFF,\n+\t\t\t\t\t\t 0xFF, 1, 0, 0xFF, 0xFF, 1, 0);\n+\t\t__m128i s01, s23;\n+\n+\t\tidx1 = otx_ep_incr_index(idx0, 1, nb_desc);\n+\t\tidx2 = otx_ep_incr_index(idx1, 1, nb_desc);\n+\t\tidx3 = otx_ep_incr_index(idx2, 1, nb_desc);\n+\n+\t\tm0 = recv_buf_list[idx0];\n+\t\tm1 = recv_buf_list[idx1];\n+\t\tm2 = recv_buf_list[idx2];\n+\t\tm3 = recv_buf_list[idx3];\n+\n+\t\t/* Load packet size big-endian. */\n+\t\ts01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,\n+\t\t\t\t rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,\n+\t\t\t\t rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,\n+\t\t\t\t rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);\n+\t\t/* Convert to littel-endian. */\n+\t\ts01 = _mm_shuffle_epi8(s01, bswap_mask);\n+\t\t/* Horizontal add. */\n+\t\tbytes_rsvd += hadd(s01);\n+\t\t/* Segregate to packet length and data length. */\n+\t\ts23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));\n+\t\ts01 = _mm_shuffle_epi8(s01, cpy_mask);\n+\t\ts23 = _mm_shuffle_epi8(s23, cpy_mask);\n+\n+\t\t/* Store packet length and data length to mbuf. */\n+\t\t*(uint64_t *)&m0->pkt_len = ((rte_xmm_t)s01).u64[0];\n+\t\t*(uint64_t *)&m1->pkt_len = ((rte_xmm_t)s01).u64[1];\n+\t\t*(uint64_t *)&m2->pkt_len = ((rte_xmm_t)s23).u64[0];\n+\t\t*(uint64_t *)&m3->pkt_len = ((rte_xmm_t)s23).u64[1];\n+\n+\t\t/* Reset rearm data. */\n+\t\t*(uint64_t *)&m0->rearm_data = droq->rearm_data;\n+\t\t*(uint64_t *)&m1->rearm_data = droq->rearm_data;\n+\t\t*(uint64_t *)&m2->rearm_data = droq->rearm_data;\n+\t\t*(uint64_t *)&m3->rearm_data = droq->rearm_data;\n+\n+\t\trx_pkts[pkts++] = m0;\n+\t\trx_pkts[pkts++] = m1;\n+\t\trx_pkts[pkts++] = m2;\n+\t\trx_pkts[pkts++] = m3;\n+\t\tidx0 = otx_ep_incr_index(idx3, 1, nb_desc);\n+\t}\n+\tdroq->read_idx = idx0;\n+\n+\tdroq->refill_count += new_pkts;\n+\tdroq->pkts_pending -= new_pkts;\n+\t/* Stats */\n+\tdroq->stats.pkts_received += new_pkts;\n+\tdroq->stats.bytes_received += bytes_rsvd;\n+}\n+\n+uint16_t __rte_noinline __rte_hot\n+cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)\n+{\n+\tstruct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;\n+\tuint16_t new_pkts, vpkts;\n+\n+\tnew_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);\n+\tvpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);\n+\tcnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);\n+\tcnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);\n+\n+\t/* Refill RX buffers */\n+\tif (droq->refill_count >= DROQ_REFILL_THRESHOLD)\n+\t\tcnxk_ep_rx_refill(droq);\n+\n+\treturn new_pkts;\n+}\n+\n+uint16_t __rte_noinline __rte_hot\n+cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)\n+{\n+\tstruct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;\n+\tuint16_t new_pkts, vpkts;\n+\n+\tnew_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);\n+\tvpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);\n+\tcnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);\n+\tcnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);\n+\n+\t/* Refill RX buffers */\n+\tif (droq->refill_count >= DROQ_REFILL_THRESHOLD) {\n+\t\tcnxk_ep_rx_refill(droq);\n+\t} else {\n+\t\t/* SDP output goes into DROP state when output doorbell count\n+\t\t * goes below drop count. When door bell count is written with\n+\t\t * a value greater than drop count SDP output should come out\n+\t\t * of DROP state. Due to a race condition this is not happening.\n+\t\t * Writing doorbell register with 0 again may make SDP output\n+\t\t * come out of this state.\n+\t\t */\n+\n+\t\trte_write32(0, droq->pkts_credit_reg);\n+\t}\n+\n+\treturn new_pkts;\n+}\ndiff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build\nindex 749776d70c..feba1fdf25 100644\n--- a/drivers/net/octeon_ep/meson.build\n+++ b/drivers/net/octeon_ep/meson.build\n@@ -12,3 +12,14 @@ sources = files(\n 'cnxk_ep_rx.c',\n 'cnxk_ep_tx.c',\n )\n+\n+if arch_subdir == 'x86'\n+ sources += files('cnxk_ep_rx_sse.c')\n+endif\n+\n+extra_flags = ['-Wno-strict-aliasing']\n+foreach flag: extra_flags\n+ if cc.has_argument(flag)\n+ cflags += flag\n+ endif\n+endforeach\ndiff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c\nindex 615cbbb648..51b34cdaa0 100644\n--- a/drivers/net/octeon_ep/otx_ep_ethdev.c\n+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c\n@@ -52,10 +52,17 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)\n \n \tif (otx_epvf->chip_gen == OTX_EP_CN10XX) {\n \t\teth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;\n+#ifdef RTE_ARCH_X86\n+\t\teth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_sse;\n+#endif\n \t\tif (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)\n \t\t\teth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;\n \t} else if (otx_epvf->chip_gen == OTX_EP_CN9XX) {\n \t\teth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;\n+#ifdef RTE_ARCH_X86\n+\t\teth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_sse;\n+#endif\n+\n \t\tif (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)\n \t\t\teth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_mseg;\n \t} else {\ndiff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h\nindex b159c32cae..efc41a8275 100644\n--- a/drivers/net/octeon_ep/otx_ep_rxtx.h\n+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h\n@@ -48,12 +48,18 @@ cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)\n uint16_t\n cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);\n \n+uint16_t\n+cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);\n+\n uint16_t\n cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);\n \n uint16_t\n cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);\n \n+uint16_t\n+cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);\n+\n uint16_t\n cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);\n #endif /* _OTX_EP_RXTX_H_ */\n", "prefixes": [ "v3", "2/3" ] }{ "id": 134888, "url": "