get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/8387/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 8387,
    "url": "http://patches.dpdk.org/api/patches/8387/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/1446210115-13927-2-git-send-email-zhe.tao@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1446210115-13927-2-git-send-email-zhe.tao@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1446210115-13927-2-git-send-email-zhe.tao@intel.com",
    "date": "2015-10-30T13:01:52",
    "name": "[dpdk-dev,1/4,v3] add vector PMD RX for FVL",
    "commit_ref": null,
    "pull_url": null,
    "state": "changes-requested",
    "archived": true,
    "hash": "04f68e4f37393175163170b2094c4c4f18cc2ab8",
    "submitter": {
        "id": 276,
        "url": "http://patches.dpdk.org/api/people/276/?format=api",
        "name": "Zhe Tao",
        "email": "zhe.tao@intel.com"
    },
    "delegate": null,
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/1446210115-13927-2-git-send-email-zhe.tao@intel.com/mbox/",
    "series": [],
    "comments": "http://patches.dpdk.org/api/patches/8387/comments/",
    "check": "pending",
    "checks": "http://patches.dpdk.org/api/patches/8387/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [IPv6:::1])\n\tby dpdk.org (Postfix) with ESMTP id 355A791A9;\n\tFri, 30 Oct 2015 14:02:19 +0100 (CET)",
            "from mga01.intel.com (mga01.intel.com [192.55.52.88])\n\tby dpdk.org (Postfix) with ESMTP id 3D3DF8E97\n\tfor <dev@dpdk.org>; Fri, 30 Oct 2015 14:02:15 +0100 (CET)",
            "from orsmga002.jf.intel.com ([10.7.209.21])\n\tby fmsmga101.fm.intel.com with ESMTP; 30 Oct 2015 06:02:15 -0700",
            "from shilc102.sh.intel.com ([10.239.39.44])\n\tby orsmga002.jf.intel.com with ESMTP; 30 Oct 2015 06:02:14 -0700",
            "from shecgisg004.sh.intel.com (shecgisg004.sh.intel.com\n\t[10.239.29.89])\n\tby shilc102.sh.intel.com with ESMTP id t9UD29Fe000457;\n\tFri, 30 Oct 2015 21:02:09 +0800",
            "from shecgisg004.sh.intel.com (localhost [127.0.0.1])\n\tby shecgisg004.sh.intel.com (8.13.6/8.13.6/SuSE Linux 0.8) with ESMTP\n\tid t9UD270j013977; Fri, 30 Oct 2015 21:02:09 +0800",
            "(from zhetao@localhost)\n\tby shecgisg004.sh.intel.com (8.13.6/8.13.6/Submit) id t9UD26fE013973; \n\tFri, 30 Oct 2015 21:02:06 +0800"
        ],
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos;i=\"5.20,218,1444719600\"; d=\"scan'208\";a=\"838825339\"",
        "From": "Zhe Tao <zhe.tao@intel.com>",
        "To": "dev@dpdk.org",
        "Date": "Fri, 30 Oct 2015 21:01:52 +0800",
        "Message-Id": "<1446210115-13927-2-git-send-email-zhe.tao@intel.com>",
        "X-Mailer": "git-send-email 1.7.4.1",
        "In-Reply-To": "<1446210115-13927-1-git-send-email-zhe.tao@intel.com>",
        "References": "<1446202336-8723-1-git-send-email-zhe.tao@intel.com>\n\t<1446210115-13927-1-git-send-email-zhe.tao@intel.com>",
        "Subject": "[dpdk-dev] [PATCH 1/4 v3] add vector PMD RX for FVL",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "patches and discussions about DPDK <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "The vPMD RX function uses the multi-buffer and SSE instructions to\naccelerate the RX speed, but now the pktype cannot be supported by the vPMD RX,\nbecause it will decrease the performance heavily.\n\nSigned-off-by: Zhe Tao <zhe.tao@intel.com>\n---\n config/common_bsdapp             |   2 +\n config/common_linuxapp           |   2 +\n drivers/net/i40e/Makefile        |   1 +\n drivers/net/i40e/i40e_rxtx.c     |  28 ++-\n drivers/net/i40e/i40e_rxtx.h     |  28 ++-\n drivers/net/i40e/i40e_rxtx_vec.c | 484 +++++++++++++++++++++++++++++++++++++++\n 6 files changed, 540 insertions(+), 5 deletions(-)\n create mode 100644 drivers/net/i40e/i40e_rxtx_vec.c",
    "diff": "diff --git a/config/common_bsdapp b/config/common_bsdapp\nindex b37dcf4..3003da5 100644\n--- a/config/common_bsdapp\n+++ b/config/common_bsdapp\n@@ -186,6 +186,8 @@ CONFIG_RTE_LIBRTE_I40E_DEBUG_TX=n\n CONFIG_RTE_LIBRTE_I40E_DEBUG_TX_FREE=n\n CONFIG_RTE_LIBRTE_I40E_DEBUG_DRIVER=n\n CONFIG_RTE_LIBRTE_I40E_RX_ALLOW_BULK_ALLOC=y\n+CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y\n+CONFIG_RTE_LIBRTE_I40E_RX_OLFLAGS_ENABLE=y\n CONFIG_RTE_LIBRTE_I40E_16BYTE_RX_DESC=n\n CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VF=4\n CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM=4\ndiff --git a/config/common_linuxapp b/config/common_linuxapp\nindex 0de43d5..dadba4d 100644\n--- a/config/common_linuxapp\n+++ b/config/common_linuxapp\n@@ -184,6 +184,8 @@ CONFIG_RTE_LIBRTE_I40E_DEBUG_TX=n\n CONFIG_RTE_LIBRTE_I40E_DEBUG_TX_FREE=n\n CONFIG_RTE_LIBRTE_I40E_DEBUG_DRIVER=n\n CONFIG_RTE_LIBRTE_I40E_RX_ALLOW_BULK_ALLOC=y\n+CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y\n+CONFIG_RTE_LIBRTE_I40E_RX_OLFLAGS_ENABLE=y\n CONFIG_RTE_LIBRTE_I40E_16BYTE_RX_DESC=n\n CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VF=4\n CONFIG_RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM=4\ndiff --git a/drivers/net/i40e/Makefile b/drivers/net/i40e/Makefile\nindex 55b7d31..d4695cb 100644\n--- a/drivers/net/i40e/Makefile\n+++ b/drivers/net/i40e/Makefile\n@@ -95,6 +95,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += i40e_dcb.c\n \n SRCS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += i40e_ethdev.c\n SRCS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += i40e_rxtx.c\n+SRCS-$(CONFIG_RTE_LIBRTE_I40E_INC_VECTOR) += i40e_rxtx_vec.c\n SRCS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += i40e_ethdev_vf.c\n SRCS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += i40e_pf.c\n SRCS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += i40e_fdir.c\ndiff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c\nindex fd656d5..dfdc7d5 100644\n--- a/drivers/net/i40e/i40e_rxtx.c\n+++ b/drivers/net/i40e/i40e_rxtx.c\n@@ -1788,9 +1788,6 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)\n \treturn txq->tx_rs_thresh;\n }\n \n-#define I40E_TD_CMD (I40E_TX_DESC_CMD_ICRC |\\\n-\t\t     I40E_TX_DESC_CMD_EOP)\n-\n /* Populate 4 descriptors with data from 4 mbufs */\n static inline void\n tx4(volatile struct i40e_tx_desc *txdp, struct rte_mbuf **pkts)\n@@ -2625,6 +2622,9 @@ i40e_reset_rx_queue(struct i40e_rx_queue *rxq)\n \trxq->nb_rx_hold = 0;\n \trxq->pkt_first_seg = NULL;\n \trxq->pkt_last_seg = NULL;\n+\n+\trxq->rxrearm_start = 0;\n+\trxq->rxrearm_nb = 0;\n }\n \n void\n@@ -3063,3 +3063,25 @@ i40e_fdir_setup_rx_resources(struct i40e_pf *pf)\n \n \treturn I40E_SUCCESS;\n }\n+\n+/* Stubs needed for linkage when CONFIG_RTE_I40E_INC_VECTOR is set to 'n' */\n+uint16_t __attribute__((weak))\n+i40e_recv_pkts_vec(\n+\tvoid __rte_unused *rx_queue,\n+\tstruct rte_mbuf __rte_unused **rx_pkts,\n+\tuint16_t __rte_unused nb_pkts)\n+{\n+\treturn 0;\n+}\n+\n+int __attribute__((weak))\n+i40e_rxq_vec_setup(struct i40e_rx_queue __rte_unused *rxq)\n+{\n+\treturn -1;\n+}\n+\n+void __attribute__((weak))\n+i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue __rte_unused*rxq)\n+{\n+\treturn;\n+}\ndiff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h\nindex 4385142..961a415 100644\n--- a/drivers/net/i40e/i40e_rxtx.h\n+++ b/drivers/net/i40e/i40e_rxtx.h\n@@ -44,13 +44,27 @@\n #define I40E_TX_FLAG_INSERT_VLAN  ((uint32_t)(1 << 1))\n #define I40E_TX_FLAG_TSYN         ((uint32_t)(1 << 2))\n \n-#ifdef RTE_LIBRTE_I40E_RX_ALLOW_BULK_ALLOC\n #define RTE_PMD_I40E_RX_MAX_BURST 32\n-#endif\n+#define RTE_PMD_I40E_TX_MAX_BURST 32\n+\n+#define RTE_I40E_VPMD_RX_BURST        32\n+#define RTE_I40E_VPMD_TX_BURST        32\n+#define RTE_I40E_RXQ_REARM_THRESH      32\n+#define RTE_I40E_MAX_RX_BURST          RTE_I40E_RXQ_REARM_THRESH\n+#define RTE_I40E_TX_MAX_FREE_BUF_SZ    64\n+#define RTE_I40E_DESCS_PER_LOOP    4\n \n #define I40E_RXBUF_SZ_1024 1024\n #define I40E_RXBUF_SZ_2048 2048\n \n+#undef container_of\n+#define container_of(ptr, type, member) ({ \\\n+\t\ttypeof(((type *)0)->member)(*__mptr) = (ptr); \\\n+\t\t(type *)((char *)__mptr - offsetof(type, member)); })\n+\n+#define I40E_TD_CMD (I40E_TX_DESC_CMD_ICRC |\\\n+\t\t     I40E_TX_DESC_CMD_EOP)\n+\n enum i40e_header_split_mode {\n \ti40e_header_split_none = 0,\n \ti40e_header_split_enabled = 1,\n@@ -100,6 +114,11 @@ struct i40e_rx_queue {\n \tstruct rte_mbuf fake_mbuf; /**< dummy mbuf */\n \tstruct rte_mbuf *rx_stage[RTE_PMD_I40E_RX_MAX_BURST * 2];\n #endif\n+\n+\tuint16_t rxrearm_nb;\t/**< number of remaining to be re-armed */\n+\tuint16_t rxrearm_start;\t/**< the idx we start the re-arming from */\n+\tuint64_t mbuf_initializer; /**< value to init mbufs */\n+\n \tuint8_t port_id; /**< device port ID */\n \tuint8_t crc_len; /**< 0 if CRC stripped, 4 otherwise */\n \tuint16_t queue_id; /**< RX queue index */\n@@ -210,4 +229,9 @@ uint32_t i40e_dev_rx_queue_count(struct rte_eth_dev *dev,\n \t\t\t\t uint16_t rx_queue_id);\n int i40e_dev_rx_descriptor_done(void *rx_queue, uint16_t offset);\n \n+uint16_t i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t    uint16_t nb_pkts);\n+int i40e_rxq_vec_setup(struct i40e_rx_queue *rxq);\n+void i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq);\n+\n #endif /* _I40E_RXTX_H_ */\ndiff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c\nnew file mode 100644\nindex 0000000..a95916b\n--- /dev/null\n+++ b/drivers/net/i40e/i40e_rxtx_vec.c\n@@ -0,0 +1,484 @@\n+/*-\n+ *   BSD LICENSE\n+ *\n+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.\n+ *   All rights reserved.\n+ *\n+ *   Redistribution and use in source and binary forms, with or without\n+ *   modification, are permitted provided that the following conditions\n+ *   are met:\n+ *\n+ *     * Redistributions of source code must retain the above copyright\n+ *       notice, this list of conditions and the following disclaimer.\n+ *     * Redistributions in binary form must reproduce the above copyright\n+ *       notice, this list of conditions and the following disclaimer in\n+ *       the documentation and/or other materials provided with the\n+ *       distribution.\n+ *     * Neither the name of Intel Corporation nor the names of its\n+ *       contributors may be used to endorse or promote products derived\n+ *       from this software without specific prior written permission.\n+ *\n+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n+ *   \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n+ */\n+\n+#include <stdint.h>\n+#include <rte_ethdev.h>\n+#include <rte_malloc.h>\n+\n+#include \"base/i40e_prototype.h\"\n+#include \"base/i40e_type.h\"\n+#include \"i40e_ethdev.h\"\n+#include \"i40e_rxtx.h\"\n+\n+#include <tmmintrin.h>\n+\n+#ifndef __INTEL_COMPILER\n+#pragma GCC diagnostic ignored \"-Wcast-qual\"\n+#endif\n+\n+static inline void\n+i40e_rxq_rearm(struct i40e_rx_queue *rxq)\n+{\n+\tint i;\n+\tuint16_t rx_id;\n+\n+\tvolatile union i40e_rx_desc *rxdp;\n+\tstruct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\tstruct rte_mbuf *mb0, *mb1;\n+\t__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,\n+\t\t\tRTE_PKTMBUF_HEADROOM);\n+\t__m128i dma_addr0, dma_addr1;\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\n+\t/* Pull 'n' more MBUFs into the software ring */\n+\tif (rte_mempool_get_bulk(rxq->mp,\n+\t\t\t\t (void *)rxep,\n+\t\t\t\t RTE_I40E_RXQ_REARM_THRESH) < 0) {\n+\t\tif (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=\n+\t\t    rxq->nb_rx_desc) {\n+\t\t\tdma_addr0 = _mm_setzero_si128();\n+\t\t\tfor (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {\n+\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\t_mm_store_si128((__m128i *)&rxdp[i].read,\n+\t\t\t\t\t\tdma_addr0);\n+\t\t\t}\n+\t\t}\n+\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=\n+\t\t\tRTE_I40E_RXQ_REARM_THRESH;\n+\t\treturn;\n+\t}\n+\n+\t/* Initialize the mbufs in vector, process 2 mbufs in one loop */\n+\tfor (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {\n+\t\t__m128i vaddr0, vaddr1;\n+\t\tuintptr_t p0, p1;\n+\n+\t\tmb0 = rxep[0].mbuf;\n+\t\tmb1 = rxep[1].mbuf;\n+\n+\t\t /* Flush mbuf with pkt template.\n+\t\t * Data to be rearmed is 6 bytes long.\n+\t\t * Though, RX will overwrite ol_flags that are coming next\n+\t\t * anyway. So overwrite whole 8 bytes with one load:\n+\t\t * 6 bytes of rearm_data plus first 2 bytes of ol_flags.\n+\t\t */\n+\t\tp0 = (uintptr_t)&mb0->rearm_data;\n+\t\t*(uint64_t *)p0 = rxq->mbuf_initializer;\n+\t\tp1 = (uintptr_t)&mb1->rearm_data;\n+\t\t*(uint64_t *)p1 = rxq->mbuf_initializer;\n+\n+\t\t/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */\n+\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\t\tvaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);\n+\n+\t\t/* convert pa to dma_addr hdr/data */\n+\t\tdma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);\n+\t\tdma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);\n+\n+\t\t/* add headroom to pa values */\n+\t\tdma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);\n+\t\tdma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);\n+\n+\t\t/* flush desc with pa dma_addr */\n+\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);\n+\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);\n+\t}\n+\n+\trxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;\n+\tif (rxq->rxrearm_start >= rxq->nb_rx_desc)\n+\t\trxq->rxrearm_start = 0;\n+\n+\trxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;\n+\n+\trx_id = (uint16_t)((rxq->rxrearm_start == 0) ?\n+\t\t\t     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));\n+\n+\t/* Update the tail pointer on the NIC */\n+\tI40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);\n+}\n+\n+/* Handling the offload flags (olflags) field takes computation\n+ * time when receiving packets. Therefore we provide a flag to disable\n+ * the processing of the olflags field when they are not needed. This\n+ * gives improved performance, at the cost of losing the offload info\n+ * in the received packet\n+ */\n+#ifdef RTE_LIBRTE_I40E_RX_OLFLAGS_ENABLE\n+\n+static inline void\n+desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)\n+{\n+\t__m128i vlan0, vlan1, rss;\n+\tunion {\n+\t\tuint16_t e[4];\n+\t\tuint64_t dword;\n+\t} vol;\n+\n+\t/* mask everything except rss and vlan flags\n+\t*bit2 is for vlan tag, bits 13:12 for rss\n+\t*/\n+\tconst __m128i rss_vlan_msk = _mm_set_epi16(\n+\t\t\t0x0000, 0x0000, 0x0000, 0x0000,\n+\t\t\t0x3004, 0x3004, 0x3004, 0x3004);\n+\n+\t/* map rss and vlan type to rss hash and vlan flag */\n+\tconst __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,\n+\t\t\t0, 0, 0, 0,\n+\t\t\t0, 0, 0, PKT_RX_VLAN_PKT,\n+\t\t\t0, 0, 0, 0);\n+\n+\tconst __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,\n+\t\t\t0, 0, 0, 0,\n+\t\t\t0, 0, 0, 0,\n+\t\t\tPKT_RX_FDIR, 0, PKT_RX_RSS_HASH, 0);\n+\n+\tvlan0 = _mm_unpackhi_epi16(descs[0], descs[1]);\n+\tvlan1 = _mm_unpackhi_epi16(descs[2], descs[3]);\n+\tvlan0 = _mm_unpacklo_epi32(vlan0, vlan1);\n+\n+\tvlan1 = _mm_and_si128(vlan0, rss_vlan_msk);\n+\tvlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);\n+\n+\trss = _mm_srli_epi16(vlan1, 12);\n+\trss = _mm_shuffle_epi8(rss_flags, rss);\n+\n+\tvlan0 = _mm_or_si128(vlan0, rss);\n+\tvol.dword = _mm_cvtsi128_si64(vlan0);\n+\n+\trx_pkts[0]->ol_flags = vol.e[0];\n+\trx_pkts[1]->ol_flags = vol.e[1];\n+\trx_pkts[2]->ol_flags = vol.e[2];\n+\trx_pkts[3]->ol_flags = vol.e[3];\n+}\n+#else\n+#define desc_to_olflags_v(desc, rx_pkts) do {} while (0)\n+#endif\n+\n+#define PKTLEN_SHIFT     (6)\n+#define PKTLEN_MASK      (0x3FFF)\n+/* Handling the pkt len field is not aligned with 1byte, so shift is\n+ * needed to let it align\n+ */\n+static inline void\n+desc_pktlen_align(__m128i descs[4])\n+{\n+\t__m128i pktlen0, pktlen1, zero;\n+\tunion {\n+\t\tuint16_t e[4];\n+\t\tuint64_t dword;\n+\t} vol;\n+\n+\t/* mask everything except pktlen field*/\n+\tconst __m128i pktlen_msk = _mm_set_epi32(PKTLEN_MASK, PKTLEN_MASK,\n+\t\t\t\t\t\tPKTLEN_MASK, PKTLEN_MASK);\n+\n+\tpktlen0 = _mm_unpackhi_epi32(descs[0], descs[2]);\n+\tpktlen1 = _mm_unpackhi_epi32(descs[1], descs[3]);\n+\tpktlen0 = _mm_unpackhi_epi32(pktlen0, pktlen1);\n+\n+\tzero = _mm_xor_si128(pktlen0, pktlen0);\n+\n+\tpktlen0 = _mm_srli_epi32(pktlen0, PKTLEN_SHIFT);\n+\tpktlen0 = _mm_and_si128(pktlen0, pktlen_msk);\n+\n+\tpktlen0 = _mm_packs_epi32(pktlen0, zero);\n+\tvol.dword = _mm_cvtsi128_si64(pktlen0);\n+\t/* let the descriptor byte 15-14 store the pkt len */\n+\t*((uint16_t *)&descs[0]+7) = vol.e[0];\n+\t*((uint16_t *)&descs[1]+7) = vol.e[1];\n+\t*((uint16_t *)&descs[2]+7) = vol.e[2];\n+\t*((uint16_t *)&descs[3]+7) = vol.e[3];\n+}\n+\n+ /* vPMD receive routine, now only accept (nb_pkts == RTE_I40E_VPMD_RX_BURST)\n+ * in one loop\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet\n+ * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST\n+ *   numbers of DD bits\n+\n+ */\n+static inline uint16_t\n+_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n+\t\t   uint16_t nb_pkts, uint8_t *split_packet)\n+{\n+\tvolatile union i40e_rx_desc *rxdp;\n+\tstruct i40e_rx_entry *sw_ring;\n+\tuint16_t nb_pkts_recd;\n+\tint pos;\n+\tuint64_t var;\n+\t__m128i shuf_msk;\n+\n+\t__m128i crc_adjust = _mm_set_epi16(\n+\t\t\t\t0, 0, 0,    /* ignore non-length fields */\n+\t\t\t\t-rxq->crc_len, /* sub crc on data_len */\n+\t\t\t\t0,          /* ignore high-16bits of pkt_len */\n+\t\t\t\t-rxq->crc_len, /* sub crc on pkt_len */\n+\t\t\t\t0, 0            /* ignore pkt_type field */\n+\t\t\t);\n+\t__m128i dd_check, eop_check;\n+\n+\t/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */\n+\tnb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);\n+\n+\t/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */\n+\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);\n+\n+\t/* Just the act of getting into the function from the application is\n+\t * going to cost about 7 cycles\n+\t */\n+\trxdp = rxq->rx_ring + rxq->rx_tail;\n+\n+\t_mm_prefetch((const void *)rxdp, _MM_HINT_T0);\n+\n+\t/* See if we need to rearm the RX queue - gives the prefetch a bit\n+\t * of time to act\n+\t */\n+\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)\n+\t\ti40e_rxq_rearm(rxq);\n+\n+\t/* Before we start moving massive data around, check to see if\n+\t * there is actually a packet available\n+\t */\n+\tif (!(rxdp->wb.qword1.status_error_len &\n+\t\t\trte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))\n+\t\treturn 0;\n+\n+\t/* 4 packets DD mask */\n+\tdd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);\n+\n+\t/* 4 packets EOP mask */\n+\teop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);\n+\n+\t/* mask to shuffle from desc. to mbuf */\n+\tshuf_msk = _mm_set_epi8(\n+\t\t7, 6, 5, 4,  /* octet 4~7, 32bits rss */\n+\t\t3, 2,        /* octet 2~3, low 16 bits vlan_macip */\n+\t\t15, 14,      /* octet 15~14, 16 bits data_len */\n+\t\t0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */\n+\t\t15, 14,      /* octet 15~14, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,  /* pkt_type set as unknown */\n+\t\t0xFF, 0xFF  /*pkt_type set as unknown */\n+\t\t);\n+\n+\t/* Cache is empty -> need to scan the buffer rings, but first move\n+\t * the next 'n' mbufs into the cache\n+\t */\n+\tsw_ring = &rxq->sw_ring[rxq->rx_tail];\n+\n+\t/* A. load 4 packet in one loop\n+\t * [A*. mask out 4 unused dirty field in desc]\n+\t * B. copy 4 mbuf point from swring to rx_pkts\n+\t * C. calc the number of DD bits among the 4 packets\n+\t * [C*. extract the end-of-packet bit, if requested]\n+\t * D. fill info. from desc to mbuf\n+\t */\n+\n+\tfor (pos = 0, nb_pkts_recd = 0; pos < RTE_I40E_VPMD_RX_BURST;\n+\t\t\tpos += RTE_I40E_DESCS_PER_LOOP,\n+\t\t\trxdp += RTE_I40E_DESCS_PER_LOOP) {\n+\t\t__m128i descs[RTE_I40E_DESCS_PER_LOOP];\n+\t\t__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;\n+\t\t__m128i zero, staterr, sterr_tmp1, sterr_tmp2;\n+\t\t__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */\n+\n+\t\t/* B.1 load 1 mbuf point */\n+\t\tmbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);\n+\t\t/* Read desc statuses backwards to avoid race condition */\n+\t\t/* A.1 load 4 pkts desc */\n+\t\tdescs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));\n+\n+\t\t/* B.2 copy 2 mbuf point into rx_pkts  */\n+\t\t_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);\n+\n+\t\t/* B.1 load 1 mbuf point */\n+\t\tmbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);\n+\n+\t\tdescs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));\n+\t\t/* B.1 load 2 mbuf point */\n+\t\tdescs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));\n+\t\tdescs[0] = _mm_loadu_si128((__m128i *)(rxdp));\n+\n+\t\t/* B.2 copy 2 mbuf point into rx_pkts  */\n+\t\t_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);\n+\n+\t\tif (split_packet) {\n+\t\t\trte_prefetch0(&rx_pkts[pos]->cacheline1);\n+\t\t\trte_prefetch0(&rx_pkts[pos + 1]->cacheline1);\n+\t\t\trte_prefetch0(&rx_pkts[pos + 2]->cacheline1);\n+\t\t\trte_prefetch0(&rx_pkts[pos + 3]->cacheline1);\n+\t\t}\n+\n+\t\t/*shift the pktlen field*/\n+\t\tdesc_pktlen_align(descs);\n+\n+\t\t/* avoid compiler reorder optimization */\n+\t\trte_compiler_barrier();\n+\n+\t\t/* D.1 pkt 3,4 convert format from desc to pktmbuf */\n+\t\tpkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);\n+\t\tpkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);\n+\n+\t\t/* C.1 4=>2 filter staterr info only */\n+\t\tsterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);\n+\t\t/* C.1 4=>2 filter staterr info only */\n+\t\tsterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);\n+\n+\t\tdesc_to_olflags_v(descs, &rx_pkts[pos]);\n+\n+\t\t/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */\n+\t\tpkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);\n+\t\tpkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);\n+\n+\t\t/* D.1 pkt 1,2 convert format from desc to pktmbuf */\n+\t\tpkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);\n+\t\tpkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);\n+\n+\t\t/* C.2 get 4 pkts staterr value  */\n+\t\tzero = _mm_xor_si128(dd_check, dd_check);\n+\t\tstaterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);\n+\n+\t\t/* D.3 copy final 3,4 data to rx_pkts */\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,\n+\t\t\t\t pkt_mb4);\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,\n+\t\t\t\t pkt_mb3);\n+\n+\t\t/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */\n+\t\tpkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);\n+\t\tpkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);\n+\n+\t\t/* C* extract and record EOP bit */\n+\t\tif (split_packet) {\n+\t\t\t__m128i eop_shuf_mask = _mm_set_epi8(\n+\t\t\t\t\t0xFF, 0xFF, 0xFF, 0xFF,\n+\t\t\t\t\t0xFF, 0xFF, 0xFF, 0xFF,\n+\t\t\t\t\t0xFF, 0xFF, 0xFF, 0xFF,\n+\t\t\t\t\t0x04, 0x0C, 0x00, 0x08\n+\t\t\t\t\t);\n+\n+\t\t\t/* and with mask to extract bits, flipping 1-0 */\n+\t\t\t__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);\n+\t\t\t/* the staterr values are not in order, as the count\n+\t\t\t * count of dd bits doesn't care. However, for end of\n+\t\t\t * packet tracking, we do care, so shuffle. This also\n+\t\t\t * compresses the 32-bit values to 8-bit\n+\t\t\t */\n+\t\t\teop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);\n+\t\t\t/* store the resulting 32-bit value */\n+\t\t\t*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);\n+\t\t\tsplit_packet += RTE_I40E_DESCS_PER_LOOP;\n+\n+\t\t\t/* zero-out next pointers */\n+\t\t\trx_pkts[pos]->next = NULL;\n+\t\t\trx_pkts[pos + 1]->next = NULL;\n+\t\t\trx_pkts[pos + 2]->next = NULL;\n+\t\t\trx_pkts[pos + 3]->next = NULL;\n+\t\t}\n+\n+\t\t/* C.3 calc available number of desc */\n+\t\tstaterr = _mm_and_si128(staterr, dd_check);\n+\t\tstaterr = _mm_packs_epi32(staterr, zero);\n+\n+\t\t/* D.3 copy final 1,2 data to rx_pkts */\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,\n+\t\t\t\t pkt_mb2);\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,\n+\t\t\t\t pkt_mb1);\n+\t\t/* C.4 calc avaialbe number of desc */\n+\t\tvar = __builtin_popcountll(_mm_cvtsi128_si64(staterr));\n+\t\tnb_pkts_recd += var;\n+\t\tif (likely(var != RTE_I40E_DESCS_PER_LOOP))\n+\t\t\tbreak;\n+\t}\n+\n+\t/* Update our internal tail pointer */\n+\trxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);\n+\trxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));\n+\trxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);\n+\n+\treturn nb_pkts_recd;\n+}\n+\n+ /* vPMD receive routine, now only accept (nb_pkts == RTE_IXGBE_VPMD_RX_BURST)\n+ * in one loop\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_I40E_VPMD_RX_BURST, just return no packet\n+ * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_IXGBE_VPMD_RX_BURST\n+ *   numbers of DD bit\n+ */\n+uint16_t\n+i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t   uint16_t nb_pkts)\n+{\n+\treturn _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);\n+}\n+\n+void __attribute__((cold))\n+i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq)\n+{\n+\tconst unsigned mask = rxq->nb_rx_desc - 1;\n+\tunsigned i;\n+\n+\tif (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc)\n+\t\treturn;\n+\n+\t/* free all mbufs that are valid in the ring */\n+\tfor (i = rxq->rx_tail; i != rxq->rxrearm_start; i = (i + 1) & mask)\n+\t\trte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);\n+\trxq->rxrearm_nb = rxq->nb_rx_desc;\n+\n+\t/* set all entries to NULL */\n+\tmemset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);\n+}\n+\n+int __attribute__((cold))\n+i40e_rxq_vec_setup(struct i40e_rx_queue *rxq)\n+{\n+\tuintptr_t p;\n+\tstruct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */\n+\n+\tmb_def.nb_segs = 1;\n+\tmb_def.data_off = RTE_PKTMBUF_HEADROOM;\n+\tmb_def.port = rxq->port_id;\n+\trte_mbuf_refcnt_set(&mb_def, 1);\n+\n+\t/* prevent compiler reordering: rearm_data covers previous fields */\n+\trte_compiler_barrier();\n+\tp = (uintptr_t)&mb_def.rearm_data;\n+\trxq->mbuf_initializer = *(uint64_t *)p;\n+\treturn 0;\n+}\n",
    "prefixes": [
        "dpdk-dev",
        "1/4",
        "v3"
    ]
}