get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/77075/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 77075,
    "url": "http://patches.dpdk.org/api/patches/77075/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20200909155717.29099-1-lance.richardson@broadcom.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20200909155717.29099-1-lance.richardson@broadcom.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20200909155717.29099-1-lance.richardson@broadcom.com",
    "date": "2020-09-09T15:57:17",
    "name": "[11/12] net/bnxt: handle multiple packets per loop in vector PMD",
    "commit_ref": null,
    "pull_url": null,
    "state": "accepted",
    "archived": true,
    "hash": "1a71f9a0669d027a4c354134b4b678249e0b0fbe",
    "submitter": {
        "id": 1323,
        "url": "http://patches.dpdk.org/api/people/1323/?format=api",
        "name": "Lance Richardson",
        "email": "lance.richardson@broadcom.com"
    },
    "delegate": {
        "id": 1766,
        "url": "http://patches.dpdk.org/api/users/1766/?format=api",
        "username": "ajitkhaparde",
        "first_name": "Ajit",
        "last_name": "Khaparde",
        "email": "ajit.khaparde@broadcom.com"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20200909155717.29099-1-lance.richardson@broadcom.com/mbox/",
    "series": [
        {
            "id": 12060,
            "url": "http://patches.dpdk.org/api/series/12060/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=12060",
            "date": "2020-09-09T15:52:53",
            "name": "net/bnxt: vector PMD improvements",
            "version": 1,
            "mbox": "http://patches.dpdk.org/series/12060/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/77075/comments/",
    "check": "warning",
    "checks": "http://patches.dpdk.org/api/patches/77075/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id E15CBA04B5;\n\tWed,  9 Sep 2020 17:57:25 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id CE95F1C0DC;\n\tWed,  9 Sep 2020 17:57:25 +0200 (CEST)",
            "from mail-pg1-f196.google.com (mail-pg1-f196.google.com\n [209.85.215.196]) by dpdk.org (Postfix) with ESMTP id 15B901C0DA\n for <dev@dpdk.org>; Wed,  9 Sep 2020 17:57:24 +0200 (CEST)",
            "by mail-pg1-f196.google.com with SMTP id 7so2359491pgm.11\n for <dev@dpdk.org>; Wed, 09 Sep 2020 08:57:24 -0700 (PDT)",
            "from localhost.localdomain ([192.19.231.250])\n by smtp.gmail.com with ESMTPSA id l7sm2320480pjz.56.2020.09.09.08.57.21\n (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);\n Wed, 09 Sep 2020 08:57:22 -0700 (PDT)"
        ],
        "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=broadcom.com;\n s=google;\n h=from:to:cc:subject:date:message-id:mime-version\n :content-transfer-encoding;\n bh=iEa4pKUyi4nAEsrKZiz6m8MkzSEuBTQZLK7Kr9pTa8c=;\n b=aZX06WtOtk59hEmniiB2/tu8fqqyad6tTQH74ld/KCQRj7CNolpHCl/eNOFIKw+4cn\n eAwMJpHhCE6XPk4TCAm6LU7p60k3gjVgucxWNs5H0S+eKVeZ4lRjxRJ3oNu959o4rAFI\n nIiU8+pDe0Cr2bZRLxsbG/lR7ZyrXodthAcPo=",
        "X-Google-DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=1e100.net; s=20161025;\n h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version\n :content-transfer-encoding;\n bh=iEa4pKUyi4nAEsrKZiz6m8MkzSEuBTQZLK7Kr9pTa8c=;\n b=a/hnegoXFocK1AjRJhNVyU1Tyu3xXZWq0SwtoNbsfY2+i2OHtg5iDull4X186n0Y3c\n KY23sRdhWKU5OxHbrqmi/T2r07mlMhGh0bJtjMdMNuzxaaEzRyaH0MdKjny25f7z8Vn9\n dok5vhtDCQmzvVrZuGMCzbD+8GzDkoWfB0YsRcEZH4BZ6mPjZmH0A9xx14hBiRsYq5lf\n ZWIFJMb3HbvdOJ3G+QJIYvPUIGbTMkMPxzmBmRdgEjR5j+F9lj0uuXUuoFozMHgeXPeX\n Gq4vTIotj2CSEX8lmQMbDLeExrWFl02NYKVS2dVtXN7+lMafOkN3+01jfoh8sxZP4NWL\n FA+g==",
        "X-Gm-Message-State": "AOAM5311cGyIsSCdP13okYgYWCwwguFhZEYzy16xYzefCqT2yHnfBGGW\n +YmJ95TSicNTKpur+BplN2mNzA==",
        "X-Google-Smtp-Source": "\n ABdhPJwJ1nZlWOqIeN1gvybJKm8I8970QXK34O7ZShpN2yEyuuC06KSQI8Ctfg9oqNcQza85brG/ew==",
        "X-Received": "by 2002:a17:902:aa8d:: with SMTP id\n d13mr1446483plr.124.1599667042811;\n Wed, 09 Sep 2020 08:57:22 -0700 (PDT)",
        "From": "Lance Richardson <lance.richardson@broadcom.com>",
        "To": "Ajit Khaparde <ajit.khaparde@broadcom.com>,\n Somnath Kotur <somnath.kotur@broadcom.com>",
        "Cc": "dev@dpdk.org",
        "Date": "Wed,  9 Sep 2020 11:57:17 -0400",
        "Message-Id": "<20200909155717.29099-1-lance.richardson@broadcom.com>",
        "X-Mailer": "git-send-email 2.25.1",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "Subject": "[dpdk-dev] [PATCH 11/12] net/bnxt: handle multiple packets per loop\n\tin vector PMD",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Process four receive descriptors per inner loop in vector mode\nburst receive functions.\n\nReviewed-by: Ajit Kumar Khaparde <ajit.khaparde@broadcom.com>\nSigned-off-by: Lance Richardson <lance.richardson@broadcom.com>\n---\n drivers/net/bnxt/bnxt_ethdev.c          |   2 +-\n drivers/net/bnxt/bnxt_rxq.c             |   3 +-\n drivers/net/bnxt/bnxt_rxq.h             |   1 +\n drivers/net/bnxt/bnxt_rxr.c             |  15 +-\n drivers/net/bnxt/bnxt_rxtx_vec_common.h |   2 +\n drivers/net/bnxt/bnxt_rxtx_vec_neon.c   | 365 +++++++++++++++++-------\n drivers/net/bnxt/bnxt_rxtx_vec_sse.c    | 316 ++++++++++++++------\n 7 files changed, 508 insertions(+), 196 deletions(-)",
    "diff": "diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c\nindex 27eba431b8..b658a44303 100644\n--- a/drivers/net/bnxt/bnxt_ethdev.c\n+++ b/drivers/net/bnxt/bnxt_ethdev.c\n@@ -2872,7 +2872,7 @@ bnxt_rx_descriptor_status_op(void *rx_queue, uint16_t offset)\n \t\t\treturn RTE_ETH_RX_DESC_DONE;\n \t}\n \trx_buf = rxr->rx_buf_ring[cons];\n-\tif (rx_buf == NULL)\n+\tif (rx_buf == NULL || rx_buf == &rxq->fake_mbuf)\n \t\treturn RTE_ETH_RX_DESC_UNAVAIL;\n \n \ndiff --git a/drivers/net/bnxt/bnxt_rxq.c b/drivers/net/bnxt/bnxt_rxq.c\nindex 4ef3b5cb5c..57ba9a1570 100644\n--- a/drivers/net/bnxt/bnxt_rxq.c\n+++ b/drivers/net/bnxt/bnxt_rxq.c\n@@ -212,7 +212,8 @@ void bnxt_rx_queue_release_mbufs(struct bnxt_rx_queue *rxq)\n \t\tfor (i = 0;\n \t\t     i < rxq->rx_ring->rx_ring_struct->ring_size; i++) {\n \t\t\tif (sw_ring[i]) {\n-\t\t\t\trte_pktmbuf_free_seg(sw_ring[i]);\n+\t\t\t\tif (sw_ring[i] != &rxq->fake_mbuf)\n+\t\t\t\t\trte_pktmbuf_free_seg(sw_ring[i]);\n \t\t\t\tsw_ring[i] = NULL;\n \t\t\t}\n \t\t}\ndiff --git a/drivers/net/bnxt/bnxt_rxq.h b/drivers/net/bnxt/bnxt_rxq.h\nindex d5ce3b6d58..96c6e06a52 100644\n--- a/drivers/net/bnxt/bnxt_rxq.h\n+++ b/drivers/net/bnxt/bnxt_rxq.h\n@@ -39,6 +39,7 @@ struct bnxt_rx_queue {\n \tuint32_t\t\t\trx_buf_size;\n \tstruct bnxt_rx_ring_info\t*rx_ring;\n \tstruct bnxt_cp_ring_info\t*cp_ring;\n+\tstruct rte_mbuf\t\t\tfake_mbuf;\n \trte_atomic64_t\t\trx_mbuf_alloc_fail;\n \tconst struct rte_memzone *mz;\n };\ndiff --git a/drivers/net/bnxt/bnxt_rxr.c b/drivers/net/bnxt/bnxt_rxr.c\nindex 33bd006530..89a964a49b 100644\n--- a/drivers/net/bnxt/bnxt_rxr.c\n+++ b/drivers/net/bnxt/bnxt_rxr.c\n@@ -20,6 +20,7 @@\n #ifdef RTE_LIBRTE_IEEE1588\n #include \"bnxt_hwrm.h\"\n #endif\n+#include \"bnxt_rxtx_vec_common.h\"\n \n #include <bnxt_tf_common.h>\n #include <ulp_mark_mgr.h>\n@@ -931,7 +932,7 @@ uint16_t bnxt_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t\t\tstruct rte_mbuf **rx_buf = &rxr->rx_buf_ring[i];\n \n \t\t\t/* Buffer already allocated for this index. */\n-\t\t\tif (*rx_buf != NULL)\n+\t\t\tif (*rx_buf != NULL && *rx_buf != &rxq->fake_mbuf)\n \t\t\t\tcontinue;\n \n \t\t\t/* This slot is empty. Alloc buffer for Rx */\n@@ -1025,7 +1026,11 @@ int bnxt_init_rx_ring_struct(struct bnxt_rx_queue *rxq, unsigned int socket_id)\n \tring->ring_mask = ring->ring_size - 1;\n \tring->bd = (void *)rxr->rx_desc_ring;\n \tring->bd_dma = rxr->rx_desc_mapping;\n-\tring->vmem_size = ring->ring_size * sizeof(struct rte_mbuf *);\n+\n+\t/* Allocate extra rx ring entries for vector rx. */\n+\tring->vmem_size = sizeof(struct rte_mbuf *) *\n+\t\t\t\t(ring->ring_size + RTE_BNXT_DESCS_PER_LOOP);\n+\n \tring->vmem = (void **)&rxr->rx_buf_ring;\n \tring->fw_ring_id = INVALID_HW_RING_ID;\n \n@@ -1136,6 +1141,12 @@ int bnxt_init_one_rx_ring(struct bnxt_rx_queue *rxq)\n \t\tprod = RING_NEXT(rxr->rx_ring_struct, prod);\n \t}\n \n+\t/* Initialize dummy mbuf pointers for vector mode rx. */\n+\tfor (i = ring->ring_size;\n+\t     i < ring->ring_size + RTE_BNXT_DESCS_PER_LOOP; i++) {\n+\t\trxr->rx_buf_ring[i] = &rxq->fake_mbuf;\n+\t}\n+\n \tring = rxr->ag_ring_struct;\n \ttype = RX_PROD_AGG_BD_TYPE_RX_PROD_AGG;\n \tbnxt_init_rxbds(ring, type, size);\ndiff --git a/drivers/net/bnxt/bnxt_rxtx_vec_common.h b/drivers/net/bnxt/bnxt_rxtx_vec_common.h\nindex 819b8290e4..8c10fdfa10 100644\n--- a/drivers/net/bnxt/bnxt_rxtx_vec_common.h\n+++ b/drivers/net/bnxt/bnxt_rxtx_vec_common.h\n@@ -75,6 +75,8 @@ bnxt_rxq_rearm(struct bnxt_rx_queue *rxq, struct bnxt_rx_ring_info *rxr)\n \tif (rte_mempool_get_bulk(rxq->mb_pool, (void *)rx_bufs, nb) < 0) {\n \t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += nb;\n \n+\t\tfor (i = 0; i < nb; i++)\n+\t\t\trx_bufs[i] = &rxq->fake_mbuf;\n \t\treturn;\n \t}\n \ndiff --git a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c\nindex 24f9fc3c39..e7fe9325ab 100644\n--- a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c\n+++ b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c\n@@ -22,52 +22,151 @@\n  * RX Ring handling\n  */\n \n-static uint32_t\n-bnxt_parse_pkt_type(uint32x4_t mm_rxcmp, uint32x4_t mm_rxcmp1)\n-{\n-\tuint32_t flags_type, flags2;\n-\tuint8_t index;\n-\n-\tflags_type = vgetq_lane_u32(mm_rxcmp, 0);\n-\tflags2 = (uint16_t)vgetq_lane_u32(mm_rxcmp1, 0);\n-\n-\t/*\n-\t * Index format:\n-\t *     bit 0: RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC\n-\t *     bit 1: RX_CMPL_FLAGS2_IP_TYPE\n-\t *     bit 2: RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN\n-\t *     bits 3-6: RX_PKT_CMPL_FLAGS_ITYPE\n-\t */\n-\tindex = ((flags_type & RX_PKT_CMPL_FLAGS_ITYPE_MASK) >> 9) |\n-\t\t((flags2 & (RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n-\t\t\t   RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC)) >> 2) |\n-\t\t((flags2 & RX_PKT_CMPL_FLAGS2_IP_TYPE) >> 7);\n+#define GET_OL_FLAGS(rss_flags, ol_idx, errors, pi, ol_flags)\t\t       \\\n+{\t\t\t\t\t\t\t\t\t       \\\n+\tuint32_t tmp, of;\t\t\t\t\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\tof = vgetq_lane_u32((rss_flags), (pi)) |\t\t\t       \\\n+\t\t   bnxt_ol_flags_table[vgetq_lane_u32((ol_idx), (pi))];\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\ttmp = vgetq_lane_u32((errors), (pi));\t\t\t\t       \\\n+\tif (tmp)\t\t\t\t\t\t\t       \\\n+\t\tof |= bnxt_ol_flags_err_table[tmp];\t\t\t       \\\n+\t(ol_flags) = of;\t\t\t\t\t\t       \\\n+}\n \n-\treturn bnxt_ptype_table[index];\n+#define GET_DESC_FIELDS(rxcmp, rxcmp1, shuf_msk, ptype_idx, pkt_idx, ret)      \\\n+{\t\t\t\t\t\t\t\t\t       \\\n+\tuint32_t ptype;\t\t\t\t\t\t\t       \\\n+\tuint16_t vlan_tci;\t\t\t\t\t\t       \\\n+\tuint32x4_t r;\t\t\t\t\t\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\t/* Set mbuf pkt_len, data_len, and rss_hash fields. */\t\t       \\\n+\tr = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(rxcmp),       \\\n+\t\t\t\t\t      (shuf_msk)));\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\t/* Set packet type. */\t\t\t\t\t\t       \\\n+\tptype = bnxt_ptype_table[vgetq_lane_u32((ptype_idx), (pkt_idx))];      \\\n+\tr = vsetq_lane_u32(ptype, r, 0);\t\t\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\t/* Set vlan_tci. */\t\t\t\t\t\t       \\\n+\tvlan_tci = vgetq_lane_u32((rxcmp1), 1);\t\t\t\t       \\\n+\tr = vreinterpretq_u32_u16(vsetq_lane_u16(vlan_tci,\t\t       \\\n+\t\t\t\tvreinterpretq_u16_u32(r), 5));\t\t       \\\n+\t(ret) = r;\t\t\t\t\t\t\t       \\\n }\n \n-static uint32_t\n-bnxt_set_ol_flags(uint32x4_t mm_rxcmp, uint32x4_t mm_rxcmp1)\n+static void\n+descs_to_mbufs(uint32x4_t mm_rxcmp[4], uint32x4_t mm_rxcmp1[4],\n+\t       uint64x2_t mb_init, struct rte_mbuf **mbuf)\n {\n-\tuint16_t flags_type, errors, flags;\n+\tconst uint8x16_t shuf_msk = {\n+\t\t0xFF, 0xFF, 0xFF, 0xFF,    /* pkt_type (zeroes) */\n+\t\t2, 3, 0xFF, 0xFF,          /* pkt_len */\n+\t\t2, 3,                      /* data_len */\n+\t\t0xFF, 0xFF,                /* vlan_tci (zeroes) */\n+\t\t12, 13, 14, 15             /* rss hash */\n+\t};\n+\tconst uint32x4_t flags_type_mask = {\n+\t\tRX_PKT_CMPL_FLAGS_ITYPE_MASK,\n+\t\tRX_PKT_CMPL_FLAGS_ITYPE_MASK,\n+\t\tRX_PKT_CMPL_FLAGS_ITYPE_MASK,\n+\t\tRX_PKT_CMPL_FLAGS_ITYPE_MASK\n+\t};\n+\tconst uint32x4_t flags2_mask1 = {\n+\t\tRX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC,\n+\t\tRX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC,\n+\t\tRX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC,\n+\t\tRX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC\n+\t};\n+\tconst uint32x4_t flags2_mask2 = {\n+\t\tRX_PKT_CMPL_FLAGS2_IP_TYPE,\n+\t\tRX_PKT_CMPL_FLAGS2_IP_TYPE,\n+\t\tRX_PKT_CMPL_FLAGS2_IP_TYPE,\n+\t\tRX_PKT_CMPL_FLAGS2_IP_TYPE\n+\t};\n+\tconst uint32x4_t rss_mask = {\n+\t\tRX_PKT_CMPL_FLAGS_RSS_VALID,\n+\t\tRX_PKT_CMPL_FLAGS_RSS_VALID,\n+\t\tRX_PKT_CMPL_FLAGS_RSS_VALID,\n+\t\tRX_PKT_CMPL_FLAGS_RSS_VALID\n+\t};\n+\tconst uint32x4_t flags2_index_mask = {\n+\t\t0x1F, 0x1F, 0x1F, 0x1F\n+\t};\n+\tconst uint32x4_t flags2_error_mask = {\n+\t\t0xF, 0xF, 0xF, 0xF\n+\t};\n+\tuint32x4_t flags_type, flags2, index, errors, rss_flags;\n+\tuint32x4_t tmp, ptype_idx;\n+\tuint64x2_t t0, t1;\n \tuint32_t ol_flags;\n \n-\t/* Extract rxcmp1->flags2. */\n-\tflags = vgetq_lane_u32(mm_rxcmp1, 0) & 0x1F;\n-\t/* Extract rxcmp->flags_type. */\n-\tflags_type = vgetq_lane_u32(mm_rxcmp, 0);\n-\t/* Extract rxcmp1->errors_v2. */\n-\terrors = (vgetq_lane_u32(mm_rxcmp1, 2) >> 4) & flags & 0xF;\n-\n-\tol_flags = bnxt_ol_flags_table[flags & ~errors];\n-\n-\tif (flags_type & RX_PKT_CMPL_FLAGS_RSS_VALID)\n-\t\tol_flags |= PKT_RX_RSS_HASH;\n-\n-\tif (errors)\n-\t\tol_flags |= bnxt_ol_flags_err_table[errors];\n-\n-\treturn ol_flags;\n+\t/* Compute packet type table indexes for four packets */\n+\tt0 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp[0], mm_rxcmp[1]));\n+\tt1 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp[2], mm_rxcmp[3]));\n+\n+\tflags_type = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0),\n+\t\t\t\t\t\t\tvget_low_u64(t1)));\n+\tptype_idx =\n+\t\tvshrq_n_u32(vandq_u32(flags_type, flags_type_mask), 9);\n+\n+\tt0 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp1[0], mm_rxcmp1[1]));\n+\tt1 = vreinterpretq_u64_u32(vzip1q_u32(mm_rxcmp1[2], mm_rxcmp1[3]));\n+\n+\tflags2 = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0),\n+\t\t\t\t\t\t    vget_low_u64(t1)));\n+\n+\tptype_idx = vorrq_u32(ptype_idx,\n+\t\t\tvshrq_n_u32(vandq_u32(flags2, flags2_mask1), 2));\n+\tptype_idx = vorrq_u32(ptype_idx,\n+\t\t\tvshrq_n_u32(vandq_u32(flags2, flags2_mask2), 7));\n+\n+\t/* Extract RSS valid flags for four packets. */\n+\trss_flags = vshrq_n_u32(vandq_u32(flags_type, rss_mask), 9);\n+\n+\tflags2 = vandq_u32(flags2, flags2_index_mask);\n+\n+\t/* Extract errors_v2 fields for four packets. */\n+\tt0 = vreinterpretq_u64_u32(vzip2q_u32(mm_rxcmp1[0], mm_rxcmp1[1]));\n+\tt1 = vreinterpretq_u64_u32(vzip2q_u32(mm_rxcmp1[2], mm_rxcmp1[3]));\n+\n+\terrors = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0),\n+\t\t\t\t\t\t    vget_low_u64(t1)));\n+\n+\t/* Compute ol_flags and checksum error indexes for four packets. */\n+\terrors = vandq_u32(vshrq_n_u32(errors, 4), flags2_error_mask);\n+\terrors = vandq_u32(errors, flags2);\n+\n+\tindex = vbicq_u32(flags2, errors);\n+\n+\t/* Update mbuf rearm_data for four packets. */\n+\tGET_OL_FLAGS(rss_flags, index, errors, 0, ol_flags);\n+\tvst1q_u32((uint32_t *)&mbuf[0]->rearm_data,\n+\t\t  vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2));\n+\tGET_OL_FLAGS(rss_flags, index, errors, 1, ol_flags);\n+\tvst1q_u32((uint32_t *)&mbuf[1]->rearm_data,\n+\t\t  vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2));\n+\tGET_OL_FLAGS(rss_flags, index, errors, 2, ol_flags);\n+\tvst1q_u32((uint32_t *)&mbuf[2]->rearm_data,\n+\t\t  vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2));\n+\tGET_OL_FLAGS(rss_flags, index, errors, 3, ol_flags);\n+\tvst1q_u32((uint32_t *)&mbuf[3]->rearm_data,\n+\t\t  vsetq_lane_u32(ol_flags, vreinterpretq_u32_u64(mb_init), 2));\n+\n+\t/* Update mbuf rx_descriptor_fields1 for four packets. */\n+\tGET_DESC_FIELDS(mm_rxcmp[0], mm_rxcmp1[0], shuf_msk, ptype_idx, 0, tmp);\n+\tvst1q_u32((uint32_t *)&mbuf[0]->rx_descriptor_fields1, tmp);\n+\tGET_DESC_FIELDS(mm_rxcmp[1], mm_rxcmp1[1], shuf_msk, ptype_idx, 1, tmp);\n+\tvst1q_u32((uint32_t *)&mbuf[1]->rx_descriptor_fields1, tmp);\n+\tGET_DESC_FIELDS(mm_rxcmp[2], mm_rxcmp1[2], shuf_msk, ptype_idx, 2, tmp);\n+\tvst1q_u32((uint32_t *)&mbuf[2]->rx_descriptor_fields1, tmp);\n+\tGET_DESC_FIELDS(mm_rxcmp[3], mm_rxcmp1[3], shuf_msk, ptype_idx, 3, tmp);\n+\tvst1q_u32((uint32_t *)&mbuf[3]->rx_descriptor_fields1, tmp);\n }\n \n uint16_t\n@@ -77,17 +176,23 @@ bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n \tstruct bnxt_rx_queue *rxq = rx_queue;\n \tstruct bnxt_cp_ring_info *cpr = rxq->cp_ring;\n \tstruct bnxt_rx_ring_info *rxr = rxq->rx_ring;\n+\tuint16_t cp_ring_size = cpr->cp_ring_struct->ring_size;\n+\tuint16_t rx_ring_size = rxr->rx_ring_struct->ring_size;\n+\tstruct cmpl_base *cp_desc_ring = cpr->cp_desc_ring;\n+\tuint64_t valid, desc_valid_mask = ~0UL;\n+\tconst uint32x4_t info3_v_mask = {\n+\t\tCMPL_BASE_V, CMPL_BASE_V,\n+\t\tCMPL_BASE_V, CMPL_BASE_V\n+\t};\n \tuint32_t raw_cons = cpr->cp_raw_cons;\n-\tuint32_t cons;\n+\tuint32_t cons, mbcons;\n \tint nb_rx_pkts = 0;\n-\tstruct rx_pkt_cmpl *rxcmp;\n-\tconst uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};\n-\tconst uint8x16_t shuf_msk = {\n-\t\t0xFF, 0xFF, 0xFF, 0xFF,    /* pkt_type (zeroes) */\n-\t\t2, 3, 0xFF, 0xFF,          /* pkt_len */\n-\t\t2, 3,                      /* data_len */\n-\t\t0xFF, 0xFF,                /* vlan_tci (zeroes) */\n-\t\t12, 13, 14, 15             /* rss hash */\n+\tconst uint64x2_t mb_init = {rxq->mbuf_initializer, 0};\n+\tconst uint32x4_t valid_target = {\n+\t\t!!(raw_cons & cp_ring_size),\n+\t\t!!(raw_cons & cp_ring_size),\n+\t\t!!(raw_cons & cp_ring_size),\n+\t\t!!(raw_cons & cp_ring_size)\n \t};\n \tint i;\n \n@@ -101,72 +206,130 @@ bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t/* Return no more than RTE_BNXT_MAX_RX_BURST per call. */\n \tnb_pkts = RTE_MIN(nb_pkts, RTE_BNXT_MAX_RX_BURST);\n \n-\t/* Make nb_pkts an integer multiple of RTE_BNXT_DESCS_PER_LOOP. */\n-\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP);\n-\tif (!nb_pkts)\n-\t\treturn 0;\n+\tcons = raw_cons & (cp_ring_size - 1);\n+\tmbcons = (raw_cons / 2) & (rx_ring_size - 1);\n \n-\t/* Handle RX burst request */\n-\tfor (i = 0; i < nb_pkts; i++) {\n-\t\tuint32x4_t mm_rxcmp, mm_rxcmp1;\n-\t\tstruct rx_pkt_cmpl_hi *rxcmp1;\n-\t\tuint32x4_t pkt_mb, rearm;\n-\t\tuint32_t ptype, ol_flags;\n-\t\tstruct rte_mbuf *mbuf;\n-\t\tuint16_t vlan_tci;\n-\t\tuint16x8_t tmp16;\n-\t\tuint8x16_t tmp;\n+\t/* Prefetch first four descriptor pairs. */\n+\trte_prefetch0(&cp_desc_ring[cons]);\n+\trte_prefetch0(&cp_desc_ring[cons + 4]);\n \n-\t\tcons = RING_CMP(cpr->cp_ring_struct, raw_cons);\n+\t/* Ensure that we do not go past the ends of the rings. */\n+\tnb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons,\n+\t\t\t\t\t   (cp_ring_size - cons) / 2));\n+\t/*\n+\t * If we are at the end of the ring, ensure that descriptors after the\n+\t * last valid entry are not treated as valid. Otherwise, force the\n+\t * maximum number of packets to receive to be a multiple of the per-\n+\t * loop count.\n+\t */\n+\tif (nb_pkts < RTE_BNXT_DESCS_PER_LOOP)\n+\t\tdesc_valid_mask >>= 16 * (RTE_BNXT_DESCS_PER_LOOP - nb_pkts);\n+\telse\n+\t\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP);\n \n-\t\trxcmp = (struct rx_pkt_cmpl *)&cpr->cp_desc_ring[cons];\n-\t\trxcmp1 = (struct rx_pkt_cmpl_hi *)&cpr->cp_desc_ring[cons + 1];\n+\t/* Handle RX burst request */\n+\tfor (i = 0; i < nb_pkts; i += RTE_BNXT_DESCS_PER_LOOP,\n+\t\t\t\t  cons += RTE_BNXT_DESCS_PER_LOOP * 2,\n+\t\t\t\t  mbcons += RTE_BNXT_DESCS_PER_LOOP) {\n+\t\tuint32x4_t rxcmp1[RTE_BNXT_DESCS_PER_LOOP];\n+\t\tuint32x4_t rxcmp[RTE_BNXT_DESCS_PER_LOOP];\n+\t\tuint32x4_t info3_v;\n+\t\tuint64x2_t t0, t1;\n+\t\tuint32_t num_valid;\n+\n+\t\t/* Copy four mbuf pointers to output array. */\n+\t\tt0 = vld1q_u64((void *)&rxr->rx_buf_ring[mbcons]);\n+#ifdef RTE_ARCH_ARM64\n+\t\tt1 = vld1q_u64((void *)&rxr->rx_buf_ring[mbcons + 2]);\n+#endif\n+\t\tvst1q_u64((void *)&rx_pkts[i], t0);\n+#ifdef RTE_ARCH_ARM64\n+\t\tvst1q_u64((void *)&rx_pkts[i + 2], t1);\n+#endif\n+\n+\t\t/* Prefetch four descriptor pairs for next iteration. */\n+\t\tif (i + RTE_BNXT_DESCS_PER_LOOP < nb_pkts) {\n+\t\t\trte_prefetch0(&cp_desc_ring[cons + 8]);\n+\t\t\trte_prefetch0(&cp_desc_ring[cons + 12]);\n+\t\t}\n \n-\t\tif (!CMP_VALID(rxcmp1, raw_cons + 1, cpr->cp_ring_struct))\n+\t\t/*\n+\t\t * Load the four curent descriptors into SSE registers in\n+\t\t * reverse order to ensure consistent state.\n+\t\t */\n+\t\trxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]);\n+\t\trte_cio_rmb();\n+\t\trxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);\n+\n+\t\trxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);\n+\t\trte_cio_rmb();\n+\t\trxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);\n+\n+\t\tt1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));\n+\n+\t\trxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);\n+\t\trte_cio_rmb();\n+\t\trxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);\n+\n+\t\trxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);\n+\t\trte_cio_rmb();\n+\t\trxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]);\n+\n+\t\tt0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1]));\n+\n+\t\t/* Isolate descriptor status flags. */\n+\t\tinfo3_v = vreinterpretq_u32_u64(vcombine_u64(vget_low_u64(t0),\n+\t\t\t\t\t\t\t     vget_low_u64(t1)));\n+\t\tinfo3_v = vandq_u32(info3_v, info3_v_mask);\n+\t\tinfo3_v = veorq_u32(info3_v, valid_target);\n+\n+\t\t/*\n+\t\t * Pack the 128-bit array of valid descriptor flags into 64\n+\t\t * bits and count the number of set bits in order to determine\n+\t\t * the number of valid descriptors.\n+\t\t */\n+\t\tvalid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)),\n+\t\t\t\t      0);\n+\t\t/*\n+\t\t * At this point, 'valid' is a 64-bit value containing four\n+\t\t * 16-bit fields, each of which is either 0x0001 or 0x0000.\n+\t\t * Compute number of valid descriptors from the index of\n+\t\t * the highest non-zero field.\n+\t\t */\n+\t\tnum_valid = (sizeof(uint64_t) / sizeof(uint16_t)) -\n+\t\t\t\t(__builtin_clzl(valid & desc_valid_mask) / 16);\n+\n+\t\tswitch (num_valid) {\n+\t\tcase 4:\n+\t\t\trxr->rx_buf_ring[mbcons + 3] = NULL;\n+\t\t\t/* FALLTHROUGH */\n+\t\tcase 3:\n+\t\t\trxr->rx_buf_ring[mbcons + 2] = NULL;\n+\t\t\t/* FALLTHROUGH */\n+\t\tcase 2:\n+\t\t\trxr->rx_buf_ring[mbcons + 1] = NULL;\n+\t\t\t/* FALLTHROUGH */\n+\t\tcase 1:\n+\t\t\trxr->rx_buf_ring[mbcons + 0] = NULL;\n \t\t\tbreak;\n+\t\tcase 0:\n+\t\t\tgoto out;\n+\t\t}\n \n-\t\tmm_rxcmp = vld1q_u32((uint32_t *)rxcmp);\n-\t\tmm_rxcmp1 = vld1q_u32((uint32_t *)rxcmp);\n-\t\traw_cons += 2;\n-\t\tcons = rxcmp->opaque;\n-\n-\t\tmbuf = rxr->rx_buf_ring[cons];\n-\t\trte_prefetch0(mbuf);\n-\t\trxr->rx_buf_ring[cons] = NULL;\n-\n-\t\t/* Set fields from mbuf initializer and ol_flags. */\n-\t\tol_flags = bnxt_set_ol_flags(mm_rxcmp, mm_rxcmp1);\n-\t\trearm = vsetq_lane_u32(ol_flags,\n-\t\t\t\t       vreinterpretq_u32_u64(mbuf_init), 2);\n-\t\tvst1q_u32((uint32_t *)&mbuf->rearm_data, rearm);\n-\n-\t\t/* Set mbuf pkt_len, data_len, and rss_hash fields. */\n-\t\ttmp = vqtbl1q_u8(vreinterpretq_u8_u32(mm_rxcmp), shuf_msk);\n-\t\tpkt_mb = vreinterpretq_u32_u8(tmp);\n-\n-\t\t/* Set packet type. */\n-\t\tptype = bnxt_parse_pkt_type(mm_rxcmp, mm_rxcmp1);\n-\t\tpkt_mb = vsetq_lane_u32(ptype, pkt_mb, 0);\n-\n-\t\t/* Set vlan_tci. */\n-\t\tvlan_tci = vgetq_lane_u32(mm_rxcmp1, 1);\n-\t\ttmp16 = vsetq_lane_u16(vlan_tci,\n-\t\t\t\t       vreinterpretq_u16_u32(pkt_mb),\n-\t\t\t\t       5);\n-\t\tpkt_mb = vreinterpretq_u32_u16(tmp16);\n-\n-\t\t/* Store descriptor fields. */\n-\t\tvst1q_u32((uint32_t *)&mbuf->rx_descriptor_fields1, pkt_mb);\n+\t\tdescs_to_mbufs(rxcmp, rxcmp1, mb_init, &rx_pkts[nb_rx_pkts]);\n+\t\tnb_rx_pkts += num_valid;\n \n-\t\trx_pkts[nb_rx_pkts++] = mbuf;\n+\t\tif (num_valid < RTE_BNXT_DESCS_PER_LOOP)\n+\t\t\tbreak;\n \t}\n \n+out:\n \tif (nb_rx_pkts) {\n \t\trxr->rx_prod =\n \t\t\tRING_ADV(rxr->rx_ring_struct, rxr->rx_prod, nb_rx_pkts);\n \n \t\trxq->rxrearm_nb += nb_rx_pkts;\n-\t\tcpr->cp_raw_cons = raw_cons;\n+\t\tcpr->cp_raw_cons += 2 * nb_rx_pkts;\n \t\tcpr->valid =\n \t\t\t!!(cpr->cp_raw_cons & cpr->cp_ring_struct->ring_size);\n \t\tbnxt_db_cq(cpr);\ndiff --git a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c\nindex 7e87555408..362992ceb2 100644\n--- a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c\n+++ b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c\n@@ -1,5 +1,5 @@\n-// SPDX-License-Identifier: BSD-3-Clause\n-/* Copyright(c) 2019 Broadcom All rights reserved. */\n+/* SPDX-License-Identifier: BSD-3-Clause */\n+/* Copyright(c) 2019-2020 Broadcom All rights reserved. */\n \n #include <inttypes.h>\n #include <stdbool.h>\n@@ -8,11 +8,7 @@\n #include <rte_byteorder.h>\n #include <rte_malloc.h>\n #include <rte_memory.h>\n-#if defined(RTE_ARCH_X86)\n-#include <tmmintrin.h>\n-#else\n-#error \"bnxt vector pmd: unsupported target.\"\n-#endif\n+#include <rte_vect.h>\n \n #include \"bnxt.h\"\n #include \"bnxt_cpr.h\"\n@@ -26,52 +22,135 @@\n  * RX Ring handling\n  */\n \n-static __m128i\n-bnxt_parse_pkt_type(__m128i mm_rxcmp, __m128i mm_rxcmp1)\n-{\n-\tuint32_t flags_type, flags2;\n-\tuint8_t index;\n-\n-\tflags_type = _mm_extract_epi16(mm_rxcmp, 0);\n-\tflags2 = _mm_extract_epi32(mm_rxcmp1, 0);\n-\n-\t/*\n-\t * Index format:\n-\t *     bit 0: RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC\n-\t *     bit 1: RX_CMPL_FLAGS2_IP_TYPE\n-\t *     bit 2: RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN\n-\t *     bits 3-6: RX_PKT_CMPL_FLAGS_ITYPE\n-\t */\n-\tindex = ((flags_type & RX_PKT_CMPL_FLAGS_ITYPE_MASK) >> 9) |\n-\t\t((flags2 & (RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n-\t\t\t   RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC)) >> 2) |\n-\t\t((flags2 & RX_PKT_CMPL_FLAGS2_IP_TYPE) >> 7);\n+#define GET_OL_FLAGS(rss_flags, ol_index, errors, pi, ol_flags)\t\t       \\\n+{\t\t\t\t\t\t\t\t\t       \\\n+\tuint32_t tmp, of;\t\t\t\t\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\tof = _mm_extract_epi32((rss_flags), (pi)) |\t\t\t       \\\n+\t\tbnxt_ol_flags_table[_mm_extract_epi32((ol_index), (pi))];      \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\ttmp = _mm_extract_epi32((errors), (pi));\t\t\t       \\\n+\tif (tmp)\t\t\t\t\t\t\t       \\\n+\t\tof |= bnxt_ol_flags_err_table[tmp];\t\t\t       \\\n+\t(ol_flags) = of;\t\t\t\t\t\t       \\\n+}\n \n-\treturn _mm_set_epi32(0, 0, 0, bnxt_ptype_table[index]);\n+#define GET_DESC_FIELDS(rxcmp, rxcmp1, shuf_msk, ptype_idx, pi, ret)\t       \\\n+{\t\t\t\t\t\t\t\t\t       \\\n+\tuint32_t ptype;\t\t\t\t\t\t\t       \\\n+\t__m128i r;\t\t\t\t\t\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\t/* Set mbuf pkt_len, data_len, and rss_hash fields. */\t\t       \\\n+\tr = _mm_shuffle_epi8((rxcmp), (shuf_msk));\t\t\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\t/* Set packet type. */\t\t\t\t\t\t       \\\n+\tptype = bnxt_ptype_table[_mm_extract_epi32((ptype_idx), (pi))];\t       \\\n+\tr = _mm_blend_epi16(r, _mm_set_epi32(0, 0, 0, ptype), 0x3);\t       \\\n+\t\t\t\t\t\t\t\t\t       \\\n+\t/* Set vlan_tci. */\t\t\t\t\t\t       \\\n+\tr = _mm_blend_epi16(r, _mm_slli_si128((rxcmp1), 6), 0x20);\t       \\\n+\t(ret) = r;\t\t\t\t\t\t\t       \\\n }\n \n-static __m128i\n-bnxt_set_ol_flags(__m128i mm_rxcmp, __m128i mm_rxcmp1)\n+static inline void\n+descs_to_mbufs(__m128i mm_rxcmp[4], __m128i mm_rxcmp1[4],\n+\t       __m128i mbuf_init, struct rte_mbuf **mbuf)\n {\n-\tuint16_t flags_type, errors, flags;\n+\tconst __m128i shuf_msk =\n+\t\t_mm_set_epi8(15, 14, 13, 12,          /* rss */\n+\t\t\t     0xFF, 0xFF,              /* vlan_tci (zeroes) */\n+\t\t\t     3, 2,                    /* data_len */\n+\t\t\t     0xFF, 0xFF, 3, 2,        /* pkt_len */\n+\t\t\t     0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */\n+\tconst __m128i flags_type_mask =\n+\t\t_mm_set_epi32(RX_PKT_CMPL_FLAGS_ITYPE_MASK,\n+\t\t\t      RX_PKT_CMPL_FLAGS_ITYPE_MASK,\n+\t\t\t      RX_PKT_CMPL_FLAGS_ITYPE_MASK,\n+\t\t\t      RX_PKT_CMPL_FLAGS_ITYPE_MASK);\n+\tconst __m128i flags2_mask1 =\n+\t\t_mm_set_epi32(RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC,\n+\t\t\t      RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC,\n+\t\t\t      RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC,\n+\t\t\t      RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN |\n+\t\t\t\tRX_PKT_CMPL_FLAGS2_T_IP_CS_CALC);\n+\tconst __m128i flags2_mask2 =\n+\t\t_mm_set_epi32(RX_PKT_CMPL_FLAGS2_IP_TYPE,\n+\t\t\t      RX_PKT_CMPL_FLAGS2_IP_TYPE,\n+\t\t\t      RX_PKT_CMPL_FLAGS2_IP_TYPE,\n+\t\t\t      RX_PKT_CMPL_FLAGS2_IP_TYPE);\n+\tconst __m128i rss_mask =\n+\t\t_mm_set_epi32(RX_PKT_CMPL_FLAGS_RSS_VALID,\n+\t\t\t      RX_PKT_CMPL_FLAGS_RSS_VALID,\n+\t\t\t      RX_PKT_CMPL_FLAGS_RSS_VALID,\n+\t\t\t      RX_PKT_CMPL_FLAGS_RSS_VALID);\n+\t__m128i t0, t1, flags_type, flags2, index, errors, rss_flags;\n+\t__m128i ptype_idx;\n \tuint32_t ol_flags;\n \n-\t/* Extract rxcmp1->flags2. */\n-\tflags = _mm_extract_epi32(mm_rxcmp1, 0) & 0x1F;\n-\t/* Extract rxcmp->flags_type. */\n-\tflags_type = _mm_extract_epi16(mm_rxcmp, 0);\n-\t/* Extract rxcmp1->errors_v2. */\n-\terrors = (_mm_extract_epi16(mm_rxcmp1, 4) >> 4) & flags & 0xF;\n+\t/* Compute packet type table indexes for four packets */\n+\tt0 = _mm_unpacklo_epi32(mm_rxcmp[0], mm_rxcmp[1]);\n+\tt1 = _mm_unpacklo_epi32(mm_rxcmp[2], mm_rxcmp[3]);\n+\tflags_type = _mm_unpacklo_epi64(t0, t1);\n+\tptype_idx =\n+\t\t_mm_srli_epi32(_mm_and_si128(flags_type, flags_type_mask), 9);\n \n-\tol_flags = bnxt_ol_flags_table[flags & ~errors];\n+\tt0 = _mm_unpacklo_epi32(mm_rxcmp1[0], mm_rxcmp1[1]);\n+\tt1 = _mm_unpacklo_epi32(mm_rxcmp1[2], mm_rxcmp1[3]);\n+\tflags2 = _mm_unpacklo_epi64(t0, t1);\n \n-\tif (flags_type & RX_PKT_CMPL_FLAGS_RSS_VALID)\n-\t\tol_flags |= PKT_RX_RSS_HASH;\n+\tptype_idx = _mm_or_si128(ptype_idx,\n+\t\t\t_mm_srli_epi32(_mm_and_si128(flags2, flags2_mask1), 2));\n+\tptype_idx = _mm_or_si128(ptype_idx,\n+\t\t\t_mm_srli_epi32(_mm_and_si128(flags2, flags2_mask2), 7));\n \n-\tif (errors)\n-\t\tol_flags |= bnxt_ol_flags_err_table[errors];\n+\t/* Extract RSS valid flags for four packets. */\n+\trss_flags = _mm_srli_epi32(_mm_and_si128(flags_type, rss_mask), 9);\n \n-\treturn _mm_set_epi64x(ol_flags, 0);\n+\t/* Extract errors_v2 fields for four packets. */\n+\tt0 = _mm_unpackhi_epi32(mm_rxcmp1[0], mm_rxcmp1[1]);\n+\tt1 = _mm_unpackhi_epi32(mm_rxcmp1[2], mm_rxcmp1[3]);\n+\n+\t/* Compute ol_flags and checksum error indexes for four packets. */\n+\tflags2 = _mm_and_si128(flags2, _mm_set_epi32(0x1F, 0x1F, 0x1F, 0x1F));\n+\n+\terrors = _mm_srli_epi32(_mm_unpacklo_epi64(t0, t1), 4);\n+\terrors = _mm_and_si128(errors, _mm_set_epi32(0xF, 0xF, 0xF, 0xF));\n+\terrors = _mm_and_si128(errors, flags2);\n+\n+\tindex = _mm_andnot_si128(errors, flags2);\n+\n+\t/* Update mbuf rearm_data for four packets. */\n+\tGET_OL_FLAGS(rss_flags, index, errors, 0, ol_flags);\n+\t_mm_store_si128((void *)&mbuf[0]->rearm_data,\n+\t\t\t_mm_or_si128(mbuf_init, _mm_set_epi64x(ol_flags, 0)));\n+\n+\tGET_OL_FLAGS(rss_flags, index, errors, 1, ol_flags);\n+\t_mm_store_si128((void *)&mbuf[1]->rearm_data,\n+\t\t\t_mm_or_si128(mbuf_init, _mm_set_epi64x(ol_flags, 0)));\n+\n+\tGET_OL_FLAGS(rss_flags, index, errors, 2, ol_flags);\n+\t_mm_store_si128((void *)&mbuf[2]->rearm_data,\n+\t\t\t_mm_or_si128(mbuf_init, _mm_set_epi64x(ol_flags, 0)));\n+\n+\tGET_OL_FLAGS(rss_flags, index, errors, 3, ol_flags);\n+\t_mm_store_si128((void *)&mbuf[3]->rearm_data,\n+\t\t\t_mm_or_si128(mbuf_init, _mm_set_epi64x(ol_flags, 0)));\n+\n+\t/* Update mbuf rx_descriptor_fields1 for four packes. */\n+\tGET_DESC_FIELDS(mm_rxcmp[0], mm_rxcmp1[0], shuf_msk, ptype_idx, 0, t0);\n+\t_mm_store_si128((void *)&mbuf[0]->rx_descriptor_fields1, t0);\n+\n+\tGET_DESC_FIELDS(mm_rxcmp[1], mm_rxcmp1[1], shuf_msk, ptype_idx, 1, t0);\n+\t_mm_store_si128((void *)&mbuf[1]->rx_descriptor_fields1, t0);\n+\n+\tGET_DESC_FIELDS(mm_rxcmp[2], mm_rxcmp1[2], shuf_msk, ptype_idx, 2, t0);\n+\t_mm_store_si128((void *)&mbuf[2]->rx_descriptor_fields1, t0);\n+\n+\tGET_DESC_FIELDS(mm_rxcmp[3], mm_rxcmp1[3], shuf_msk, ptype_idx, 3, t0);\n+\t_mm_store_si128((void *)&mbuf[3]->rx_descriptor_fields1, t0);\n }\n \n uint16_t\n@@ -79,19 +158,23 @@ bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t\t   uint16_t nb_pkts)\n {\n \tstruct bnxt_rx_queue *rxq = rx_queue;\n+\tconst __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);\n \tstruct bnxt_cp_ring_info *cpr = rxq->cp_ring;\n \tstruct bnxt_rx_ring_info *rxr = rxq->rx_ring;\n+\tuint16_t cp_ring_size = cpr->cp_ring_struct->ring_size;\n+\tuint16_t rx_ring_size = rxr->rx_ring_struct->ring_size;\n+\tstruct cmpl_base *cp_desc_ring = cpr->cp_desc_ring;\n+\tuint64_t valid, desc_valid_mask = ~0UL;\n+\tconst __m128i info3_v_mask = _mm_set_epi32(CMPL_BASE_V, CMPL_BASE_V,\n+\t\t\t\t\t\t   CMPL_BASE_V, CMPL_BASE_V);\n \tuint32_t raw_cons = cpr->cp_raw_cons;\n-\tuint32_t cons;\n+\tuint32_t cons, mbcons;\n \tint nb_rx_pkts = 0;\n-\tstruct rx_pkt_cmpl *rxcmp;\n-\tconst __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);\n-\tconst __m128i shuf_msk =\n-\t\t_mm_set_epi8(15, 14, 13, 12,          /* rss */\n-\t\t\t     0xFF, 0xFF,              /* vlan_tci (zeroes) */\n-\t\t\t     3, 2,                    /* data_len */\n-\t\t\t     0xFF, 0xFF, 3, 2,        /* pkt_len */\n-\t\t\t     0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */\n+\tconst __m128i valid_target =\n+\t\t_mm_set_epi32(!!(raw_cons & cp_ring_size),\n+\t\t\t      !!(raw_cons & cp_ring_size),\n+\t\t\t      !!(raw_cons & cp_ring_size),\n+\t\t\t      !!(raw_cons & cp_ring_size));\n \tint i;\n \n \t/* If Rx Q was stopped return */\n@@ -104,69 +187,120 @@ bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t/* Return no more than RTE_BNXT_MAX_RX_BURST per call. */\n \tnb_pkts = RTE_MIN(nb_pkts, RTE_BNXT_MAX_RX_BURST);\n \n+\tcons = raw_cons & (cp_ring_size - 1);\n+\tmbcons = (raw_cons / 2) & (rx_ring_size - 1);\n+\n+\t/* Prefetch first four descriptor pairs. */\n+\trte_prefetch0(&cp_desc_ring[cons]);\n+\trte_prefetch0(&cp_desc_ring[cons + 4]);\n+\n+\t/* Ensure that we do not go past the ends of the rings. */\n+\tnb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons,\n+\t\t\t\t\t   (cp_ring_size - cons) / 2));\n \t/*\n-\t * Make nb_pkts an integer multiple of RTE_BNXT_DESCS_PER_LOOP.\n-\t * nb_pkts < RTE_BNXT_DESCS_PER_LOOP, just return no packet\n+\t * If we are at the end of the ring, ensure that descriptors after the\n+\t * last valid entry are not treated as valid. Otherwise, force the\n+\t * maximum number of packets to receive to be a multiple of the per-\n+\t * loop count.\n \t */\n-\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP);\n-\tif (!nb_pkts)\n-\t\treturn 0;\n+\tif (nb_pkts < RTE_BNXT_DESCS_PER_LOOP)\n+\t\tdesc_valid_mask >>= 16 * (RTE_BNXT_DESCS_PER_LOOP - nb_pkts);\n+\telse\n+\t\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP);\n \n \t/* Handle RX burst request */\n-\tfor (i = 0; i < nb_pkts; i++) {\n-\t\tstruct rx_pkt_cmpl_hi *rxcmp1;\n-\t\tstruct rte_mbuf *mbuf;\n-\t\t__m128i mm_rxcmp, mm_rxcmp1, pkt_mb, ptype, rearm;\n-\n-\t\tcons = RING_CMP(cpr->cp_ring_struct, raw_cons);\n+\tfor (i = 0; i < nb_pkts; i += RTE_BNXT_DESCS_PER_LOOP,\n+\t\t\t\t  cons += RTE_BNXT_DESCS_PER_LOOP * 2,\n+\t\t\t\t  mbcons += RTE_BNXT_DESCS_PER_LOOP) {\n+\t\t__m128i rxcmp1[RTE_BNXT_DESCS_PER_LOOP];\n+\t\t__m128i rxcmp[RTE_BNXT_DESCS_PER_LOOP];\n+\t\t__m128i tmp0, tmp1, info3_v;\n+\t\tuint32_t num_valid;\n+\n+\t\t/* Copy four mbuf pointers to output array. */\n+\t\ttmp0 = _mm_loadu_si128((void *)&rxr->rx_buf_ring[mbcons]);\n+#ifdef RTE_ARCH_X86_64\n+\t\ttmp1 = _mm_loadu_si128((void *)&rxr->rx_buf_ring[mbcons + 2]);\n+#endif\n+\t\t_mm_storeu_si128((void *)&rx_pkts[i], tmp0);\n+#ifdef RTE_ARCH_X86_64\n+\t\t_mm_storeu_si128((void *)&rx_pkts[i + 2], tmp1);\n+#endif\n \n-\t\trxcmp = (struct rx_pkt_cmpl *)&cpr->cp_desc_ring[cons];\n-\t\trxcmp1 = (struct rx_pkt_cmpl_hi *)&cpr->cp_desc_ring[cons + 1];\n+\t\t/* Prefetch four descriptor pairs for next iteration. */\n+\t\tif (i + RTE_BNXT_DESCS_PER_LOOP < nb_pkts) {\n+\t\t\trte_prefetch0(&cp_desc_ring[cons + 8]);\n+\t\t\trte_prefetch0(&cp_desc_ring[cons + 12]);\n+\t\t}\n \n-\t\tif (!CMP_VALID(rxcmp1, raw_cons + 1, cpr->cp_ring_struct))\n-\t\t\tbreak;\n+\t\t/*\n+\t\t * Load the four curent descriptors into SSE registers in\n+\t\t * reverse order to ensure consistent state.\n+\t\t */\n+\t\trxcmp1[3] = _mm_load_si128((void *)&cp_desc_ring[cons + 7]);\n+\t\trte_compiler_barrier();\n+\t\trxcmp[3] = _mm_load_si128((void *)&cp_desc_ring[cons + 6]);\n \n-\t\tmm_rxcmp = _mm_load_si128((__m128i *)rxcmp);\n-\t\tmm_rxcmp1 = _mm_load_si128((__m128i *)rxcmp1);\n+\t\trxcmp1[2] = _mm_load_si128((void *)&cp_desc_ring[cons + 5]);\n+\t\trte_compiler_barrier();\n+\t\trxcmp[2] = _mm_load_si128((void *)&cp_desc_ring[cons + 4]);\n \n-\t\traw_cons += 2;\n-\t\tcons = rxcmp->opaque;\n+\t\ttmp1 = _mm_unpackhi_epi32(rxcmp1[2], rxcmp1[3]);\n \n-\t\tmbuf = rxr->rx_buf_ring[cons];\n-\t\trxr->rx_buf_ring[cons] = NULL;\n+\t\trxcmp1[1] = _mm_load_si128((void *)&cp_desc_ring[cons + 3]);\n+\t\trte_compiler_barrier();\n+\t\trxcmp[1] = _mm_load_si128((void *)&cp_desc_ring[cons + 2]);\n \n-\t\t/* Set fields from mbuf initializer and ol_flags. */\n-\t\trearm = _mm_or_si128(mbuf_init,\n-\t\t\t\t     bnxt_set_ol_flags(mm_rxcmp, mm_rxcmp1));\n-\t\t_mm_store_si128((__m128i *)&mbuf->rearm_data, rearm);\n+\t\trxcmp1[0] = _mm_load_si128((void *)&cp_desc_ring[cons + 1]);\n+\t\trte_compiler_barrier();\n+\t\trxcmp[0] = _mm_load_si128((void *)&cp_desc_ring[cons + 0]);\n \n-\t\t/* Set mbuf pkt_len, data_len, and rss_hash fields. */\n-\t\tpkt_mb = _mm_shuffle_epi8(mm_rxcmp, shuf_msk);\n+\t\ttmp0 = _mm_unpackhi_epi32(rxcmp1[0], rxcmp1[1]);\n \n-\t\t/* Set packet type. */\n-\t\tptype = bnxt_parse_pkt_type(mm_rxcmp, mm_rxcmp1);\n-\t\tpkt_mb = _mm_blend_epi16(pkt_mb, ptype, 0x3);\n+\t\t/* Isolate descriptor valid flags. */\n+\t\tinfo3_v = _mm_and_si128(_mm_unpacklo_epi64(tmp0, tmp1),\n+\t\t\t\t\tinfo3_v_mask);\n+\t\tinfo3_v = _mm_xor_si128(info3_v, valid_target);\n \n \t\t/*\n-\t\t * Shift vlan_tci from completion metadata field left six\n-\t\t * bytes and blend into mbuf->rx_descriptor_fields1 to set\n-\t\t * mbuf->vlan_tci.\n+\t\t * Pack the 128-bit array of valid descriptor flags into 64\n+\t\t * bits and count the number of set bits in order to determine\n+\t\t * the number of valid descriptors.\n \t\t */\n-\t\tpkt_mb = _mm_blend_epi16(pkt_mb,\n-\t\t\t\t\t _mm_slli_si128(mm_rxcmp1, 6), 0x20);\n+\t\tvalid = _mm_cvtsi128_si64(_mm_packs_epi32(info3_v, info3_v));\n+\t\tnum_valid = __builtin_popcountll(valid & desc_valid_mask);\n+\n+\t\tswitch (num_valid) {\n+\t\tcase 4:\n+\t\t\trxr->rx_buf_ring[mbcons + 3] = NULL;\n+\t\t\t/* FALLTHROUGH */\n+\t\tcase 3:\n+\t\t\trxr->rx_buf_ring[mbcons + 2] = NULL;\n+\t\t\t/* FALLTHROUGH */\n+\t\tcase 2:\n+\t\t\trxr->rx_buf_ring[mbcons + 1] = NULL;\n+\t\t\t/* FALLTHROUGH */\n+\t\tcase 1:\n+\t\t\trxr->rx_buf_ring[mbcons + 0] = NULL;\n+\t\t\tbreak;\n+\t\tcase 0:\n+\t\t\tgoto out;\n+\t\t}\n \n-\t\t/* Store descriptor fields. */\n-\t\t_mm_storeu_si128((void *)&mbuf->rx_descriptor_fields1, pkt_mb);\n+\t\tdescs_to_mbufs(rxcmp, rxcmp1, mbuf_init, &rx_pkts[nb_rx_pkts]);\n+\t\tnb_rx_pkts += num_valid;\n \n-\t\trx_pkts[nb_rx_pkts++] = mbuf;\n+\t\tif (num_valid < RTE_BNXT_DESCS_PER_LOOP)\n+\t\t\tbreak;\n \t}\n \n+out:\n \tif (nb_rx_pkts) {\n \t\trxr->rx_prod =\n \t\t\tRING_ADV(rxr->rx_ring_struct, rxr->rx_prod, nb_rx_pkts);\n \n \t\trxq->rxrearm_nb += nb_rx_pkts;\n-\t\tcpr->cp_raw_cons = raw_cons;\n+\t\tcpr->cp_raw_cons += 2 * nb_rx_pkts;\n \t\tcpr->valid =\n \t\t\t!!(cpr->cp_raw_cons & cpr->cp_ring_struct->ring_size);\n \t\tbnxt_db_cq(cpr);\n",
    "prefixes": [
        "11/12"
    ]
}