get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/81711/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 81711,
    "url": "https://patches.dpdk.org/api/patches/81711/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/20201021203030.19042-3-akozyrev@nvidia.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20201021203030.19042-3-akozyrev@nvidia.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20201021203030.19042-3-akozyrev@nvidia.com",
    "date": "2020-10-21T20:30:30",
    "name": "[v2,2/2] net/mlx5: implement vectorized MPRQ burst",
    "commit_ref": null,
    "pull_url": null,
    "state": "accepted",
    "archived": true,
    "hash": "a5f5b7a9033b8666606261b1b3749a77fd25a012",
    "submitter": {
        "id": 1873,
        "url": "https://patches.dpdk.org/api/people/1873/?format=api",
        "name": "Alexander Kozyrev",
        "email": "akozyrev@nvidia.com"
    },
    "delegate": {
        "id": 3268,
        "url": "https://patches.dpdk.org/api/users/3268/?format=api",
        "username": "rasland",
        "first_name": "Raslan",
        "last_name": "Darawsheh",
        "email": "rasland@nvidia.com"
    },
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/20201021203030.19042-3-akozyrev@nvidia.com/mbox/",
    "series": [
        {
            "id": 13181,
            "url": "https://patches.dpdk.org/api/series/13181/?format=api",
            "web_url": "https://patches.dpdk.org/project/dpdk/list/?series=13181",
            "date": "2020-10-21T20:30:28",
            "name": "net/mlx5: add vectorized mprq",
            "version": 2,
            "mbox": "https://patches.dpdk.org/series/13181/mbox/"
        }
    ],
    "comments": "https://patches.dpdk.org/api/patches/81711/comments/",
    "check": "success",
    "checks": "https://patches.dpdk.org/api/patches/81711/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 13335A04DD;\n\tWed, 21 Oct 2020 22:31:33 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 63F05A8EC;\n\tWed, 21 Oct 2020 22:30:57 +0200 (CEST)",
            "from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129])\n by dpdk.org (Postfix) with ESMTP id F1E14A575\n for <dev@dpdk.org>; Wed, 21 Oct 2020 22:30:53 +0200 (CEST)",
            "from Internal Mail-Server by MTLPINE1 (envelope-from\n akozyrev@nvidia.com) with SMTP; 21 Oct 2020 23:30:47 +0300",
            "from nvidia.com (pegasus02.mtr.labs.mlnx [10.210.16.122])\n by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id 09LKUl32019906;\n Wed, 21 Oct 2020 23:30:47 +0300"
        ],
        "From": "Alexander Kozyrev <akozyrev@nvidia.com>",
        "To": "dev@dpdk.org",
        "Cc": "rasland@nvidia.com, matan@nvidia.com, viacheslavo@nvidia.com",
        "Date": "Wed, 21 Oct 2020 20:30:30 +0000",
        "Message-Id": "<20201021203030.19042-3-akozyrev@nvidia.com>",
        "X-Mailer": "git-send-email 2.24.1",
        "In-Reply-To": "<20201021203030.19042-1-akozyrev@nvidia.com>",
        "References": "<20200719041142.14485-1-akozyrev@mellanox.com>\n <20201021203030.19042-1-akozyrev@nvidia.com>",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "Subject": "[dpdk-dev] [PATCH v2 2/2] net/mlx5: implement vectorized MPRQ burst",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "MPRQ (Multi-Packet Rx Queue) processes one packet at a time using\nsimple scalar instructions. MPRQ works by posting a single large buffer\n(consisted of multiple fixed-size strides) in order to receive multiple\npackets at once on this buffer. A Rx packet is then copied to a\nuser-provided mbuf or PMD attaches the Rx packet to the mbuf by the\npointer to an external buffer.\n\nThere is an opportunity to speed up the packet receiving by processing\n4 packets simultaneously using SIMD (single instruction, multiple data)\nextensions. Allocate mbufs in batches for every MPRQ buffer and process\nthe packets in groups of 4 until all the strides are exhausted. Then\nswitch to another MPRQ buffer and repeat the process over again.\n\nThe vectorized MPRQ burst routine is engaged automatically in case\nthe mprq_en=1 devarg is specified and the vectorization is not disabled\nexplicitly by providing rx_vec_en=0 devarg. There is a limitation:\nLRO is not supported and scalar MPRQ is selected if it is on.\n\nSigned-off-by: Alexander Kozyrev <akozyrev@nvidia.com>\nAcked-by: Slava Ovsiienko <viacheslavo@nvidia.com>\n---\n drivers/net/mlx5/mlx5_devx.c     |  15 +-\n drivers/net/mlx5/mlx5_ethdev.c   |  20 +-\n drivers/net/mlx5/mlx5_rxq.c      |  96 ++++++----\n drivers/net/mlx5/mlx5_rxtx.c     | 237 ++++++-----------------\n drivers/net/mlx5/mlx5_rxtx.h     | 200 +++++++++++++++++++-\n drivers/net/mlx5/mlx5_rxtx_vec.c | 312 ++++++++++++++++++++++++++++++-\n drivers/net/mlx5/mlx5_rxtx_vec.h |  56 ------\n 7 files changed, 644 insertions(+), 292 deletions(-)",
    "diff": "diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c\nindex 11bda32557..0c99fe7519 100644\n--- a/drivers/net/mlx5/mlx5_devx.c\n+++ b/drivers/net/mlx5/mlx5_devx.c\n@@ -437,10 +437,17 @@ mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)\n \tif (priv->config.cqe_comp && !rxq_data->hw_timestamp &&\n \t    !rxq_data->lro) {\n \t\tcq_attr.cqe_comp_en = 1u;\n-\t\tcq_attr.mini_cqe_res_format =\n-\t\t\t\tmlx5_rxq_mprq_enabled(rxq_data) ?\n-\t\t\t\t\tMLX5_CQE_RESP_FORMAT_CSUM_STRIDX :\n-\t\t\t\t\tMLX5_CQE_RESP_FORMAT_HASH;\n+\t\t/*\n+\t\t * Select CSUM miniCQE format only for non-vectorized MPRQ\n+\t\t * Rx burst, use HASH miniCQE format for everything else.\n+\t\t */\n+\t\tif (mlx5_rxq_check_vec_support(rxq_data) < 0 &&\n+\t\t\tmlx5_rxq_mprq_enabled(rxq_data))\n+\t\t\tcq_attr.mini_cqe_res_format =\n+\t\t\t\tMLX5_CQE_RESP_FORMAT_CSUM_STRIDX;\n+\t\telse\n+\t\t\tcq_attr.mini_cqe_res_format =\n+\t\t\t\tMLX5_CQE_RESP_FORMAT_HASH;\n \t\t/*\n \t\t * For vectorized Rx, it must not be doubled in order to\n \t\t * make cq_ci and rq_ci aligned.\ndiff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c\nindex 7631f644b2..c70cd301b5 100644\n--- a/drivers/net/mlx5/mlx5_ethdev.c\n+++ b/drivers/net/mlx5/mlx5_ethdev.c\n@@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)\n \n \tif (dev->rx_pkt_burst == mlx5_rx_burst ||\n \t    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||\n-\t    dev->rx_pkt_burst == mlx5_rx_burst_vec)\n+\t    dev->rx_pkt_burst == mlx5_rx_burst_vec ||\n+\t    dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)\n \t\treturn ptypes;\n \treturn NULL;\n }\n@@ -480,11 +481,22 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)\n \n \tMLX5_ASSERT(dev != NULL);\n \tif (mlx5_check_vec_rx_support(dev) > 0) {\n-\t\trx_pkt_burst = mlx5_rx_burst_vec;\n-\t\tDRV_LOG(DEBUG, \"port %u selected Rx vectorized function\",\n-\t\t\tdev->data->port_id);\n+\t\tif (mlx5_mprq_enabled(dev)) {\n+\t\t\trx_pkt_burst = mlx5_rx_burst_mprq_vec;\n+\t\t\tDRV_LOG(DEBUG, \"port %u selected vectorized\"\n+\t\t\t\t\" MPRQ Rx function\", dev->data->port_id);\n+\t\t} else {\n+\t\t\trx_pkt_burst = mlx5_rx_burst_vec;\n+\t\t\tDRV_LOG(DEBUG, \"port %u selected vectorized\"\n+\t\t\t\t\" SPRQ Rx function\", dev->data->port_id);\n+\t\t}\n \t} else if (mlx5_mprq_enabled(dev)) {\n \t\trx_pkt_burst = mlx5_rx_burst_mprq;\n+\t\tDRV_LOG(DEBUG, \"port %u selected MPRQ Rx function\",\n+\t\t\tdev->data->port_id);\n+\t} else {\n+\t\tDRV_LOG(DEBUG, \"port %u selected SPRQ Rx function\",\n+\t\t\tdev->data->port_id);\n \t}\n \treturn rx_pkt_burst;\n }\ndiff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c\nindex e1783ba397..ca1625eac6 100644\n--- a/drivers/net/mlx5/mlx5_rxq.c\n+++ b/drivers/net/mlx5/mlx5_rxq.c\n@@ -173,7 +173,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\trxq->mprq_repl = buf;\n \t}\n \tDRV_LOG(DEBUG,\n-\t\t\"port %u Rx queue %u allocated and configured %u segments\",\n+\t\t\"port %u MPRQ queue %u allocated and configured %u segments\",\n \t\trxq->port_id, rxq->idx, wqe_n);\n \treturn 0;\n error:\n@@ -185,7 +185,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\t\t\t(*rxq->mprq_bufs)[i]);\n \t\t(*rxq->mprq_bufs)[i] = NULL;\n \t}\n-\tDRV_LOG(DEBUG, \"port %u Rx queue %u failed, freed everything\",\n+\tDRV_LOG(DEBUG, \"port %u MPRQ queue %u failed, freed everything\",\n \t\trxq->port_id, rxq->idx);\n \trte_errno = err; /* Restore rte_errno. */\n \treturn -rte_errno;\n@@ -204,7 +204,9 @@ static int\n rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n \tconst unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;\n-\tunsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;\n+\tunsigned int elts_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?\n+\t\t(1 << rxq_ctrl->rxq.elts_n) * (1 << rxq_ctrl->rxq.strd_num_n) :\n+\t\t(1 << rxq_ctrl->rxq.elts_n);\n \tunsigned int i;\n \tint err;\n \n@@ -262,7 +264,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\t(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;\n \t}\n \tDRV_LOG(DEBUG,\n-\t\t\"port %u Rx queue %u allocated and configured %u segments\"\n+\t\t\"port %u SPRQ queue %u allocated and configured %u segments\"\n \t\t\" (max %u packets)\",\n \t\tPORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx, elts_n,\n \t\telts_n / (1 << rxq_ctrl->rxq.sges_n));\n@@ -275,7 +277,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\trte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);\n \t\t(*rxq_ctrl->rxq.elts)[i] = NULL;\n \t}\n-\tDRV_LOG(DEBUG, \"port %u Rx queue %u failed, freed everything\",\n+\tDRV_LOG(DEBUG, \"port %u SPRQ queue %u failed, freed everything\",\n \t\tPORT_ID(rxq_ctrl->priv), rxq_ctrl->rxq.idx);\n \trte_errno = err; /* Restore rte_errno. */\n \treturn -rte_errno;\n@@ -293,8 +295,15 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n int\n rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n-\treturn mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?\n-\t       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);\n+\tint ret = 0;\n+\n+\t/**\n+\t * For MPRQ we need to allocate both MPRQ buffers\n+\t * for WQEs and simple mbufs for vector processing.\n+\t */\n+\tif (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))\n+\t\tret = rxq_alloc_elts_mprq(rxq_ctrl);\n+\treturn (ret || rxq_alloc_elts_sprq(rxq_ctrl));\n }\n \n /**\n@@ -309,11 +318,10 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \tstruct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;\n \tuint16_t i;\n \n-\tDRV_LOG(DEBUG, \"port %u Multi-Packet Rx queue %u freeing WRs\",\n-\t\trxq->port_id, rxq->idx);\n+\tDRV_LOG(DEBUG, \"port %u Multi-Packet Rx queue %u freeing %d WRs\",\n+\t\trxq->port_id, rxq->idx, (1u << rxq->elts_n));\n \tif (rxq->mprq_bufs == NULL)\n \t\treturn;\n-\tMLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);\n \tfor (i = 0; (i != (1u << rxq->elts_n)); ++i) {\n \t\tif ((*rxq->mprq_bufs)[i] != NULL)\n \t\t\tmlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);\n@@ -335,25 +343,27 @@ static void\n rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n \tstruct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;\n-\tconst uint16_t q_n = (1 << rxq->elts_n);\n+\tconst uint16_t q_n = mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?\n+\t\t(1 << rxq->elts_n) * (1 << rxq->strd_num_n) :\n+\t\t(1 << rxq->elts_n);\n \tconst uint16_t q_mask = q_n - 1;\n \tuint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);\n \tuint16_t i;\n \n-\tDRV_LOG(DEBUG, \"port %u Rx queue %u freeing WRs\",\n-\t\tPORT_ID(rxq_ctrl->priv), rxq->idx);\n+\tDRV_LOG(DEBUG, \"port %u Rx queue %u freeing %d WRs\",\n+\t\tPORT_ID(rxq_ctrl->priv), rxq->idx, q_n);\n \tif (rxq->elts == NULL)\n \t\treturn;\n \t/**\n-\t * Some mbuf in the Ring belongs to the application.  They cannot be\n-\t * freed.\n+\t * Some mbuf in the Ring belongs to the application.\n+\t * They cannot be freed.\n \t */\n \tif (mlx5_rxq_check_vec_support(rxq) > 0) {\n \t\tfor (i = 0; i < used; ++i)\n \t\t\t(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;\n \t\trxq->rq_pi = rxq->rq_ci;\n \t}\n-\tfor (i = 0; (i != (1u << rxq->elts_n)); ++i) {\n+\tfor (i = 0; i != q_n; ++i) {\n \t\tif ((*rxq->elts)[i] != NULL)\n \t\t\trte_pktmbuf_free_seg((*rxq->elts)[i]);\n \t\t(*rxq->elts)[i] = NULL;\n@@ -369,10 +379,13 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n static void\n rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n+\t/*\n+\t * For MPRQ we need to allocate both MPRQ buffers\n+\t * for WQEs and simple mbufs for vector processing.\n+\t */\n \tif (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))\n \t\trxq_free_elts_mprq(rxq_ctrl);\n-\telse\n-\t\trxq_free_elts_sprq(rxq_ctrl);\n+\trxq_free_elts_sprq(rxq_ctrl);\n }\n \n /**\n@@ -1334,20 +1347,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \tstruct mlx5_priv *priv = dev->data->dev_private;\n \tstruct mlx5_rxq_ctrl *tmpl;\n \tunsigned int mb_len = rte_pktmbuf_data_room_size(mp);\n-\tunsigned int mprq_stride_nums;\n-\tunsigned int mprq_stride_size;\n-\tunsigned int mprq_stride_cap;\n \tstruct mlx5_dev_config *config = &priv->config;\n-\t/*\n-\t * Always allocate extra slots, even if eventually\n-\t * the vector Rx will not be used.\n-\t */\n-\tuint16_t desc_n =\n-\t\tdesc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;\n \tuint64_t offloads = conf->offloads |\n \t\t\t   dev->data->dev_conf.rxmode.offloads;\n \tunsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);\n-\tconst int mprq_en = mlx5_check_mprq_support(dev) > 0;\n \tunsigned int max_rx_pkt_len = lro_on_queue ?\n \t\t\tdev->data->dev_conf.rxmode.max_lro_pkt_size :\n \t\t\tdev->data->dev_conf.rxmode.max_rx_pkt_len;\n@@ -1355,6 +1358,21 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \t\t\t\t\t\t\tRTE_PKTMBUF_HEADROOM;\n \tunsigned int max_lro_size = 0;\n \tunsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;\n+\tconst int mprq_en = mlx5_check_mprq_support(dev) > 0;\n+\tunsigned int mprq_stride_nums = config->mprq.stride_num_n ?\n+\t\tconfig->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;\n+\tunsigned int mprq_stride_size = non_scatter_min_mbuf_size <=\n+\t\t(1U << config->mprq.max_stride_size_n) ?\n+\t\tlog2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;\n+\tunsigned int mprq_stride_cap = (config->mprq.stride_num_n ?\n+\t\t(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *\n+\t\t(config->mprq.stride_size_n ?\n+\t\t(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));\n+\t/*\n+\t * Always allocate extra slots, even if eventually\n+\t * the vector Rx will not be used.\n+\t */\n+\tuint16_t desc_n = desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;\n \n \tif (non_scatter_min_mbuf_size > mb_len && !(offloads &\n \t\t\t\t\t\t    DEV_RX_OFFLOAD_SCATTER)) {\n@@ -1366,8 +1384,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \t\trte_errno = ENOSPC;\n \t\treturn NULL;\n \t}\n-\ttmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +\n-\t\t\t   desc_n * sizeof(struct rte_mbuf *), 0, socket);\n+\ttmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,\n+\t\tsizeof(*tmpl) + desc_n * sizeof(struct rte_mbuf *) +\n+\t\t(desc >> mprq_stride_nums) * sizeof(struct mlx5_mprq_buf *),\n+\t\t0, socket);\n+\n \tif (!tmpl) {\n \t\trte_errno = ENOMEM;\n \t\treturn NULL;\n@@ -1381,15 +1402,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \ttmpl->socket = socket;\n \tif (dev->data->dev_conf.intr_conf.rxq)\n \t\ttmpl->irq = 1;\n-\tmprq_stride_nums = config->mprq.stride_num_n ?\n-\t\tconfig->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;\n-\tmprq_stride_size = non_scatter_min_mbuf_size <=\n-\t\t(1U << config->mprq.max_stride_size_n) ?\n-\t\tlog2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;\n-\tmprq_stride_cap = (config->mprq.stride_num_n ?\n-\t\t(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *\n-\t\t\t(config->mprq.stride_size_n ?\n-\t\t(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));\n \t/*\n \t * This Rx queue can be configured as a Multi-Packet RQ if all of the\n \t * following conditions are met:\n@@ -1535,9 +1547,11 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \ttmpl->rxq.mp = mp;\n \ttmpl->rxq.elts_n = log2above(desc);\n \ttmpl->rxq.rq_repl_thresh =\n-\t\tMLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);\n+\t\tMLX5_VPMD_RXQ_RPLNSH_THRESH(desc_n);\n \ttmpl->rxq.elts =\n-\t\t(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);\n+\t\t(struct rte_mbuf *(*)[desc_n])(tmpl + 1);\n+\ttmpl->rxq.mprq_bufs =\n+\t\t(struct mlx5_mprq_buf *(*)[desc])(*tmpl->rxq.elts + desc_n);\n #ifndef RTE_ARCH_64\n \ttmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;\n #endif\ndiff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c\nindex b530ff421f..dbb427b5a8 100644\n--- a/drivers/net/mlx5/mlx5_rxtx.c\n+++ b/drivers/net/mlx5/mlx5_rxtx.c\n@@ -19,12 +19,12 @@\n #include <mlx5_prm.h>\n #include <mlx5_common.h>\n \n+#include \"mlx5_autoconf.h\"\n #include \"mlx5_defs.h\"\n #include \"mlx5.h\"\n #include \"mlx5_mr.h\"\n #include \"mlx5_utils.h\"\n #include \"mlx5_rxtx.h\"\n-#include \"mlx5_autoconf.h\"\n \n /* TX burst subroutines return codes. */\n enum mlx5_txcmp_code {\n@@ -93,10 +93,6 @@ static __rte_always_inline void\n rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,\n \t       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);\n \n-static __rte_always_inline void\n-mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,\n-\t\t const unsigned int strd_n);\n-\n static int\n mlx5_queue_state_modify(struct rte_eth_dev *dev,\n \t\t\tstruct mlx5_mp_arg_queue_state_modify *sm);\n@@ -584,7 +580,14 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,\n \t\t       struct rte_eth_burst_mode *mode)\n {\n \teth_rx_burst_t pkt_burst = dev->rx_pkt_burst;\n+\tstruct mlx5_priv *priv = dev->data->dev_private;\n+\tstruct mlx5_rxq_data *rxq;\n \n+\trxq = (*priv->rxqs)[rx_queue_id];\n+\tif (!rxq) {\n+\t\trte_errno = EINVAL;\n+\t\treturn -rte_errno;\n+\t}\n \tif (pkt_burst == mlx5_rx_burst) {\n \t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"Scalar\");\n \t} else if (pkt_burst == mlx5_rx_burst_mprq) {\n@@ -598,6 +601,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,\n \t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"Vector AltiVec\");\n #else\n \t\treturn -EINVAL;\n+#endif\n+\t} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {\n+#if defined RTE_ARCH_X86_64\n+\t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"MPRQ Vector SSE\");\n+#elif defined RTE_ARCH_ARM64\n+\t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"MPRQ Vector Neon\");\n+#elif defined RTE_ARCH_PPC_64\n+\t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"MPRQ Vector AltiVec\");\n+#else\n+\t\treturn -EINVAL;\n #endif\n \t} else {\n \t\treturn -EINVAL;\n@@ -866,6 +879,8 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)\n \trxq->zip = (struct rxq_zip){\n \t\t.ai = 0,\n \t};\n+\trxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?\n+\t\t(wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0;\n \t/* Update doorbell counter. */\n \trxq->rq_ci = wqe_n >> rxq->sges_n;\n \trte_io_wmb();\n@@ -969,7 +984,8 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)\n {\n \tconst uint16_t cqe_n = 1 << rxq->cqe_n;\n \tconst uint16_t cqe_mask = cqe_n - 1;\n-\tconst unsigned int wqe_n = 1 << rxq->elts_n;\n+\tconst uint16_t wqe_n = 1 << rxq->elts_n;\n+\tconst uint16_t strd_n = 1 << rxq->strd_num_n;\n \tstruct mlx5_rxq_ctrl *rxq_ctrl =\n \t\t\tcontainer_of(rxq, struct mlx5_rxq_ctrl, rxq);\n \tunion {\n@@ -1033,21 +1049,27 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)\n \t\t\t\t\t\t    &sm))\n \t\t\t\treturn -1;\n \t\t\tif (vec) {\n-\t\t\t\tconst uint16_t q_mask = wqe_n - 1;\n-\t\t\t\tuint16_t elt_idx;\n+\t\t\t\tconst uint32_t elts_n =\n+\t\t\t\t\tmlx5_rxq_mprq_enabled(rxq) ?\n+\t\t\t\t\twqe_n * strd_n : wqe_n;\n+\t\t\t\tconst uint32_t e_mask = elts_n - 1;\n+\t\t\t\tuint32_t elts_ci =\n+\t\t\t\t\tmlx5_rxq_mprq_enabled(rxq) ?\n+\t\t\t\t\trxq->elts_ci : rxq->rq_ci;\n+\t\t\t\tuint32_t elt_idx;\n \t\t\t\tstruct rte_mbuf **elt;\n \t\t\t\tint i;\n-\t\t\t\tunsigned int n = wqe_n - (rxq->rq_ci -\n+\t\t\t\tunsigned int n = elts_n - (elts_ci -\n \t\t\t\t\t\t\t  rxq->rq_pi);\n \n \t\t\t\tfor (i = 0; i < (int)n; ++i) {\n-\t\t\t\t\telt_idx = (rxq->rq_ci + i) & q_mask;\n+\t\t\t\t\telt_idx = (elts_ci + i) & e_mask;\n \t\t\t\t\telt = &(*rxq->elts)[elt_idx];\n \t\t\t\t\t*elt = rte_mbuf_raw_alloc(rxq->mp);\n \t\t\t\t\tif (!*elt) {\n \t\t\t\t\t\tfor (i--; i >= 0; --i) {\n-\t\t\t\t\t\t\telt_idx = (rxq->rq_ci +\n-\t\t\t\t\t\t\t\t   i) & q_mask;\n+\t\t\t\t\t\t\telt_idx = (elts_ci +\n+\t\t\t\t\t\t\t\t   i) & elts_n;\n \t\t\t\t\t\t\telt = &(*rxq->elts)\n \t\t\t\t\t\t\t\t[elt_idx];\n \t\t\t\t\t\t\trte_pktmbuf_free_seg\n@@ -1056,7 +1078,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)\n \t\t\t\t\t\treturn -1;\n \t\t\t\t\t}\n \t\t\t\t}\n-\t\t\t\tfor (i = 0; i < (int)wqe_n; ++i) {\n+\t\t\t\tfor (i = 0; i < (int)elts_n; ++i) {\n \t\t\t\t\telt = &(*rxq->elts)[i];\n \t\t\t\t\tDATA_LEN(*elt) =\n \t\t\t\t\t\t(uint16_t)((*elt)->buf_len -\n@@ -1064,7 +1086,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)\n \t\t\t\t}\n \t\t\t\t/* Padding with a fake mbuf for vec Rx. */\n \t\t\t\tfor (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)\n-\t\t\t\t\t(*rxq->elts)[wqe_n + i] =\n+\t\t\t\t\t(*rxq->elts)[elts_n + i] =\n \t\t\t\t\t\t\t\t&rxq->fake_mbuf;\n \t\t\t}\n \t\t\tmlx5_rxq_initialize(rxq);\n@@ -1545,31 +1567,6 @@ mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)\n \tmlx5_mprq_buf_free_cb(NULL, buf);\n }\n \n-static inline void\n-mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,\n-\t\t const unsigned int strd_n)\n-{\n-\tstruct mlx5_mprq_buf *rep = rxq->mprq_repl;\n-\tvolatile struct mlx5_wqe_data_seg *wqe =\n-\t\t&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;\n-\tvoid *addr;\n-\n-\tMLX5_ASSERT(rep != NULL);\n-\t/* Replace MPRQ buf. */\n-\t(*rxq->mprq_bufs)[rq_idx] = rep;\n-\t/* Replace WQE. */\n-\taddr = mlx5_mprq_buf_addr(rep, strd_n);\n-\twqe->addr = rte_cpu_to_be_64((uintptr_t)addr);\n-\t/* If there's only one MR, no need to replace LKey in WQE. */\n-\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))\n-\t\twqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);\n-\t/* Stash a mbuf for next replacement. */\n-\tif (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))\n-\t\trxq->mprq_repl = rep;\n-\telse\n-\t\trxq->mprq_repl = NULL;\n-}\n-\n /**\n  * DPDK callback for RX with Multi-Packet RQ support.\n  *\n@@ -1587,12 +1584,9 @@ uint16_t\n mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n {\n \tstruct mlx5_rxq_data *rxq = dpdk_rxq;\n-\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n-\tconst unsigned int strd_sz = 1 << rxq->strd_sz_n;\n-\tconst unsigned int strd_shift =\n-\t\tMLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;\n-\tconst unsigned int cq_mask = (1 << rxq->cqe_n) - 1;\n-\tconst unsigned int wq_mask = (1 << rxq->elts_n) - 1;\n+\tconst uint32_t strd_n = 1 << rxq->strd_num_n;\n+\tconst uint32_t cq_mask = (1 << rxq->cqe_n) - 1;\n+\tconst uint32_t wq_mask = (1 << rxq->elts_n) - 1;\n \tvolatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];\n \tunsigned int i = 0;\n \tuint32_t rq_ci = rxq->rq_ci;\n@@ -1601,37 +1595,18 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n \n \twhile (i < pkts_n) {\n \t\tstruct rte_mbuf *pkt;\n-\t\tvoid *addr;\n \t\tint ret;\n \t\tuint32_t len;\n \t\tuint16_t strd_cnt;\n \t\tuint16_t strd_idx;\n-\t\tuint32_t offset;\n \t\tuint32_t byte_cnt;\n-\t\tint32_t hdrm_overlap;\n \t\tvolatile struct mlx5_mini_cqe8 *mcqe = NULL;\n \t\tuint32_t rss_hash_res = 0;\n+\t\tenum mlx5_rqx_code rxq_code;\n \n \t\tif (consumed_strd == strd_n) {\n-\t\t\t/* Replace WQE only if the buffer is still in use. */\n-\t\t\tif (__atomic_load_n(&buf->refcnt,\n-\t\t\t\t\t    __ATOMIC_RELAXED) > 1) {\n-\t\t\t\tmprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);\n-\t\t\t\t/* Release the old buffer. */\n-\t\t\t\tmlx5_mprq_buf_free(buf);\n-\t\t\t} else if (unlikely(rxq->mprq_repl == NULL)) {\n-\t\t\t\tstruct mlx5_mprq_buf *rep;\n-\n-\t\t\t\t/*\n-\t\t\t\t * Currently, the MPRQ mempool is out of buffer\n-\t\t\t\t * and doing memcpy regardless of the size of Rx\n-\t\t\t\t * packet. Retry allocation to get back to\n-\t\t\t\t * normal.\n-\t\t\t\t */\n-\t\t\t\tif (!rte_mempool_get(rxq->mprq_mp,\n-\t\t\t\t\t\t     (void **)&rep))\n-\t\t\t\t\trxq->mprq_repl = rep;\n-\t\t\t}\n+\t\t\t/* Replace WQE if the buffer is still in use. */\n+\t\t\tmprq_buf_replace(rxq, rq_ci & wq_mask);\n \t\t\t/* Advance to the next WQE. */\n \t\t\tconsumed_strd = 0;\n \t\t\t++rq_ci;\n@@ -1667,122 +1642,23 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n \t\tMLX5_ASSERT((int)len >= (rxq->crc_present << 2));\n \t\tif (rxq->crc_present)\n \t\t\tlen -= RTE_ETHER_CRC_LEN;\n-\t\toffset = strd_idx * strd_sz + strd_shift;\n-\t\taddr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);\n-\t\thdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;\n-\t\t/*\n-\t\t * Memcpy packets to the target mbuf if:\n-\t\t * - The size of packet is smaller than mprq_max_memcpy_len.\n-\t\t * - Out of buffer in the Mempool for Multi-Packet RQ.\n-\t\t * - The packet's stride overlaps a headroom and scatter is off.\n-\t\t */\n-\t\tif (len <= rxq->mprq_max_memcpy_len ||\n-\t\t    rxq->mprq_repl == NULL ||\n-\t\t    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {\n-\t\t\tif (likely(rte_pktmbuf_tailroom(pkt) >= len)) {\n-\t\t\t\trte_memcpy(rte_pktmbuf_mtod(pkt, void *),\n-\t\t\t\t\t   addr, len);\n-\t\t\t\tDATA_LEN(pkt) = len;\n-\t\t\t} else if (rxq->strd_scatter_en) {\n-\t\t\t\tstruct rte_mbuf *prev = pkt;\n-\t\t\t\tuint32_t seg_len =\n-\t\t\t\t\tRTE_MIN(rte_pktmbuf_tailroom(pkt), len);\n-\t\t\t\tuint32_t rem_len = len - seg_len;\n-\n-\t\t\t\trte_memcpy(rte_pktmbuf_mtod(pkt, void *),\n-\t\t\t\t\t   addr, seg_len);\n-\t\t\t\tDATA_LEN(pkt) = seg_len;\n-\t\t\t\twhile (rem_len) {\n-\t\t\t\t\tstruct rte_mbuf *next =\n-\t\t\t\t\t\trte_pktmbuf_alloc(rxq->mp);\n-\n-\t\t\t\t\tif (unlikely(next == NULL)) {\n-\t\t\t\t\t\trte_pktmbuf_free(pkt);\n-\t\t\t\t\t\t++rxq->stats.rx_nombuf;\n-\t\t\t\t\t\tgoto out;\n-\t\t\t\t\t}\n-\t\t\t\t\tNEXT(prev) = next;\n-\t\t\t\t\tSET_DATA_OFF(next, 0);\n-\t\t\t\t\taddr = RTE_PTR_ADD(addr, seg_len);\n-\t\t\t\t\tseg_len = RTE_MIN\n-\t\t\t\t\t\t(rte_pktmbuf_tailroom(next),\n-\t\t\t\t\t\t rem_len);\n-\t\t\t\t\trte_memcpy\n-\t\t\t\t\t\t(rte_pktmbuf_mtod(next, void *),\n-\t\t\t\t\t\t addr, seg_len);\n-\t\t\t\t\tDATA_LEN(next) = seg_len;\n-\t\t\t\t\trem_len -= seg_len;\n-\t\t\t\t\tprev = next;\n-\t\t\t\t\t++NB_SEGS(pkt);\n-\t\t\t\t}\n-\t\t\t} else {\n-\t\t\t\trte_pktmbuf_free_seg(pkt);\n+\t\trxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,\n+\t\t\t\t\t   strd_idx, strd_cnt);\n+\t\tif (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {\n+\t\t\trte_pktmbuf_free_seg(pkt);\n+\t\t\tif (rxq_code == MLX5_RXQ_CODE_DROPPED) {\n \t\t\t\t++rxq->stats.idropped;\n \t\t\t\tcontinue;\n \t\t\t}\n-\t\t} else {\n-\t\t\trte_iova_t buf_iova;\n-\t\t\tstruct rte_mbuf_ext_shared_info *shinfo;\n-\t\t\tuint16_t buf_len = strd_cnt * strd_sz;\n-\t\t\tvoid *buf_addr;\n-\n-\t\t\t/* Increment the refcnt of the whole chunk. */\n-\t\t\t__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);\n-\t\t\tMLX5_ASSERT(__atomic_load_n(&buf->refcnt,\n-\t\t\t\t    __ATOMIC_RELAXED) <= strd_n + 1);\n-\t\t\tbuf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);\n-\t\t\t/*\n-\t\t\t * MLX5 device doesn't use iova but it is necessary in a\n-\t\t\t * case where the Rx packet is transmitted via a\n-\t\t\t * different PMD.\n-\t\t\t */\n-\t\t\tbuf_iova = rte_mempool_virt2iova(buf) +\n-\t\t\t\t   RTE_PTR_DIFF(buf_addr, buf);\n-\t\t\tshinfo = &buf->shinfos[strd_idx];\n-\t\t\trte_mbuf_ext_refcnt_set(shinfo, 1);\n-\t\t\t/*\n-\t\t\t * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when\n-\t\t\t * attaching the stride to mbuf and more offload flags\n-\t\t\t * will be added below by calling rxq_cq_to_mbuf().\n-\t\t\t * Other fields will be overwritten.\n-\t\t\t */\n-\t\t\trte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,\n-\t\t\t\t\t\t  buf_len, shinfo);\n-\t\t\t/* Set mbuf head-room. */\n-\t\t\tSET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);\n-\t\t\tMLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);\n-\t\t\tMLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=\n-\t\t\t\tlen - (hdrm_overlap > 0 ? hdrm_overlap : 0));\n-\t\t\tDATA_LEN(pkt) = len;\n-\t\t\t/*\n-\t\t\t * Copy the last fragment of a packet (up to headroom\n-\t\t\t * size bytes) in case there is a stride overlap with\n-\t\t\t * a next packet's headroom. Allocate a separate mbuf\n-\t\t\t * to store this fragment and link it. Scatter is on.\n-\t\t\t */\n-\t\t\tif (hdrm_overlap > 0) {\n-\t\t\t\tMLX5_ASSERT(rxq->strd_scatter_en);\n-\t\t\t\tstruct rte_mbuf *seg =\n-\t\t\t\t\trte_pktmbuf_alloc(rxq->mp);\n-\n-\t\t\t\tif (unlikely(seg == NULL)) {\n-\t\t\t\t\trte_pktmbuf_free_seg(pkt);\n-\t\t\t\t\t++rxq->stats.rx_nombuf;\n-\t\t\t\t\tbreak;\n-\t\t\t\t}\n-\t\t\t\tSET_DATA_OFF(seg, 0);\n-\t\t\t\trte_memcpy(rte_pktmbuf_mtod(seg, void *),\n-\t\t\t\t\tRTE_PTR_ADD(addr, len - hdrm_overlap),\n-\t\t\t\t\thdrm_overlap);\n-\t\t\t\tDATA_LEN(seg) = hdrm_overlap;\n-\t\t\t\tDATA_LEN(pkt) = len - hdrm_overlap;\n-\t\t\t\tNEXT(pkt) = seg;\n-\t\t\t\tNB_SEGS(pkt) = 2;\n+\t\t\tif (rxq_code == MLX5_RXQ_CODE_NOMBUF) {\n+\t\t\t\t++rxq->stats.rx_nombuf;\n+\t\t\t\tbreak;\n \t\t\t}\n \t\t}\n \t\trxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);\n \t\tif (cqe->lro_num_seg > 1) {\n-\t\t\tmlx5_lro_update_hdr(addr, cqe, len);\n+\t\t\tmlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),\n+\t\t\t\t\t    cqe, len);\n \t\t\tpkt->ol_flags |= PKT_RX_LRO;\n \t\t\tpkt->tso_segsz = len / cqe->lro_num_seg;\n \t\t}\n@@ -1796,7 +1672,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n \t\t*(pkts++) = pkt;\n \t\t++i;\n \t}\n-out:\n \t/* Update the consumer indexes. */\n \trxq->consumed_strd = consumed_strd;\n \trte_io_wmb();\n@@ -1878,6 +1753,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,\n \treturn 0;\n }\n \n+__rte_weak uint16_t\n+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,\n+\t\t       struct rte_mbuf **pkts __rte_unused,\n+\t\t       uint16_t pkts_n __rte_unused)\n+{\n+\treturn 0;\n+}\n+\n __rte_weak int\n mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)\n {\ndiff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h\nindex b243b6f28c..0eafa22d63 100644\n--- a/drivers/net/mlx5/mlx5_rxtx.h\n+++ b/drivers/net/mlx5/mlx5_rxtx.h\n@@ -30,6 +30,7 @@\n #include \"mlx5_utils.h\"\n #include \"mlx5.h\"\n #include \"mlx5_autoconf.h\"\n+#include \"mlx5_mr.h\"\n \n /* Support tunnel matching. */\n #define MLX5_FLOW_TUNNEL 10\n@@ -94,6 +95,12 @@ enum mlx5_rxq_err_state {\n \tMLX5_RXQ_ERR_STATE_NEED_READY,\n };\n \n+enum mlx5_rqx_code {\n+\tMLX5_RXQ_CODE_EXIT = 0,\n+\tMLX5_RXQ_CODE_NOMBUF,\n+\tMLX5_RXQ_CODE_DROPPED,\n+};\n+\n /* RX queue descriptor. */\n struct mlx5_rxq_data {\n \tunsigned int csum:1; /* Enable checksum offloading. */\n@@ -116,6 +123,7 @@ struct mlx5_rxq_data {\n \tvolatile uint32_t *rq_db;\n \tvolatile uint32_t *cq_db;\n \tuint16_t port_id;\n+\tuint32_t elts_ci;\n \tuint32_t rq_ci;\n \tuint16_t consumed_strd; /* Number of consumed strides in WQE. */\n \tuint32_t rq_pi;\n@@ -130,11 +138,8 @@ struct mlx5_rxq_data {\n \tuint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */\n \tvolatile void *wqes;\n \tvolatile struct mlx5_cqe(*cqes)[];\n-\tRTE_STD_C11\n-\tunion  {\n-\t\tstruct rte_mbuf *(*elts)[];\n-\t\tstruct mlx5_mprq_buf *(*mprq_bufs)[];\n-\t};\n+\tstruct rte_mbuf *(*elts)[];\n+\tstruct mlx5_mprq_buf *(*mprq_bufs)[];\n \tstruct rte_mempool *mp;\n \tstruct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */\n \tstruct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */\n@@ -421,6 +426,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);\n int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);\n uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,\n \t\t\t   uint16_t pkts_n);\n+uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,\n+\t\t\t\tuint16_t pkts_n);\n \n /* mlx5_mr.c */\n \n@@ -681,4 +688,187 @@ mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)\n \treturn ci;\n }\n \n+/**\n+ * Replace MPRQ buffer.\n+ *\n+ * @param rxq\n+ *   Pointer to Rx queue structure.\n+ * @param rq_idx\n+ *   RQ index to replace.\n+ */\n+static __rte_always_inline void\n+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx)\n+{\n+\tconst uint32_t strd_n = 1 << rxq->strd_num_n;\n+\tstruct mlx5_mprq_buf *rep = rxq->mprq_repl;\n+\tvolatile struct mlx5_wqe_data_seg *wqe =\n+\t\t&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;\n+\tstruct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];\n+\tvoid *addr;\n+\n+\tif (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) > 1) {\n+\t\tMLX5_ASSERT(rep != NULL);\n+\t\t/* Replace MPRQ buf. */\n+\t\t(*rxq->mprq_bufs)[rq_idx] = rep;\n+\t\t/* Replace WQE. */\n+\t\taddr = mlx5_mprq_buf_addr(rep, strd_n);\n+\t\twqe->addr = rte_cpu_to_be_64((uintptr_t)addr);\n+\t\t/* If there's only one MR, no need to replace LKey in WQE. */\n+\t\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))\n+\t\t\twqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);\n+\t\t/* Stash a mbuf for next replacement. */\n+\t\tif (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))\n+\t\t\trxq->mprq_repl = rep;\n+\t\telse\n+\t\t\trxq->mprq_repl = NULL;\n+\t\t/* Release the old buffer. */\n+\t\tmlx5_mprq_buf_free(buf);\n+\t} else if (unlikely(rxq->mprq_repl == NULL)) {\n+\t\tstruct mlx5_mprq_buf *rep;\n+\n+\t\t/*\n+\t\t * Currently, the MPRQ mempool is out of buffer\n+\t\t * and doing memcpy regardless of the size of Rx\n+\t\t * packet. Retry allocation to get back to\n+\t\t * normal.\n+\t\t */\n+\t\tif (!rte_mempool_get(rxq->mprq_mp, (void **)&rep))\n+\t\t\trxq->mprq_repl = rep;\n+\t}\n+}\n+\n+/**\n+ * Attach or copy MPRQ buffer content to a packet.\n+ *\n+ * @param rxq\n+ *   Pointer to Rx queue structure.\n+ * @param pkt\n+ *   Pointer to a packet to fill.\n+ * @param len\n+ *   Packet length.\n+ * @param buf\n+ *   Pointer to a MPRQ buffer to take the data from.\n+ * @param strd_idx\n+ *   Stride index to start from.\n+ * @param strd_cnt\n+ *   Number of strides to consume.\n+ */\n+static __rte_always_inline enum mlx5_rqx_code\n+mprq_buf_to_pkt(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, uint32_t len,\n+\t\tstruct mlx5_mprq_buf *buf, uint16_t strd_idx, uint16_t strd_cnt)\n+{\n+\tconst uint32_t strd_n = 1 << rxq->strd_num_n;\n+\tconst uint16_t strd_sz = 1 << rxq->strd_sz_n;\n+\tconst uint16_t strd_shift =\n+\t\tMLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;\n+\tconst int32_t hdrm_overlap =\n+\t\tlen + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;\n+\tconst uint32_t offset = strd_idx * strd_sz + strd_shift;\n+\tvoid *addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);\n+\n+\t/*\n+\t * Memcpy packets to the target mbuf if:\n+\t * - The size of packet is smaller than mprq_max_memcpy_len.\n+\t * - Out of buffer in the Mempool for Multi-Packet RQ.\n+\t * - The packet's stride overlaps a headroom and scatter is off.\n+\t */\n+\tif (len <= rxq->mprq_max_memcpy_len ||\n+\t    rxq->mprq_repl == NULL ||\n+\t    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {\n+\t\tif (likely(len <=\n+\t\t\t   (uint32_t)(pkt->buf_len - RTE_PKTMBUF_HEADROOM))) {\n+\t\t\trte_memcpy(rte_pktmbuf_mtod(pkt, void *),\n+\t\t\t\t   addr, len);\n+\t\t\tDATA_LEN(pkt) = len;\n+\t\t} else if (rxq->strd_scatter_en) {\n+\t\t\tstruct rte_mbuf *prev = pkt;\n+\t\t\tuint32_t seg_len = RTE_MIN(len, (uint32_t)\n+\t\t\t\t(pkt->buf_len - RTE_PKTMBUF_HEADROOM));\n+\t\t\tuint32_t rem_len = len - seg_len;\n+\n+\t\t\trte_memcpy(rte_pktmbuf_mtod(pkt, void *),\n+\t\t\t\t   addr, seg_len);\n+\t\t\tDATA_LEN(pkt) = seg_len;\n+\t\t\twhile (rem_len) {\n+\t\t\t\tstruct rte_mbuf *next =\n+\t\t\t\t\trte_pktmbuf_alloc(rxq->mp);\n+\n+\t\t\t\tif (unlikely(next == NULL))\n+\t\t\t\t\treturn MLX5_RXQ_CODE_NOMBUF;\n+\t\t\t\tNEXT(prev) = next;\n+\t\t\t\tSET_DATA_OFF(next, 0);\n+\t\t\t\taddr = RTE_PTR_ADD(addr, seg_len);\n+\t\t\t\tseg_len = RTE_MIN(rem_len, (uint32_t)\n+\t\t\t\t\t(next->buf_len - RTE_PKTMBUF_HEADROOM));\n+\t\t\t\trte_memcpy\n+\t\t\t\t\t(rte_pktmbuf_mtod(next, void *),\n+\t\t\t\t\t addr, seg_len);\n+\t\t\t\tDATA_LEN(next) = seg_len;\n+\t\t\t\trem_len -= seg_len;\n+\t\t\t\tprev = next;\n+\t\t\t\t++NB_SEGS(pkt);\n+\t\t\t}\n+\t\t} else {\n+\t\t\treturn MLX5_RXQ_CODE_DROPPED;\n+\t\t}\n+\t} else {\n+\t\trte_iova_t buf_iova;\n+\t\tstruct rte_mbuf_ext_shared_info *shinfo;\n+\t\tuint16_t buf_len = strd_cnt * strd_sz;\n+\t\tvoid *buf_addr;\n+\n+\t\t/* Increment the refcnt of the whole chunk. */\n+\t\t__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);\n+\t\tMLX5_ASSERT(__atomic_load_n(&buf->refcnt,\n+\t\t\t    __ATOMIC_RELAXED) <= strd_n + 1);\n+\t\tbuf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);\n+\t\t/*\n+\t\t * MLX5 device doesn't use iova but it is necessary in a\n+\t\t * case where the Rx packet is transmitted via a\n+\t\t * different PMD.\n+\t\t */\n+\t\tbuf_iova = rte_mempool_virt2iova(buf) +\n+\t\t\t   RTE_PTR_DIFF(buf_addr, buf);\n+\t\tshinfo = &buf->shinfos[strd_idx];\n+\t\trte_mbuf_ext_refcnt_set(shinfo, 1);\n+\t\t/*\n+\t\t * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when\n+\t\t * attaching the stride to mbuf and more offload flags\n+\t\t * will be added below by calling rxq_cq_to_mbuf().\n+\t\t * Other fields will be overwritten.\n+\t\t */\n+\t\trte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,\n+\t\t\t\t\t  buf_len, shinfo);\n+\t\t/* Set mbuf head-room. */\n+\t\tSET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);\n+\t\tMLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);\n+\t\tMLX5_ASSERT(rte_pktmbuf_tailroom(pkt) >=\n+\t\t\tlen - (hdrm_overlap > 0 ? hdrm_overlap : 0));\n+\t\tDATA_LEN(pkt) = len;\n+\t\t/*\n+\t\t * Copy the last fragment of a packet (up to headroom\n+\t\t * size bytes) in case there is a stride overlap with\n+\t\t * a next packet's headroom. Allocate a separate mbuf\n+\t\t * to store this fragment and link it. Scatter is on.\n+\t\t */\n+\t\tif (hdrm_overlap > 0) {\n+\t\t\tMLX5_ASSERT(rxq->strd_scatter_en);\n+\t\t\tstruct rte_mbuf *seg =\n+\t\t\t\trte_pktmbuf_alloc(rxq->mp);\n+\n+\t\t\tif (unlikely(seg == NULL))\n+\t\t\t\treturn MLX5_RXQ_CODE_NOMBUF;\n+\t\t\tSET_DATA_OFF(seg, 0);\n+\t\t\trte_memcpy(rte_pktmbuf_mtod(seg, void *),\n+\t\t\t\tRTE_PTR_ADD(addr, len - hdrm_overlap),\n+\t\t\t\thdrm_overlap);\n+\t\t\tDATA_LEN(seg) = hdrm_overlap;\n+\t\t\tDATA_LEN(pkt) = len - hdrm_overlap;\n+\t\t\tNEXT(pkt) = seg;\n+\t\t\tNB_SEGS(pkt) = 2;\n+\t\t}\n+\t}\n+\treturn MLX5_RXQ_CODE_EXIT;\n+}\n+\n #endif /* RTE_PMD_MLX5_RXTX_H_ */\ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c\nindex aa48775738..469ea8401d 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec.c\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c\n@@ -77,6 +77,177 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n \treturn n;\n }\n \n+/**\n+ * Replenish buffers for RX in bulk.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ */\n+static inline void\n+mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)\n+{\n+\tconst uint16_t q_n = 1 << rxq->elts_n;\n+\tconst uint16_t q_mask = q_n - 1;\n+\tuint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);\n+\tuint16_t elts_idx = rxq->rq_ci & q_mask;\n+\tstruct rte_mbuf **elts = &(*rxq->elts)[elts_idx];\n+\tvolatile struct mlx5_wqe_data_seg *wq =\n+\t\t&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];\n+\tunsigned int i;\n+\n+\tif (n >= rxq->rq_repl_thresh) {\n+\t\tMLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));\n+\t\tMLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >\n+\t\t\t    MLX5_VPMD_DESCS_PER_LOOP);\n+\t\t/* Not to cross queue end. */\n+\t\tn = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);\n+\t\tif (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {\n+\t\t\trxq->stats.rx_nombuf += n;\n+\t\t\treturn;\n+\t\t}\n+\t\tfor (i = 0; i < n; ++i) {\n+\t\t\tvoid *buf_addr;\n+\n+\t\t\t/*\n+\t\t\t * In order to support the mbufs with external attached\n+\t\t\t * data buffer we should use the buf_addr pointer\n+\t\t\t * instead of rte_mbuf_buf_addr(). It touches the mbuf\n+\t\t\t * itself and may impact the performance.\n+\t\t\t */\n+\t\t\tbuf_addr = elts[i]->buf_addr;\n+\t\t\twq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +\n+\t\t\t\t\t\t      RTE_PKTMBUF_HEADROOM);\n+\t\t\t/* If there's a single MR, no need to replace LKey. */\n+\t\t\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)\n+\t\t\t\t     > 1))\n+\t\t\t\twq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);\n+\t\t}\n+\t\trxq->rq_ci += n;\n+\t\t/* Prevent overflowing into consumed mbufs. */\n+\t\telts_idx = rxq->rq_ci & q_mask;\n+\t\tfor (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)\n+\t\t\t(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;\n+\t\trte_io_wmb();\n+\t\t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n+\t}\n+}\n+\n+/**\n+ * Replenish buffers for MPRQ RX in bulk.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ */\n+static inline void\n+mlx5_rx_mprq_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)\n+{\n+\tconst uint16_t wqe_n = 1 << rxq->elts_n;\n+\tconst uint32_t strd_n = 1 << rxq->strd_num_n;\n+\tconst uint32_t elts_n = wqe_n * strd_n;\n+\tconst uint32_t wqe_mask = elts_n - 1;\n+\tuint32_t n = elts_n - (rxq->elts_ci - rxq->rq_pi);\n+\tuint32_t elts_idx = rxq->elts_ci & wqe_mask;\n+\tstruct rte_mbuf **elts = &(*rxq->elts)[elts_idx];\n+\n+\t/* Not to cross queue end. */\n+\tif (n >= rxq->rq_repl_thresh) {\n+\t\tMLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n));\n+\t\tMLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(elts_n) >\n+\t\t\t     MLX5_VPMD_DESCS_PER_LOOP);\n+\t\tn = RTE_MIN(n, elts_n - elts_idx);\n+\t\tif (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {\n+\t\t\trxq->stats.rx_nombuf += n;\n+\t\t\treturn;\n+\t\t}\n+\t\trxq->elts_ci += n;\n+\t}\n+}\n+\n+/**\n+ * Copy or attach MPRQ buffers to RX SW ring.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param pkts\n+ *   Pointer to array of packets to be stored.\n+ * @param pkts_n\n+ *   Number of packets to be stored.\n+ *\n+ * @return\n+ *   Number of packets successfully copied/attached (<= pkts_n).\n+ */\n+static inline uint16_t\n+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq,\n+\t\t     struct rte_mbuf **pkts, uint16_t pkts_n)\n+{\n+\tconst uint16_t wqe_n = 1 << rxq->elts_n;\n+\tconst uint16_t wqe_mask = wqe_n - 1;\n+\tconst uint16_t strd_sz = 1 << rxq->strd_sz_n;\n+\tconst uint32_t strd_n = 1 << rxq->strd_num_n;\n+\tconst uint32_t elts_n = wqe_n * strd_n;\n+\tconst uint32_t elts_mask = elts_n - 1;\n+\tuint32_t elts_idx = rxq->rq_pi & elts_mask;\n+\tstruct rte_mbuf **elts = &(*rxq->elts)[elts_idx];\n+\tuint32_t rq_ci = rxq->rq_ci;\n+\tstruct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wqe_mask];\n+\tuint16_t copied = 0;\n+\tuint16_t i = 0;\n+\n+\tfor (i = 0; i < pkts_n; ++i) {\n+\t\tuint16_t strd_cnt;\n+\t\tenum mlx5_rqx_code rxq_code;\n+\n+\t\tif (rxq->consumed_strd == strd_n) {\n+\t\t\t/* Replace WQE if the buffer is still in use. */\n+\t\t\tmprq_buf_replace(rxq, rq_ci & wqe_mask);\n+\t\t\t/* Advance to the next WQE. */\n+\t\t\trxq->consumed_strd = 0;\n+\t\t\trq_ci++;\n+\t\t\tbuf = (*rxq->mprq_bufs)[rq_ci & wqe_mask];\n+\t\t}\n+\n+\t\tif (!elts[i]->pkt_len) {\n+\t\t\trxq->consumed_strd = strd_n;\n+\t\t\trte_pktmbuf_free_seg(elts[i]);\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t\trxq->stats.ipackets -= 1;\n+#endif\n+\t\t\tcontinue;\n+\t\t}\n+\t\tstrd_cnt = (elts[i]->pkt_len / strd_sz) +\n+\t\t\t   ((elts[i]->pkt_len % strd_sz) ? 1 : 0);\n+\t\trxq_code = mprq_buf_to_pkt(rxq, elts[i], elts[i]->pkt_len,\n+\t\t\t\t\t   buf, rxq->consumed_strd, strd_cnt);\n+\t\trxq->consumed_strd += strd_cnt;\n+\t\tif (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {\n+\t\t\trte_pktmbuf_free_seg(elts[i]);\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t\trxq->stats.ipackets -= 1;\n+\t\t\trxq->stats.ibytes -= elts[i]->pkt_len;\n+#endif\n+\t\t\tif (rxq_code == MLX5_RXQ_CODE_NOMBUF) {\n+\t\t\t\t++rxq->stats.rx_nombuf;\n+\t\t\t\tbreak;\n+\t\t\t}\n+\t\t\tif (rxq_code == MLX5_RXQ_CODE_DROPPED) {\n+\t\t\t\t++rxq->stats.idropped;\n+\t\t\t\tcontinue;\n+\t\t\t}\n+\t\t}\n+\t\tpkts[copied++] = elts[i];\n+\t}\n+\trxq->rq_pi += i;\n+\trxq->cq_ci += i;\n+\trte_io_wmb();\n+\t*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);\n+\tif (rq_ci != rxq->rq_ci) {\n+\t\trxq->rq_ci = rq_ci;\n+\t\trte_io_wmb();\n+\t\t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n+\t}\n+\treturn copied;\n+}\n+\n /**\n  * Receive burst of packets. An errored completion also consumes a mbuf, but the\n  * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed\n@@ -204,7 +375,142 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n \tbool no_cq = false;\n \n \tdo {\n-\t\tnb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err, &no_cq);\n+\t\tnb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn,\n+\t\t\t\t    &err, &no_cq);\n+\t\tif (unlikely(err | rxq->err_state))\n+\t\t\tnb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);\n+\t\ttn += nb_rx;\n+\t\tif (unlikely(no_cq))\n+\t\t\tbreak;\n+\t} while (tn != pkts_n);\n+\treturn tn;\n+}\n+\n+/**\n+ * Receive burst of packets. An errored completion also consumes a mbuf, but the\n+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed\n+ * before returning to application.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param[out] pkts\n+ *   Array to store received packets.\n+ * @param pkts_n\n+ *   Maximum number of packets in array.\n+ * @param[out] err\n+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error\n+ *   packet to handle.\n+ * @param[out] no_cq\n+ *   Pointer to a boolean. Set true if no new CQE seen.\n+ *\n+ * @return\n+ *   Number of packets received including errors (<= pkts_n).\n+ */\n+static inline uint16_t\n+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t uint16_t pkts_n, uint64_t *err, bool *no_cq)\n+{\n+\tconst uint16_t q_n = 1 << rxq->cqe_n;\n+\tconst uint16_t q_mask = q_n - 1;\n+\tconst uint16_t wqe_n = 1 << rxq->elts_n;\n+\tconst uint32_t strd_n = 1 << rxq->strd_num_n;\n+\tconst uint32_t elts_n = wqe_n * strd_n;\n+\tconst uint32_t elts_mask = elts_n - 1;\n+\tvolatile struct mlx5_cqe *cq;\n+\tstruct rte_mbuf **elts;\n+\tuint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;\n+\tuint16_t nocmp_n = 0;\n+\tuint16_t rcvd_pkt = 0;\n+\tuint16_t cp_pkt = 0;\n+\tunsigned int cq_idx = rxq->cq_ci & q_mask;\n+\tunsigned int elts_idx;\n+\n+\tMLX5_ASSERT(rxq->sges_n == 0);\n+\tcq = &(*rxq->cqes)[cq_idx];\n+\trte_prefetch0(cq);\n+\trte_prefetch0(cq + 1);\n+\trte_prefetch0(cq + 2);\n+\trte_prefetch0(cq + 3);\n+\tpkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);\n+\tmlx5_rx_mprq_replenish_bulk_mbuf(rxq);\n+\t/* See if there're unreturned mbufs from compressed CQE. */\n+\trcvd_pkt = rxq->decompressed;\n+\tif (rcvd_pkt > 0) {\n+\t\trcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);\n+\t\tcp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt);\n+\t\trxq->decompressed -= rcvd_pkt;\n+\t\tpkts += cp_pkt;\n+\t}\n+\telts_idx = rxq->rq_pi & elts_mask;\n+\telts = &(*rxq->elts)[elts_idx];\n+\t/* Not to overflow pkts array. */\n+\tpkts_n = RTE_ALIGN_FLOOR(pkts_n - cp_pkt, MLX5_VPMD_DESCS_PER_LOOP);\n+\t/* Not to cross queue end. */\n+\tpkts_n = RTE_MIN(pkts_n, elts_n - elts_idx);\n+\tpkts_n = RTE_MIN(pkts_n, q_n - cq_idx);\n+\t/* Not to move past the allocated mbufs. */\n+\tpkts_n = RTE_MIN(pkts_n, rxq->elts_ci - rxq->rq_pi);\n+\tif (!pkts_n) {\n+\t\t*no_cq = !cp_pkt;\n+\t\treturn cp_pkt;\n+\t}\n+\t/* At this point, there shouldn't be any remaining packets. */\n+\tMLX5_ASSERT(rxq->decompressed == 0);\n+\t/* Process all the CQEs */\n+\tnocmp_n = rxq_cq_process_v(rxq, cq, elts, pkts, pkts_n, err, &comp_idx);\n+\t/* If no new CQE seen, return without updating cq_db. */\n+\tif (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {\n+\t\t*no_cq = true;\n+\t\treturn cp_pkt;\n+\t}\n+\t/* Update the consumer indexes for non-compressed CQEs. */\n+\tMLX5_ASSERT(nocmp_n <= pkts_n);\n+\tcp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n);\n+\trcvd_pkt += cp_pkt;\n+\t/* Decompress the last CQE if compressed. */\n+\tif (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\tMLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));\n+\t\trxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],\n+\t\t\t\t\t\t\t&elts[nocmp_n]);\n+\t\t/* Return more packets if needed. */\n+\t\tif (nocmp_n < pkts_n) {\n+\t\t\tuint16_t n = rxq->decompressed;\n+\n+\t\t\tn = RTE_MIN(n, pkts_n - nocmp_n);\n+\t\t\tcp_pkt = rxq_copy_mprq_mbuf_v(rxq, &pkts[cp_pkt], n);\n+\t\t\trcvd_pkt += cp_pkt;\n+\t\t\trxq->decompressed -= n;\n+\t\t}\n+\t}\n+\t*no_cq = !rcvd_pkt;\n+\treturn rcvd_pkt;\n+}\n+\n+/**\n+ * DPDK callback for vectorized MPRQ RX.\n+ *\n+ * @param dpdk_rxq\n+ *   Generic pointer to RX queue structure.\n+ * @param[out] pkts\n+ *   Array to store received packets.\n+ * @param pkts_n\n+ *   Maximum number of packets in array.\n+ *\n+ * @return\n+ *   Number of packets successfully received (<= pkts_n).\n+ */\n+uint16_t\n+mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n+{\n+\tstruct mlx5_rxq_data *rxq = dpdk_rxq;\n+\tuint16_t nb_rx = 0;\n+\tuint16_t tn = 0;\n+\tuint64_t err = 0;\n+\tbool no_cq = false;\n+\n+\tdo {\n+\t\tnb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,\n+\t\t\t\t\t &err, &no_cq);\n \t\tif (unlikely(err | rxq->err_state))\n \t\t\tnb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);\n \t\ttn += nb_rx;\n@@ -229,8 +535,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)\n \tstruct mlx5_rxq_ctrl *ctrl =\n \t\tcontainer_of(rxq, struct mlx5_rxq_ctrl, rxq);\n \n-\tif (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))\n-\t\treturn -ENOTSUP;\n \tif (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)\n \t\treturn -ENOTSUP;\n \tif (rxq->lro)\n@@ -257,8 +561,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)\n \t\treturn -ENOTSUP;\n \tif (!priv->config.rx_vec_en)\n \t\treturn -ENOTSUP;\n-\tif (mlx5_mprq_enabled(dev))\n-\t\treturn -ENOTSUP;\n \t/* All the configured queues should support. */\n \tfor (i = 0; i < priv->rxqs_n; ++i) {\n \t\tstruct mlx5_rxq_data *rxq = (*priv->rxqs)[i];\ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h\nindex ce27074b08..93b4f517bb 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec.h\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h\n@@ -12,7 +12,6 @@\n #include <mlx5_prm.h>\n \n #include \"mlx5_autoconf.h\"\n-\n #include \"mlx5_mr.h\"\n \n /* HW checksum offload capabilities of vectorized Tx. */\n@@ -68,59 +67,4 @@ S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, sop_drop_qpn) ==\n S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) ==\n \t\t  offsetof(struct mlx5_cqe, sop_drop_qpn) + 7);\n \n-/**\n- * Replenish buffers for RX in bulk.\n- *\n- * @param rxq\n- *   Pointer to RX queue structure.\n- */\n-static inline void\n-mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq)\n-{\n-\tconst uint16_t q_n = 1 << rxq->elts_n;\n-\tconst uint16_t q_mask = q_n - 1;\n-\tuint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);\n-\tuint16_t elts_idx = rxq->rq_ci & q_mask;\n-\tstruct rte_mbuf **elts = &(*rxq->elts)[elts_idx];\n-\tvolatile struct mlx5_wqe_data_seg *wq =\n-\t\t&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx];\n-\tunsigned int i;\n-\n-\tif (n >= rxq->rq_repl_thresh) {\n-\t\tMLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n));\n-\t\tMLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) >\n-\t\t\t    MLX5_VPMD_DESCS_PER_LOOP);\n-\t\t/* Not to cross queue end. */\n-\t\tn = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);\n-\t\tif (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {\n-\t\t\trxq->stats.rx_nombuf += n;\n-\t\t\treturn;\n-\t\t}\n-\t\tfor (i = 0; i < n; ++i) {\n-\t\t\tvoid *buf_addr;\n-\n-\t\t\t/*\n-\t\t\t * In order to support the mbufs with external attached\n-\t\t\t * data buffer we should use the buf_addr pointer\n-\t\t\t * instead of rte_mbuf_buf_addr(). It touches the mbuf\n-\t\t\t * itself and may impact the performance.\n-\t\t\t */\n-\t\t\tbuf_addr = elts[i]->buf_addr;\n-\t\t\twq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +\n-\t\t\t\t\t\t      RTE_PKTMBUF_HEADROOM);\n-\t\t\t/* If there's a single MR, no need to replace LKey. */\n-\t\t\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh)\n-\t\t\t\t     > 1))\n-\t\t\t\twq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);\n-\t\t}\n-\t\trxq->rq_ci += n;\n-\t\t/* Prevent overflowing into consumed mbufs. */\n-\t\telts_idx = rxq->rq_ci & q_mask;\n-\t\tfor (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)\n-\t\t\t(*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf;\n-\t\trte_io_wmb();\n-\t\t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n-\t}\n-}\n-\n #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */\n",
    "prefixes": [
        "v2",
        "2/2"
    ]
}