get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/29659/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 29659,
    "url": "http://patches.dpdk.org/api/patches/29659/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/1507195992-12513-2-git-send-email-ophirmu@mellanox.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1507195992-12513-2-git-send-email-ophirmu@mellanox.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1507195992-12513-2-git-send-email-ophirmu@mellanox.com",
    "date": "2017-10-05T09:33:06",
    "name": "[dpdk-dev,v4,1/7] net/mlx4: add simple Tx bypassing Verbs",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "92fe83a1c25cd69715b11df32cab82c89edc298a",
    "submitter": {
        "id": 793,
        "url": "http://patches.dpdk.org/api/people/793/?format=api",
        "name": "Ophir Munk",
        "email": "ophirmu@mellanox.com"
    },
    "delegate": {
        "id": 319,
        "url": "http://patches.dpdk.org/api/users/319/?format=api",
        "username": "fyigit",
        "first_name": "Ferruh",
        "last_name": "Yigit",
        "email": "ferruh.yigit@amd.com"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/1507195992-12513-2-git-send-email-ophirmu@mellanox.com/mbox/",
    "series": [],
    "comments": "http://patches.dpdk.org/api/patches/29659/comments/",
    "check": "success",
    "checks": "http://patches.dpdk.org/api/patches/29659/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 7D9F71AEE9;\n\tThu,  5 Oct 2017 11:33:29 +0200 (CEST)",
            "from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129])\n\tby dpdk.org (Postfix) with ESMTP id 760647CBD\n\tfor <dev@dpdk.org>; Thu,  5 Oct 2017 11:33:26 +0200 (CEST)",
            "from Internal Mail-Server by MTLPINE1 (envelope-from\n\tophirmu@mellanox.com)\n\twith ESMTPS (AES256-SHA encrypted); 5 Oct 2017 11:33:22 +0200",
            "from pegasus05.mtr.labs.mlnx (pegasus05.mtr.labs.mlnx\n\t[10.210.16.100])\n\tby labmailer.mlnx (8.13.8/8.13.8) with ESMTP id v959XMlA015284;\n\tThu, 5 Oct 2017 12:33:22 +0300",
            "from pegasus05.mtr.labs.mlnx (localhost [127.0.0.1])\n\tby pegasus05.mtr.labs.mlnx (8.14.7/8.14.7) with ESMTP id\n\tv959XMSN012575; Thu, 5 Oct 2017 09:33:22 GMT",
            "(from root@localhost)\n\tby pegasus05.mtr.labs.mlnx (8.14.7/8.14.7/Submit) id v959XMHw012574; \n\tThu, 5 Oct 2017 09:33:22 GMT"
        ],
        "From": "Ophir Munk <ophirmu@mellanox.com>",
        "To": "Adrien Mazarguil <adrien.mazarguil@6wind.com>",
        "Cc": "dev@dpdk.org, Thomas Monjalon <thomas@monjalon.net>,\n\tOlga Shern <olgas@mellanox.com>, Matan Azrad <matan@mellanox.com>,\n\tMoti Haimovsky <motih@mellanox.com>",
        "Date": "Thu,  5 Oct 2017 09:33:06 +0000",
        "Message-Id": "<1507195992-12513-2-git-send-email-ophirmu@mellanox.com>",
        "X-Mailer": "git-send-email 1.8.3.1",
        "In-Reply-To": "<1507195992-12513-1-git-send-email-ophirmu@mellanox.com>",
        "References": "<cover.1507141616.git.adrien.mazarguil@6wind.com>\n\t<1507195992-12513-1-git-send-email-ophirmu@mellanox.com>",
        "Subject": "[dpdk-dev] [PATCH v4 1/7] net/mlx4: add simple Tx bypassing Verbs",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "From: Moti Haimovsky <motih@mellanox.com>\n\nModify PMD to send single-buffer packets directly to the device bypassing\nthe Verbs Tx post and poll routines.\n\nSigned-off-by: Moti Haimovsky <motih@mellanox.com>\nAcked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>\n---\n drivers/net/mlx4/mlx4_prm.h  | 120 +++++++++++++++\n drivers/net/mlx4/mlx4_rxtx.c | 337 ++++++++++++++++++++++++++++++++-----------\n drivers/net/mlx4/mlx4_rxtx.h |  28 ++--\n drivers/net/mlx4/mlx4_txq.c  |  51 +++++++\n mk/rte.app.mk                |   2 +-\n 5 files changed, 436 insertions(+), 102 deletions(-)\n create mode 100644 drivers/net/mlx4/mlx4_prm.h",
    "diff": "diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h\nnew file mode 100644\nindex 0000000..085a595\n--- /dev/null\n+++ b/drivers/net/mlx4/mlx4_prm.h\n@@ -0,0 +1,120 @@\n+/*-\n+ *   BSD LICENSE\n+ *\n+ *   Copyright 2017 6WIND S.A.\n+ *   Copyright 2017 Mellanox\n+ *\n+ *   Redistribution and use in source and binary forms, with or without\n+ *   modification, are permitted provided that the following conditions\n+ *   are met:\n+ *\n+ *     * Redistributions of source code must retain the above copyright\n+ *       notice, this list of conditions and the following disclaimer.\n+ *     * Redistributions in binary form must reproduce the above copyright\n+ *       notice, this list of conditions and the following disclaimer in\n+ *       the documentation and/or other materials provided with the\n+ *       distribution.\n+ *     * Neither the name of 6WIND S.A. nor the names of its\n+ *       contributors may be used to endorse or promote products derived\n+ *       from this software without specific prior written permission.\n+ *\n+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n+ *   \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n+ */\n+\n+#ifndef MLX4_PRM_H_\n+#define MLX4_PRM_H_\n+\n+#include <rte_atomic.h>\n+#include <rte_branch_prediction.h>\n+#include <rte_byteorder.h>\n+\n+/* Verbs headers do not support -pedantic. */\n+#ifdef PEDANTIC\n+#pragma GCC diagnostic ignored \"-Wpedantic\"\n+#endif\n+#include <infiniband/mlx4dv.h>\n+#include <infiniband/verbs.h>\n+#ifdef PEDANTIC\n+#pragma GCC diagnostic error \"-Wpedantic\"\n+#endif\n+\n+/* ConnectX-3 Tx queue basic block. */\n+#define MLX4_TXBB_SHIFT 6\n+#define MLX4_TXBB_SIZE (1 << MLX4_TXBB_SHIFT)\n+\n+/* Typical TSO descriptor with 16 gather entries is 352 bytes. */\n+#define MLX4_MAX_WQE_SIZE 512\n+#define MLX4_MAX_WQE_TXBBS (MLX4_MAX_WQE_SIZE / MLX4_TXBB_SIZE)\n+\n+/* Send queue stamping/invalidating information. */\n+#define MLX4_SQ_STAMP_STRIDE 64\n+#define MLX4_SQ_STAMP_DWORDS (MLX4_SQ_STAMP_STRIDE / 4)\n+#define MLX4_SQ_STAMP_SHIFT 31\n+#define MLX4_SQ_STAMP_VAL 0x7fffffff\n+\n+/* Work queue element (WQE) flags. */\n+#define MLX4_BIT_WQE_OWN 0x80000000\n+\n+#define MLX4_SIZE_TO_TXBBS(size) \\\n+\t(RTE_ALIGN((size), (MLX4_TXBB_SIZE)) >> (MLX4_TXBB_SHIFT))\n+\n+/* Send queue information. */\n+struct mlx4_sq {\n+\tuint8_t *buf; /**< SQ buffer. */\n+\tuint8_t *eob; /**< End of SQ buffer */\n+\tuint32_t head; /**< SQ head counter in units of TXBBS. */\n+\tuint32_t tail; /**< SQ tail counter in units of TXBBS. */\n+\tuint32_t txbb_cnt; /**< Num of WQEBB in the Q (should be ^2). */\n+\tuint32_t txbb_cnt_mask; /**< txbbs_cnt mask (txbb_cnt is ^2). */\n+\tuint32_t headroom_txbbs; /**< Num of txbbs that should be kept free. */\n+\tuint32_t *db; /**< Pointer to the doorbell. */\n+\tuint32_t doorbell_qpn; /**< qp number to write to the doorbell. */\n+};\n+\n+#define mlx4_get_send_wqe(sq, n) ((sq)->buf + ((n) * (MLX4_TXBB_SIZE)))\n+\n+/* Completion queue information. */\n+struct mlx4_cq {\n+\tuint8_t *buf; /**< Pointer to the completion queue buffer. */\n+\tuint32_t cqe_cnt; /**< Number of entries in the queue. */\n+\tuint32_t cqe_64:1; /**< CQ entry size is 64 bytes. */\n+\tuint32_t cons_index; /**< Last queue entry that was handled. */\n+\tuint32_t *set_ci_db; /**< Pointer to the completion queue doorbell. */\n+};\n+\n+/**\n+ * Retrieve a CQE entry from a CQ.\n+ *\n+ * cqe = cq->buf + cons_index * cqe_size + cqe_offset\n+ *\n+ * Where cqe_size is 32 or 64 bytes and cqe_offset is 0 or 32 (depending on\n+ * cqe_size).\n+ *\n+ * @param cq\n+ *   CQ to retrieve entry from.\n+ * @param index\n+ *   Entry index.\n+ *\n+ * @return\n+ *   Pointer to CQE entry.\n+ */\n+static inline struct mlx4_cqe *\n+mlx4_get_cqe(struct mlx4_cq *cq, uint32_t index)\n+{\n+\treturn (struct mlx4_cqe *)(cq->buf +\n+\t\t\t\t   ((index & (cq->cqe_cnt - 1)) <<\n+\t\t\t\t    (5 + cq->cqe_64)) +\n+\t\t\t\t   (cq->cqe_64 << 5));\n+}\n+\n+#endif /* MLX4_PRM_H_ */\ndiff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c\nindex b5e7777..35367a2 100644\n--- a/drivers/net/mlx4/mlx4_rxtx.c\n+++ b/drivers/net/mlx4/mlx4_rxtx.c\n@@ -52,15 +52,72 @@\n \n #include <rte_branch_prediction.h>\n #include <rte_common.h>\n+#include <rte_io.h>\n #include <rte_mbuf.h>\n #include <rte_mempool.h>\n #include <rte_prefetch.h>\n \n #include \"mlx4.h\"\n+#include \"mlx4_prm.h\"\n #include \"mlx4_rxtx.h\"\n #include \"mlx4_utils.h\"\n \n /**\n+ * Stamp a WQE so it won't be reused by the HW.\n+ *\n+ * Routine is used when freeing WQE used by the chip or when failing\n+ * building an WQ entry has failed leaving partial information on the queue.\n+ *\n+ * @param sq\n+ *   Pointer to the SQ structure.\n+ * @param index\n+ *   Index of the freed WQE.\n+ * @param num_txbbs\n+ *   Number of blocks to stamp.\n+ *   If < 0 the routine will use the size written in the WQ entry.\n+ * @param owner\n+ *   The value of the WQE owner bit to use in the stamp.\n+ *\n+ * @return\n+ *   The number of Tx basic blocs (TXBB) the WQE contained.\n+ */\n+static int\n+mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)\n+{\n+\tuint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |\n+\t\t\t\t\t  (!!owner << MLX4_SQ_STAMP_SHIFT));\n+\tuint8_t *wqe = mlx4_get_send_wqe(sq, (index & sq->txbb_cnt_mask));\n+\tuint32_t *ptr = (uint32_t *)wqe;\n+\tint i;\n+\tint txbbs_size;\n+\tint num_txbbs;\n+\n+\t/* Extract the size from the control segment of the WQE. */\n+\tnum_txbbs = MLX4_SIZE_TO_TXBBS((((struct mlx4_wqe_ctrl_seg *)\n+\t\t\t\t\t wqe)->fence_size & 0x3f) << 4);\n+\ttxbbs_size = num_txbbs * MLX4_TXBB_SIZE;\n+\t/* Optimize the common case when there is no wrap-around. */\n+\tif (wqe + txbbs_size <= sq->eob) {\n+\t\t/* Stamp the freed descriptor. */\n+\t\tfor (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {\n+\t\t\t*ptr = stamp;\n+\t\t\tptr += MLX4_SQ_STAMP_DWORDS;\n+\t\t}\n+\t} else {\n+\t\t/* Stamp the freed descriptor. */\n+\t\tfor (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) {\n+\t\t\t*ptr = stamp;\n+\t\t\tptr += MLX4_SQ_STAMP_DWORDS;\n+\t\t\tif ((uint8_t *)ptr >= sq->eob) {\n+\t\t\t\tptr = (uint32_t *)sq->buf;\n+\t\t\t\tstamp ^= RTE_BE32(0x80000000);\n+\t\t\t}\n+\t\t}\n+\t}\n+\treturn num_txbbs;\n+}\n+\n+/**\n  * Manage Tx completions.\n  *\n  * When sending a burst, mlx4_tx_burst() posts several WRs.\n@@ -80,26 +137,71 @@\n \tunsigned int elts_comp = txq->elts_comp;\n \tunsigned int elts_tail = txq->elts_tail;\n \tconst unsigned int elts_n = txq->elts_n;\n-\tstruct ibv_wc wcs[elts_comp];\n-\tint wcs_n;\n+\tstruct mlx4_cq *cq = &txq->mcq;\n+\tstruct mlx4_sq *sq = &txq->msq;\n+\tstruct mlx4_cqe *cqe;\n+\tuint32_t cons_index = cq->cons_index;\n+\tuint16_t new_index;\n+\tuint16_t nr_txbbs = 0;\n+\tint pkts = 0;\n \n \tif (unlikely(elts_comp == 0))\n \t\treturn 0;\n-\twcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);\n-\tif (unlikely(wcs_n == 0))\n+\t/*\n+\t * Traverse over all CQ entries reported and handle each WQ entry\n+\t * reported by them.\n+\t */\n+\tdo {\n+\t\tcqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);\n+\t\tif (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^\n+\t\t    !!(cons_index & cq->cqe_cnt)))\n+\t\t\tbreak;\n+\t\t/*\n+\t\t * Make sure we read the CQE after we read the ownership bit.\n+\t\t */\n+\t\trte_rmb();\n+\t\tif (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==\n+\t\t\t     MLX4_CQE_OPCODE_ERROR)) {\n+\t\t\tstruct mlx4_err_cqe *cqe_err =\n+\t\t\t\t(struct mlx4_err_cqe *)cqe;\n+\t\t\tERROR(\"%p CQE error - vendor syndrome: 0x%x\"\n+\t\t\t      \" syndrome: 0x%x\\n\",\n+\t\t\t      (void *)txq, cqe_err->vendor_err,\n+\t\t\t      cqe_err->syndrome);\n+\t\t}\n+\t\t/* Get WQE index reported in the CQE. */\n+\t\tnew_index =\n+\t\t\trte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask;\n+\t\tdo {\n+\t\t\t/* Free next descriptor. */\n+\t\t\tnr_txbbs +=\n+\t\t\t\tmlx4_txq_stamp_freed_wqe(sq,\n+\t\t\t\t     (sq->tail + nr_txbbs) & sq->txbb_cnt_mask,\n+\t\t\t\t     !!((sq->tail + nr_txbbs) & sq->txbb_cnt));\n+\t\t\tpkts++;\n+\t\t} while (((sq->tail + nr_txbbs) & sq->txbb_cnt_mask) !=\n+\t\t\t new_index);\n+\t\tcons_index++;\n+\t} while (1);\n+\tif (unlikely(pkts == 0))\n \t\treturn 0;\n-\tif (unlikely(wcs_n < 0)) {\n-\t\tDEBUG(\"%p: ibv_poll_cq() failed (wcs_n=%d)\",\n-\t\t      (void *)txq, wcs_n);\n-\t\treturn -1;\n-\t}\n-\telts_comp -= wcs_n;\n+\t/*\n+\t * Update CQ.\n+\t * To prevent CQ overflow we first update CQ consumer and only then\n+\t * the ring consumer.\n+\t */\n+\tcq->cons_index = cons_index;\n+\t*cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & 0xffffff);\n+\trte_wmb();\n+\tsq->tail = sq->tail + nr_txbbs;\n+\t/* Update the list of packets posted for transmission. */\n+\telts_comp -= pkts;\n \tassert(elts_comp <= txq->elts_comp);\n \t/*\n-\t * Assume WC status is successful as nothing can be done about it\n-\t * anyway.\n+\t * Assume completion status is successful as nothing can be done about\n+\t * it anyway.\n \t */\n-\telts_tail += wcs_n * txq->elts_comp_cd_init;\n+\telts_tail += pkts;\n \tif (elts_tail >= elts_n)\n \t\telts_tail -= elts_n;\n \ttxq->elts_tail = elts_tail;\n@@ -183,6 +285,119 @@\n }\n \n /**\n+ * Posts a single work request to a send queue.\n+ *\n+ * @param txq\n+ *   Target Tx queue.\n+ * @param pkt\n+ *   Packet to transmit.\n+ * @param send_flags\n+ *   @p MLX4_WQE_CTRL_CQ_UPDATE to request completion on this packet.\n+ *\n+ * @return\n+ *   0 on success, negative errno value otherwise and rte_errno is set.\n+ */\n+static inline int\n+mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt, uint32_t send_flags)\n+{\n+\tstruct mlx4_wqe_ctrl_seg *ctrl;\n+\tstruct mlx4_wqe_data_seg *dseg;\n+\tstruct mlx4_sq *sq = &txq->msq;\n+\tuint32_t head_idx = sq->head & sq->txbb_cnt_mask;\n+\tuint32_t lkey;\n+\tuintptr_t addr;\n+\tint wqe_real_size;\n+\tint nr_txbbs;\n+\tint rc;\n+\n+\t/* Calculate the needed work queue entry size for this packet. */\n+\twqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +\n+\t\t\tpkt->nb_segs * sizeof(struct mlx4_wqe_data_seg);\n+\tnr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);\n+\t/*\n+\t * Check that there is room for this WQE in the send queue and that\n+\t * the WQE size is legal.\n+\t */\n+\tif (((sq->head - sq->tail) + nr_txbbs +\n+\t     sq->headroom_txbbs) >= sq->txbb_cnt ||\n+\t    nr_txbbs > MLX4_MAX_WQE_TXBBS) {\n+\t\trc = ENOSPC;\n+\t\tgoto err;\n+\t}\n+\t/* Get the control and single-data entries of the WQE. */\n+\tctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);\n+\tdseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +\n+\t\t\t\t\t    sizeof(struct mlx4_wqe_ctrl_seg));\n+\t/* Fill the data segment with buffer information. */\n+\taddr = rte_pktmbuf_mtod(pkt, uintptr_t);\n+\trte_prefetch0((volatile void *)addr);\n+\tdseg->addr = rte_cpu_to_be_64(addr);\n+\t/* Memory region key for this memory pool. */\n+\tlkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(pkt));\n+\tif (unlikely(lkey == (uint32_t)-1)) {\n+\t\t/* MR does not exist. */\n+\t\tDEBUG(\"%p: unable to get MP <-> MR association\", (void *)txq);\n+\t\t/*\n+\t\t * Restamp entry in case of failure, make sure that size is\n+\t\t * written correctly.\n+\t\t * Note that we give ownership to the SW, not the HW.\n+\t\t */\n+\t\tctrl->fence_size = (wqe_real_size >> 4) & 0x3f;\n+\t\tmlx4_txq_stamp_freed_wqe(sq, head_idx,\n+\t\t\t\t\t (sq->head & sq->txbb_cnt) ? 0 : 1);\n+\t\trc = EFAULT;\n+\t\tgoto err;\n+\t}\n+\tdseg->lkey = rte_cpu_to_be_32(lkey);\n+\t/*\n+\t * Need a barrier here before writing the byte_count field to\n+\t * make sure that all the data is visible before the\n+\t * byte_count field is set. Otherwise, if the segment begins\n+\t * a new cache line, the HCA prefetcher could grab the 64-byte\n+\t * chunk and get a valid (!= 0xffffffff) byte count but\n+\t * stale data, and end up sending the wrong data.\n+\t */\n+\trte_io_wmb();\n+\tif (likely(pkt->data_len))\n+\t\tdseg->byte_count = rte_cpu_to_be_32(pkt->data_len);\n+\telse\n+\t\t/*\n+\t\t * Zero length segment is treated as inline segment\n+\t\t * with zero data.\n+\t\t */\n+\t\tdseg->byte_count = RTE_BE32(0x80000000);\n+\t/*\n+\t * Fill the control parameters for this packet.\n+\t * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC\n+\t * should be calculated.\n+\t */\n+\tctrl->srcrb_flags =\n+\t\trte_cpu_to_be_32(MLX4_WQE_CTRL_SOLICIT |\n+\t\t\t\t (send_flags & MLX4_WQE_CTRL_CQ_UPDATE));\n+\tctrl->fence_size = (wqe_real_size >> 4) & 0x3f;\n+\t/*\n+\t * The caller should prepare \"imm\" in advance in order to support\n+\t * VF to VF communication (when the device is a virtual-function\n+\t * device (VF)).\n+\t */\n+\tctrl->imm = 0;\n+\t/*\n+\t * Make sure descriptor is fully written before setting ownership\n+\t * bit (because HW can start executing as soon as we do).\n+\t */\n+\trte_wmb();\n+\tctrl->owner_opcode =\n+\t\trte_cpu_to_be_32(MLX4_OPCODE_SEND |\n+\t\t\t\t ((sq->head & sq->txbb_cnt) ?\n+\t\t\t\t  MLX4_BIT_WQE_OWN : 0));\n+\tsq->head += nr_txbbs;\n+\treturn 0;\n+err:\n+\trte_errno = rc;\n+\treturn -rc;\n+}\n+\n+/**\n  * DPDK callback for Tx.\n  *\n  * @param dpdk_txq\n@@ -199,13 +414,11 @@\n mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)\n {\n \tstruct txq *txq = (struct txq *)dpdk_txq;\n-\tstruct ibv_send_wr *wr_head = NULL;\n-\tstruct ibv_send_wr **wr_next = &wr_head;\n-\tstruct ibv_send_wr *wr_bad = NULL;\n \tunsigned int elts_head = txq->elts_head;\n \tconst unsigned int elts_n = txq->elts_n;\n \tunsigned int elts_comp_cd = txq->elts_comp_cd;\n \tunsigned int elts_comp = 0;\n+\tunsigned int bytes_sent = 0;\n \tunsigned int i;\n \tunsigned int max;\n \tint err;\n@@ -229,9 +442,7 @@\n \t\t\t(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);\n \t\tstruct txq_elt *elt_next = &(*txq->elts)[elts_head_next];\n \t\tstruct txq_elt *elt = &(*txq->elts)[elts_head];\n-\t\tstruct ibv_send_wr *wr = &elt->wr;\n \t\tunsigned int segs = buf->nb_segs;\n-\t\tunsigned int sent_size = 0;\n \t\tuint32_t send_flags = 0;\n \n \t\t/* Clean up old buffer. */\n@@ -254,93 +465,43 @@\n \t\tif (unlikely(--elts_comp_cd == 0)) {\n \t\t\telts_comp_cd = txq->elts_comp_cd_init;\n \t\t\t++elts_comp;\n-\t\t\tsend_flags |= IBV_SEND_SIGNALED;\n+\t\t\tsend_flags |= MLX4_WQE_CTRL_CQ_UPDATE;\n \t\t}\n \t\tif (likely(segs == 1)) {\n-\t\t\tstruct ibv_sge *sge = &elt->sge;\n-\t\t\tuintptr_t addr;\n-\t\t\tuint32_t length;\n-\t\t\tuint32_t lkey;\n-\n-\t\t\t/* Retrieve buffer information. */\n-\t\t\taddr = rte_pktmbuf_mtod(buf, uintptr_t);\n-\t\t\tlength = buf->data_len;\n-\t\t\t/* Retrieve memory region key for this memory pool. */\n-\t\t\tlkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));\n-\t\t\tif (unlikely(lkey == (uint32_t)-1)) {\n-\t\t\t\t/* MR does not exist. */\n-\t\t\t\tDEBUG(\"%p: unable to get MP <-> MR\"\n-\t\t\t\t      \" association\", (void *)txq);\n-\t\t\t\t/* Clean up Tx element. */\n+\t\t\t/* Update element. */\n+\t\t\telt->buf = buf;\n+\t\t\tRTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);\n+\t\t\t/* Post the packet for sending. */\n+\t\t\terr = mlx4_post_send(txq, buf, send_flags);\n+\t\t\tif (unlikely(err)) {\n+\t\t\t\tif (unlikely(send_flags &\n+\t\t\t\t\t     MLX4_WQE_CTRL_CQ_UPDATE)) {\n+\t\t\t\t\telts_comp_cd = 1;\n+\t\t\t\t\t--elts_comp;\n+\t\t\t\t}\n \t\t\t\telt->buf = NULL;\n \t\t\t\tgoto stop;\n \t\t\t}\n-\t\t\t/* Update element. */\n \t\t\telt->buf = buf;\n-\t\t\tif (txq->priv->vf)\n-\t\t\t\trte_prefetch0((volatile void *)\n-\t\t\t\t\t      (uintptr_t)addr);\n-\t\t\tRTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);\n-\t\t\tsge->addr = addr;\n-\t\t\tsge->length = length;\n-\t\t\tsge->lkey = lkey;\n-\t\t\tsent_size += length;\n+\t\t\tbytes_sent += buf->pkt_len;\n \t\t} else {\n-\t\t\terr = -1;\n+\t\t\terr = -EINVAL;\n+\t\t\trte_errno = -err;\n \t\t\tgoto stop;\n \t\t}\n-\t\tif (sent_size <= txq->max_inline)\n-\t\t\tsend_flags |= IBV_SEND_INLINE;\n \t\telts_head = elts_head_next;\n-\t\t/* Increment sent bytes counter. */\n-\t\ttxq->stats.obytes += sent_size;\n-\t\t/* Set up WR. */\n-\t\twr->sg_list = &elt->sge;\n-\t\twr->num_sge = segs;\n-\t\twr->opcode = IBV_WR_SEND;\n-\t\twr->send_flags = send_flags;\n-\t\t*wr_next = wr;\n-\t\twr_next = &wr->next;\n \t}\n stop:\n \t/* Take a shortcut if nothing must be sent. */\n \tif (unlikely(i == 0))\n \t\treturn 0;\n-\t/* Increment sent packets counter. */\n+\t/* Increment send statistics counters. */\n \ttxq->stats.opackets += i;\n+\ttxq->stats.obytes += bytes_sent;\n+\t/* Make sure that descriptors are written before doorbell record. */\n+\trte_wmb();\n \t/* Ring QP doorbell. */\n-\t*wr_next = NULL;\n-\tassert(wr_head);\n-\terr = ibv_post_send(txq->qp, wr_head, &wr_bad);\n-\tif (unlikely(err)) {\n-\t\tuint64_t obytes = 0;\n-\t\tuint64_t opackets = 0;\n-\n-\t\t/* Rewind bad WRs. */\n-\t\twhile (wr_bad != NULL) {\n-\t\t\tint j;\n-\n-\t\t\t/* Force completion request if one was lost. */\n-\t\t\tif (wr_bad->send_flags & IBV_SEND_SIGNALED) {\n-\t\t\t\telts_comp_cd = 1;\n-\t\t\t\t--elts_comp;\n-\t\t\t}\n-\t\t\t++opackets;\n-\t\t\tfor (j = 0; j < wr_bad->num_sge; ++j)\n-\t\t\t\tobytes += wr_bad->sg_list[j].length;\n-\t\t\telts_head = (elts_head ? elts_head : elts_n) - 1;\n-\t\t\twr_bad = wr_bad->next;\n-\t\t}\n-\t\ttxq->stats.opackets -= opackets;\n-\t\ttxq->stats.obytes -= obytes;\n-\t\ti -= opackets;\n-\t\tDEBUG(\"%p: ibv_post_send() failed, %\" PRIu64 \" packets\"\n-\t\t      \" (%\" PRIu64 \" bytes) rejected: %s\",\n-\t\t      (void *)txq,\n-\t\t      opackets,\n-\t\t      obytes,\n-\t\t      (err <= -1) ? \"Internal error\" : strerror(err));\n-\t}\n+\trte_write32(txq->msq.doorbell_qpn, txq->msq.db);\n \ttxq->elts_head = elts_head;\n \ttxq->elts_comp += elts_comp;\n \ttxq->elts_comp_cd = elts_comp_cd;\ndiff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h\nindex fec998a..cc5951c 100644\n--- a/drivers/net/mlx4/mlx4_rxtx.h\n+++ b/drivers/net/mlx4/mlx4_rxtx.h\n@@ -40,6 +40,7 @@\n #ifdef PEDANTIC\n #pragma GCC diagnostic ignored \"-Wpedantic\"\n #endif\n+#include <infiniband/mlx4dv.h>\n #include <infiniband/verbs.h>\n #ifdef PEDANTIC\n #pragma GCC diagnostic error \"-Wpedantic\"\n@@ -50,6 +51,7 @@\n #include <rte_mempool.h>\n \n #include \"mlx4.h\"\n+#include \"mlx4_prm.h\"\n \n /** Rx queue counters. */\n struct mlx4_rxq_stats {\n@@ -85,8 +87,6 @@ struct rxq {\n \n /** Tx element. */\n struct txq_elt {\n-\tstruct ibv_send_wr wr; /* Work request. */\n-\tstruct ibv_sge sge; /* Scatter/gather element. */\n \tstruct rte_mbuf *buf; /**< Buffer. */\n };\n \n@@ -100,24 +100,26 @@ struct mlx4_txq_stats {\n \n /** Tx queue descriptor. */\n struct txq {\n-\tstruct priv *priv; /**< Back pointer to private data. */\n-\tstruct {\n-\t\tconst struct rte_mempool *mp; /**< Cached memory pool. */\n-\t\tstruct ibv_mr *mr; /**< Memory region (for mp). */\n-\t\tuint32_t lkey; /**< mr->lkey copy. */\n-\t} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */\n-\tstruct ibv_cq *cq; /**< Completion queue. */\n-\tstruct ibv_qp *qp; /**< Queue pair. */\n-\tuint32_t max_inline; /**< Max inline send size. */\n-\tunsigned int elts_n; /**< (*elts)[] length. */\n-\tstruct txq_elt (*elts)[]; /**< Tx elements. */\n+\tstruct mlx4_sq msq; /**< Info for directly manipulating the SQ. */\n+\tstruct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */\n \tunsigned int elts_head; /**< Current index in (*elts)[]. */\n \tunsigned int elts_tail; /**< First element awaiting completion. */\n \tunsigned int elts_comp; /**< Number of completion requests. */\n \tunsigned int elts_comp_cd; /**< Countdown for next completion. */\n \tunsigned int elts_comp_cd_init; /**< Initial value for countdown. */\n+\tunsigned int elts_n; /**< (*elts)[] length. */\n+\tstruct txq_elt (*elts)[]; /**< Tx elements. */\n \tstruct mlx4_txq_stats stats; /**< Tx queue counters. */\n+\tuint32_t max_inline; /**< Max inline send size. */\n+\tstruct {\n+\t\tconst struct rte_mempool *mp; /**< Cached memory pool. */\n+\t\tstruct ibv_mr *mr; /**< Memory region (for mp). */\n+\t\tuint32_t lkey; /**< mr->lkey copy. */\n+\t} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */\n+\tstruct priv *priv; /**< Back pointer to private data. */\n \tunsigned int socket; /**< CPU socket ID for allocations. */\n+\tstruct ibv_cq *cq; /**< Completion queue. */\n+\tstruct ibv_qp *qp; /**< Queue pair. */\n };\n \n /* mlx4_rxq.c */\ndiff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c\nindex e0245b0..fb28ef2 100644\n--- a/drivers/net/mlx4/mlx4_txq.c\n+++ b/drivers/net/mlx4/mlx4_txq.c\n@@ -62,6 +62,7 @@\n #include \"mlx4_autoconf.h\"\n #include \"mlx4_rxtx.h\"\n #include \"mlx4_utils.h\"\n+#include \"mlx4_prm.h\"\n \n /**\n  * Allocate Tx queue elements.\n@@ -242,6 +243,41 @@ struct txq_mp2mr_mbuf_check_data {\n }\n \n /**\n+ * Retrieves information needed in order to directly access the Tx queue.\n+ *\n+ * @param txq\n+ *   Pointer to Tx queue structure.\n+ * @param mlxdv\n+ *   Pointer to device information for this Tx queue.\n+ */\n+static void\n+mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)\n+{\n+\tstruct mlx4_sq *sq = &txq->msq;\n+\tstruct mlx4_cq *cq = &txq->mcq;\n+\tstruct mlx4dv_qp *dqp = mlxdv->qp.out;\n+\tstruct mlx4dv_cq *dcq = mlxdv->cq.out;\n+\tuint32_t sq_size = (uint32_t)dqp->rq.offset - (uint32_t)dqp->sq.offset;\n+\n+\tsq->buf = (uint8_t *)dqp->buf.buf + dqp->sq.offset;\n+\t/* Total length, including headroom and spare WQEs. */\n+\tsq->eob = sq->buf + sq_size;\n+\tsq->head = 0;\n+\tsq->tail = 0;\n+\tsq->txbb_cnt =\n+\t\t(dqp->sq.wqe_cnt << dqp->sq.wqe_shift) >> MLX4_TXBB_SHIFT;\n+\tsq->txbb_cnt_mask = sq->txbb_cnt - 1;\n+\tsq->db = dqp->sdb;\n+\tsq->doorbell_qpn = dqp->doorbell_qpn;\n+\tsq->headroom_txbbs =\n+\t\t(2048 + (1 << dqp->sq.wqe_shift)) >> MLX4_TXBB_SHIFT;\n+\tcq->buf = dcq->buf.buf;\n+\tcq->cqe_cnt = dcq->cqe_cnt;\n+\tcq->set_ci_db = dcq->set_ci_db;\n+\tcq->cqe_64 = (dcq->cqe_size & 64) ? 1 : 0;\n+}\n+\n+/**\n  * Configure a Tx queue.\n  *\n  * @param dev\n@@ -263,6 +299,9 @@ struct txq_mp2mr_mbuf_check_data {\n \t       unsigned int socket, const struct rte_eth_txconf *conf)\n {\n \tstruct priv *priv = dev->data->dev_private;\n+\tstruct mlx4dv_obj mlxdv;\n+\tstruct mlx4dv_qp dv_qp;\n+\tstruct mlx4dv_cq dv_cq;\n \tstruct txq tmpl = {\n \t\t.priv = priv,\n \t\t.socket = socket\n@@ -370,6 +409,18 @@ struct txq_mp2mr_mbuf_check_data {\n \tDEBUG(\"%p: txq updated with %p\", (void *)txq, (void *)&tmpl);\n \t/* Pre-register known mempools. */\n \trte_mempool_walk(mlx4_txq_mp2mr_iter, txq);\n+\t/* Retrieve device queue information. */\n+\tmlxdv.cq.in = txq->cq;\n+\tmlxdv.cq.out = &dv_cq;\n+\tmlxdv.qp.in = txq->qp;\n+\tmlxdv.qp.out = &dv_qp;\n+\tret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ);\n+\tif (ret) {\n+\t\tERROR(\"%p: failed to obtain information needed for\"\n+\t\t      \" accessing the device queues\", (void *)dev);\n+\t\tgoto error;\n+\t}\n+\tmlx4_txq_fill_dv_obj_info(txq, &mlxdv);\n \treturn 0;\n error:\n \tret = rte_errno;\ndiff --git a/mk/rte.app.mk b/mk/rte.app.mk\nindex 29507dc..1435cb6 100644\n--- a/mk/rte.app.mk\n+++ b/mk/rte.app.mk\n@@ -133,7 +133,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)\n _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KNI)        += -lrte_pmd_kni\n endif\n _LDLIBS-$(CONFIG_RTE_LIBRTE_LIO_PMD)        += -lrte_pmd_lio\n-_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD)       += -lrte_pmd_mlx4 -libverbs\n+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD)       += -lrte_pmd_mlx4 -libverbs -lmlx4\n _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5\n _LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD)        += -lrte_pmd_nfp\n _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL)       += -lrte_pmd_null\n",
    "prefixes": [
        "dpdk-dev",
        "v4",
        "1/7"
    ]
}