[RFC] net/mlx5: add queue start and stop feature

Message ID 1591774509-5473-1-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers
Series [RFC] net/mlx5: add queue start and stop feature |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko June 10, 2020, 7:35 a.m. UTC
  The mlx5 PMD does not support queue_start and queue_stop eth_dev API
routines, queue can't be suspended and resumed during device operation.

There is the use case when this feature is crucial for applications:

- there is the secondary process handling the queue
- secondary process crashed/aborted
- some mbufs were allocated or used by secondary application
- some mbufs were allocated by Rx queues to receive packets
- some mbufs were placed to send queue
- queue goes to undefined state

In this case there is no reliable way to recovery queue handling
by restarted secondary process but reset queue to initial state
freeing all involved resources, including buffers involved in queue
operations, reset the mbuf pools, and then reinitialize queue
to working state:

- reset mbuf pool, allocate all mbuf to initialize pool into
  safe state after the crush and allow safe mbuf free calls
- stop queue, free all potentially involved mbufs
- reset mbuf pool again
- start queue, reallocate mbufs needed

This patch introduces the queue start/stop feature with some
limitations:

- hairpin queues are not supported
- it is application responsibility to synchronize start/stop
  with datapath routines, rx/tx_burst must be suspended during
  the queue_start/queue_stop calls
- it is application responsibility to track queue usage and
  provide coordinated queue_start/queue_stop calls from
  secondary and primary processes.
- Rx queues with vectorized Rx routine and engaged CQE
  compression are not supported by this patch currently

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/common/mlx5/mlx5_common_mp.h |  10 ++
 drivers/net/mlx5/mlx5.c              |  12 ++
 drivers/net/mlx5/mlx5.h              |   2 +
 drivers/net/mlx5/mlx5_mp.c           |  74 ++++++++++-
 drivers/net/mlx5/mlx5_rxq.c          | 233 +++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h         |   8 ++
 drivers/net/mlx5/mlx5_txq.c          | 215 ++++++++++++++++++++++++++++++++
 7 files changed, 553 insertions(+), 1 deletion(-)
  

Patch

diff --git a/drivers/common/mlx5/mlx5_common_mp.h b/drivers/common/mlx5/mlx5_common_mp.h
index 05466fd..740aa36 100644
--- a/drivers/common/mlx5/mlx5_common_mp.h
+++ b/drivers/common/mlx5/mlx5_common_mp.h
@@ -26,6 +26,10 @@  enum mlx5_mp_req_type {
 	MLX5_MP_REQ_START_RXTX,
 	MLX5_MP_REQ_STOP_RXTX,
 	MLX5_MP_REQ_QUEUE_STATE_MODIFY,
+	MLX5_MP_REQ_QUEUE_RX_STOP,
+	MLX5_MP_REQ_QUEUE_RX_START,
+	MLX5_MP_REQ_QUEUE_TX_STOP,
+	MLX5_MP_REQ_QUEUE_TX_START,
 };
 
 struct mlx5_mp_arg_queue_state_modify {
@@ -34,6 +38,10 @@  struct mlx5_mp_arg_queue_state_modify {
 	enum ibv_wq_state state; /* WQ requested state. */
 };
 
+struct mlx5_mp_arg_queue_id {
+	uint16_t queue_id; /* DPDK queue ID. */
+};
+
 /* Pameters for IPC. */
 struct mlx5_mp_param {
 	enum mlx5_mp_req_type type;
@@ -44,6 +52,8 @@  struct mlx5_mp_param {
 		uintptr_t addr; /* MLX5_MP_REQ_CREATE_MR */
 		struct mlx5_mp_arg_queue_state_modify state_modify;
 		/* MLX5_MP_REQ_QUEUE_STATE_MODIFY */
+		struct mlx5_mp_arg_queue_id queue_id;
+		/* MLX5_MP_REQ_QUEUE_RX/TX_START/STOP */
 	} args;
 };
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7c5e23d..cb35dc8 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1250,6 +1250,10 @@  struct mlx5_dev_ctx_shared *
 	.tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
 	.rx_queue_release = mlx5_rx_queue_release,
 	.tx_queue_release = mlx5_tx_queue_release,
+	.rx_queue_start = mlx5_rx_queue_start,
+	.rx_queue_stop = mlx5_rx_queue_stop,
+	.tx_queue_start = mlx5_tx_queue_start,
+	.tx_queue_stop = mlx5_tx_queue_stop,
 	.flow_ctrl_get = mlx5_dev_get_flow_ctrl,
 	.flow_ctrl_set = mlx5_dev_set_flow_ctrl,
 	.mac_addr_remove = mlx5_mac_addr_remove,
@@ -1290,6 +1294,10 @@  struct mlx5_dev_ctx_shared *
 	.xstats_get_names = mlx5_xstats_get_names,
 	.fw_version_get = mlx5_fw_version_get,
 	.dev_infos_get = mlx5_dev_infos_get,
+	.rx_queue_start = mlx5_rx_queue_start,
+	.rx_queue_stop = mlx5_rx_queue_stop,
+	.tx_queue_start = mlx5_tx_queue_start,
+	.tx_queue_stop = mlx5_tx_queue_stop,
 	.rx_descriptor_status = mlx5_rx_descriptor_status,
 	.tx_descriptor_status = mlx5_tx_descriptor_status,
 	.rxq_info_get = mlx5_rxq_info_get,
@@ -1328,6 +1336,10 @@  struct mlx5_dev_ctx_shared *
 	.tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
 	.rx_queue_release = mlx5_rx_queue_release,
 	.tx_queue_release = mlx5_tx_queue_release,
+	.rx_queue_start = mlx5_rx_queue_start,
+	.rx_queue_stop = mlx5_rx_queue_stop,
+	.tx_queue_start = mlx5_tx_queue_start,
+	.tx_queue_stop = mlx5_tx_queue_stop,
 	.flow_ctrl_get = mlx5_dev_get_flow_ctrl,
 	.flow_ctrl_set = mlx5_dev_set_flow_ctrl,
 	.mac_addr_remove = mlx5_mac_addr_remove,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8c4b234..050bd47 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -890,6 +890,8 @@  int mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
 int mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer);
 void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev);
 void mlx5_mp_req_stop_rxtx(struct rte_eth_dev *dev);
+int mlx5_mp_req_queue_control(struct rte_eth_dev *dev, uint16_t queue_id,
+			      enum mlx5_mp_req_type req_type);
 
 /* mlx5_socket.c */
 
diff --git a/drivers/net/mlx5/mlx5_mp.c b/drivers/net/mlx5/mlx5_mp.c
index a2b5c40..e7e32e6 100644
--- a/drivers/net/mlx5/mlx5_mp.c
+++ b/drivers/net/mlx5/mlx5_mp.c
@@ -62,6 +62,30 @@ 
 					(dev, &param->args.state_modify);
 		ret = rte_mp_reply(&mp_res, peer);
 		break;
+	case MLX5_MP_REQ_QUEUE_RX_STOP:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_rx_queue_stop_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_RX_START:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_rx_queue_start_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_TX_STOP:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_tx_queue_stop_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_TX_START:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_tx_queue_start_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
 	default:
 		rte_errno = EINVAL;
 		DRV_LOG(ERR, "port %u invalid mp request type",
@@ -85,7 +109,7 @@ 
 int
 mlx5_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 {
-	struct rte_mp_msg mp_res;
+struct rte_mp_msg mp_res;
 	struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
 	const struct mlx5_mp_param *param =
 		(const struct mlx5_mp_param *)mp_msg->param;
@@ -209,3 +233,51 @@ 
 {
 	mp_req_on_rxtx(dev, MLX5_MP_REQ_STOP_RXTX);
 }
+
+/**
+ * Request Verbs Rx/Tx queue stop or start to the primary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param queue_id
+ *   Queue ID to control.
+ * @param req_type
+ *   request type
+ *     MLX5_MP_REQ_QUEUE_RX_START - start Rx queue
+ *     MLX5_MP_REQ_QUEUE_TX_START - stop Tx queue
+ *     MLX5_MP_REQ_QUEUE_RX_STOP - stop Rx queue
+ *     MLX5_MP_REQ_QUEUE_TX_STOP - stop Tx queue
+ * @return
+ *   0 on success, a negative errno value otherwise and
+ *     rte_errno is set.
+ */
+int
+mlx5_mp_req_queue_control(struct rte_eth_dev *dev, uint16_t queue_id,
+			  enum mlx5_mp_req_type req_type)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx5_mp_param *req = (struct mlx5_mp_param *)mp_req.param;
+	struct mlx5_mp_param *res;
+	struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	struct mlx5_priv *priv;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	priv = dev->data->dev_private;
+	mp_init_msg(&priv->mp_id, &mp_req, req_type);
+	req->args.queue_id.queue_id = queue_id;
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		DRV_LOG(ERR, "port %u request to primary process failed",
+			dev->data->port_id);
+		return -rte_errno;
+	}
+	MLX5_ASSERT(mp_rep.nb_received == 1);
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mlx5_mp_param *)mp_res->param;
+	ret = res->result;
+	free(mp_rep.msgs);
+	return ret;
+}
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 78046fd..f7792bc 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -439,6 +439,229 @@ 
 	return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
 }
 
+/* Fetches and drops all SW-owned and error CQEs to synchronize CQ. */
+static void
+rxq_sync_cq(struct mlx5_rxq_data *rxq)
+{
+	const uint16_t cqe_n = 1 << rxq->cqe_n;
+	const uint16_t cqe_mask = cqe_n - 1;
+	volatile struct mlx5_cqe *cqe;
+	int ret, i;
+
+	i = cqe_n;
+	do {
+		cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
+		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
+		if (ret == MLX5_CQE_STATUS_HW_OWN)
+			break;
+		if (ret == MLX5_CQE_STATUS_ERR) {
+			rxq->cq_ci++;
+			continue;
+		}
+		MLX5_ASSERT(ret == MLX5_CQE_STATUS_SW_OWN);
+		if (MLX5_CQE_FORMAT(cqe->op_own) != MLX5_COMPRESSED) {
+			rxq->cq_ci++;
+			continue;
+		}
+		/* Compute the next non compressed CQE. */
+		rxq->cq_ci += rte_be_to_cpu_32(cqe->byte_cnt);
+
+	} while (--i);
+	/* Move all CQEs to HW ownership, including possible MiniCQEs. */
+	for (i = 0; i < cqe_n; i++) {
+		cqe = &(*rxq->cqes)[i];
+		cqe->op_own = MLX5_CQE_INVALIDATE;
+	}
+	/* Resync CQE and WQE (WQ in RESET state). */
+	rte_cio_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	rte_cio_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(0);
+	rte_cio_wmb();
+}
+
+/**
+ * Rx queue stop. Device queue goes to the RESET state,
+ * all involved mbufs are freed from WQ.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+		struct ibv_wq_attr mod = {
+			.attr_mask = IBV_WQ_ATTR_STATE,
+			.wq_state = IBV_WQS_RESET,
+		};
+
+		ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
+	} else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */
+		struct mlx5_devx_modify_rq_attr rq_attr;
+
+		memset(&rq_attr, 0, sizeof(rq_attr));
+		rq_attr.rq_state = MLX5_RQC_STATE_RST;
+		rq_attr.state = MLX5_RQC_STATE_RDY;
+		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
+	}
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Rx WQ state to RESET:  %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	/* Remove all processes CQEs. */
+	rxq_sync_cq(rxq);
+	/* Free all involved mbufs. */
+	rxq_free_elts(rxq_ctrl);
+	/* Set the actual queue state. */
+	dev->data->rx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
+	return 0;
+}
+
+/**
+ * Rx queue stop. Device queue goes to the RESET state,
+ * all involved mbufs are freed from WQ.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_stop(struct rte_eth_dev *dev, uint16_t idx)
+{
+	int ret;
+
+	if (dev->data->rx_queue_state[idx] == RTE_ETH_QUEUE_STATE_HAIRPIN) {
+		DRV_LOG(ERR, "Hairpin queue can't be stopped");
+		rte_errno = EINVAL;
+		return -EINVAL;
+	}
+	if (dev->data->rx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STOPPED)
+		return 0;
+	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
+		ret = mlx5_mp_req_queue_control(dev, idx,
+						MLX5_MP_REQ_QUEUE_RX_STOP);
+	} else {
+		ret = mlx5_rx_queue_stop_primary(dev, idx);
+	}
+	return ret;
+}
+
+/**
+ * Rx queue start. Device queue goes to the ready state,
+ * all required mbufs are allocated and WQ is replenished.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_start_primary(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() ==  RTE_PROC_PRIMARY);
+	/* Allocate needed buffers. */
+	ret = rxq_alloc_elts(rxq_ctrl);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot reallocate buffers for Rx WQ");
+		rte_errno = errno;
+		return ret;
+	}
+	rte_cio_wmb();
+	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	rte_cio_wmb();
+	/* Reset RQ consumer before moving queue ro READY state. */
+	*rxq->rq_db = rte_cpu_to_be_32(0);
+	rte_cio_wmb();
+	if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+		struct ibv_wq_attr mod = {
+			.attr_mask = IBV_WQ_ATTR_STATE,
+			.wq_state = IBV_WQS_RDY,
+		};
+
+		ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
+	} else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */
+		struct mlx5_devx_modify_rq_attr rq_attr;
+
+		memset(&rq_attr, 0, sizeof(rq_attr));
+		rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+		rq_attr.state = MLX5_RQC_STATE_RST;
+		ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, &rq_attr);
+	}
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Rx WQ state to READY:  %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	/* Reinitialize RQ - set WQEs. */
+	mlx5_rxq_initialize(rxq);
+	rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
+	/* Set actual queue state. */
+	dev->data->rx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
+	return 0;
+}
+
+/**
+ * Rx queue start. Device queue goes to the ready state,
+ * all required mbufs are allocated and WQ is replenished.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_start(struct rte_eth_dev *dev, uint16_t idx)
+{
+	int ret;
+
+	if (dev->data->rx_queue_state[idx] == RTE_ETH_QUEUE_STATE_HAIRPIN) {
+		DRV_LOG(ERR, "Hairpin queue can't be started");
+		rte_errno = EINVAL;
+		return -EINVAL;
+	}
+	if (dev->data->rx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STARTED)
+		return 0;
+	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
+		ret = mlx5_mp_req_queue_control(dev, idx,
+						MLX5_MP_REQ_QUEUE_RX_START);
+	} else {
+		ret = mlx5_rx_queue_start_primary(dev, idx);
+	}
+	return ret;
+}
+
 /**
  * Rx queue presetup checks.
  *
@@ -678,6 +901,9 @@ 
 static int
 mlx5_rxq_obj_release(struct mlx5_rxq_obj *rxq_obj)
 {
+	struct rte_eth_dev_data *dev_data;
+	uint16_t idx;
+
 	MLX5_ASSERT(rxq_obj);
 	if (rte_atomic32_dec_and_test(&rxq_obj->refcnt)) {
 		switch (rxq_obj->type) {
@@ -704,6 +930,11 @@ 
 		if (rxq_obj->channel)
 			claim_zero(mlx5_glue->destroy_comp_channel
 				   (rxq_obj->channel));
+		idx = rxq_obj->rxq_ctrl->rxq.idx;
+		dev_data = rxq_obj->rxq_ctrl->priv->dev_data;
+		if (rxq_obj->type != MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+			dev_data->rx_queue_state[idx] =
+				RTE_ETH_QUEUE_STATE_STOPPED;
 		LIST_REMOVE(rxq_obj, next);
 		rte_free(rxq_obj);
 		return 0;
@@ -1319,6 +1550,7 @@ 
 	rte_atomic32_inc(&tmpl->refcnt);
 	LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	dev->data->rx_queue_state[idx] = RTE_ETH_QUEUE_STATE_HAIRPIN;
 	return tmpl;
 }
 
@@ -1485,6 +1717,7 @@  struct mlx5_rxq_obj *
 	rte_atomic32_inc(&tmpl->refcnt);
 	LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+	dev->data->rx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
 	return tmpl;
 error:
 	if (tmpl) {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 26621ff..e2f7c3c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -382,6 +382,10 @@  struct mlx5_txq_ctrl {
 int mlx5_mprq_enabled(struct rte_eth_dev *dev);
 int mlx5_mprq_free_mp(struct rte_eth_dev *dev);
 int mlx5_mprq_alloc_mp(struct rte_eth_dev *dev);
+int mlx5_rx_queue_start(struct rte_eth_dev *dev, uint16_t queue_id);
+int mlx5_rx_queue_stop(struct rte_eth_dev *dev, uint16_t queue_id);
+int mlx5_rx_queue_start_primary(struct rte_eth_dev *dev, uint16_t queue_id);
+int mlx5_rx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t queue_id);
 int mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			unsigned int socket, const struct rte_eth_rxconf *conf,
 			struct rte_mempool *mp);
@@ -427,6 +431,10 @@  uint32_t mlx5_hrxq_get(struct rte_eth_dev *dev,
 
 /* mlx5_txq.c */
 
+int mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t queue_id);
+int mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t queue_id);
+int mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t queue_id);
+int mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t queue_id);
 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			unsigned int socket, const struct rte_eth_txconf *conf);
 int mlx5_tx_hairpin_queue_setup
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index f7b548f..ff1fb87 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -136,6 +136,218 @@ 
 	return offloads;
 }
 
+/* Fetches and drops all SW-owned and error CQEs to synchronize CQ. */
+static void
+txq_sync_cq(struct mlx5_txq_data *txq)
+{
+	volatile struct mlx5_cqe *cqe;
+	int ret, i;
+
+	i = txq->cqe_s;
+	do {
+		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+				/* No new CQEs in completion queue. */
+				MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
+				break;
+			}
+		}
+		++txq->cq_ci;
+	} while (--i);
+	/* Move all CQEs to HW ownership. */
+	for (i = 0; i < txq->cqe_s; i++) {
+		cqe = &txq->cqes[i];
+		cqe->op_own = MLX5_CQE_INVALIDATE;
+	}
+	/* Resync CQE and WQE (WQ in reset state). */
+	rte_cio_wmb();
+	*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
+	rte_cio_wmb();
+}
+
+/**
+ * Tx queue stop. Device queue goes to the idle state,
+ * all involved mbufs are freed from elts/WQ.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Tx queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+			container_of(txq, struct mlx5_txq_ctrl, txq);
+	struct ibv_qp_attr mod = {
+			.qp_state = IBV_QPS_RESET,
+			.port_num = (uint8_t)priv->ibv_port,
+	};
+	struct ibv_qp *qp = txq_ctrl->obj->qp;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Move QP to RESET state. */
+	ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
+			"%s", strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	/* Handle all send completions. */
+	txq_sync_cq(txq);
+	/* Free elts stored in the SQ. */
+	txq_free_elts(txq_ctrl);
+	/* Prevent writing new pkts to SQ by setting no free WQE.*/
+	txq->wqe_ci = txq->wqe_s;
+	txq->wqe_pi = 0;
+	txq->elts_comp = 0;
+	/* Set the actual queue state. */
+	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
+	return 0;
+}
+
+/**
+ * Tx queue stop. Device queue goes to the idle state,
+ * all involved mbufs are freed from elts/WQ.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   Tx queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t idx)
+{
+	int ret;
+
+	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_HAIRPIN) {
+		DRV_LOG(ERR, "Hairpin queue can't be stopped");
+		rte_errno = EINVAL;
+		return -EINVAL;
+	}
+	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STOPPED)
+		return 0;
+	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
+		ret = mlx5_mp_req_queue_control(dev, idx,
+						MLX5_MP_REQ_QUEUE_TX_STOP);
+	} else {
+		ret = mlx5_tx_queue_stop_primary(dev, idx);
+	}
+	return ret;
+}
+
+/**
+ * Rx queue start. Device queue goes to the ready state,
+ * all required mbufs are allocated and WQ is replenished.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+			container_of(txq, struct mlx5_txq_ctrl, txq);
+	struct ibv_qp_attr mod = {
+			.qp_state = IBV_QPS_RESET,
+			.port_num = (uint8_t)priv->ibv_port,
+	};
+	struct ibv_qp *qp = txq_ctrl->obj->qp;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() ==  RTE_PROC_PRIMARY);
+	ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
+			"%s", strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	mod.qp_state = IBV_QPS_INIT;
+	ret = mlx5_glue->modify_qp(qp, &mod,
+				   (IBV_QP_STATE | IBV_QP_PORT));
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	mod.qp_state = IBV_QPS_RTR;
+	ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
+			strerror(errno));
+			rte_errno = errno;
+			return ret;
+	}
+	mod.qp_state = IBV_QPS_RTS;
+	ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	txq_ctrl->txq.wqe_ci = 0;
+	txq_ctrl->txq.wqe_pi = 0;
+	txq_ctrl->txq.elts_comp = 0;
+	/* Set the actual queue state. */
+	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
+	return 0;
+}
+
+/**
+ * Rx queue start. Device queue goes to the ready state,
+ * all required mbufs are allocated and WQ is replenished.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t idx)
+{
+	int ret;
+
+	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_HAIRPIN) {
+		DRV_LOG(ERR, "Hairpin queue can't be started");
+		rte_errno = EINVAL;
+		return -EINVAL;
+	}
+	if (dev->data->tx_queue_state[idx] == RTE_ETH_QUEUE_STATE_STARTED)
+		return 0;
+	if (rte_eal_process_type() ==  RTE_PROC_SECONDARY) {
+		ret = mlx5_mp_req_queue_control(dev, idx,
+						MLX5_MP_REQ_QUEUE_TX_START);
+	} else {
+		ret = mlx5_tx_queue_start_primary(dev, idx);
+	}
+	return ret;
+}
+
 /**
  * Tx queue presetup checks.
  *
@@ -225,6 +437,7 @@ 
 	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
 		dev->data->port_id, idx);
 	(*priv->txqs)[idx] = &txq_ctrl->txq;
+	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
 	return 0;
 }
 
@@ -275,6 +488,7 @@ 
 	DRV_LOG(DEBUG, "port %u adding Tx queue %u to list",
 		dev->data->port_id, idx);
 	(*priv->txqs)[idx] = &txq_ctrl->txq;
+	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_HAIRPIN;
 	return 0;
 }
 
@@ -1434,6 +1648,7 @@  struct mlx5_txq_ctrl *
 		LIST_REMOVE(txq, next);
 		rte_free(txq);
 		(*priv->txqs)[idx] = NULL;
+		dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STOPPED;
 		return 0;
 	}
 	return 1;