diff mbox series

[RFC,1/1] net/mlx5: set txq affinity in round-robin

Message ID 20210910042347.12820-2-rongweil@nvidia.com (mailing list archive)
State RFC
Delegated to: Raslan Darawsheh
Headers show
Series net/mlx5: set txq affinity in round-robin | expand

Checks

Context Check Description
ci/intel-Testing fail Testing issues
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Rongwei Liu Sept. 10, 2021, 4:23 a.m. UTC
Previously, we set txq affinity to 0 and let firmware
to perform round-robin when bonding. Firmware uses a
global counter to assign txq affinity to different
physical ports accord to remainder after division.

There are three dis-advantages:
1. The global counter is shared between kernel and dpdk.
2. After restarting pmd or port, the previous counter value
is reused, so the new affinity is unpredictable.
3. There is no way to get what affinity is set by firmware.

In this update, we will create several TISs up to the
number of bonding ports and bind each TIS to one PF port.

For each port, it will start to pick up TIS using its port
index. Upper layer application can quickly calculate each txq's
affinity without querying.

At DPDK layer, when creating txq with 2 bonding ports, the
affinity is set like:
port 0: 1-->2-->1-->2
port 1: 2-->1-->2-->1
port 2: 1-->2-->1-->2

Note: Only applicable to Devx api.

Signed-off-by: Jiawei Wang <jiaweiw@nvidia.com>
Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 38 ++++++++++++++++++++++
 drivers/common/mlx5/mlx5_devx_cmds.h | 12 +++++++
 drivers/common/mlx5/mlx5_prm.h       | 26 +++++++++++++++
 drivers/common/mlx5/version.map      |  1 +
 drivers/net/mlx5/linux/mlx5_os.c     | 37 ++++++++++++++++++++++
 drivers/net/mlx5/mlx5.c              | 17 +++-------
 drivers/net/mlx5/mlx5.h              |  8 ++++-
 drivers/net/mlx5/mlx5_devx.c         | 47 ++++++++++++++++++++++++++--
 drivers/net/mlx5/mlx5_tx.h           |  1 +
 drivers/net/mlx5/mlx5_txpp.c         |  4 +--
 10 files changed, 174 insertions(+), 17 deletions(-)

Comments

Kinsella, Ray Sept. 13, 2021, 12:11 p.m. UTC | #1
On 10/09/2021 05:23, Rongwei Liu wrote:
> Previously, we set txq affinity to 0 and let firmware
> to perform round-robin when bonding. Firmware uses a
> global counter to assign txq affinity to different
> physical ports accord to remainder after division.
> 
> There are three dis-advantages:
> 1. The global counter is shared between kernel and dpdk.
> 2. After restarting pmd or port, the previous counter value
> is reused, so the new affinity is unpredictable.
> 3. There is no way to get what affinity is set by firmware.
> 
> In this update, we will create several TISs up to the
> number of bonding ports and bind each TIS to one PF port.
> 
> For each port, it will start to pick up TIS using its port
> index. Upper layer application can quickly calculate each txq's
> affinity without querying.
> 
> At DPDK layer, when creating txq with 2 bonding ports, the
> affinity is set like:
> port 0: 1-->2-->1-->2
> port 1: 2-->1-->2-->1
> port 2: 1-->2-->1-->2
> 
> Note: Only applicable to Devx api.
> 
> Signed-off-by: Jiawei Wang <jiaweiw@nvidia.com>
> Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
> ---
>  drivers/common/mlx5/mlx5_devx_cmds.c | 38 ++++++++++++++++++++++
>  drivers/common/mlx5/mlx5_devx_cmds.h | 12 +++++++
>  drivers/common/mlx5/mlx5_prm.h       | 26 +++++++++++++++
>  drivers/common/mlx5/version.map      |  1 +
>  drivers/net/mlx5/linux/mlx5_os.c     | 37 ++++++++++++++++++++++
>  drivers/net/mlx5/mlx5.c              | 17 +++-------
>  drivers/net/mlx5/mlx5.h              |  8 ++++-
>  drivers/net/mlx5/mlx5_devx.c         | 47 ++++++++++++++++++++++++++--
>  drivers/net/mlx5/mlx5_tx.h           |  1 +
>  drivers/net/mlx5/mlx5_txpp.c         |  4 +--
>  10 files changed, 174 insertions(+), 17 deletions(-)
> 
Acked-by: Ray Kinsella <mdr@ashroe.eu>
diff mbox series

Patch

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 56407cc332..dd04b9820c 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -2765,3 +2765,41 @@  mlx5_devx_cmd_create_crypto_login_obj(void *ctx,
 	crypto_login_obj->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
 	return crypto_login_obj;
 }
+
+/**
+ * Query LAG context.
+ *
+ * @param[in] ctx
+ *   Pointer to ibv_context, returned from mlx5dv_open_device.
+ * @param[out] lag_ctx
+ *   Pointer to struct mlx5_devx_lag_context, to be set by the routine.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int
+mlx5_devx_cmd_query_lag(struct ibv_context *ctx,
+			struct mlx5_devx_lag_context *lag_ctx)
+{
+	uint32_t in[MLX5_ST_SZ_DW(query_lag_in)] = {0};
+	uint32_t out[MLX5_ST_SZ_DW(query_lag_out)] = {0};
+	void *lctx;
+	int rc;
+
+	MLX5_SET(query_lag_in, in, opcode, MLX5_CMD_OP_QUERY_LAG);
+	rc = mlx5_glue->devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out));
+	if (rc)
+		goto error;
+	lctx = MLX5_ADDR_OF(query_lag_out, out, context);
+	lag_ctx->fdb_selection_mode = MLX5_GET(lag_context, lctx,
+					       fdb_selection_mode);
+	lag_ctx->lag_state = MLX5_GET(lag_context, lctx, lag_state);
+	lag_ctx->tx_remap_affinity_2 = MLX5_GET(lag_context, lctx,
+						tx_remap_affinity_2);
+	lag_ctx->tx_remap_affinity_1 = MLX5_GET(lag_context, lctx,
+						tx_remap_affinity_1);
+	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
+}
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index e576e30f24..641f1b25e3 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -184,6 +184,14 @@  struct mlx5_hca_attr {
 	uint32_t umr_indirect_mkey_disabled:1;
 };
 
+/* LAG Context. */
+struct mlx5_devx_lag_context {
+	uint32_t fdb_selection_mode:1;
+	uint32_t lag_state:3;
+	uint32_t tx_remap_affinity_1:4;
+	uint32_t tx_remap_affinity_2:4;
+};
+
 struct mlx5_devx_wq_attr {
 	uint32_t wq_type:4;
 	uint32_t wq_signature:1;
@@ -666,4 +674,8 @@  struct mlx5_devx_obj *
 mlx5_devx_cmd_create_crypto_login_obj(void *ctx,
 				      struct mlx5_devx_crypto_login_attr *attr);
 
+__rte_internal
+int
+mlx5_devx_cmd_query_lag(struct ibv_context *ctx,
+			struct mlx5_devx_lag_context *lag_ctx);
 #endif /* RTE_PMD_MLX5_DEVX_CMDS_H_ */
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d361bcf90e..9cccdd7098 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -1048,6 +1048,7 @@  enum {
 	MLX5_CMD_OP_DEALLOC_PD = 0x801,
 	MLX5_CMD_OP_ACCESS_REGISTER = 0x805,
 	MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816,
+	MLX5_CMD_OP_QUERY_LAG = 0x842,
 	MLX5_CMD_OP_CREATE_TIR = 0x900,
 	MLX5_CMD_OP_MODIFY_TIR = 0x901,
 	MLX5_CMD_OP_CREATE_SQ = 0X904,
@@ -2000,6 +2001,31 @@  struct mlx5_ifc_query_tis_in_bits {
 	u8 reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_lag_context_bits {
+	u8 fdb_selection_mode[0x1];
+	u8 reserved_at_1[0x1c];
+	u8 lag_state[0x3];
+	u8 reserved_at_20[0x14];
+	u8 tx_remap_affinity_2[0x4];
+	u8 reserved_at_38[0x4];
+	u8 tx_remap_affinity_1[0x4];
+};
+
+struct mlx5_ifc_query_lag_in_bits {
+	u8 opcode[0x10];
+	u8 uid[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_lag_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	struct mlx5_ifc_lag_context_bits context;
+};
+
 struct mlx5_ifc_alloc_transport_domain_out_bits {
 	u8 status[0x8];
 	u8 reserved_at_8[0x18];
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index e5cb6b7060..e169fbaa40 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -53,6 +53,7 @@  INTERNAL {
 	mlx5_devx_cmd_modify_virtq;
 	mlx5_devx_cmd_qp_query_tis_td;
 	mlx5_devx_cmd_query_hca_attr;
+	mlx5_devx_cmd_query_lag;
 	mlx5_devx_cmd_query_parse_samples;
 	mlx5_devx_cmd_query_virtio_q_counters; # WINDOWS_NO_EXPORT
 	mlx5_devx_cmd_query_virtq;
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 5f8766aa48..f4159021b7 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -977,6 +977,8 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	uint16_t port_id;
 	struct mlx5_port_info vport_info = { .query_flags = 0 };
 	int i;
+	struct mlx5_devx_tis_attr tis_attr = { 0 };
+	struct mlx5_devx_lag_context lag_ctx = {0};
 
 	/* Determine if this port representor is supposed to be spawned. */
 	if (switch_info->representor && dpdk_dev->devargs &&
@@ -1679,6 +1681,41 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		eth_dev->data->representor_id = priv->representor_id;
 	}
 	priv->mp_id.port_id = eth_dev->data->port_id;
+	tis_attr.transport_domain = sh->td->id;
+	if (sh->bond.n_port) {
+		for (i = 0; i < sh->bond.n_port; i++) {
+			/*
+			 * 0 is auto affinity, non-zero value
+			 * to propose port.
+			 */
+			tis_attr.lag_tx_port_affinity = (eth_dev->data->port_id
+					+ i) % sh->bond.n_port + 1;
+			sh->tis[i] = mlx5_devx_cmd_create_tis(sh->ctx,
+					&tis_attr);
+			if (!sh->tis[i]) {
+				DRV_LOG(ERR, "TIS allocation failure %d", i);
+				err = ENOMEM;
+				goto error;
+			}
+		}
+		if (!mlx5_devx_cmd_query_lag(sh->ctx, &lag_ctx)) {
+			sh->lag.tx_remap_affinity[0] =
+				lag_ctx.tx_remap_affinity_1;
+			sh->lag.tx_remap_affinity[1] =
+				lag_ctx.tx_remap_affinity_2;
+		}
+		DRV_LOG(DEBUG, "LAG number of ports : %d, affinity_1 & 2 : %d & %d\n",
+			sh->bond.n_port, lag_ctx.tx_remap_affinity_1,
+			lag_ctx.tx_remap_affinity_2);
+	} else {
+		tis_attr.lag_tx_port_affinity = 0;
+		sh->tis[0] = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
+		if (!sh->tis[0]) {
+			DRV_LOG(ERR, "TIS allocation failure");
+			err = ENOMEM;
+			goto error;
+		}
+	}
 	strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
 	/*
 	 * Store associated network device interface index. This index
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f84e061fe7..b2520b7f77 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1112,7 +1112,6 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 	struct mlx5_dev_ctx_shared *sh;
 	int err = 0;
 	uint32_t i;
-	struct mlx5_devx_tis_attr tis_attr = { 0 };
 
 	MLX5_ASSERT(spawn);
 	/* Secondary process should not create the shared context. */
@@ -1183,13 +1182,6 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 			err = ENOMEM;
 			goto error;
 		}
-		tis_attr.transport_domain = sh->td->id;
-		sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
-		if (!sh->tis) {
-			DRV_LOG(ERR, "TIS allocation failure");
-			err = ENOMEM;
-			goto error;
-		}
 		err = mlx5_alloc_rxtx_uars(sh, config);
 		if (err)
 			goto error;
@@ -1254,8 +1246,6 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 	MLX5_ASSERT(sh);
 	if (sh->cnt_id_tbl)
 		mlx5_l3t_destroy(sh->cnt_id_tbl);
-	if (sh->tis)
-		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
 	if (sh->td)
 		claim_zero(mlx5_devx_cmd_destroy(sh->td));
 	if (sh->devx_rx_uar)
@@ -1282,6 +1272,7 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 void
 mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
 {
+	int i = 0;
 	pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 	/* Check the object presence in the list. */
@@ -1337,8 +1328,10 @@  mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
 	}
 	if (sh->pd)
 		claim_zero(mlx5_os_dealloc_pd(sh->pd));
-	if (sh->tis)
-		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+	do {
+		if (sh->tis[i])
+			claim_zero(mlx5_devx_cmd_destroy(sh->tis[i]));
+	} while (++i < sh->bond.n_port);
 	if (sh->td)
 		claim_zero(mlx5_devx_cmd_destroy(sh->td));
 	if (sh->devx_rx_uar)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e02714e231..f7935fdf82 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1118,6 +1118,11 @@  struct mlx5_aso_ct_pools_mng {
 	struct mlx5_aso_sq aso_sq; /* ASO queue objects. */
 };
 
+/* LAG attr. */
+struct mlx5_lag {
+	uint8_t tx_remap_affinity[16]; /* The PF port number of affinity */
+};
+
 /*
  * Shared Infiniband device context for Master/Representors
  * which belong to same IB device with multiple IB ports.
@@ -1187,8 +1192,9 @@  struct mlx5_dev_ctx_shared {
 	struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
 	struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
 	void *devx_comp; /* DEVX async comp obj. */
-	struct mlx5_devx_obj *tis; /* TIS object. */
+	struct mlx5_devx_obj *tis[16]; /* TIS object. */
 	struct mlx5_devx_obj *td; /* Transport domain. */
+	struct mlx5_lag lag; /* LAG attributes */
 	void *tx_uar; /* Tx/packet pacing shared UAR. */
 	struct mlx5_flex_parser_profiles fp[MLX5_FLEX_PARSER_MAX];
 	/* Flex parser profiles information. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index a1db53577a..9a5bc127ec 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -888,6 +888,41 @@  mlx5_devx_drop_action_destroy(struct rte_eth_dev *dev)
 	rte_errno = ENOTSUP;
 }
 
+/**
+ * Set TXQ affinity via TIS round-ronbin
+ *
+ * @param priv
+ *   Pointer to device private data.
+ * @param idx
+ *   TX queue index
+ * @param attr
+ *   Pointer to Devx SQ attribute
+ */
+static void
+__mlx5_set_txq_affinity(struct mlx5_priv *priv, uint16_t idx,
+		struct mlx5_devx_create_sq_attr *attr)
+{
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	int i, min_tis;
+
+	/*
+	 * The starting TIS is round-robin in all ports like
+	 * port 0: TIS 0, port 1: TIS 1 and so on.
+	 * Suppose TIS id increases by creation.
+	 */
+	for (min_tis = 0, i = 1; i < sh->bond.n_port; i++)
+		if (sh->tis[i]->id < sh->tis[min_tis]->id)
+			min_tis = i;
+	if (sh->bond.n_port) {
+		attr->tis_num = sh->tis[idx % sh->bond.n_port]->id;
+		txq_data->lag_port_affinity = (idx + sh->bond.n_port -
+			min_tis) % sh->bond.n_port + 1;
+	} else {
+		attr->tis_num = sh->tis[0]->id;
+	}
+}
+
 /**
  * Create the Tx hairpin queue object.
  *
@@ -935,7 +970,11 @@  mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
 	attr.wq_attr.log_hairpin_num_packets =
 			attr.wq_attr.log_hairpin_data_sz -
 			MLX5_HAIRPIN_QUEUE_STRIDE;
-	attr.tis_num = priv->sh->tis->id;
+	__mlx5_set_txq_affinity(priv, idx, &attr);
+	DRV_LOG(INFO, "queue %d tis number %d with affinity %d maps to PF port %d",
+			idx, attr.tis_num, txq_data->lag_port_affinity,
+			priv->sh->lag.tx_remap_affinity
+			[txq_data->lag_port_affinity - 1]);
 	tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr);
 	if (!tmpl->sq) {
 		DRV_LOG(ERR,
@@ -992,7 +1031,6 @@  mlx5_txq_create_devx_sq_resources(struct rte_eth_dev *dev, uint16_t idx,
 		.allow_swp = !!priv->config.swp,
 		.cqn = txq_obj->cq_obj.cq->id,
 		.tis_lst_sz = 1,
-		.tis_num = priv->sh->tis->id,
 		.wq_attr = (struct mlx5_devx_wq_attr){
 			.pd = priv->sh->pdn,
 			.uar_page =
@@ -1000,6 +1038,11 @@  mlx5_txq_create_devx_sq_resources(struct rte_eth_dev *dev, uint16_t idx,
 		},
 		.ts_format = mlx5_ts_format_conv(priv->sh->sq_ts_format),
 	};
+	__mlx5_set_txq_affinity(priv, idx, &sq_attr);
+	DRV_LOG(INFO, "queue %d tis number %d with affinity %d maps to PF port %d",
+			idx, sq_attr.tis_num, txq_data->lag_port_affinity,
+			priv->sh->lag.tx_remap_affinity
+			[txq_data->lag_port_affinity - 1]);
 	/* Create Send Queue object with DevX. */
 	return mlx5_devx_sq_create(priv->sh->ctx, &txq_obj->sq_obj, log_desc_n,
 				   &sq_attr, priv->sh->numa_node);
diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h
index 1a35919371..939fd60fff 100644
--- a/drivers/net/mlx5/mlx5_tx.h
+++ b/drivers/net/mlx5/mlx5_tx.h
@@ -161,6 +161,7 @@  struct mlx5_txq_data {
 	int32_t ts_offset; /* Timestamp field dynamic offset. */
 	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
+	uint8_t lag_port_affinity; /* TXQ affinity */
 #ifndef RTE_ARCH_64
 	rte_spinlock_t *uar_lock;
 	/* UAR access lock required for 32bit implementations */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 4f6da9f2d1..5b4464ce90 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -230,7 +230,7 @@  mlx5_txpp_create_rearm_queue(struct mlx5_dev_ctx_shared *sh)
 		.cd_master = 1,
 		.state = MLX5_SQC_STATE_RST,
 		.tis_lst_sz = 1,
-		.tis_num = sh->tis->id,
+		.tis_num = sh->tis[0]->id,
 		.wq_attr = (struct mlx5_devx_wq_attr){
 			.pd = sh->pdn,
 			.uar_page = mlx5_os_get_devx_uar_page_id(sh->tx_uar),
@@ -433,7 +433,7 @@  mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
 	/* Create send queue object for Clock Queue. */
 	if (sh->txpp.test) {
 		sq_attr.tis_lst_sz = 1;
-		sq_attr.tis_num = sh->tis->id;
+		sq_attr.tis_num = sh->tis[0]->id;
 		sq_attr.non_wire = 0;
 		sq_attr.static_sq_wq = 1;
 	} else {