[3/9] net/mlx5: add Direct Rules configuration support

Message ID 1555276357-4892-4-git-send-email-orika@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx5: add Direct Verbs E-Switch support |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Ori Kam April 14, 2019, 9:12 p.m. UTC
  This commit provides the basic configuration needed in order to
support Direct Rules eswitch.

Signed-off-by: Ori Kam <orika@mellanox.com>
---
 drivers/net/mlx5/Makefile         |   5 +
 drivers/net/mlx5/meson.build      |   2 +
 drivers/net/mlx5/mlx5.c           |  52 +++++-
 drivers/net/mlx5/mlx5.h           |  12 ++
 drivers/net/mlx5/mlx5_devx_cmds.c |  42 +++++
 drivers/net/mlx5/mlx5_flow.c      |   2 +-
 drivers/net/mlx5/mlx5_prm.h       | 328 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 437 insertions(+), 6 deletions(-)
  

Comments

Yongseok Koh April 17, 2019, 1:42 a.m. UTC | #1
On Sun, Apr 14, 2019 at 09:12:31PM +0000, Ori Kam wrote:
> This commit provides the basic configuration needed in order to
> support Direct Rules eswitch.

What do you mean my "Direct Rules eswitch"? What is the official name of it?
E-Switch is in HCA and DR is use by library? Then, shouldn't it be "E-Switch
with Direct Rules"? Please correct it appropriately.

And I can see many of 'eswitch' in commit log or comment in the code. Please
correct all of them as well.

> Signed-off-by: Ori Kam <orika@mellanox.com>
> ---

The title is "net/mlx5: add Direct Rules configuration support"
Shouldn't it have the word, "E-Switch"?

And it seems to have more than "configuration"?

>  drivers/net/mlx5/Makefile         |   5 +
>  drivers/net/mlx5/meson.build      |   2 +
>  drivers/net/mlx5/mlx5.c           |  52 +++++-
>  drivers/net/mlx5/mlx5.h           |  12 ++
>  drivers/net/mlx5/mlx5_devx_cmds.c |  42 +++++
>  drivers/net/mlx5/mlx5_flow.c      |   2 +-
>  drivers/net/mlx5/mlx5_prm.h       | 328 ++++++++++++++++++++++++++++++++++++++
>  7 files changed, 437 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
> index 93bc869..2b72a33 100644
> --- a/drivers/net/mlx5/Makefile
> +++ b/drivers/net/mlx5/Makefile
> @@ -161,6 +161,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
>  		enum MLX5DV_DR_NS_TYPE_TERMINATING \
>  		$(AUTOCONF_OUTPUT)
>  	$Q sh -- '$<' '$@' \
> +		HAVE_MLX5DV_DR_ESWITCH \
> +		infiniband/mlx5dv.h \
> +		enum MLX5DV_DR_NS_DOMAIN_FDB_BYPASS \
> +		$(AUTOCONF_OUTPUT)
> +	$Q sh -- '$<' '$@' \

Should start from HAVE_IBV_FLOW_
How about HAVE_IBV_FLOW_DV_ESW_DIRECT_RULES?

>  		HAVE_IBV_DEVX_OBJ \
>  		infiniband/mlx5dv.h \
>  		func mlx5dv_devx_obj_create \
> diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
> index 0037e15..9dfd28d 100644
> --- a/drivers/net/mlx5/meson.build
> +++ b/drivers/net/mlx5/meson.build
> @@ -113,6 +113,8 @@ if build
>  		'MLX5DV_FLOW_ACTION_COUNTERS_DEVX' ],
>  		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
>  		'MLX5DV_DR_NS_TYPE_TERMINATING' ],
> +		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h',
> +		'MLX5DV_DR_NS_DOMAIN_FDB_BYPASS' ],

Same here.

>  		[ 'HAVE_SUPPORTED_40000baseKR4_Full', 'linux/ethtool.h',
>  		'SUPPORTED_40000baseKR4_Full' ],
>  		[ 'HAVE_SUPPORTED_40000baseCR4_Full', 'linux/ethtool.h',
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 9ff50df..938ba1c 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -101,6 +101,9 @@
>  /* Allow L3 VXLAN flow creation. */
>  #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
>  
> +/* Activate DV eswitch flow steering. */
> +#define MLX5_DV_ESWITCH_EN "dv_eswitch_en"
> +

We can set a rule to use 'esw'/'ESW' in the code just like dv/tcf/verbs and so on?
Or, what's the difference between E-Switch and FDB?

>  /* Activate DV flow steering. */
>  #define MLX5_DV_FLOW_EN "dv_flow_en"
>  
> @@ -344,6 +347,18 @@ struct mlx5_dev_spawn_data {
>  	}
>  	pthread_mutex_init(&sh->dv_mutex, NULL);
>  	sh->tx_ns = ns;
> +#ifdef HAVE_MLX5DV_DR_ESWITCH
> +	if (priv->config.dv_eswitch_en) {
> +		ns  = mlx5_glue->dr_create_ns(sh->ctx,
> +					      MLX5DV_DR_NS_DOMAIN_FDB_BYPASS);
> +		if (!ns) {
> +			DRV_LOG(ERR, "FDB mlx5dv_dr_create_ns failed");
> +			err = errno;
> +			goto error;
> +		}
> +		sh->fdb_ns = ns;
> +	}
> +#endif
>  	sh->dv_refcnt++;
>  	priv->dr_shared = 1;
>  	return 0;
> @@ -358,6 +373,10 @@ struct mlx5_dev_spawn_data {
>  		mlx5dv_dr_destroy_ns(sh->tx_ns);
>  		sh->tx_ns = NULL;
>  	}
> +	if (sh->fdb_ns) {
> +		mlx5_glue->dr_destroy_ns(sh->fdb_ns);
> +		sh->fdb_ns = NULL;
> +	}
>  	return err;
>  #else
>  	(void)priv;
> @@ -393,6 +412,12 @@ struct mlx5_dev_spawn_data {
>  		mlx5dv_dr_destroy_ns(sh->tx_ns);
>  		sh->tx_ns = NULL;
>  	}
> +#ifdef HAVE_MLX5DV_DR_ESWITCH
> +	if (sh->fdb_ns) {
> +		mlx5_glue->dr_destroy_ns(sh->fdb_ns);
> +		sh->fdb_ns = NULL;
> +	}
> +#endif
>  	pthread_mutex_destroy(&sh->dv_mutex);
>  #else
>  	(void)priv;
> @@ -861,6 +886,8 @@ struct mlx5_dev_spawn_data {
>  		config->l3_vxlan_en = !!tmp;
>  	} else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
>  		config->vf_nl_en = !!tmp;
> +	} else if (strcmp(MLX5_DV_ESWITCH_EN, key) == 0) {
> +		config->dv_eswitch_en = !!tmp;

Do we really need to make it configurable? What is the purpose of doing that? If
esw dr isn't supported, it can fall back to tcf but, if supported, why not using
it? We still have dv_flow_en. If dv_flow_en is disabled, we should disable dv
esw too. But we need not configure the two individually. Thoughts?

>  	} else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
>  		config->dv_flow_en = !!tmp;
>  	} else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
> @@ -905,6 +932,7 @@ struct mlx5_dev_spawn_data {
>  		MLX5_RX_VEC_EN,
>  		MLX5_L3_VXLAN_EN,
>  		MLX5_VF_NL_EN,
> +		MLX5_DV_ESWITCH_EN,
>  		MLX5_DV_FLOW_EN,
>  		MLX5_MR_EXT_MEMSEG_EN,
>  		MLX5_REPRESENTOR,
> @@ -1458,11 +1486,6 @@ struct mlx5_dev_spawn_data {
>  			priv->tcf_context = NULL;
>  		}
>  	}
> -	if (config.dv_flow_en) {
> -		err = mlx5_alloc_shared_dr(priv);
> -		if (err)
> -			goto error;
> -	}
>  	TAILQ_INIT(&priv->flows);
>  	TAILQ_INIT(&priv->ctrl_flows);
>  	/* Hint libmlx5 to use PMD allocator for data plane resources */
> @@ -1484,8 +1507,26 @@ struct mlx5_dev_spawn_data {
>  	 * Verbs context returned by ibv_open_device().
>  	 */
>  	mlx5_link_update(eth_dev, 0);
> +#ifdef HAVE_IBV_DEVX_OBJ
> +	err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
> +	if (err) {
> +		err = -err;
> +		goto error;
> +	}
> +#endif
> +#ifdef HAVE_MLX5DV_DR_ESWITCH
> +	if (!config.hca_attr.eswitch_manager)
> +		config.dv_eswitch_en = 0;
> +#else
> +	config.dv_eswitch_en = 0;
> +#endif
>  	/* Store device configuration on private structure. */
>  	priv->config = config;
> +	if (config.dv_flow_en) {
> +		err = mlx5_alloc_shared_dr(priv);
> +		if (err)
> +			goto error;
> +	}
>  	/* Supported Verbs flow priority number detection. */
>  	err = mlx5_flow_discover_priorities(eth_dev);
>  	if (err < 0) {
> @@ -1876,6 +1917,7 @@ struct mlx5_dev_spawn_data {
>  			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
>  			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
>  		},
> +		.dv_eswitch_en = 1,
>  	};
>  	/* Device specific configuration. */
>  	switch (pci_dev->id.device_id) {
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 14c7f3c..33a4127 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -138,6 +138,11 @@ struct mlx5_devx_counter_set {
>  	int id; /* Flow counter ID */
>  };
>  
> +/* HCA attributes. */
> +struct mlx5_hca_attr {
> +	uint32_t eswitch_manager:1;
> +};
> +
>  /* Flow list . */
>  TAILQ_HEAD(mlx5_flows, rte_flow);
>  
> @@ -171,6 +176,7 @@ struct mlx5_dev_config {
>  	/* Whether memseg should be extended for MR creation. */
>  	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
>  	unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
> +	unsigned int dv_eswitch_en:1; /* Enable eswitch DV flow. */
>  	unsigned int dv_flow_en:1; /* Enable DV flow. */
>  	unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
>  	unsigned int devx:1; /* Whether devx interface is available or not. */
> @@ -192,6 +198,7 @@ struct mlx5_dev_config {
>  	int txqs_inline; /* Queue number threshold for inlining. */
>  	int txqs_vec; /* Queue number threshold for vectorized Tx. */
>  	int inline_max_packet_sz; /* Max packet size for inlining. */
> +	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
>  };
>  
>  /**
> @@ -241,6 +248,7 @@ struct mlx5_flow_tbl_resource {
>  };
>  
>  #define MLX5_MAX_TABLES 1024
> +#define MLX5_MAX_TABLES_FDB 32
>  #define MLX5_GROUP_FACTOR 1
>  
>  /*
> @@ -260,6 +268,8 @@ struct mlx5_ibv_shared {
>  	/* Shared DV/DR flow data section. */
>  	pthread_mutex_t dv_mutex; /* DV context mutex. */
>  	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> +	void *fdb_ns; /* FDB Direct Rules name space handle. */
> +	struct mlx5_flow_tbl_resource fdb_tbl[MLX5_MAX_TABLES_FDB];
>  	void *rx_ns; /* RX Direct Rules name space handle. */
>  	struct mlx5_flow_tbl_resource rx_tbl[MLX5_MAX_TABLES];
>  	/* RX Direct Rules tables. */
> @@ -539,4 +549,6 @@ int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
>  int mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_counter_set *dcx,
>  				     int clear,
>  				     uint64_t *pkts, uint64_t *bytes);
> +int mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
> +				 struct mlx5_hca_attr *attr);
>  #endif /* RTE_PMD_MLX5_H_ */
> diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
> index a9dff58..3caea41 100644
> --- a/drivers/net/mlx5/mlx5_devx_cmds.c
> +++ b/drivers/net/mlx5/mlx5_devx_cmds.c
> @@ -105,3 +105,45 @@ int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
>  	*bytes = MLX5_GET64(traffic_counter, stats, octets);
>  	return 0;
>  }
> +
> +/**
> + * Query HCA attributes.

Need to be more informative. What to query here? Please specify in detail.

> + *
> + * @param[in] ctx
> + *   ibv contexts returned from mlx5dv_open_device.
> + * @param[out] attr
> + *   Attributes device values.
> + *
> + * @return
> + *   0 on success, a negative value otherwise.
> + */
> +int
> +mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
> +			     struct mlx5_hca_attr *attr)
> +{
> +	uint32_t in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {0};
> +	uint32_t out[MLX5_ST_SZ_DW(query_hca_cap_out)] = {0};
> +	void *hcattr;
> +	int status, syndrome, rc;
> +
> +	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
> +	MLX5_SET(query_hca_cap_in, in, op_mod,
> +		 MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE |
> +		 MLX5_HCA_CAP_OPMOD_GET_CUR);
> +
> +	rc = mlx5_glue->devx_general_cmd(ctx,
> +					 in, sizeof(in), out, sizeof(out));
> +	if (rc)
> +		return rc;
> +	status = MLX5_GET(query_hca_cap_out, out, status);
> +	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
> +	if (status) {
> +		DRV_LOG(DEBUG, "Failed to query devx HCA capabilities, "
> +			"status %x, syndrome = %x",
> +			status, syndrome);
> +		return -1;
> +	}
> +	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
> +	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr, eswitch_manager);
> +	return 0;
> +}
> diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> index a0683ee..83abc14 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -1784,7 +1784,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
>  	struct mlx5_priv *priv = dev->data->dev_private;
>  	enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX;
>  
> -	if (attr->transfer)
> +	if (attr->transfer && !priv->config.dv_eswitch_en)

To make sure it works as intended, a critical precondition MUST be met.
	"If dv_flow_en is set, dv_eswitch_en is also set."

Think about a case where
	attr->transfer is set
	dv_eswtich_en is set
	dv_flow_en is unset

MLX5_FLOW_TYPE_VERBS can't handle 'transfer' case, can it?

>  		type = MLX5_FLOW_TYPE_TCF;
>  	else
>  		type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV :
> diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> index b15266f..b25d4e8 100644
> --- a/drivers/net/mlx5/mlx5_prm.h
> +++ b/drivers/net/mlx5/mlx5_prm.h
> @@ -529,6 +529,7 @@ enum {
>  };
>  
>  enum {
> +	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
>  	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
>  	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
>  };
> @@ -591,6 +592,333 @@ struct mlx5_ifc_query_flow_counter_in_bits {
>  	u8         flow_counter_id[0x20];
>  };
>  

Please fix all the indentation violation from here.

> +enum {
> +	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
> +	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,
> +};
> +
> +enum {
> +	MLX5_HCA_CAP_OPMOD_GET_MAX   = 0,
> +	MLX5_HCA_CAP_OPMOD_GET_CUR   = 1,
> +};
> +
> +struct mlx5_ifc_cmd_hca_cap_bits {
> +	u8         reserved_at_0[0x30];
> +	u8         vhca_id[0x10];
> +	u8         reserved_at_40[0x40];
> +	u8         log_max_srq_sz[0x8];
> +	u8         log_max_qp_sz[0x8];
> +	u8         reserved_at_90[0xb];
> +	u8         log_max_qp[0x5];
> +	u8         reserved_at_a0[0xb];
> +	u8         log_max_srq[0x5];
> +	u8         reserved_at_b0[0x10];
> +	u8         reserved_at_c0[0x8];
> +	u8         log_max_cq_sz[0x8];
> +	u8         reserved_at_d0[0xb];
> +	u8         log_max_cq[0x5];
> +	u8         log_max_eq_sz[0x8];
> +	u8         reserved_at_e8[0x2];
> +	u8         log_max_mkey[0x6];
> +	u8         reserved_at_f0[0x8];
> +	u8         dump_fill_mkey[0x1];
> +	u8         reserved_at_f9[0x3];
> +	u8         log_max_eq[0x4];
> +	u8         max_indirection[0x8];
> +	u8         fixed_buffer_size[0x1];
> +	u8         log_max_mrw_sz[0x7];
> +	u8         force_teardown[0x1];
> +	u8         reserved_at_111[0x1];
> +	u8         log_max_bsf_list_size[0x6];
> +	u8         umr_extended_translation_offset[0x1];
> +	u8         null_mkey[0x1];
> +	u8         log_max_klm_list_size[0x6];
> +	u8         reserved_at_120[0xa];
> +	u8         log_max_ra_req_dc[0x6];
> +	u8         reserved_at_130[0xa];
> +	u8         log_max_ra_res_dc[0x6];
> +	u8         reserved_at_140[0xa];
> +	u8         log_max_ra_req_qp[0x6];
> +	u8         reserved_at_150[0xa];
> +	u8         log_max_ra_res_qp[0x6];
> +	u8         end_pad[0x1];
> +	u8         cc_query_allowed[0x1];
> +	u8         cc_modify_allowed[0x1];
> +	u8         start_pad[0x1];
> +	u8         cache_line_128byte[0x1];
> +	u8         reserved_at_165[0xa];
> +	u8         qcam_reg[0x1];
> +	u8         gid_table_size[0x10];
> +	u8         out_of_seq_cnt[0x1];
> +	u8         vport_counters[0x1];
> +	u8         retransmission_q_counters[0x1];
> +	u8         debug[0x1];
> +	u8         modify_rq_counter_set_id[0x1];
> +	u8         rq_delay_drop[0x1];
> +	u8         max_qp_cnt[0xa];
> +	u8         pkey_table_size[0x10];
> +	u8         vport_group_manager[0x1];
> +	u8         vhca_group_manager[0x1];
> +	u8         ib_virt[0x1];
> +	u8         eth_virt[0x1];
> +	u8         vnic_env_queue_counters[0x1];
> +	u8         ets[0x1];
> +	u8         nic_flow_table[0x1];
> +	u8         eswitch_manager[0x1];
> +	u8         device_memory[0x1];
> +	u8         mcam_reg[0x1];
> +	u8         pcam_reg[0x1];
> +	u8         local_ca_ack_delay[0x5];
> +	u8         port_module_event[0x1];
> +	u8         enhanced_error_q_counters[0x1];
> +	u8         ports_check[0x1];
> +	u8         reserved_at_1b3[0x1];
> +	u8         disable_link_up[0x1];
> +	u8         beacon_led[0x1];
> +	u8         port_type[0x2];
> +	u8         num_ports[0x8];
> +	u8         reserved_at_1c0[0x1];
> +	u8         pps[0x1];
> +	u8         pps_modify[0x1];
> +	u8         log_max_msg[0x5];
> +	u8         reserved_at_1c8[0x4];
> +	u8         max_tc[0x4];
> +	u8         temp_warn_event[0x1];
> +	u8         dcbx[0x1];
> +	u8         general_notification_event[0x1];
> +	u8         reserved_at_1d3[0x2];
> +	u8         fpga[0x1];
> +	u8         rol_s[0x1];
> +	u8         rol_g[0x1];
> +	u8         reserved_at_1d8[0x1];
> +	u8         wol_s[0x1];
> +	u8         wol_g[0x1];
> +	u8         wol_a[0x1];
> +	u8         wol_b[0x1];
> +	u8         wol_m[0x1];
> +	u8         wol_u[0x1];
> +	u8         wol_p[0x1];
> +	u8         stat_rate_support[0x10];
> +	u8         reserved_at_1f0[0xc];
> +	u8         cqe_version[0x4];
> +	u8         compact_address_vector[0x1];
> +	u8         striding_rq[0x1];
> +	u8         reserved_at_202[0x1];
> +	u8         ipoib_enhanced_offloads[0x1];
> +	u8         ipoib_basic_offloads[0x1];
> +	u8         reserved_at_205[0x1];
> +	u8         repeated_block_disabled[0x1];
> +	u8         umr_modify_entity_size_disabled[0x1];
> +	u8         umr_modify_atomic_disabled[0x1];
> +	u8         umr_indirect_mkey_disabled[0x1];
> +	u8         umr_fence[0x2];
> +	u8         reserved_at_20c[0x3];
> +	u8         drain_sigerr[0x1];
> +	u8         cmdif_checksum[0x2];
> +	u8         sigerr_cqe[0x1];
> +	u8         reserved_at_213[0x1];
> +	u8         wq_signature[0x1];
> +	u8         sctr_data_cqe[0x1];
> +	u8         reserved_at_216[0x1];
> +	u8         sho[0x1];
> +	u8         tph[0x1];
> +	u8         rf[0x1];
> +	u8         dct[0x1];
> +	u8         qos[0x1];
> +	u8         eth_net_offloads[0x1];
> +	u8         roce[0x1];
> +	u8         atomic[0x1];
> +	u8         reserved_at_21f[0x1];
> +	u8         cq_oi[0x1];
> +	u8         cq_resize[0x1];
> +	u8         cq_moderation[0x1];
> +	u8         reserved_at_223[0x3];
> +	u8         cq_eq_remap[0x1];
> +	u8         pg[0x1];
> +	u8         block_lb_mc[0x1];
> +	u8         reserved_at_229[0x1];
> +	u8         scqe_break_moderation[0x1];
> +	u8         cq_period_start_from_cqe[0x1];
> +	u8         cd[0x1];
> +	u8         reserved_at_22d[0x1];
> +	u8         apm[0x1];
> +	u8         vector_calc[0x1];
> +	u8         umr_ptr_rlky[0x1];
> +	u8	   imaicl[0x1];
> +	u8         reserved_at_232[0x4];
> +	u8         qkv[0x1];
> +	u8         pkv[0x1];
> +	u8         set_deth_sqpn[0x1];
> +	u8         reserved_at_239[0x3];
> +	u8         xrc[0x1];
> +	u8         ud[0x1];
> +	u8         uc[0x1];
> +	u8         rc[0x1];
> +	u8         uar_4k[0x1];
> +	u8         reserved_at_241[0x9];
> +	u8         uar_sz[0x6];
> +	u8         reserved_at_250[0x8];
> +	u8         log_pg_sz[0x8];
> +	u8         bf[0x1];
> +	u8         driver_version[0x1];
> +	u8         pad_tx_eth_packet[0x1];
> +	u8         reserved_at_263[0x8];
> +	u8         log_bf_reg_size[0x5];
> +	u8         reserved_at_270[0xb];
> +	u8         lag_master[0x1];
> +	u8         num_lag_ports[0x4];
> +	u8         reserved_at_280[0x10];
> +	u8         max_wqe_sz_sq[0x10];
> +	u8         reserved_at_2a0[0x10];
> +	u8         max_wqe_sz_rq[0x10];
> +	u8         max_flow_counter_31_16[0x10];
> +	u8         max_wqe_sz_sq_dc[0x10];
> +	u8         reserved_at_2e0[0x7];
> +	u8         max_qp_mcg[0x19];
> +	u8         reserved_at_300[0x10];
> +	u8         flow_counter_bulk_alloc[0x08];
> +	u8         log_max_mcg[0x8];
> +	u8         reserved_at_320[0x3];
> +	u8         log_max_transport_domain[0x5];
> +	u8         reserved_at_328[0x3];
> +	u8         log_max_pd[0x5];
> +	u8         reserved_at_330[0xb];
> +	u8         log_max_xrcd[0x5];
> +	u8         nic_receive_steering_discard[0x1];
> +	u8         receive_discard_vport_down[0x1];
> +	u8         transmit_discard_vport_down[0x1];
> +	u8         reserved_at_343[0x5];
> +	u8         log_max_flow_counter_bulk[0x8];
> +	u8         max_flow_counter_15_0[0x10];
> +	u8         reserved_at_360[0x3];
> +	u8         log_max_rq[0x5];
> +	u8         reserved_at_368[0x3];
> +	u8         log_max_sq[0x5];
> +	u8         reserved_at_370[0x3];
> +	u8         log_max_tir[0x5];
> +	u8         reserved_at_378[0x3];
> +	u8         log_max_tis[0x5];
> +	u8         basic_cyclic_rcv_wqe[0x1];
> +	u8         reserved_at_381[0x2];
> +	u8         log_max_rmp[0x5];
> +	u8         reserved_at_388[0x3];
> +	u8         log_max_rqt[0x5];
> +	u8         reserved_at_390[0x3];
> +	u8         log_max_rqt_size[0x5];
> +	u8         reserved_at_398[0x3];
> +	u8         log_max_tis_per_sq[0x5];
> +	u8         ext_stride_num_range[0x1];
> +	u8         reserved_at_3a1[0x2];
> +	u8         log_max_stride_sz_rq[0x5];
> +	u8         reserved_at_3a8[0x3];
> +	u8         log_min_stride_sz_rq[0x5];
> +	u8         reserved_at_3b0[0x3];
> +	u8         log_max_stride_sz_sq[0x5];
> +	u8         reserved_at_3b8[0x3];
> +	u8         log_min_stride_sz_sq[0x5];
> +	u8         hairpin[0x1];
> +	u8         reserved_at_3c1[0x2];
> +	u8         log_max_hairpin_queues[0x5];
> +	u8         reserved_at_3c8[0x3];
> +	u8         log_max_hairpin_wq_data_sz[0x5];
> +	u8         reserved_at_3d0[0x3];
> +	u8         log_max_hairpin_num_packets[0x5];
> +	u8         reserved_at_3d8[0x3];
> +	u8         log_max_wq_sz[0x5];
> +	u8         nic_vport_change_event[0x1];
> +	u8         disable_local_lb_uc[0x1];
> +	u8         disable_local_lb_mc[0x1];
> +	u8         log_min_hairpin_wq_data_sz[0x5];
> +	u8         reserved_at_3e8[0x3];
> +	u8         log_max_vlan_list[0x5];
> +	u8         reserved_at_3f0[0x3];
> +	u8         log_max_current_mc_list[0x5];
> +	u8         reserved_at_3f8[0x3];
> +	u8         log_max_current_uc_list[0x5];
> +	u8         general_obj_types[0x40];
> +	u8         reserved_at_440[0x20];
> +	u8         reserved_at_460[0x10];
> +	u8         max_num_eqs[0x10];
> +	u8         reserved_at_480[0x3];
> +	u8         log_max_l2_table[0x5];
> +	u8         reserved_at_488[0x8];
> +	u8         log_uar_page_sz[0x10];
> +	u8         reserved_at_4a0[0x20];
> +	u8         device_frequency_mhz[0x20];
> +	u8         device_frequency_khz[0x20];
> +	u8         reserved_at_500[0x20];
> +	u8	   num_of_uars_per_page[0x20];
> +	u8         flex_parser_protocols[0x20];
> +	u8         reserved_at_560[0x20];
> +	u8         reserved_at_580[0x3c];
> +	u8         mini_cqe_resp_stride_index[0x1];
> +	u8         cqe_128_always[0x1];
> +	u8         cqe_compression_128[0x1];
> +	u8         cqe_compression[0x1];
> +	u8         cqe_compression_timeout[0x10];
> +	u8         cqe_compression_max_num[0x10];
> +	u8         reserved_at_5e0[0x10];
> +	u8         tag_matching[0x1];
> +	u8         rndv_offload_rc[0x1];
> +	u8         rndv_offload_dc[0x1];
> +	u8         log_tag_matching_list_sz[0x5];
> +	u8         reserved_at_5f8[0x3];
> +	u8         log_max_xrq[0x5];
> +	u8	   affiliate_nic_vport_criteria[0x8];
> +	u8	   native_port_num[0x8];
> +	u8	   num_vhca_ports[0x8];
> +	u8	   reserved_at_618[0x6];
> +	u8	   sw_owner_id[0x1];
> +	u8	   reserved_at_61f[0x1e1];
> +};
> +
> +struct mlx5_ifc_qos_cap_bits {
> +	u8         packet_pacing[0x1];
> +	u8         esw_scheduling[0x1];
> +	u8         esw_bw_share[0x1];
> +	u8         esw_rate_limit[0x1];
> +	u8         reserved_at_4[0x1];
> +	u8         packet_pacing_burst_bound[0x1];
> +	u8         packet_pacing_typical_size[0x1];
> +	u8         flow_meter_srtcm[0x1];
> +	u8         reserved_at_8[0x8];
> +	u8         log_max_flow_meter[0x8];
> +	u8         flow_meter_reg_id[0x8];
> +	u8         reserved_at_25[0x20];
> +	u8         packet_pacing_max_rate[0x20];
> +	u8         packet_pacing_min_rate[0x20];
> +	u8         reserved_at_80[0x10];
> +	u8         packet_pacing_rate_table_size[0x10];
> +	u8         esw_element_type[0x10];
> +	u8         esw_tsar_type[0x10];
> +	u8         reserved_at_c0[0x10];
> +	u8         max_qos_para_vport[0x10];
> +	u8         max_tsar_bw_share[0x20];
> +	u8         reserved_at_100[0x6e8];
> +};
> +
> +union mlx5_ifc_hca_cap_union_bits {
> +	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
> +	struct mlx5_ifc_qos_cap_bits qos_cap;
> +	u8         reserved_at_0[0x8000];
> +};
> +
> +struct mlx5_ifc_query_hca_cap_out_bits {
> +	u8         status[0x8];
> +	u8         reserved_at_8[0x18];
> +	u8         syndrome[0x20];
> +	u8         reserved_at_40[0x40];
> +	union mlx5_ifc_hca_cap_union_bits capability;
> +};
> +
> +struct mlx5_ifc_query_hca_cap_in_bits {
> +	u8         opcode[0x10];
> +	u8         reserved_at_10[0x10];
> +	u8         reserved_at_20[0x10];
> +	u8         op_mod[0x10];
> +	u8         reserved_at_40[0x40];
> +};
> +
>  /* CQE format mask. */
>  #define MLX5E_CQE_FORMAT_MASK 0xc
>  
> -- 
> 1.8.3.1

Thanks
Yongseok
  
Ori Kam April 17, 2019, 6:19 a.m. UTC | #2
Hi Koh,

PSB

> -----Original Message-----
> From: Yongseok Koh
> Sent: Wednesday, April 17, 2019 4:42 AM
> To: Ori Kam <orika@mellanox.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; Matan Azrad
> <matan@mellanox.com>; Slava Ovsiienko <viacheslavo@mellanox.com>; Moti
> Haimovsky <motih@mellanox.com>; dev@dpdk.org
> Subject: Re: [PATCH 3/9] net/mlx5: add Direct Rules configuration support
> 
> On Sun, Apr 14, 2019 at 09:12:31PM +0000, Ori Kam wrote:
> > This commit provides the basic configuration needed in order to
> > support Direct Rules eswitch.
> 
> What do you mean my "Direct Rules eswitch"? What is the official name of it?
> E-Switch is in HCA and DR is use by library? Then, shouldn't it be "E-Switch
> with Direct Rules"? Please correct it appropriately.
>

 
Will fix.

> And I can see many of 'eswitch' in commit log or comment in the code. Please
> correct all of them as well.
> 
> > Signed-off-by: Ori Kam <orika@mellanox.com>
> > ---
> 
> The title is "net/mlx5: add Direct Rules configuration support"
> Shouldn't it have the word, "E-Switch"?
> 
> And it seems to have more than "configuration"?
> 
> >  drivers/net/mlx5/Makefile         |   5 +
> >  drivers/net/mlx5/meson.build      |   2 +
> >  drivers/net/mlx5/mlx5.c           |  52 +++++-
> >  drivers/net/mlx5/mlx5.h           |  12 ++
> >  drivers/net/mlx5/mlx5_devx_cmds.c |  42 +++++
> >  drivers/net/mlx5/mlx5_flow.c      |   2 +-
> >  drivers/net/mlx5/mlx5_prm.h       | 328
> ++++++++++++++++++++++++++++++++++++++
> >  7 files changed, 437 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
> > index 93bc869..2b72a33 100644
> > --- a/drivers/net/mlx5/Makefile
> > +++ b/drivers/net/mlx5/Makefile
> > @@ -161,6 +161,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-
> config-h.sh
> >  		enum MLX5DV_DR_NS_TYPE_TERMINATING \
> >  		$(AUTOCONF_OUTPUT)
> >  	$Q sh -- '$<' '$@' \
> > +		HAVE_MLX5DV_DR_ESWITCH \
> > +		infiniband/mlx5dv.h \
> > +		enum MLX5DV_DR_NS_DOMAIN_FDB_BYPASS \
> > +		$(AUTOCONF_OUTPUT)
> > +	$Q sh -- '$<' '$@' \
> 
> Should start from HAVE_IBV_FLOW_
> How about HAVE_IBV_FLOW_DV_ESW_DIRECT_RULES?
> 

Like stated in previous patch. The Nic DR define is of this format.

> >  		HAVE_IBV_DEVX_OBJ \
> >  		infiniband/mlx5dv.h \
> >  		func mlx5dv_devx_obj_create \
> > diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
> > index 0037e15..9dfd28d 100644
> > --- a/drivers/net/mlx5/meson.build
> > +++ b/drivers/net/mlx5/meson.build
> > @@ -113,6 +113,8 @@ if build
> >  		'MLX5DV_FLOW_ACTION_COUNTERS_DEVX' ],
> >  		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
> >  		'MLX5DV_DR_NS_TYPE_TERMINATING' ],
> > +		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h',
> > +		'MLX5DV_DR_NS_DOMAIN_FDB_BYPASS' ],
> 
> Same here.

Same comment as before.

> 
> >  		[ 'HAVE_SUPPORTED_40000baseKR4_Full', 'linux/ethtool.h',
> >  		'SUPPORTED_40000baseKR4_Full' ],
> >  		[ 'HAVE_SUPPORTED_40000baseCR4_Full', 'linux/ethtool.h',
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> > index 9ff50df..938ba1c 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -101,6 +101,9 @@
> >  /* Allow L3 VXLAN flow creation. */
> >  #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
> >
> > +/* Activate DV eswitch flow steering. */
> > +#define MLX5_DV_ESWITCH_EN "dv_eswitch_en"
> > +
> 
> We can set a rule to use 'esw'/'ESW' in the code just like dv/tcf/verbs and so
> on?
> Or, what's the difference between E-Switch and FDB?

I'm not sure I understand your comment.
E-Switch rules are located in the device FDB, 
We can ether create E-Switch rules using tcf or using DR which are part of dv steering.
This parameters is used to select if the E-Switch rules engine will be the tcf or dv.

> 
> >  /* Activate DV flow steering. */
> >  #define MLX5_DV_FLOW_EN "dv_flow_en"
> >
> > @@ -344,6 +347,18 @@ struct mlx5_dev_spawn_data {
> >  	}
> >  	pthread_mutex_init(&sh->dv_mutex, NULL);
> >  	sh->tx_ns = ns;
> > +#ifdef HAVE_MLX5DV_DR_ESWITCH
> > +	if (priv->config.dv_eswitch_en) {
> > +		ns  = mlx5_glue->dr_create_ns(sh->ctx,
> > +
> MLX5DV_DR_NS_DOMAIN_FDB_BYPASS);
> > +		if (!ns) {
> > +			DRV_LOG(ERR, "FDB mlx5dv_dr_create_ns failed");
> > +			err = errno;
> > +			goto error;
> > +		}
> > +		sh->fdb_ns = ns;
> > +	}
> > +#endif
> >  	sh->dv_refcnt++;
> >  	priv->dr_shared = 1;
> >  	return 0;
> > @@ -358,6 +373,10 @@ struct mlx5_dev_spawn_data {
> >  		mlx5dv_dr_destroy_ns(sh->tx_ns);
> >  		sh->tx_ns = NULL;
> >  	}
> > +	if (sh->fdb_ns) {
> > +		mlx5_glue->dr_destroy_ns(sh->fdb_ns);
> > +		sh->fdb_ns = NULL;
> > +	}
> >  	return err;
> >  #else
> >  	(void)priv;
> > @@ -393,6 +412,12 @@ struct mlx5_dev_spawn_data {
> >  		mlx5dv_dr_destroy_ns(sh->tx_ns);
> >  		sh->tx_ns = NULL;
> >  	}
> > +#ifdef HAVE_MLX5DV_DR_ESWITCH
> > +	if (sh->fdb_ns) {
> > +		mlx5_glue->dr_destroy_ns(sh->fdb_ns);
> > +		sh->fdb_ns = NULL;
> > +	}
> > +#endif
> >  	pthread_mutex_destroy(&sh->dv_mutex);
> >  #else
> >  	(void)priv;
> > @@ -861,6 +886,8 @@ struct mlx5_dev_spawn_data {
> >  		config->l3_vxlan_en = !!tmp;
> >  	} else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
> >  		config->vf_nl_en = !!tmp;
> > +	} else if (strcmp(MLX5_DV_ESWITCH_EN, key) == 0) {
> > +		config->dv_eswitch_en = !!tmp;
> 
> Do we really need to make it configurable? What is the purpose of doing that?
> If
> esw dr isn't supported, it can fall back to tcf but, if supported, why not using
> it? We still have dv_flow_en. If dv_flow_en is disabled, we should disable dv
> esw too. But we need not configure the two individually. Thoughts?
> 

I agree and this is the default value, but since there are some basic initializations that are done for the E-Switch,
for example opening the name space, I can see that in some cases the user would like do disable this option. 

> >  	} else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
> >  		config->dv_flow_en = !!tmp;
> >  	} else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
> > @@ -905,6 +932,7 @@ struct mlx5_dev_spawn_data {
> >  		MLX5_RX_VEC_EN,
> >  		MLX5_L3_VXLAN_EN,
> >  		MLX5_VF_NL_EN,
> > +		MLX5_DV_ESWITCH_EN,
> >  		MLX5_DV_FLOW_EN,
> >  		MLX5_MR_EXT_MEMSEG_EN,
> >  		MLX5_REPRESENTOR,
> > @@ -1458,11 +1486,6 @@ struct mlx5_dev_spawn_data {
> >  			priv->tcf_context = NULL;
> >  		}
> >  	}
> > -	if (config.dv_flow_en) {
> > -		err = mlx5_alloc_shared_dr(priv);
> > -		if (err)
> > -			goto error;
> > -	}
> >  	TAILQ_INIT(&priv->flows);
> >  	TAILQ_INIT(&priv->ctrl_flows);
> >  	/* Hint libmlx5 to use PMD allocator for data plane resources */
> > @@ -1484,8 +1507,26 @@ struct mlx5_dev_spawn_data {
> >  	 * Verbs context returned by ibv_open_device().
> >  	 */
> >  	mlx5_link_update(eth_dev, 0);
> > +#ifdef HAVE_IBV_DEVX_OBJ
> > +	err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
> > +	if (err) {
> > +		err = -err;
> > +		goto error;
> > +	}
> > +#endif
> > +#ifdef HAVE_MLX5DV_DR_ESWITCH
> > +	if (!config.hca_attr.eswitch_manager)
> > +		config.dv_eswitch_en = 0;
> > +#else
> > +	config.dv_eswitch_en = 0;
> > +#endif
> >  	/* Store device configuration on private structure. */
> >  	priv->config = config;
> > +	if (config.dv_flow_en) {
> > +		err = mlx5_alloc_shared_dr(priv);
> > +		if (err)
> > +			goto error;
> > +	}
> >  	/* Supported Verbs flow priority number detection. */
> >  	err = mlx5_flow_discover_priorities(eth_dev);
> >  	if (err < 0) {
> > @@ -1876,6 +1917,7 @@ struct mlx5_dev_spawn_data {
> >  			.max_memcpy_len =
> MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
> >  			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
> >  		},
> > +		.dv_eswitch_en = 1,
> >  	};
> >  	/* Device specific configuration. */
> >  	switch (pci_dev->id.device_id) {
> > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> > index 14c7f3c..33a4127 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -138,6 +138,11 @@ struct mlx5_devx_counter_set {
> >  	int id; /* Flow counter ID */
> >  };
> >
> > +/* HCA attributes. */
> > +struct mlx5_hca_attr {
> > +	uint32_t eswitch_manager:1;
> > +};
> > +
> >  /* Flow list . */
> >  TAILQ_HEAD(mlx5_flows, rte_flow);
> >
> > @@ -171,6 +176,7 @@ struct mlx5_dev_config {
> >  	/* Whether memseg should be extended for MR creation. */
> >  	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
> >  	unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
> > +	unsigned int dv_eswitch_en:1; /* Enable eswitch DV flow. */
> >  	unsigned int dv_flow_en:1; /* Enable DV flow. */
> >  	unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
> >  	unsigned int devx:1; /* Whether devx interface is available or not. */
> > @@ -192,6 +198,7 @@ struct mlx5_dev_config {
> >  	int txqs_inline; /* Queue number threshold for inlining. */
> >  	int txqs_vec; /* Queue number threshold for vectorized Tx. */
> >  	int inline_max_packet_sz; /* Max packet size for inlining. */
> > +	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
> >  };
> >
> >  /**
> > @@ -241,6 +248,7 @@ struct mlx5_flow_tbl_resource {
> >  };
> >
> >  #define MLX5_MAX_TABLES 1024
> > +#define MLX5_MAX_TABLES_FDB 32
> >  #define MLX5_GROUP_FACTOR 1
> >
> >  /*
> > @@ -260,6 +268,8 @@ struct mlx5_ibv_shared {
> >  	/* Shared DV/DR flow data section. */
> >  	pthread_mutex_t dv_mutex; /* DV context mutex. */
> >  	uint32_t dv_refcnt; /* DV/DR data reference counter. */
> > +	void *fdb_ns; /* FDB Direct Rules name space handle. */
> > +	struct mlx5_flow_tbl_resource fdb_tbl[MLX5_MAX_TABLES_FDB];
> >  	void *rx_ns; /* RX Direct Rules name space handle. */
> >  	struct mlx5_flow_tbl_resource rx_tbl[MLX5_MAX_TABLES];
> >  	/* RX Direct Rules tables. */
> > @@ -539,4 +549,6 @@ int mlx5_devx_cmd_flow_counter_alloc(struct
> ibv_context *ctx,
> >  int mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_counter_set
> *dcx,
> >  				     int clear,
> >  				     uint64_t *pkts, uint64_t *bytes);
> > +int mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
> > +				 struct mlx5_hca_attr *attr);
> >  #endif /* RTE_PMD_MLX5_H_ */
> > diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c
> b/drivers/net/mlx5/mlx5_devx_cmds.c
> > index a9dff58..3caea41 100644
> > --- a/drivers/net/mlx5/mlx5_devx_cmds.c
> > +++ b/drivers/net/mlx5/mlx5_devx_cmds.c
> > @@ -105,3 +105,45 @@ int mlx5_devx_cmd_flow_counter_free(struct
> mlx5dv_devx_obj *obj)
> >  	*bytes = MLX5_GET64(traffic_counter, stats, octets);
> >  	return 0;
> >  }
> > +
> > +/**
> > + * Query HCA attributes.
> 
> Need to be more informative. What to query here? Please specify in detail.
> 

O.K.

> > + *
> > + * @param[in] ctx
> > + *   ibv contexts returned from mlx5dv_open_device.
> > + * @param[out] attr
> > + *   Attributes device values.
> > + *
> > + * @return
> > + *   0 on success, a negative value otherwise.
> > + */
> > +int
> > +mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
> > +			     struct mlx5_hca_attr *attr)
> > +{
> > +	uint32_t in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {0};
> > +	uint32_t out[MLX5_ST_SZ_DW(query_hca_cap_out)] = {0};
> > +	void *hcattr;
> > +	int status, syndrome, rc;
> > +
> > +	MLX5_SET(query_hca_cap_in, in, opcode,
> MLX5_CMD_OP_QUERY_HCA_CAP);
> > +	MLX5_SET(query_hca_cap_in, in, op_mod,
> > +		 MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE |
> > +		 MLX5_HCA_CAP_OPMOD_GET_CUR);
> > +
> > +	rc = mlx5_glue->devx_general_cmd(ctx,
> > +					 in, sizeof(in), out, sizeof(out));
> > +	if (rc)
> > +		return rc;
> > +	status = MLX5_GET(query_hca_cap_out, out, status);
> > +	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
> > +	if (status) {
> > +		DRV_LOG(DEBUG, "Failed to query devx HCA capabilities, "
> > +			"status %x, syndrome = %x",
> > +			status, syndrome);
> > +		return -1;
> > +	}
> > +	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
> > +	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr,
> eswitch_manager);
> > +	return 0;
> > +}
> > diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> > index a0683ee..83abc14 100644
> > --- a/drivers/net/mlx5/mlx5_flow.c
> > +++ b/drivers/net/mlx5/mlx5_flow.c
> > @@ -1784,7 +1784,7 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,
> >  	struct mlx5_priv *priv = dev->data->dev_private;
> >  	enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX;
> >
> > -	if (attr->transfer)
> > +	if (attr->transfer && !priv->config.dv_eswitch_en)
> 
> To make sure it works as intended, a critical precondition MUST be met.
> 	"If dv_flow_en is set, dv_eswitch_en is also set."
> 
> Think about a case where
> 	attr->transfer is set
> 	dv_eswtich_en is set
> 	dv_flow_en is unset
> 
> MLX5_FLOW_TYPE_VERBS can't handle 'transfer' case, can it?
> 

If (dv_flow_en == 0 ) then dv_switch_en = 0

> >  		type = MLX5_FLOW_TYPE_TCF;
> >  	else
> >  		type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV :
> > diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> > index b15266f..b25d4e8 100644
> > --- a/drivers/net/mlx5/mlx5_prm.h
> > +++ b/drivers/net/mlx5/mlx5_prm.h
> > @@ -529,6 +529,7 @@ enum {
> >  };
> >
> >  enum {
> > +	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
> >  	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
> >  	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
> >  };
> > @@ -591,6 +592,333 @@ struct mlx5_ifc_query_flow_counter_in_bits {
> >  	u8         flow_counter_id[0x20];
> >  };
> >
> 
> Please fix all the indentation violation from here.
> 

It is the same indentation as all other cmd structures.
It is also a copy from the kernel and is based on automatic generation of the PRM.

> > +enum {
> > +	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
> > +	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,
> > +};
> > +
> > +enum {
> > +	MLX5_HCA_CAP_OPMOD_GET_MAX   = 0,
> > +	MLX5_HCA_CAP_OPMOD_GET_CUR   = 1,
> > +};
> > +
> > +struct mlx5_ifc_cmd_hca_cap_bits {
> > +	u8         reserved_at_0[0x30];
> > +	u8         vhca_id[0x10];
> > +	u8         reserved_at_40[0x40];
> > +	u8         log_max_srq_sz[0x8];
> > +	u8         log_max_qp_sz[0x8];
> > +	u8         reserved_at_90[0xb];
> > +	u8         log_max_qp[0x5];
> > +	u8         reserved_at_a0[0xb];
> > +	u8         log_max_srq[0x5];
> > +	u8         reserved_at_b0[0x10];
> > +	u8         reserved_at_c0[0x8];
> > +	u8         log_max_cq_sz[0x8];
> > +	u8         reserved_at_d0[0xb];
> > +	u8         log_max_cq[0x5];
> > +	u8         log_max_eq_sz[0x8];
> > +	u8         reserved_at_e8[0x2];
> > +	u8         log_max_mkey[0x6];
> > +	u8         reserved_at_f0[0x8];
> > +	u8         dump_fill_mkey[0x1];
> > +	u8         reserved_at_f9[0x3];
> > +	u8         log_max_eq[0x4];
> > +	u8         max_indirection[0x8];
> > +	u8         fixed_buffer_size[0x1];
> > +	u8         log_max_mrw_sz[0x7];
> > +	u8         force_teardown[0x1];
> > +	u8         reserved_at_111[0x1];
> > +	u8         log_max_bsf_list_size[0x6];
> > +	u8         umr_extended_translation_offset[0x1];
> > +	u8         null_mkey[0x1];
> > +	u8         log_max_klm_list_size[0x6];
> > +	u8         reserved_at_120[0xa];
> > +	u8         log_max_ra_req_dc[0x6];
> > +	u8         reserved_at_130[0xa];
> > +	u8         log_max_ra_res_dc[0x6];
> > +	u8         reserved_at_140[0xa];
> > +	u8         log_max_ra_req_qp[0x6];
> > +	u8         reserved_at_150[0xa];
> > +	u8         log_max_ra_res_qp[0x6];
> > +	u8         end_pad[0x1];
> > +	u8         cc_query_allowed[0x1];
> > +	u8         cc_modify_allowed[0x1];
> > +	u8         start_pad[0x1];
> > +	u8         cache_line_128byte[0x1];
> > +	u8         reserved_at_165[0xa];
> > +	u8         qcam_reg[0x1];
> > +	u8         gid_table_size[0x10];
> > +	u8         out_of_seq_cnt[0x1];
> > +	u8         vport_counters[0x1];
> > +	u8         retransmission_q_counters[0x1];
> > +	u8         debug[0x1];
> > +	u8         modify_rq_counter_set_id[0x1];
> > +	u8         rq_delay_drop[0x1];
> > +	u8         max_qp_cnt[0xa];
> > +	u8         pkey_table_size[0x10];
> > +	u8         vport_group_manager[0x1];
> > +	u8         vhca_group_manager[0x1];
> > +	u8         ib_virt[0x1];
> > +	u8         eth_virt[0x1];
> > +	u8         vnic_env_queue_counters[0x1];
> > +	u8         ets[0x1];
> > +	u8         nic_flow_table[0x1];
> > +	u8         eswitch_manager[0x1];
> > +	u8         device_memory[0x1];
> > +	u8         mcam_reg[0x1];
> > +	u8         pcam_reg[0x1];
> > +	u8         local_ca_ack_delay[0x5];
> > +	u8         port_module_event[0x1];
> > +	u8         enhanced_error_q_counters[0x1];
> > +	u8         ports_check[0x1];
> > +	u8         reserved_at_1b3[0x1];
> > +	u8         disable_link_up[0x1];
> > +	u8         beacon_led[0x1];
> > +	u8         port_type[0x2];
> > +	u8         num_ports[0x8];
> > +	u8         reserved_at_1c0[0x1];
> > +	u8         pps[0x1];
> > +	u8         pps_modify[0x1];
> > +	u8         log_max_msg[0x5];
> > +	u8         reserved_at_1c8[0x4];
> > +	u8         max_tc[0x4];
> > +	u8         temp_warn_event[0x1];
> > +	u8         dcbx[0x1];
> > +	u8         general_notification_event[0x1];
> > +	u8         reserved_at_1d3[0x2];
> > +	u8         fpga[0x1];
> > +	u8         rol_s[0x1];
> > +	u8         rol_g[0x1];
> > +	u8         reserved_at_1d8[0x1];
> > +	u8         wol_s[0x1];
> > +	u8         wol_g[0x1];
> > +	u8         wol_a[0x1];
> > +	u8         wol_b[0x1];
> > +	u8         wol_m[0x1];
> > +	u8         wol_u[0x1];
> > +	u8         wol_p[0x1];
> > +	u8         stat_rate_support[0x10];
> > +	u8         reserved_at_1f0[0xc];
> > +	u8         cqe_version[0x4];
> > +	u8         compact_address_vector[0x1];
> > +	u8         striding_rq[0x1];
> > +	u8         reserved_at_202[0x1];
> > +	u8         ipoib_enhanced_offloads[0x1];
> > +	u8         ipoib_basic_offloads[0x1];
> > +	u8         reserved_at_205[0x1];
> > +	u8         repeated_block_disabled[0x1];
> > +	u8         umr_modify_entity_size_disabled[0x1];
> > +	u8         umr_modify_atomic_disabled[0x1];
> > +	u8         umr_indirect_mkey_disabled[0x1];
> > +	u8         umr_fence[0x2];
> > +	u8         reserved_at_20c[0x3];
> > +	u8         drain_sigerr[0x1];
> > +	u8         cmdif_checksum[0x2];
> > +	u8         sigerr_cqe[0x1];
> > +	u8         reserved_at_213[0x1];
> > +	u8         wq_signature[0x1];
> > +	u8         sctr_data_cqe[0x1];
> > +	u8         reserved_at_216[0x1];
> > +	u8         sho[0x1];
> > +	u8         tph[0x1];
> > +	u8         rf[0x1];
> > +	u8         dct[0x1];
> > +	u8         qos[0x1];
> > +	u8         eth_net_offloads[0x1];
> > +	u8         roce[0x1];
> > +	u8         atomic[0x1];
> > +	u8         reserved_at_21f[0x1];
> > +	u8         cq_oi[0x1];
> > +	u8         cq_resize[0x1];
> > +	u8         cq_moderation[0x1];
> > +	u8         reserved_at_223[0x3];
> > +	u8         cq_eq_remap[0x1];
> > +	u8         pg[0x1];
> > +	u8         block_lb_mc[0x1];
> > +	u8         reserved_at_229[0x1];
> > +	u8         scqe_break_moderation[0x1];
> > +	u8         cq_period_start_from_cqe[0x1];
> > +	u8         cd[0x1];
> > +	u8         reserved_at_22d[0x1];
> > +	u8         apm[0x1];
> > +	u8         vector_calc[0x1];
> > +	u8         umr_ptr_rlky[0x1];
> > +	u8	   imaicl[0x1];
> > +	u8         reserved_at_232[0x4];
> > +	u8         qkv[0x1];
> > +	u8         pkv[0x1];
> > +	u8         set_deth_sqpn[0x1];
> > +	u8         reserved_at_239[0x3];
> > +	u8         xrc[0x1];
> > +	u8         ud[0x1];
> > +	u8         uc[0x1];
> > +	u8         rc[0x1];
> > +	u8         uar_4k[0x1];
> > +	u8         reserved_at_241[0x9];
> > +	u8         uar_sz[0x6];
> > +	u8         reserved_at_250[0x8];
> > +	u8         log_pg_sz[0x8];
> > +	u8         bf[0x1];
> > +	u8         driver_version[0x1];
> > +	u8         pad_tx_eth_packet[0x1];
> > +	u8         reserved_at_263[0x8];
> > +	u8         log_bf_reg_size[0x5];
> > +	u8         reserved_at_270[0xb];
> > +	u8         lag_master[0x1];
> > +	u8         num_lag_ports[0x4];
> > +	u8         reserved_at_280[0x10];
> > +	u8         max_wqe_sz_sq[0x10];
> > +	u8         reserved_at_2a0[0x10];
> > +	u8         max_wqe_sz_rq[0x10];
> > +	u8         max_flow_counter_31_16[0x10];
> > +	u8         max_wqe_sz_sq_dc[0x10];
> > +	u8         reserved_at_2e0[0x7];
> > +	u8         max_qp_mcg[0x19];
> > +	u8         reserved_at_300[0x10];
> > +	u8         flow_counter_bulk_alloc[0x08];
> > +	u8         log_max_mcg[0x8];
> > +	u8         reserved_at_320[0x3];
> > +	u8         log_max_transport_domain[0x5];
> > +	u8         reserved_at_328[0x3];
> > +	u8         log_max_pd[0x5];
> > +	u8         reserved_at_330[0xb];
> > +	u8         log_max_xrcd[0x5];
> > +	u8         nic_receive_steering_discard[0x1];
> > +	u8         receive_discard_vport_down[0x1];
> > +	u8         transmit_discard_vport_down[0x1];
> > +	u8         reserved_at_343[0x5];
> > +	u8         log_max_flow_counter_bulk[0x8];
> > +	u8         max_flow_counter_15_0[0x10];
> > +	u8         reserved_at_360[0x3];
> > +	u8         log_max_rq[0x5];
> > +	u8         reserved_at_368[0x3];
> > +	u8         log_max_sq[0x5];
> > +	u8         reserved_at_370[0x3];
> > +	u8         log_max_tir[0x5];
> > +	u8         reserved_at_378[0x3];
> > +	u8         log_max_tis[0x5];
> > +	u8         basic_cyclic_rcv_wqe[0x1];
> > +	u8         reserved_at_381[0x2];
> > +	u8         log_max_rmp[0x5];
> > +	u8         reserved_at_388[0x3];
> > +	u8         log_max_rqt[0x5];
> > +	u8         reserved_at_390[0x3];
> > +	u8         log_max_rqt_size[0x5];
> > +	u8         reserved_at_398[0x3];
> > +	u8         log_max_tis_per_sq[0x5];
> > +	u8         ext_stride_num_range[0x1];
> > +	u8         reserved_at_3a1[0x2];
> > +	u8         log_max_stride_sz_rq[0x5];
> > +	u8         reserved_at_3a8[0x3];
> > +	u8         log_min_stride_sz_rq[0x5];
> > +	u8         reserved_at_3b0[0x3];
> > +	u8         log_max_stride_sz_sq[0x5];
> > +	u8         reserved_at_3b8[0x3];
> > +	u8         log_min_stride_sz_sq[0x5];
> > +	u8         hairpin[0x1];
> > +	u8         reserved_at_3c1[0x2];
> > +	u8         log_max_hairpin_queues[0x5];
> > +	u8         reserved_at_3c8[0x3];
> > +	u8         log_max_hairpin_wq_data_sz[0x5];
> > +	u8         reserved_at_3d0[0x3];
> > +	u8         log_max_hairpin_num_packets[0x5];
> > +	u8         reserved_at_3d8[0x3];
> > +	u8         log_max_wq_sz[0x5];
> > +	u8         nic_vport_change_event[0x1];
> > +	u8         disable_local_lb_uc[0x1];
> > +	u8         disable_local_lb_mc[0x1];
> > +	u8         log_min_hairpin_wq_data_sz[0x5];
> > +	u8         reserved_at_3e8[0x3];
> > +	u8         log_max_vlan_list[0x5];
> > +	u8         reserved_at_3f0[0x3];
> > +	u8         log_max_current_mc_list[0x5];
> > +	u8         reserved_at_3f8[0x3];
> > +	u8         log_max_current_uc_list[0x5];
> > +	u8         general_obj_types[0x40];
> > +	u8         reserved_at_440[0x20];
> > +	u8         reserved_at_460[0x10];
> > +	u8         max_num_eqs[0x10];
> > +	u8         reserved_at_480[0x3];
> > +	u8         log_max_l2_table[0x5];
> > +	u8         reserved_at_488[0x8];
> > +	u8         log_uar_page_sz[0x10];
> > +	u8         reserved_at_4a0[0x20];
> > +	u8         device_frequency_mhz[0x20];
> > +	u8         device_frequency_khz[0x20];
> > +	u8         reserved_at_500[0x20];
> > +	u8	   num_of_uars_per_page[0x20];
> > +	u8         flex_parser_protocols[0x20];
> > +	u8         reserved_at_560[0x20];
> > +	u8         reserved_at_580[0x3c];
> > +	u8         mini_cqe_resp_stride_index[0x1];
> > +	u8         cqe_128_always[0x1];
> > +	u8         cqe_compression_128[0x1];
> > +	u8         cqe_compression[0x1];
> > +	u8         cqe_compression_timeout[0x10];
> > +	u8         cqe_compression_max_num[0x10];
> > +	u8         reserved_at_5e0[0x10];
> > +	u8         tag_matching[0x1];
> > +	u8         rndv_offload_rc[0x1];
> > +	u8         rndv_offload_dc[0x1];
> > +	u8         log_tag_matching_list_sz[0x5];
> > +	u8         reserved_at_5f8[0x3];
> > +	u8         log_max_xrq[0x5];
> > +	u8	   affiliate_nic_vport_criteria[0x8];
> > +	u8	   native_port_num[0x8];
> > +	u8	   num_vhca_ports[0x8];
> > +	u8	   reserved_at_618[0x6];
> > +	u8	   sw_owner_id[0x1];
> > +	u8	   reserved_at_61f[0x1e1];
> > +};
> > +
> > +struct mlx5_ifc_qos_cap_bits {
> > +	u8         packet_pacing[0x1];
> > +	u8         esw_scheduling[0x1];
> > +	u8         esw_bw_share[0x1];
> > +	u8         esw_rate_limit[0x1];
> > +	u8         reserved_at_4[0x1];
> > +	u8         packet_pacing_burst_bound[0x1];
> > +	u8         packet_pacing_typical_size[0x1];
> > +	u8         flow_meter_srtcm[0x1];
> > +	u8         reserved_at_8[0x8];
> > +	u8         log_max_flow_meter[0x8];
> > +	u8         flow_meter_reg_id[0x8];
> > +	u8         reserved_at_25[0x20];
> > +	u8         packet_pacing_max_rate[0x20];
> > +	u8         packet_pacing_min_rate[0x20];
> > +	u8         reserved_at_80[0x10];
> > +	u8         packet_pacing_rate_table_size[0x10];
> > +	u8         esw_element_type[0x10];
> > +	u8         esw_tsar_type[0x10];
> > +	u8         reserved_at_c0[0x10];
> > +	u8         max_qos_para_vport[0x10];
> > +	u8         max_tsar_bw_share[0x20];
> > +	u8         reserved_at_100[0x6e8];
> > +};
> > +
> > +union mlx5_ifc_hca_cap_union_bits {
> > +	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
> > +	struct mlx5_ifc_qos_cap_bits qos_cap;
> > +	u8         reserved_at_0[0x8000];
> > +};
> > +
> > +struct mlx5_ifc_query_hca_cap_out_bits {
> > +	u8         status[0x8];
> > +	u8         reserved_at_8[0x18];
> > +	u8         syndrome[0x20];
> > +	u8         reserved_at_40[0x40];
> > +	union mlx5_ifc_hca_cap_union_bits capability;
> > +};
> > +
> > +struct mlx5_ifc_query_hca_cap_in_bits {
> > +	u8         opcode[0x10];
> > +	u8         reserved_at_10[0x10];
> > +	u8         reserved_at_20[0x10];
> > +	u8         op_mod[0x10];
> > +	u8         reserved_at_40[0x40];
> > +};
> > +
> >  /* CQE format mask. */
> >  #define MLX5E_CQE_FORMAT_MASK 0xc
> >
> > --
> > 1.8.3.1
> 
> Thanks
> Yongseok
  

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 93bc869..2b72a33 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -161,6 +161,11 @@  mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum MLX5DV_DR_NS_TYPE_TERMINATING \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_MLX5DV_DR_ESWITCH \
+		infiniband/mlx5dv.h \
+		enum MLX5DV_DR_NS_DOMAIN_FDB_BYPASS \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_IBV_DEVX_OBJ \
 		infiniband/mlx5dv.h \
 		func mlx5dv_devx_obj_create \
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 0037e15..9dfd28d 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -113,6 +113,8 @@  if build
 		'MLX5DV_FLOW_ACTION_COUNTERS_DEVX' ],
 		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
 		'MLX5DV_DR_NS_TYPE_TERMINATING' ],
+		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h',
+		'MLX5DV_DR_NS_DOMAIN_FDB_BYPASS' ],
 		[ 'HAVE_SUPPORTED_40000baseKR4_Full', 'linux/ethtool.h',
 		'SUPPORTED_40000baseKR4_Full' ],
 		[ 'HAVE_SUPPORTED_40000baseCR4_Full', 'linux/ethtool.h',
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9ff50df..938ba1c 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -101,6 +101,9 @@ 
 /* Allow L3 VXLAN flow creation. */
 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
 
+/* Activate DV eswitch flow steering. */
+#define MLX5_DV_ESWITCH_EN "dv_eswitch_en"
+
 /* Activate DV flow steering. */
 #define MLX5_DV_FLOW_EN "dv_flow_en"
 
@@ -344,6 +347,18 @@  struct mlx5_dev_spawn_data {
 	}
 	pthread_mutex_init(&sh->dv_mutex, NULL);
 	sh->tx_ns = ns;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (priv->config.dv_eswitch_en) {
+		ns  = mlx5_glue->dr_create_ns(sh->ctx,
+					      MLX5DV_DR_NS_DOMAIN_FDB_BYPASS);
+		if (!ns) {
+			DRV_LOG(ERR, "FDB mlx5dv_dr_create_ns failed");
+			err = errno;
+			goto error;
+		}
+		sh->fdb_ns = ns;
+	}
+#endif
 	sh->dv_refcnt++;
 	priv->dr_shared = 1;
 	return 0;
@@ -358,6 +373,10 @@  struct mlx5_dev_spawn_data {
 		mlx5dv_dr_destroy_ns(sh->tx_ns);
 		sh->tx_ns = NULL;
 	}
+	if (sh->fdb_ns) {
+		mlx5_glue->dr_destroy_ns(sh->fdb_ns);
+		sh->fdb_ns = NULL;
+	}
 	return err;
 #else
 	(void)priv;
@@ -393,6 +412,12 @@  struct mlx5_dev_spawn_data {
 		mlx5dv_dr_destroy_ns(sh->tx_ns);
 		sh->tx_ns = NULL;
 	}
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (sh->fdb_ns) {
+		mlx5_glue->dr_destroy_ns(sh->fdb_ns);
+		sh->fdb_ns = NULL;
+	}
+#endif
 	pthread_mutex_destroy(&sh->dv_mutex);
 #else
 	(void)priv;
@@ -861,6 +886,8 @@  struct mlx5_dev_spawn_data {
 		config->l3_vxlan_en = !!tmp;
 	} else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
 		config->vf_nl_en = !!tmp;
+	} else if (strcmp(MLX5_DV_ESWITCH_EN, key) == 0) {
+		config->dv_eswitch_en = !!tmp;
 	} else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
 		config->dv_flow_en = !!tmp;
 	} else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
@@ -905,6 +932,7 @@  struct mlx5_dev_spawn_data {
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
 		MLX5_VF_NL_EN,
+		MLX5_DV_ESWITCH_EN,
 		MLX5_DV_FLOW_EN,
 		MLX5_MR_EXT_MEMSEG_EN,
 		MLX5_REPRESENTOR,
@@ -1458,11 +1486,6 @@  struct mlx5_dev_spawn_data {
 			priv->tcf_context = NULL;
 		}
 	}
-	if (config.dv_flow_en) {
-		err = mlx5_alloc_shared_dr(priv);
-		if (err)
-			goto error;
-	}
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1484,8 +1507,26 @@  struct mlx5_dev_spawn_data {
 	 * Verbs context returned by ibv_open_device().
 	 */
 	mlx5_link_update(eth_dev, 0);
+#ifdef HAVE_IBV_DEVX_OBJ
+	err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
+	if (err) {
+		err = -err;
+		goto error;
+	}
+#endif
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (!config.hca_attr.eswitch_manager)
+		config.dv_eswitch_en = 0;
+#else
+	config.dv_eswitch_en = 0;
+#endif
 	/* Store device configuration on private structure. */
 	priv->config = config;
+	if (config.dv_flow_en) {
+		err = mlx5_alloc_shared_dr(priv);
+		if (err)
+			goto error;
+	}
 	/* Supported Verbs flow priority number detection. */
 	err = mlx5_flow_discover_priorities(eth_dev);
 	if (err < 0) {
@@ -1876,6 +1917,7 @@  struct mlx5_dev_spawn_data {
 			.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
 			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
 		},
+		.dv_eswitch_en = 1,
 	};
 	/* Device specific configuration. */
 	switch (pci_dev->id.device_id) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 14c7f3c..33a4127 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -138,6 +138,11 @@  struct mlx5_devx_counter_set {
 	int id; /* Flow counter ID */
 };
 
+/* HCA attributes. */
+struct mlx5_hca_attr {
+	uint32_t eswitch_manager:1;
+};
+
 /* Flow list . */
 TAILQ_HEAD(mlx5_flows, rte_flow);
 
@@ -171,6 +176,7 @@  struct mlx5_dev_config {
 	/* Whether memseg should be extended for MR creation. */
 	unsigned int l3_vxlan_en:1; /* Enable L3 VXLAN flow creation. */
 	unsigned int vf_nl_en:1; /* Enable Netlink requests in VF mode. */
+	unsigned int dv_eswitch_en:1; /* Enable eswitch DV flow. */
 	unsigned int dv_flow_en:1; /* Enable DV flow. */
 	unsigned int swp:1; /* Tx generic tunnel checksum and TSO offload. */
 	unsigned int devx:1; /* Whether devx interface is available or not. */
@@ -192,6 +198,7 @@  struct mlx5_dev_config {
 	int txqs_inline; /* Queue number threshold for inlining. */
 	int txqs_vec; /* Queue number threshold for vectorized Tx. */
 	int inline_max_packet_sz; /* Max packet size for inlining. */
+	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 };
 
 /**
@@ -241,6 +248,7 @@  struct mlx5_flow_tbl_resource {
 };
 
 #define MLX5_MAX_TABLES 1024
+#define MLX5_MAX_TABLES_FDB 32
 #define MLX5_GROUP_FACTOR 1
 
 /*
@@ -260,6 +268,8 @@  struct mlx5_ibv_shared {
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_refcnt; /* DV/DR data reference counter. */
+	void *fdb_ns; /* FDB Direct Rules name space handle. */
+	struct mlx5_flow_tbl_resource fdb_tbl[MLX5_MAX_TABLES_FDB];
 	void *rx_ns; /* RX Direct Rules name space handle. */
 	struct mlx5_flow_tbl_resource rx_tbl[MLX5_MAX_TABLES];
 	/* RX Direct Rules tables. */
@@ -539,4 +549,6 @@  int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
 int mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_counter_set *dcx,
 				     int clear,
 				     uint64_t *pkts, uint64_t *bytes);
+int mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
+				 struct mlx5_hca_attr *attr);
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
index a9dff58..3caea41 100644
--- a/drivers/net/mlx5/mlx5_devx_cmds.c
+++ b/drivers/net/mlx5/mlx5_devx_cmds.c
@@ -105,3 +105,45 @@  int mlx5_devx_cmd_flow_counter_free(struct mlx5dv_devx_obj *obj)
 	*bytes = MLX5_GET64(traffic_counter, stats, octets);
 	return 0;
 }
+
+/**
+ * Query HCA attributes.
+ *
+ * @param[in] ctx
+ *   ibv contexts returned from mlx5dv_open_device.
+ * @param[out] attr
+ *   Attributes device values.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int
+mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
+			     struct mlx5_hca_attr *attr)
+{
+	uint32_t in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {0};
+	uint32_t out[MLX5_ST_SZ_DW(query_hca_cap_out)] = {0};
+	void *hcattr;
+	int status, syndrome, rc;
+
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod,
+		 MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE |
+		 MLX5_HCA_CAP_OPMOD_GET_CUR);
+
+	rc = mlx5_glue->devx_general_cmd(ctx,
+					 in, sizeof(in), out, sizeof(out));
+	if (rc)
+		return rc;
+	status = MLX5_GET(query_hca_cap_out, out, status);
+	syndrome = MLX5_GET(query_hca_cap_out, out, syndrome);
+	if (status) {
+		DRV_LOG(DEBUG, "Failed to query devx HCA capabilities, "
+			"status %x, syndrome = %x",
+			status, syndrome);
+		return -1;
+	}
+	hcattr = MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+	attr->eswitch_manager = MLX5_GET(cmd_hca_cap, hcattr, eswitch_manager);
+	return 0;
+}
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index a0683ee..83abc14 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -1784,7 +1784,7 @@  uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 	struct mlx5_priv *priv = dev->data->dev_private;
 	enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX;
 
-	if (attr->transfer)
+	if (attr->transfer && !priv->config.dv_eswitch_en)
 		type = MLX5_FLOW_TYPE_TCF;
 	else
 		type = priv->config.dv_flow_en ? MLX5_FLOW_TYPE_DV :
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index b15266f..b25d4e8 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -529,6 +529,7 @@  enum {
 };
 
 enum {
+	MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
 	MLX5_CMD_OP_ALLOC_FLOW_COUNTER = 0x939,
 	MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b,
 };
@@ -591,6 +592,333 @@  struct mlx5_ifc_query_flow_counter_in_bits {
 	u8         flow_counter_id[0x20];
 };
 
+enum {
+	MLX5_GET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
+	MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP        = 0xc << 1,
+};
+
+enum {
+	MLX5_HCA_CAP_OPMOD_GET_MAX   = 0,
+	MLX5_HCA_CAP_OPMOD_GET_CUR   = 1,
+};
+
+struct mlx5_ifc_cmd_hca_cap_bits {
+	u8         reserved_at_0[0x30];
+	u8         vhca_id[0x10];
+	u8         reserved_at_40[0x40];
+	u8         log_max_srq_sz[0x8];
+	u8         log_max_qp_sz[0x8];
+	u8         reserved_at_90[0xb];
+	u8         log_max_qp[0x5];
+	u8         reserved_at_a0[0xb];
+	u8         log_max_srq[0x5];
+	u8         reserved_at_b0[0x10];
+	u8         reserved_at_c0[0x8];
+	u8         log_max_cq_sz[0x8];
+	u8         reserved_at_d0[0xb];
+	u8         log_max_cq[0x5];
+	u8         log_max_eq_sz[0x8];
+	u8         reserved_at_e8[0x2];
+	u8         log_max_mkey[0x6];
+	u8         reserved_at_f0[0x8];
+	u8         dump_fill_mkey[0x1];
+	u8         reserved_at_f9[0x3];
+	u8         log_max_eq[0x4];
+	u8         max_indirection[0x8];
+	u8         fixed_buffer_size[0x1];
+	u8         log_max_mrw_sz[0x7];
+	u8         force_teardown[0x1];
+	u8         reserved_at_111[0x1];
+	u8         log_max_bsf_list_size[0x6];
+	u8         umr_extended_translation_offset[0x1];
+	u8         null_mkey[0x1];
+	u8         log_max_klm_list_size[0x6];
+	u8         reserved_at_120[0xa];
+	u8         log_max_ra_req_dc[0x6];
+	u8         reserved_at_130[0xa];
+	u8         log_max_ra_res_dc[0x6];
+	u8         reserved_at_140[0xa];
+	u8         log_max_ra_req_qp[0x6];
+	u8         reserved_at_150[0xa];
+	u8         log_max_ra_res_qp[0x6];
+	u8         end_pad[0x1];
+	u8         cc_query_allowed[0x1];
+	u8         cc_modify_allowed[0x1];
+	u8         start_pad[0x1];
+	u8         cache_line_128byte[0x1];
+	u8         reserved_at_165[0xa];
+	u8         qcam_reg[0x1];
+	u8         gid_table_size[0x10];
+	u8         out_of_seq_cnt[0x1];
+	u8         vport_counters[0x1];
+	u8         retransmission_q_counters[0x1];
+	u8         debug[0x1];
+	u8         modify_rq_counter_set_id[0x1];
+	u8         rq_delay_drop[0x1];
+	u8         max_qp_cnt[0xa];
+	u8         pkey_table_size[0x10];
+	u8         vport_group_manager[0x1];
+	u8         vhca_group_manager[0x1];
+	u8         ib_virt[0x1];
+	u8         eth_virt[0x1];
+	u8         vnic_env_queue_counters[0x1];
+	u8         ets[0x1];
+	u8         nic_flow_table[0x1];
+	u8         eswitch_manager[0x1];
+	u8         device_memory[0x1];
+	u8         mcam_reg[0x1];
+	u8         pcam_reg[0x1];
+	u8         local_ca_ack_delay[0x5];
+	u8         port_module_event[0x1];
+	u8         enhanced_error_q_counters[0x1];
+	u8         ports_check[0x1];
+	u8         reserved_at_1b3[0x1];
+	u8         disable_link_up[0x1];
+	u8         beacon_led[0x1];
+	u8         port_type[0x2];
+	u8         num_ports[0x8];
+	u8         reserved_at_1c0[0x1];
+	u8         pps[0x1];
+	u8         pps_modify[0x1];
+	u8         log_max_msg[0x5];
+	u8         reserved_at_1c8[0x4];
+	u8         max_tc[0x4];
+	u8         temp_warn_event[0x1];
+	u8         dcbx[0x1];
+	u8         general_notification_event[0x1];
+	u8         reserved_at_1d3[0x2];
+	u8         fpga[0x1];
+	u8         rol_s[0x1];
+	u8         rol_g[0x1];
+	u8         reserved_at_1d8[0x1];
+	u8         wol_s[0x1];
+	u8         wol_g[0x1];
+	u8         wol_a[0x1];
+	u8         wol_b[0x1];
+	u8         wol_m[0x1];
+	u8         wol_u[0x1];
+	u8         wol_p[0x1];
+	u8         stat_rate_support[0x10];
+	u8         reserved_at_1f0[0xc];
+	u8         cqe_version[0x4];
+	u8         compact_address_vector[0x1];
+	u8         striding_rq[0x1];
+	u8         reserved_at_202[0x1];
+	u8         ipoib_enhanced_offloads[0x1];
+	u8         ipoib_basic_offloads[0x1];
+	u8         reserved_at_205[0x1];
+	u8         repeated_block_disabled[0x1];
+	u8         umr_modify_entity_size_disabled[0x1];
+	u8         umr_modify_atomic_disabled[0x1];
+	u8         umr_indirect_mkey_disabled[0x1];
+	u8         umr_fence[0x2];
+	u8         reserved_at_20c[0x3];
+	u8         drain_sigerr[0x1];
+	u8         cmdif_checksum[0x2];
+	u8         sigerr_cqe[0x1];
+	u8         reserved_at_213[0x1];
+	u8         wq_signature[0x1];
+	u8         sctr_data_cqe[0x1];
+	u8         reserved_at_216[0x1];
+	u8         sho[0x1];
+	u8         tph[0x1];
+	u8         rf[0x1];
+	u8         dct[0x1];
+	u8         qos[0x1];
+	u8         eth_net_offloads[0x1];
+	u8         roce[0x1];
+	u8         atomic[0x1];
+	u8         reserved_at_21f[0x1];
+	u8         cq_oi[0x1];
+	u8         cq_resize[0x1];
+	u8         cq_moderation[0x1];
+	u8         reserved_at_223[0x3];
+	u8         cq_eq_remap[0x1];
+	u8         pg[0x1];
+	u8         block_lb_mc[0x1];
+	u8         reserved_at_229[0x1];
+	u8         scqe_break_moderation[0x1];
+	u8         cq_period_start_from_cqe[0x1];
+	u8         cd[0x1];
+	u8         reserved_at_22d[0x1];
+	u8         apm[0x1];
+	u8         vector_calc[0x1];
+	u8         umr_ptr_rlky[0x1];
+	u8	   imaicl[0x1];
+	u8         reserved_at_232[0x4];
+	u8         qkv[0x1];
+	u8         pkv[0x1];
+	u8         set_deth_sqpn[0x1];
+	u8         reserved_at_239[0x3];
+	u8         xrc[0x1];
+	u8         ud[0x1];
+	u8         uc[0x1];
+	u8         rc[0x1];
+	u8         uar_4k[0x1];
+	u8         reserved_at_241[0x9];
+	u8         uar_sz[0x6];
+	u8         reserved_at_250[0x8];
+	u8         log_pg_sz[0x8];
+	u8         bf[0x1];
+	u8         driver_version[0x1];
+	u8         pad_tx_eth_packet[0x1];
+	u8         reserved_at_263[0x8];
+	u8         log_bf_reg_size[0x5];
+	u8         reserved_at_270[0xb];
+	u8         lag_master[0x1];
+	u8         num_lag_ports[0x4];
+	u8         reserved_at_280[0x10];
+	u8         max_wqe_sz_sq[0x10];
+	u8         reserved_at_2a0[0x10];
+	u8         max_wqe_sz_rq[0x10];
+	u8         max_flow_counter_31_16[0x10];
+	u8         max_wqe_sz_sq_dc[0x10];
+	u8         reserved_at_2e0[0x7];
+	u8         max_qp_mcg[0x19];
+	u8         reserved_at_300[0x10];
+	u8         flow_counter_bulk_alloc[0x08];
+	u8         log_max_mcg[0x8];
+	u8         reserved_at_320[0x3];
+	u8         log_max_transport_domain[0x5];
+	u8         reserved_at_328[0x3];
+	u8         log_max_pd[0x5];
+	u8         reserved_at_330[0xb];
+	u8         log_max_xrcd[0x5];
+	u8         nic_receive_steering_discard[0x1];
+	u8         receive_discard_vport_down[0x1];
+	u8         transmit_discard_vport_down[0x1];
+	u8         reserved_at_343[0x5];
+	u8         log_max_flow_counter_bulk[0x8];
+	u8         max_flow_counter_15_0[0x10];
+	u8         reserved_at_360[0x3];
+	u8         log_max_rq[0x5];
+	u8         reserved_at_368[0x3];
+	u8         log_max_sq[0x5];
+	u8         reserved_at_370[0x3];
+	u8         log_max_tir[0x5];
+	u8         reserved_at_378[0x3];
+	u8         log_max_tis[0x5];
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         reserved_at_381[0x2];
+	u8         log_max_rmp[0x5];
+	u8         reserved_at_388[0x3];
+	u8         log_max_rqt[0x5];
+	u8         reserved_at_390[0x3];
+	u8         log_max_rqt_size[0x5];
+	u8         reserved_at_398[0x3];
+	u8         log_max_tis_per_sq[0x5];
+	u8         ext_stride_num_range[0x1];
+	u8         reserved_at_3a1[0x2];
+	u8         log_max_stride_sz_rq[0x5];
+	u8         reserved_at_3a8[0x3];
+	u8         log_min_stride_sz_rq[0x5];
+	u8         reserved_at_3b0[0x3];
+	u8         log_max_stride_sz_sq[0x5];
+	u8         reserved_at_3b8[0x3];
+	u8         log_min_stride_sz_sq[0x5];
+	u8         hairpin[0x1];
+	u8         reserved_at_3c1[0x2];
+	u8         log_max_hairpin_queues[0x5];
+	u8         reserved_at_3c8[0x3];
+	u8         log_max_hairpin_wq_data_sz[0x5];
+	u8         reserved_at_3d0[0x3];
+	u8         log_max_hairpin_num_packets[0x5];
+	u8         reserved_at_3d8[0x3];
+	u8         log_max_wq_sz[0x5];
+	u8         nic_vport_change_event[0x1];
+	u8         disable_local_lb_uc[0x1];
+	u8         disable_local_lb_mc[0x1];
+	u8         log_min_hairpin_wq_data_sz[0x5];
+	u8         reserved_at_3e8[0x3];
+	u8         log_max_vlan_list[0x5];
+	u8         reserved_at_3f0[0x3];
+	u8         log_max_current_mc_list[0x5];
+	u8         reserved_at_3f8[0x3];
+	u8         log_max_current_uc_list[0x5];
+	u8         general_obj_types[0x40];
+	u8         reserved_at_440[0x20];
+	u8         reserved_at_460[0x10];
+	u8         max_num_eqs[0x10];
+	u8         reserved_at_480[0x3];
+	u8         log_max_l2_table[0x5];
+	u8         reserved_at_488[0x8];
+	u8         log_uar_page_sz[0x10];
+	u8         reserved_at_4a0[0x20];
+	u8         device_frequency_mhz[0x20];
+	u8         device_frequency_khz[0x20];
+	u8         reserved_at_500[0x20];
+	u8	   num_of_uars_per_page[0x20];
+	u8         flex_parser_protocols[0x20];
+	u8         reserved_at_560[0x20];
+	u8         reserved_at_580[0x3c];
+	u8         mini_cqe_resp_stride_index[0x1];
+	u8         cqe_128_always[0x1];
+	u8         cqe_compression_128[0x1];
+	u8         cqe_compression[0x1];
+	u8         cqe_compression_timeout[0x10];
+	u8         cqe_compression_max_num[0x10];
+	u8         reserved_at_5e0[0x10];
+	u8         tag_matching[0x1];
+	u8         rndv_offload_rc[0x1];
+	u8         rndv_offload_dc[0x1];
+	u8         log_tag_matching_list_sz[0x5];
+	u8         reserved_at_5f8[0x3];
+	u8         log_max_xrq[0x5];
+	u8	   affiliate_nic_vport_criteria[0x8];
+	u8	   native_port_num[0x8];
+	u8	   num_vhca_ports[0x8];
+	u8	   reserved_at_618[0x6];
+	u8	   sw_owner_id[0x1];
+	u8	   reserved_at_61f[0x1e1];
+};
+
+struct mlx5_ifc_qos_cap_bits {
+	u8         packet_pacing[0x1];
+	u8         esw_scheduling[0x1];
+	u8         esw_bw_share[0x1];
+	u8         esw_rate_limit[0x1];
+	u8         reserved_at_4[0x1];
+	u8         packet_pacing_burst_bound[0x1];
+	u8         packet_pacing_typical_size[0x1];
+	u8         flow_meter_srtcm[0x1];
+	u8         reserved_at_8[0x8];
+	u8         log_max_flow_meter[0x8];
+	u8         flow_meter_reg_id[0x8];
+	u8         reserved_at_25[0x20];
+	u8         packet_pacing_max_rate[0x20];
+	u8         packet_pacing_min_rate[0x20];
+	u8         reserved_at_80[0x10];
+	u8         packet_pacing_rate_table_size[0x10];
+	u8         esw_element_type[0x10];
+	u8         esw_tsar_type[0x10];
+	u8         reserved_at_c0[0x10];
+	u8         max_qos_para_vport[0x10];
+	u8         max_tsar_bw_share[0x20];
+	u8         reserved_at_100[0x6e8];
+};
+
+union mlx5_ifc_hca_cap_union_bits {
+	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_qos_cap_bits qos_cap;
+	u8         reserved_at_0[0x8000];
+};
+
+struct mlx5_ifc_query_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+	u8         syndrome[0x20];
+	u8         reserved_at_40[0x40];
+	union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_query_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+	u8         reserved_at_40[0x40];
+};
+
 /* CQE format mask. */
 #define MLX5E_CQE_FORMAT_MASK 0xc