[v2,05/13] net/mlx5: add IB shared context alloc/free functions

Message ID 1553533414-9911-6-git-send-email-viacheslavo@mellanox.com
State Changes Requested, archived
Delegated to: Shahaf Shuler
Headers show
Series
  • net/mlx5: add support for multiport IB devices
Related show

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Viacheslav Ovsiienko March 25, 2019, 5:03 p.m.
The Mellanox NICs support SR-IOV and have E-Switch feature.
When SR-IOV is set up in switchdev mode and E-Switch is enabled
we have so called VF representors in the system. All representors
belonging to the same E-Switch are created on the basis of the
single PCI function and with current implementation each representor
has its own dedicated Infiniband device and operates within its
own Infiniband context. It is proposed to provide representors
as ports of the single Infiniband device and operate on the
shared Infiniband context saving various resources. This patch
introduces appropriate structures.

Alsa the functions to allocate and free shared IB context for
multiport are added. The IB device context, Protection Domain,
device attributes, Infiniband names are going to be relocated
to the shared structure from the device private one.
mlx5_dev_spawn() is updated to support shared context.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c | 298 +++++++++++++++++++++++++++++++++---------------
 drivers/net/mlx5/mlx5.h |  32 ++++++
 2 files changed, 235 insertions(+), 95 deletions(-)

Comments

Shahaf Shuler March 26, 2019, 12:10 p.m. | #1
Monday, March 25, 2019 7:03 PM, Viacheslav Ovsiienko:
> Subject: [PATCH v2 05/13] net/mlx5: add IB shared context alloc/free
> functions
> 
> The Mellanox NICs support SR-IOV and have E-Switch feature.
> When SR-IOV is set up in switchdev mode and E-Switch is enabled we have
> so called VF representors in the system. All representors belonging to the
> same E-Switch are created on the basis of the single PCI function and with
> current implementation each representor has its own dedicated Infiniband
> device and operates within its own Infiniband context. It is proposed to
> provide representors as ports of the single Infiniband device and operate on
> the shared Infiniband context saving various resources. This patch introduces
> appropriate structures.
> 
> Alsa the functions to allocate and free shared IB context for multiport are
> added. The IB device context, Protection Domain, device attributes,
> Infiniband names are going to be relocated to the shared structure from the
> device private one.
> mlx5_dev_spawn() is updated to support shared context.
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c | 298 +++++++++++++++++++++++++++++++++---
> ------------
>  drivers/net/mlx5/mlx5.h |  32 ++++++
>  2 files changed, 235 insertions(+), 95 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 4338254..4c9621a 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -140,6 +140,141 @@ struct mlx5_dev_spawn_data {
>  	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */  };
> 
> +static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list =
> +LIST_HEAD_INITIALIZER();
> +
> +/**
> + * Allocate shared IB device context. If there is multiport device the
> + * master and representors will share this context, if there is single
> + * port dedicated IB device, the context will be used by only given
> + * port due to unification.
> + *
> + * Routine first searches the context for the spesified IB device name,
> + * if found the shared context assumed and reference counter is
> incremented.
> + * If no context found the new one is created and initialized with
> +specified
> + * IB device context and parameters.
> + *
> + * @param[in] spawn
> + *   Pointer to the IB device attributes (name, port, etc).
> + *
> + * @return
> + *   Pointer to mlx5_ibv_shared object on success,
> + *   otherwise NULL and rte_errno is set.
> + */
> +static struct mlx5_ibv_shared *
> +mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) {

Sorry Slava, I mislead you.
Even though it is not expected for probe to be called by multiple thread it is safer to protect this list as it is global resources.

It is not on the critical datapath, so no big concerns w/ mutex. 
Please add proper synchronization when manipulating this shared contexts. 

> +	struct mlx5_ibv_shared *sh;
> +	int err = 0;
> +
> +	assert(spawn);
> +	/* Secondary process should not create the shared context. */
> +	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +	/* Search for IB context by device name. */
> +	LIST_FOREACH(sh, &mlx5_ibv_list, next) {
> +		if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
> +			sh->refcnt++;
> +			return sh;
> +		}
> +	}
> +	/* No device found, we have to create new sharted context. */
> +	assert(spawn->max_port);
> +	sh = rte_zmalloc("ethdev shared ib context",
> +			 sizeof(struct mlx5_ibv_shared) +
> +			 spawn->max_port *
> +			 sizeof(struct mlx5_ibv_shared_port),
> +			 RTE_CACHE_LINE_SIZE);
> +	if (!sh) {
> +		DRV_LOG(ERR, "shared context allocation failure");
> +		rte_errno  = ENOMEM;
> +		return NULL;
> +	}
> +	/* Try to open IB device with DV first, then usual Verbs. */
> +	errno = 0;
> +	sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
> +	if (sh->ctx) {
> +		sh->devx = 1;
> +		DRV_LOG(DEBUG, "DevX is supported");
> +	} else {
> +		sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
> +		if (!sh->ctx) {
> +			err = errno ? errno : ENODEV;
> +			goto error;
> +		}
> +		DRV_LOG(DEBUG, "DevX is NOT supported");
> +	}
> +	err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh-
> >device_attr);
> +	if (err) {
> +		DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
> +		goto error;
> +	}
> +	sh->refcnt = 1;
> +	sh->max_port = spawn->max_port;
> +	strncpy(sh->ibdev_name, sh->ctx->device->name,
> +		sizeof(sh->ibdev_name));
> +	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
> +		sizeof(sh->ibdev_path));
> +	sh->pd = mlx5_glue->alloc_pd(sh->ctx);
> +	if (sh->pd == NULL) {
> +		DRV_LOG(ERR, "PD allocation failure");
> +		err = ENOMEM;
> +		goto error;
> +	}
> +	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
> +	return sh;
> +error:
> +	assert(sh);
> +	if (sh->pd)
> +		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
> +	if (sh->ctx)
> +		claim_zero(mlx5_glue->close_device(sh->ctx));
> +	rte_free(sh);
> +	assert(err > 0);
> +	rte_errno = err;
> +	return NULL;
> +}
> +
> +/**
> + * Free shared IB device context. Decrement counter and if zero free
> + * all allocated resources and close handles.
> + *
> + * @param[in] sh
> + *   Pointer to mlx5_ibv_shared object to free
> + */
> +static void
> +mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) { #ifndef NDEBUG
> +	/* Check the object presence in the list. */
> +	struct mlx5_ibv_shared *lctx;
> +
> +	LIST_FOREACH(lctx, &mlx5_ibv_list, next)
> +		if (lctx == sh)
> +			break;
> +	assert(lctx);
> +	if (lctx != sh) {
> +		DRV_LOG(ERR, "Freeing non-existing shared IB context");
> +		return;
> +	}
> +#endif
> +	assert(sh);
> +	assert(sh->refcnt);
> +	if (--sh->refcnt)
> +		return;
> +	/* Zero reference counter, we should release resources. */
> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +		assert(sh->secondary);
> +		assert(sh->ctx);
> +		assert(!sh->pd);
> +	}
> +	LIST_REMOVE(sh, next);
> +	if (sh->pd)
> +		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
> +	if (sh->ctx)
> +		claim_zero(mlx5_glue->close_device(sh->ctx));
> +	rte_free(sh);
> +}
> +
> +
>  /**
>   * Prepare shared data between primary and secondary process.
>   */
> @@ -289,12 +424,10 @@ struct mlx5_dev_spawn_data {
>  	}
>  	mlx5_mprq_free_mp(dev);
>  	mlx5_mr_release(dev);
> -	if (priv->pd != NULL) {
> -		assert(priv->ctx != NULL);
> -		claim_zero(mlx5_glue->dealloc_pd(priv->pd));
> -		claim_zero(mlx5_glue->close_device(priv->ctx));
> -	} else
> -		assert(priv->ctx == NULL);
> +	assert(priv->sh);
> +	if (priv->sh)
> +		mlx5_free_shared_ibctx(priv->sh);
> +	priv->sh = NULL;
>  	if (priv->rss_conf.rss_key != NULL)
>  		rte_free(priv->rss_conf.rss_key);
>  	if (priv->reta_idx != NULL)
> @@ -744,11 +877,8 @@ struct mlx5_dev_spawn_data {
>  	       struct mlx5_dev_config config)
>  {
>  	const struct mlx5_switch_info *switch_info = &spawn->info;
> -	struct ibv_device *ibv_dev = spawn->ibv_dev;
> -	struct ibv_context *ctx = NULL;
> -	struct ibv_device_attr_ex attr;
> +	struct mlx5_ibv_shared *sh;
>  	struct ibv_port_attr port_attr;
> -	struct ibv_pd *pd = NULL;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
>  	struct rte_eth_dev *eth_dev = NULL;
>  	struct mlx5_priv *priv = NULL;
> @@ -807,18 +937,47 @@ struct mlx5_dev_spawn_data {
>  	}
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> -	errno = 0;
> -	ctx = mlx5_glue->dv_open_device(ibv_dev);
> -	if (ctx) {
> -		config.devx = 1;
> -		DRV_LOG(DEBUG, "DEVX is supported");
> -	} else {
> -		ctx = mlx5_glue->open_device(ibv_dev);
> -		if (!ctx) {
> -			rte_errno = errno ? errno : ENODEV;
> -			return NULL;
> +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +		eth_dev = rte_eth_dev_attach_secondary(name);
> +		if (eth_dev == NULL) {
> +			DRV_LOG(ERR, "can not attach rte ethdev");
> +			rte_errno = ENOMEM;
> +			err = rte_errno;
> +			goto error;
> +		}
> +		eth_dev->device = dpdk_dev;
> +		eth_dev->dev_ops = &mlx5_dev_sec_ops;
> +		err = mlx5_uar_init_secondary(eth_dev);
> +		if (err) {
> +			err = rte_errno;
> +			goto error;
>  		}
> +		/* Receive command fd from primary process */
> +		err = mlx5_socket_connect(eth_dev);
> +		if (err < 0) {
> +			err = rte_errno;
> +			goto error;
> +		}
> +		/* Remap UAR for Tx queues. */
> +		err = mlx5_tx_uar_remap(eth_dev, err);
> +		if (err) {
> +			err = rte_errno;
> +			goto error;
> +		}
> +		/*
> +		 * Ethdev pointer is still required as input since
> +		 * the primary device is not accessible from the
> +		 * secondary process.
> +		 */
> +		eth_dev->rx_pkt_burst =
> mlx5_select_rx_function(eth_dev);
> +		eth_dev->tx_pkt_burst =
> mlx5_select_tx_function(eth_dev);
> +		return eth_dev;
>  	}
> +	sh = mlx5_alloc_shared_ibctx(spawn);
> +	if (!sh)
> +		return NULL;
> +	config.devx = sh->devx;
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;  #endif @@
> -832,7 +991,7 @@ struct mlx5_dev_spawn_data {  #ifdef
> HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
> #endif
> -	mlx5_glue->dv_query_device(ctx, &dv_attr);
> +	mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
>  	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
>  		if (dv_attr.flags &
> MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
>  			DRV_LOG(DEBUG, "enhanced MPW is supported");
> @@ -917,51 +1076,8 @@ struct mlx5_dev_spawn_data {
>  		" old OFED/rdma-core version or firmware configuration");
> #endif
>  	config.mpls_en = mpls_en;
> -	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> -	if (err) {
> -		DEBUG("ibv_query_device_ex() failed");
> -		goto error;
> -	}
> -	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
> -	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> -		eth_dev = rte_eth_dev_attach_secondary(name);
> -		if (eth_dev == NULL) {
> -			DRV_LOG(ERR, "can not attach rte ethdev");
> -			rte_errno = ENOMEM;
> -			err = rte_errno;
> -			goto error;
> -		}
> -		eth_dev->device = dpdk_dev;
> -		eth_dev->dev_ops = &mlx5_dev_sec_ops;
> -		err = mlx5_uar_init_secondary(eth_dev);
> -		if (err) {
> -			err = rte_errno;
> -			goto error;
> -		}
> -		/* Receive command fd from primary process */
> -		err = mlx5_socket_connect(eth_dev);
> -		if (err < 0) {
> -			err = rte_errno;
> -			goto error;
> -		}
> -		/* Remap UAR for Tx queues. */
> -		err = mlx5_tx_uar_remap(eth_dev, err);
> -		if (err) {
> -			err = rte_errno;
> -			goto error;
> -		}
> -		/*
> -		 * Ethdev pointer is still required as input since
> -		 * the primary device is not accessible from the
> -		 * secondary process.
> -		 */
> -		eth_dev->rx_pkt_burst =
> mlx5_select_rx_function(eth_dev);
> -		eth_dev->tx_pkt_burst =
> mlx5_select_tx_function(eth_dev);
> -		claim_zero(mlx5_glue->close_device(ctx));
> -		return eth_dev;
> -	}
>  	/* Check port status. */
> -	err = mlx5_glue->query_port(ctx, spawn->ibv_port, &port_attr);
> +	err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
>  	if (err) {
>  		DRV_LOG(ERR, "port query failed: %s", strerror(err));
>  		goto error;
> @@ -975,13 +1091,7 @@ struct mlx5_dev_spawn_data {
>  		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
>  			mlx5_glue->port_state_str(port_attr.state),
>  			port_attr.state);
> -	/* Allocate protection domain. */
> -	pd = mlx5_glue->alloc_pd(ctx);
> -	if (pd == NULL) {
> -		DRV_LOG(ERR, "PD allocation failure");
> -		err = ENOMEM;
> -		goto error;
> -	}
> +	/* Allocate private eth device data. */
>  	priv = rte_zmalloc("ethdev private structure",
>  			   sizeof(*priv),
>  			   RTE_CACHE_LINE_SIZE);
> @@ -990,13 +1100,11 @@ struct mlx5_dev_spawn_data {
>  		err = ENOMEM;
>  		goto error;
>  	}
> -	priv->ctx = ctx;
> -	strncpy(priv->ibdev_name, priv->ctx->device->name,
> -		sizeof(priv->ibdev_name));
> -	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
> -		sizeof(priv->ibdev_path));
> -	priv->device_attr = attr;
> -	priv->pd = pd;
> +	priv->sh = sh;
> +	priv->ctx = sh->ctx;
> +	priv->ibv_port = spawn->ibv_port;
> +	priv->device_attr = sh->device_attr;
> +	priv->pd = sh->pd;
>  	priv->mtu = ETHER_MTU;
>  #ifndef RTE_ARCH_64
>  	/* Initialize UAR access locks for 32bit implementations. */ @@ -
> 1062,7 +1170,8 @@ struct mlx5_dev_spawn_data {
>  			strerror(rte_errno));
>  		goto error;
>  	}
> -	config.hw_csum = !!(attr.device_cap_flags_ex &
> IBV_DEVICE_RAW_IP_CSUM);
> +	config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
> +			    IBV_DEVICE_RAW_IP_CSUM);
>  	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  		(config.hw_csum ? "" : "not "));
>  #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ @@ -1076,7
> +1185,7 @@ struct mlx5_dev_spawn_data {
>  	}
>  #endif
>  	config.ind_table_max_size =
> -		attr.rss_caps.max_rwq_indirection_table_size;
> +		sh->device_attr.rss_caps.max_rwq_indirection_table_size;
>  	/*
>  	 * Remove this check once DPDK supports larger/variable
>  	 * indirection tables.
> @@ -1085,18 +1194,18 @@ struct mlx5_dev_spawn_data {
>  		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  		config.ind_table_max_size);
> -	config.hw_vlan_strip = !!(attr.raw_packet_caps &
> +	config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
> 
> IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  		(config.hw_vlan_strip ? "" : "not "));
> -	config.hw_fcs_strip = !!(attr.raw_packet_caps &
> +	config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
>  				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  		(config.hw_fcs_strip ? "" : "not "));  #if
> defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
> -	hw_padding = !!attr.rx_pad_end_addr_align;
> +	hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
>  #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
> -	hw_padding = !!(attr.device_cap_flags_ex &
> +	hw_padding = !!(sh->device_attr.device_cap_flags_ex &
>  			IBV_DEVICE_PCI_WRITE_END_PADDING);
>  #endif
>  	if (config.hw_padding && !hw_padding) { @@ -1105,11 +1214,11 @@
> struct mlx5_dev_spawn_data {
>  	} else if (config.hw_padding) {
>  		DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
>  	}
> -	config.tso = (attr.tso_caps.max_tso > 0 &&
> -		      (attr.tso_caps.supported_qpts &
> +	config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
> +		      (sh->device_attr.tso_caps.supported_qpts &
>  		       (1 << IBV_QPT_RAW_PACKET)));
>  	if (config.tso)
> -		config.tso_max_payload_sz = attr.tso_caps.max_tso;
> +		config.tso_max_payload_sz = sh-
> >device_attr.tso_caps.max_tso;
>  	/*
>  	 * MPW is disabled by default, while the Enhanced MPW is enabled
>  	 * by default.
> @@ -1243,7 +1352,7 @@ struct mlx5_dev_spawn_data {
>  		}
>  	}
>  #ifdef HAVE_MLX5DV_DR
> -		ns = mlx5dv_dr_create_ns(ctx,
> +		ns = mlx5dv_dr_create_ns(sh->ctx,
> 
> MLX5DV_DR_NS_DOMAIN_INGRESS_BYPASS);
>  		if (ns == NULL) {
>  			DRV_LOG(ERR, "mlx5dv_dr_create_ns failed"); @@ -
> 1251,7 +1360,7 @@ struct mlx5_dev_spawn_data {
>  			goto error;
>  		}
>  		priv->rx_ns = ns;
> -		ns = mlx5dv_dr_create_ns(ctx,
> +		ns = mlx5dv_dr_create_ns(sh->ctx,
> 
> MLX5DV_DR_NS_DOMAIN_EGRESS_BYPASS);
>  		if (ns == NULL) {
>  			DRV_LOG(ERR, "mlx5dv_dr_create_ns failed"); @@ -
> 1268,7 +1377,8 @@ struct mlx5_dev_spawn_data {
>  		.free = &mlx5_free_verbs_buf,
>  		.data = priv,
>  	};
> -	mlx5_glue->dv_set_context_attr(ctx,
> MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
> +	mlx5_glue->dv_set_context_attr(sh->ctx,
> +				       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
>  				       (void *)((uintptr_t)&alctr));
>  	/* Bring Ethernet device up. */
>  	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", @@ -
> 1322,15 +1432,13 @@ struct mlx5_dev_spawn_data {
>  		if (eth_dev != NULL)
>  			eth_dev->data->dev_private = NULL;
>  	}
> -	if (pd)
> -		claim_zero(mlx5_glue->dealloc_pd(pd));
>  	if (eth_dev != NULL) {
>  		/* mac_addrs must not be freed alone because part of
> dev_private */
>  		eth_dev->data->mac_addrs = NULL;
>  		rte_eth_dev_release_port(eth_dev);
>  	}
> -	if (ctx)
> -		claim_zero(mlx5_glue->close_device(ctx));
> +	if (sh)
> +		mlx5_free_shared_ibctx(sh);
>  	assert(err > 0);
>  	rte_errno = err;
>  	return NULL;
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> d6967a3..4568de7 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -196,10 +196,42 @@ struct mlx5_flow_tbl_resource {  #define
> MLX5_MAX_TABLES 1024  #define MLX5_GROUP_FACTOR 1
> 
> +/* Per port data of shared IB device. */ struct mlx5_ibv_shared_port {
> +	uint32_t ih_port_id;
> +	/*
> +	 * Interrupt handler port_id. Used by shared interrupt
> +	 * handler to find the corresponding rte_eth device
> +	 * by IB port index. If value is equal or greater
> +	 * RTE_MAX_ETHPORTS it means there is no subhandler
> +	 * installed for specified IB port index.
> +	 */
> +};
> +
> +/*
> + * Shared Infiniband device context for Master/Representors
> + * which belong to same IB device with multiple IB ports.
> + **/
> +struct mlx5_ibv_shared {
> +	LIST_ENTRY(mlx5_ibv_shared) next;
> +	uint32_t refcnt;
> +	uint32_t devx:1; /* Opened with DV. */
> +	uint32_t max_port; /* Maximal IB device port index. */
> +	struct ibv_context *ctx; /* Verbs/DV context. */
> +	struct ibv_pd *pd; /* Protection Domain. */
> +	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
> +	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for
> secondary */
> +	struct ibv_device_attr_ex device_attr; /* Device properties. */
> +	struct rte_intr_handle intr_handle; /* Interrupt handler for device.
> */
> +	struct mlx5_ibv_shared_port port[]; /* per device port data array. */
> +};
> +
>  struct mlx5_priv {
>  	LIST_ENTRY(mlx5_priv) mem_event_cb;
>  	/**< Called by memory event callback. */
>  	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
> +	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
> +	uint32_t ibv_port; /* IB device port number. */
>  	struct ibv_context *ctx; /* Verbs context. */
>  	struct ibv_device_attr_ex device_attr; /* Device properties. */
>  	struct ibv_pd *pd; /* Protection Domain. */
> --
> 1.8.3.1

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4338254..4c9621a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -140,6 +140,141 @@  struct mlx5_dev_spawn_data {
 	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
 };
 
+static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
+
+/**
+ * Allocate shared IB device context. If there is multiport device the
+ * master and representors will share this context, if there is single
+ * port dedicated IB device, the context will be used by only given
+ * port due to unification.
+ *
+ * Routine first searches the context for the spesified IB device name,
+ * if found the shared context assumed and reference counter is incremented.
+ * If no context found the new one is created and initialized with specified
+ * IB device context and parameters.
+ *
+ * @param[in] spawn
+ *   Pointer to the IB device attributes (name, port, etc).
+ *
+ * @return
+ *   Pointer to mlx5_ibv_shared object on success,
+ *   otherwise NULL and rte_errno is set.
+ */
+static struct mlx5_ibv_shared *
+mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn)
+{
+	struct mlx5_ibv_shared *sh;
+	int err = 0;
+
+	assert(spawn);
+	/* Secondary process should not create the shared context. */
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Search for IB context by device name. */
+	LIST_FOREACH(sh, &mlx5_ibv_list, next) {
+		if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
+			sh->refcnt++;
+			return sh;
+		}
+	}
+	/* No device found, we have to create new sharted context. */
+	assert(spawn->max_port);
+	sh = rte_zmalloc("ethdev shared ib context",
+			 sizeof(struct mlx5_ibv_shared) +
+			 spawn->max_port *
+			 sizeof(struct mlx5_ibv_shared_port),
+			 RTE_CACHE_LINE_SIZE);
+	if (!sh) {
+		DRV_LOG(ERR, "shared context allocation failure");
+		rte_errno  = ENOMEM;
+		return NULL;
+	}
+	/* Try to open IB device with DV first, then usual Verbs. */
+	errno = 0;
+	sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
+	if (sh->ctx) {
+		sh->devx = 1;
+		DRV_LOG(DEBUG, "DevX is supported");
+	} else {
+		sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
+		if (!sh->ctx) {
+			err = errno ? errno : ENODEV;
+			goto error;
+		}
+		DRV_LOG(DEBUG, "DevX is NOT supported");
+	}
+	err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
+	if (err) {
+		DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
+		goto error;
+	}
+	sh->refcnt = 1;
+	sh->max_port = spawn->max_port;
+	strncpy(sh->ibdev_name, sh->ctx->device->name,
+		sizeof(sh->ibdev_name));
+	strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
+		sizeof(sh->ibdev_path));
+	sh->pd = mlx5_glue->alloc_pd(sh->ctx);
+	if (sh->pd == NULL) {
+		DRV_LOG(ERR, "PD allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
+	return sh;
+error:
+	assert(sh);
+	if (sh->pd)
+		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
+	if (sh->ctx)
+		claim_zero(mlx5_glue->close_device(sh->ctx));
+	rte_free(sh);
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Free shared IB device context. Decrement counter and if zero free
+ * all allocated resources and close handles.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object to free
+ */
+static void
+mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
+{
+#ifndef NDEBUG
+	/* Check the object presence in the list. */
+	struct mlx5_ibv_shared *lctx;
+
+	LIST_FOREACH(lctx, &mlx5_ibv_list, next)
+		if (lctx == sh)
+			break;
+	assert(lctx);
+	if (lctx != sh) {
+		DRV_LOG(ERR, "Freeing non-existing shared IB context");
+		return;
+	}
+#endif
+	assert(sh);
+	assert(sh->refcnt);
+	if (--sh->refcnt)
+		return;
+	/* Zero reference counter, we should release resources. */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		assert(sh->secondary);
+		assert(sh->ctx);
+		assert(!sh->pd);
+	}
+	LIST_REMOVE(sh, next);
+	if (sh->pd)
+		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
+	if (sh->ctx)
+		claim_zero(mlx5_glue->close_device(sh->ctx));
+	rte_free(sh);
+}
+
+
 /**
  * Prepare shared data between primary and secondary process.
  */
@@ -289,12 +424,10 @@  struct mlx5_dev_spawn_data {
 	}
 	mlx5_mprq_free_mp(dev);
 	mlx5_mr_release(dev);
-	if (priv->pd != NULL) {
-		assert(priv->ctx != NULL);
-		claim_zero(mlx5_glue->dealloc_pd(priv->pd));
-		claim_zero(mlx5_glue->close_device(priv->ctx));
-	} else
-		assert(priv->ctx == NULL);
+	assert(priv->sh);
+	if (priv->sh)
+		mlx5_free_shared_ibctx(priv->sh);
+	priv->sh = NULL;
 	if (priv->rss_conf.rss_key != NULL)
 		rte_free(priv->rss_conf.rss_key);
 	if (priv->reta_idx != NULL)
@@ -744,11 +877,8 @@  struct mlx5_dev_spawn_data {
 	       struct mlx5_dev_config config)
 {
 	const struct mlx5_switch_info *switch_info = &spawn->info;
-	struct ibv_device *ibv_dev = spawn->ibv_dev;
-	struct ibv_context *ctx = NULL;
-	struct ibv_device_attr_ex attr;
+	struct mlx5_ibv_shared *sh;
 	struct ibv_port_attr port_attr;
-	struct ibv_pd *pd = NULL;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
 	struct rte_eth_dev *eth_dev = NULL;
 	struct mlx5_priv *priv = NULL;
@@ -807,18 +937,47 @@  struct mlx5_dev_spawn_data {
 	}
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	errno = 0;
-	ctx = mlx5_glue->dv_open_device(ibv_dev);
-	if (ctx) {
-		config.devx = 1;
-		DRV_LOG(DEBUG, "DEVX is supported");
-	} else {
-		ctx = mlx5_glue->open_device(ibv_dev);
-		if (!ctx) {
-			rte_errno = errno ? errno : ENODEV;
-			return NULL;
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		eth_dev = rte_eth_dev_attach_secondary(name);
+		if (eth_dev == NULL) {
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			err = rte_errno;
+			goto error;
+		}
+		eth_dev->device = dpdk_dev;
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		err = mlx5_uar_init_secondary(eth_dev);
+		if (err) {
+			err = rte_errno;
+			goto error;
 		}
+		/* Receive command fd from primary process */
+		err = mlx5_socket_connect(eth_dev);
+		if (err < 0) {
+			err = rte_errno;
+			goto error;
+		}
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_remap(eth_dev, err);
+		if (err) {
+			err = rte_errno;
+			goto error;
+		}
+		/*
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
+		 */
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		return eth_dev;
 	}
+	sh = mlx5_alloc_shared_ibctx(spawn);
+	if (!sh)
+		return NULL;
+	config.devx = sh->devx;
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -832,7 +991,7 @@  struct mlx5_dev_spawn_data {
 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
 #endif
-	mlx5_glue->dv_query_device(ctx, &dv_attr);
+	mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
 	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
 		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
 			DRV_LOG(DEBUG, "enhanced MPW is supported");
@@ -917,51 +1076,8 @@  struct mlx5_dev_spawn_data {
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config.mpls_en = mpls_en;
-	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
-	if (err) {
-		DEBUG("ibv_query_device_ex() failed");
-		goto error;
-	}
-	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		eth_dev = rte_eth_dev_attach_secondary(name);
-		if (eth_dev == NULL) {
-			DRV_LOG(ERR, "can not attach rte ethdev");
-			rte_errno = ENOMEM;
-			err = rte_errno;
-			goto error;
-		}
-		eth_dev->device = dpdk_dev;
-		eth_dev->dev_ops = &mlx5_dev_sec_ops;
-		err = mlx5_uar_init_secondary(eth_dev);
-		if (err) {
-			err = rte_errno;
-			goto error;
-		}
-		/* Receive command fd from primary process */
-		err = mlx5_socket_connect(eth_dev);
-		if (err < 0) {
-			err = rte_errno;
-			goto error;
-		}
-		/* Remap UAR for Tx queues. */
-		err = mlx5_tx_uar_remap(eth_dev, err);
-		if (err) {
-			err = rte_errno;
-			goto error;
-		}
-		/*
-		 * Ethdev pointer is still required as input since
-		 * the primary device is not accessible from the
-		 * secondary process.
-		 */
-		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
-		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
-		claim_zero(mlx5_glue->close_device(ctx));
-		return eth_dev;
-	}
 	/* Check port status. */
-	err = mlx5_glue->query_port(ctx, spawn->ibv_port, &port_attr);
+	err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
 	if (err) {
 		DRV_LOG(ERR, "port query failed: %s", strerror(err));
 		goto error;
@@ -975,13 +1091,7 @@  struct mlx5_dev_spawn_data {
 		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
 			mlx5_glue->port_state_str(port_attr.state),
 			port_attr.state);
-	/* Allocate protection domain. */
-	pd = mlx5_glue->alloc_pd(ctx);
-	if (pd == NULL) {
-		DRV_LOG(ERR, "PD allocation failure");
-		err = ENOMEM;
-		goto error;
-	}
+	/* Allocate private eth device data. */
 	priv = rte_zmalloc("ethdev private structure",
 			   sizeof(*priv),
 			   RTE_CACHE_LINE_SIZE);
@@ -990,13 +1100,11 @@  struct mlx5_dev_spawn_data {
 		err = ENOMEM;
 		goto error;
 	}
-	priv->ctx = ctx;
-	strncpy(priv->ibdev_name, priv->ctx->device->name,
-		sizeof(priv->ibdev_name));
-	strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
-		sizeof(priv->ibdev_path));
-	priv->device_attr = attr;
-	priv->pd = pd;
+	priv->sh = sh;
+	priv->ctx = sh->ctx;
+	priv->ibv_port = spawn->ibv_port;
+	priv->device_attr = sh->device_attr;
+	priv->pd = sh->pd;
 	priv->mtu = ETHER_MTU;
 #ifndef RTE_ARCH_64
 	/* Initialize UAR access locks for 32bit implementations. */
@@ -1062,7 +1170,8 @@  struct mlx5_dev_spawn_data {
 			strerror(rte_errno));
 		goto error;
 	}
-	config.hw_csum = !!(attr.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM);
+	config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
+			    IBV_DEVICE_RAW_IP_CSUM);
 	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 		(config.hw_csum ? "" : "not "));
 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
@@ -1076,7 +1185,7 @@  struct mlx5_dev_spawn_data {
 	}
 #endif
 	config.ind_table_max_size =
-		attr.rss_caps.max_rwq_indirection_table_size;
+		sh->device_attr.rss_caps.max_rwq_indirection_table_size;
 	/*
 	 * Remove this check once DPDK supports larger/variable
 	 * indirection tables.
@@ -1085,18 +1194,18 @@  struct mlx5_dev_spawn_data {
 		config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 		config.ind_table_max_size);
-	config.hw_vlan_strip = !!(attr.raw_packet_caps &
+	config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
 				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 		(config.hw_vlan_strip ? "" : "not "));
-	config.hw_fcs_strip = !!(attr.raw_packet_caps &
+	config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
 				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 		(config.hw_fcs_strip ? "" : "not "));
 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
-	hw_padding = !!attr.rx_pad_end_addr_align;
+	hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
-	hw_padding = !!(attr.device_cap_flags_ex &
+	hw_padding = !!(sh->device_attr.device_cap_flags_ex &
 			IBV_DEVICE_PCI_WRITE_END_PADDING);
 #endif
 	if (config.hw_padding && !hw_padding) {
@@ -1105,11 +1214,11 @@  struct mlx5_dev_spawn_data {
 	} else if (config.hw_padding) {
 		DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
 	}
-	config.tso = (attr.tso_caps.max_tso > 0 &&
-		      (attr.tso_caps.supported_qpts &
+	config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
+		      (sh->device_attr.tso_caps.supported_qpts &
 		       (1 << IBV_QPT_RAW_PACKET)));
 	if (config.tso)
-		config.tso_max_payload_sz = attr.tso_caps.max_tso;
+		config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
 	/*
 	 * MPW is disabled by default, while the Enhanced MPW is enabled
 	 * by default.
@@ -1243,7 +1352,7 @@  struct mlx5_dev_spawn_data {
 		}
 	}
 #ifdef HAVE_MLX5DV_DR
-		ns = mlx5dv_dr_create_ns(ctx,
+		ns = mlx5dv_dr_create_ns(sh->ctx,
 					 MLX5DV_DR_NS_DOMAIN_INGRESS_BYPASS);
 		if (ns == NULL) {
 			DRV_LOG(ERR, "mlx5dv_dr_create_ns failed");
@@ -1251,7 +1360,7 @@  struct mlx5_dev_spawn_data {
 			goto error;
 		}
 		priv->rx_ns = ns;
-		ns = mlx5dv_dr_create_ns(ctx,
+		ns = mlx5dv_dr_create_ns(sh->ctx,
 					 MLX5DV_DR_NS_DOMAIN_EGRESS_BYPASS);
 		if (ns == NULL) {
 			DRV_LOG(ERR, "mlx5dv_dr_create_ns failed");
@@ -1268,7 +1377,8 @@  struct mlx5_dev_spawn_data {
 		.free = &mlx5_free_verbs_buf,
 		.data = priv,
 	};
-	mlx5_glue->dv_set_context_attr(ctx, MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+	mlx5_glue->dv_set_context_attr(sh->ctx,
+				       MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
 				       (void *)((uintptr_t)&alctr));
 	/* Bring Ethernet device up. */
 	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
@@ -1322,15 +1432,13 @@  struct mlx5_dev_spawn_data {
 		if (eth_dev != NULL)
 			eth_dev->data->dev_private = NULL;
 	}
-	if (pd)
-		claim_zero(mlx5_glue->dealloc_pd(pd));
 	if (eth_dev != NULL) {
 		/* mac_addrs must not be freed alone because part of dev_private */
 		eth_dev->data->mac_addrs = NULL;
 		rte_eth_dev_release_port(eth_dev);
 	}
-	if (ctx)
-		claim_zero(mlx5_glue->close_device(ctx));
+	if (sh)
+		mlx5_free_shared_ibctx(sh);
 	assert(err > 0);
 	rte_errno = err;
 	return NULL;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index d6967a3..4568de7 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -196,10 +196,42 @@  struct mlx5_flow_tbl_resource {
 #define MLX5_MAX_TABLES 1024
 #define MLX5_GROUP_FACTOR 1
 
+/* Per port data of shared IB device. */
+struct mlx5_ibv_shared_port {
+	uint32_t ih_port_id;
+	/*
+	 * Interrupt handler port_id. Used by shared interrupt
+	 * handler to find the corresponding rte_eth device
+	 * by IB port index. If value is equal or greater
+	 * RTE_MAX_ETHPORTS it means there is no subhandler
+	 * installed for specified IB port index.
+	 */
+};
+
+/*
+ * Shared Infiniband device context for Master/Representors
+ * which belong to same IB device with multiple IB ports.
+ **/
+struct mlx5_ibv_shared {
+	LIST_ENTRY(mlx5_ibv_shared) next;
+	uint32_t refcnt;
+	uint32_t devx:1; /* Opened with DV. */
+	uint32_t max_port; /* Maximal IB device port index. */
+	struct ibv_context *ctx; /* Verbs/DV context. */
+	struct ibv_pd *pd; /* Protection Domain. */
+	char ibdev_name[IBV_SYSFS_NAME_MAX]; /* IB device name. */
+	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
+	struct ibv_device_attr_ex device_attr; /* Device properties. */
+	struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
+	struct mlx5_ibv_shared_port port[]; /* per device port data array. */
+};
+
 struct mlx5_priv {
 	LIST_ENTRY(mlx5_priv) mem_event_cb;
 	/**< Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
+	struct mlx5_ibv_shared *sh; /* Shared IB device context. */
+	uint32_t ibv_port; /* IB device port number. */
 	struct ibv_context *ctx; /* Verbs context. */
 	struct ibv_device_attr_ex device_attr; /* Device properties. */
 	struct ibv_pd *pd; /* Protection Domain. */