[v2,3/7] net/mlx5: split PCI from generic probing code
diff mbox series

Message ID 20180614083047.10812-4-adrien.mazarguil@6wind.com
State Superseded, archived
Headers show
Series
  • net/mlx5: add port representor support
Related show

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply issues

Commit Message

Adrien Mazarguil June 14, 2018, 8:34 a.m. UTC
All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. All ports of a Verbs device (mlx5_dev_spawn()).
3. A given port of a Verbs device (mlx5_dev_spawn_one()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Fixed device naming. A port suffix is now appended only if several IB
  ports happen to be detected.
- Added separate message to distinguish missing kernel drivers from other
  initialization errors, as it was confusing.
---
 drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
 1 file changed, 209 insertions(+), 131 deletions(-)

Comments

Xueming Li June 16, 2018, 8:29 a.m. UTC | #1
Reviewed-by: Xueming Li <xuemingl@mellanox.com>

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
> 
> All the generic probing code needs is an IB device. While this device is currently supplied by a PCI
> lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Fixed device naming. A port suffix is now appended only if several IB
>   ports happen to be detected.
> - Added separate message to distinguish missing kernel drivers from other
>   initialization errors, as it was confusing.
> ---
>  drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
>  1 file changed, 209 insertions(+), 131 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 1a5391e63..01dcf25b9 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)  }
> 
>  /**
> - * DPDK callback to register a PCI device.
> - *
> - * This function creates an Ethernet device for each port of a given
> - * PCI device.
> + * Spawn an Ethernet device from Verbs information.
>   *
> - * @param[in] pci_drv
> - *   PCI driver structure (mlx5_driver).
> - * @param[in] pci_dev
> - *   PCI device information.
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + * @param[in] attr
> + *   Verbs device attributes.
> + * @param port
> + *   Verbs port to use (indexed from 1).
>   *
>   * @return
> - *   0 on success, a negative errno value otherwise and rte_errno is set.
> + *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> + *   is set.
>   */
> -static int
> -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> -	       struct rte_pci_device *pci_dev)
> +static struct rte_eth_dev *
> +mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> +		   struct ibv_device *ibv_dev,
> +		   int vf,
> +		   const struct ibv_device_attr_ex *attr,
> +		   unsigned int port)
>  {
> -	struct ibv_device **list = NULL;
> -	struct ibv_device *ibv_dev;
> -	struct ibv_context *ctx = NULL;
> -	struct ibv_device_attr_ex attr;
> +	struct ibv_context *ctx;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> +	struct rte_eth_dev *eth_dev = NULL;
>  	int err = 0;
> -	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
>  	unsigned int tunnel_en = 0;
> @@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	unsigned int mprq_max_stride_size_n = 0;
>  	unsigned int mprq_min_stride_num_n = 0;
>  	unsigned int mprq_max_stride_num_n = 0;
> -	int i;
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };  #endif
> 
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> -	assert(pci_drv == &mlx5_driver);
> -	list = mlx5_glue->get_device_list(&i);
> -	if (list == NULL) {
> -		assert(errno);
> -		err = errno;
> -		if (errno == ENOSYS)
> -			DRV_LOG(ERR,
> -				"cannot list devices, is ib_uverbs loaded?");
> -		goto error;
> -	}
> -	assert(i >= 0);
> -	/*
> -	 * For each listed device, check related sysfs entry against
> -	 * the provided PCI ID.
> -	 */
> -	while (i != 0) {
> -		struct rte_pci_addr pci_addr;
> -
> -		--i;
> -		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
> -		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
> -			continue;
> -		if ((pci_dev->addr.domain != pci_addr.domain) ||
> -		    (pci_dev->addr.bus != pci_addr.bus) ||
> -		    (pci_dev->addr.devid != pci_addr.devid) ||
> -		    (pci_dev->addr.function != pci_addr.function))
> -			continue;
> -		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> -			list[i]->name);
> -		vf = ((pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		ctx = mlx5_glue->open_device(list[i]);
> -		rte_errno = errno;
> -		err = rte_errno;
> -		break;
> -	}
> -	if (ctx == NULL) {
> -		switch (err) {
> -		case 0:
> -			DRV_LOG(ERR,
> -				"cannot access device, is mlx5_ib loaded?");
> -			err = ENODEV;
> -			break;
> -		case EINVAL:
> -			DRV_LOG(ERR,
> -				"cannot use device, are drivers up to date?");
> -			break;
> -		}
> -		goto error;
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		return NULL;
>  	}
> -	ibv_dev = list[i];
> -	DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;  #endif @@ -822,20 +773,11 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> -	if (err) {
> -		DEBUG("ibv_query_device_ex() failed");
> -		goto error;
> -	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
> +	{
>  		char name[RTE_ETH_NAME_MAX_LEN];
> -		int len;
> -		uint32_t port = i + 1; /* ports are indexed from one */
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
> -		struct rte_eth_dev *eth_dev = NULL;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			},
>  		};
> 
> -		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> -			 pci_dev->addr.domain, pci_dev->addr.bus,
> -			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (attr.orig_attr.phys_port_cnt > 1)
> -			snprintf(name + len, sizeof(name), " port %u", i);
> +		if (attr->orig_attr.phys_port_cnt > 1)
> +			snprintf(name, sizeof(name), "%s port %u",
> +				 dpdk_dev->name, port);
> +		else
> +			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
>  			if (eth_dev == NULL) {
> @@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				err = rte_errno;
>  				goto error;
>  			}
> -			eth_dev->device = &pci_dev->device;
> +			eth_dev->device = dpdk_dev;
>  			eth_dev->dev_ops = &mlx5_dev_sec_ops;
>  			err = mlx5_uar_init_secondary(eth_dev);
>  			if (err) {
> @@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				mlx5_select_rx_function(eth_dev);
>  			eth_dev->tx_pkt_burst =
>  				mlx5_select_tx_function(eth_dev);
> -			rte_eth_dev_probing_finish(eth_dev);
> -			continue;
> +			mlx5_glue->close_device(ctx);
> +			return eth_dev;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		if (!ctx)
> -			ctx = mlx5_glue->open_device(ibv_dev);
> -		if (ctx == NULL) {
> -			err = ENODEV;
> -			goto port_error;
> -		}
>  		/* Check port status. */
>  		err = mlx5_glue->query_port(ctx, port, &port_attr);
>  		if (err) {
> @@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = attr;
> +		priv->device_attr = *attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> -		err = mlx5_args(&config, pci_dev->device.devargs);
> +		err = mlx5_args(&config, dpdk_dev->devargs);
>  		if (err) {
>  			err = rte_errno;
>  			DRV_LOG(ERR, "failed to process device arguments: %s",
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		config.hw_csum = !!(attr.device_cap_flags_ex &
> +		config.hw_csum = !!(attr->device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!attr.max_counter_sets;
> +		config.flow_counter_en = !!attr->max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes = %d", @@ -971,7 +907,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -			attr.rss_caps.max_rwq_indirection_table_size;
> +			attr->rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(attr.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(attr.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!attr.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr->rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = (attr.tso_caps.max_tso > 0 &&
> -			      (attr.tso_caps.supported_qpts &
> +		config.tso = (attr->tso_caps.max_tso > 0 &&
> +			      (attr->tso_caps.supported_qpts &
>  			       (1 << IBV_QPT_RAW_PACKET)));
>  		if (config.tso)
> -			config.tso_max_payload_sz = attr.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr->tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this device"
> @@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		eth_dev->data->dev_private = priv;
>  		priv->dev_data = eth_dev->data;
>  		eth_dev->data->mac_addrs = priv->mac;
> -		eth_dev->device = &pci_dev->device;
> -		rte_eth_copy_pci_info(eth_dev, pci_dev);
> +		eth_dev->device = dpdk_dev;
>  		eth_dev->device->driver = &mlx5_driver.driver;
>  		err = mlx5_uar_init_primary(eth_dev);
>  		if (err) {
> @@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> -		/*
> -		 * Each eth_dev instance is assigned its own Verbs context,
> -		 * since this one is consumed, let the next iteration open
> -		 * another.
> -		 */
> -		ctx = NULL;
> -		continue;
> +		return eth_dev;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
> @@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
>  		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
> -		break;
>  	}
> -	/*
> -	 * XXX if something went wrong in the loop above, there is a resource
> -	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
> -	 * long as the dpdk does not provide a way to deallocate a ethdev and a
> -	 * way to enumerate the registered ethdevs to free the previous ones.
> -	 */
>  error:
>  	if (ctx)
>  		claim_zero(mlx5_glue->close_device(ctx));
> -	if (list)
> -		mlx5_glue->free_device_list(list);
> -	if (err) {
> -		rte_errno = err;
> +	assert(err > 0);
> +	rte_errno = err;
> +	return NULL;
> +}
> +
> +/**
> + * Spawn Ethernet devices from Verbs information, one per detected port.
> + *
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + *
> + * @return
> + *   A NULL-terminated list of Ethernet device objects on success, NULL
> + *   otherwise and rte_errno is set. Caller is expected to release list
> + *   memory through free().
> + */
> +static struct rte_eth_dev **
> +mlx5_dev_spawn(struct rte_device *dpdk_dev,
> +	       struct ibv_device *ibv_dev,
> +	       int vf)
> +{
> +	struct rte_eth_dev **eth_list = NULL;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex attr;
> +	unsigned int i;
> +	int ret;
> +
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		if (rte_errno == ENODEV)
> +			DRV_LOG(ERR,
> +				"cannot access device, is mlx5_ib loaded?");
> +		else
> +			DRV_LOG(ERR,
> +				"cannot use device, are drivers up to date?");
> +		return NULL;
> +	}
> +	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> +	mlx5_glue->close_device(ctx);
> +	if (ret) {
> +		rte_errno = ret;
> +		DRV_LOG(ERR, "unable to query device information: %s",
> +			strerror(rte_errno));
> +		return NULL;
> +	}
> +	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +	eth_list = malloc(sizeof(*eth_list) *
> +			  (attr.orig_attr.phys_port_cnt + 1));
> +	if (!eth_list) {
> +		rte_errno = errno;
> +		return NULL;
> +	}
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> +		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> +						 &attr, i + 1);
> +		if (eth_list[i])
> +			continue;
> +		/* Save rte_errno and roll back in case of failure. */
> +		ret = rte_errno;
> +		while (i--) {
> +			mlx5_dev_close(eth_list[i]);
> +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +				rte_free(eth_list[i]->data->dev_private);
> +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> +		}
> +		free(eth_list);
> +		rte_errno = ret;
> +		return NULL;
> +	}
> +	eth_list[i] = NULL;
> +	return eth_list;
> +}
> +
> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +	       struct rte_pci_device *pci_dev) {
> +	struct ibv_device **ibv_list;
> +	struct rte_eth_dev **eth_list = NULL;
> +	int vf;
> +	int ret;
> +
> +	assert(pci_drv == &mlx5_driver);
> +	switch (pci_dev->id.device_id) {
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +		vf = 1;
> +		break;
> +	default:
> +		vf = 0;
> +	}
> +	errno = 0;
> +	ibv_list = mlx5_glue->get_device_list(&ret);
> +	if (!ibv_list) {
> +		rte_errno = errno ? errno : ENOSYS;
> +		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> -	return 0;
> +	while (ret-- > 0) {
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
> +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +			continue;
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> +			ibv_list[ret]->name);
> +		break;
> +	}
> +	if (ret >= 0)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> +	mlx5_glue->free_device_list(ibv_list);
> +	if (!ret) {
> +		DRV_LOG(WARNING,
> +			"no Verbs device matches PCI device " PCI_PRI_FMT ","
> +			" are kernel drivers loaded?",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function);
> +		rte_errno = ENOENT;
> +		ret = -rte_errno;
> +	} else if (!eth_list || !*eth_list) {
> +		DRV_LOG(ERR,
> +			"probe of PCI device " PCI_PRI_FMT " aborted after"
> +			" encountering an error: %s",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function,
> +			strerror(rte_errno));
> +		ret = -rte_errno;
> +	} else {
> +		for (ret = 0; eth_list[ret]; ++ret) {
> +			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			rte_eth_dev_probing_finish(eth_list[ret]);
> +		}
> +		ret = 0;
> +	}
> +	free(eth_list);
> +	return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0
Shahaf Shuler June 17, 2018, 10:14 a.m. UTC | #2
Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> Subject: [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
> 
> All the generic probing code needs is an IB device. While this device is
> currently supplied by a PCI lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as
> follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Fixed device naming. A port suffix is now appended only if several IB
>   ports happen to be detected.
> - Added separate message to distinguish missing kernel drivers from other
>   initialization errors, as it was confusing.

[...]

> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +	       struct rte_pci_device *pci_dev) {
> +	struct ibv_device **ibv_list;
> +	struct rte_eth_dev **eth_list = NULL;
> +	int vf;
> +	int ret;
> +
> +	assert(pci_drv == &mlx5_driver);
> +	switch (pci_dev->id.device_id) {
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +		vf = 1;
> +		break;
> +	default:
> +		vf = 0;
> +	}

Even though I couldn't find any functional bug, I think it is logically more correct to determine if pci device is vf after we know this is Mellanox device. 
Meaning the above block should be ...

> +	errno = 0;
> +	ibv_list = mlx5_glue->get_device_list(&ret);
> +	if (!ibv_list) {
> +		rte_errno = errno ? errno : ENOSYS;
> +		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> -	return 0;
> +	while (ret-- > 0) {
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]-
> >name);
> +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +			continue;
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +		DRV_LOG(INFO, "PCI information matches, using device
> \"%s\"",
> +			ibv_list[ret]->name);
> +		break;
> +	}

Here. 

> +	if (ret >= 0)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret],
> vf);
> +	mlx5_glue->free_device_list(ibv_list);
> +	if (!ret) {
> +		DRV_LOG(WARNING,
> +			"no Verbs device matches PCI device " PCI_PRI_FMT
> ","
> +			" are kernel drivers loaded?",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function);
> +		rte_errno = ENOENT;
> +		ret = -rte_errno;
> +	} else if (!eth_list || !*eth_list) {
> +		DRV_LOG(ERR,
> +			"probe of PCI device " PCI_PRI_FMT " aborted after"
> +			" encountering an error: %s",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function,
> +			strerror(rte_errno));
> +		ret = -rte_errno;
> +	} else {
> +		for (ret = 0; eth_list[ret]; ++ret) {
> +			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			rte_eth_dev_probing_finish(eth_list[ret]);
> +		}
> +		ret = 0;
> +	}
> +	free(eth_list);
> +	return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0
Adrien Mazarguil June 27, 2018, 1:31 p.m. UTC | #3
On Sun, Jun 17, 2018 at 10:14:58AM +0000, Shahaf Shuler wrote:
> Thursday, June 14, 2018 11:35 AM, Adrien Mazarguil:
> > Subject: [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
> > 
> > All the generic probing code needs is an IB device. While this device is
> > currently supplied by a PCI lookup, other methods will be added soon.
> > 
> > This patch divides the original function, which has become huge over time, as
> > follows:
> > 
> > 1. PCI-specific (mlx5_pci_probe()).
> > 2. All ports of a Verbs device (mlx5_dev_spawn()).
> > 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> > 
> > - Fixed device naming. A port suffix is now appended only if several IB
> >   ports happen to be detected.
> > - Added separate message to distinguish missing kernel drivers from other
> >   initialization errors, as it was confusing.
<snip>
> > +static int
> > +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> > +	       struct rte_pci_device *pci_dev) {
> > +	struct ibv_device **ibv_list;
> > +	struct rte_eth_dev **eth_list = NULL;
> > +	int vf;
> > +	int ret;
> > +
> > +	assert(pci_drv == &mlx5_driver);
> > +	switch (pci_dev->id.device_id) {
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> > +		vf = 1;
> > +		break;
> > +	default:
> > +		vf = 0;
> > +	}
> 
> Even though I couldn't find any functional bug, I think it is logically more correct to determine if pci device is vf after we know this is Mellanox device. 
> Meaning the above block should be ...

<snip> 
> > +	while (ret-- > 0) {
> > +		struct rte_pci_addr pci_addr;
> > +
> > +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]-
> > >name);
> > +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> > +			continue;
> > +		if (pci_dev->addr.domain != pci_addr.domain ||
> > +		    pci_dev->addr.bus != pci_addr.bus ||
> > +		    pci_dev->addr.devid != pci_addr.devid ||
> > +		    pci_dev->addr.function != pci_addr.function)
> > +			continue;
> > +		DRV_LOG(INFO, "PCI information matches, using device
> > \"%s\"",
> > +			ibv_list[ret]->name);
> > +		break;
> > +	}
> 
> Here. 

No problem, I will update.

Patch
diff mbox series

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1a5391e63..01dcf25b9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -635,30 +635,34 @@  mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ * @param[in] attr
+ *   Verbs device attributes.
+ * @param port
+ *   Verbs port to use (indexed from 1).
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-	       struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
+		   struct ibv_device *ibv_dev,
+		   int vf,
+		   const struct ibv_device_attr_ex *attr,
+		   unsigned int port)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
-	struct ibv_context *ctx = NULL;
-	struct ibv_device_attr_ex attr;
+	struct ibv_context *ctx;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
 	int err = 0;
-	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -670,71 +674,18 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_max_stride_size_n = 0;
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
-	int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
 #endif
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	assert(pci_drv == &mlx5_driver);
-	list = mlx5_glue->get_device_list(&i);
-	if (list == NULL) {
-		assert(errno);
-		err = errno;
-		if (errno == ENOSYS)
-			DRV_LOG(ERR,
-				"cannot list devices, is ib_uverbs loaded?");
-		goto error;
-	}
-	assert(i >= 0);
-	/*
-	 * For each listed device, check related sysfs entry against
-	 * the provided PCI ID.
-	 */
-	while (i != 0) {
-		struct rte_pci_addr pci_addr;
-
-		--i;
-		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-			continue;
-		if ((pci_dev->addr.domain != pci_addr.domain) ||
-		    (pci_dev->addr.bus != pci_addr.bus) ||
-		    (pci_dev->addr.devid != pci_addr.devid) ||
-		    (pci_dev->addr.function != pci_addr.function))
-			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-			list[i]->name);
-		vf = ((pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		ctx = mlx5_glue->open_device(list[i]);
-		rte_errno = errno;
-		err = rte_errno;
-		break;
-	}
-	if (ctx == NULL) {
-		switch (err) {
-		case 0:
-			DRV_LOG(ERR,
-				"cannot access device, is mlx5_ib loaded?");
-			err = ENODEV;
-			break;
-		case EINVAL:
-			DRV_LOG(ERR,
-				"cannot use device, are drivers up to date?");
-			break;
-		}
-		goto error;
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		return NULL;
 	}
-	ibv_dev = list[i];
-	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -822,20 +773,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
-	if (err) {
-		DEBUG("ibv_query_device_ex() failed");
-		goto error;
-	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -859,11 +801,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
-			 pci_dev->addr.domain, pci_dev->addr.bus,
-			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
+		if (attr->orig_attr.phys_port_cnt > 1)
+			snprintf(name, sizeof(name), "%s port %u",
+				 dpdk_dev->name, port);
+		else
+			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -872,7 +814,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				err = rte_errno;
 				goto error;
 			}
-			eth_dev->device = &pci_dev->device;
+			eth_dev->device = dpdk_dev;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			err = mlx5_uar_init_secondary(eth_dev);
 			if (err) {
@@ -900,16 +842,10 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				mlx5_select_rx_function(eth_dev);
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
-			rte_eth_dev_probing_finish(eth_dev);
-			continue;
+			mlx5_glue->close_device(ctx);
+			return eth_dev;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
-		}
 		/* Check port status. */
 		err = mlx5_glue->query_port(ctx, port, &port_attr);
 		if (err) {
@@ -947,23 +883,23 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = attr;
+		priv->device_attr = *attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, pci_dev->device.devargs);
+		err = mlx5_args(&config, dpdk_dev->devargs);
 		if (err) {
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
 			goto port_error;
 		}
-		config.hw_csum = !!(attr.device_cap_flags_ex &
+		config.hw_csum = !!(attr->device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr.max_counter_sets;
+		config.flow_counter_en = !!attr->max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -971,7 +907,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			attr.rss_caps.max_rwq_indirection_table_size;
+			attr->rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -979,28 +915,28 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(attr.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr.rx_pad_end_addr_align;
+		config.hw_padding = !!attr->rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = (attr.tso_caps.max_tso > 0 &&
-			      (attr.tso_caps.supported_qpts &
+		config.tso = (attr->tso_caps.max_tso > 0 &&
+			      (attr->tso_caps.supported_qpts &
 			       (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz = attr.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr->tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1041,8 +977,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
 		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = &pci_dev->device;
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device = dpdk_dev;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
@@ -1160,13 +1095,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		/*
-		 * Each eth_dev instance is assigned its own Verbs context,
-		 * since this one is consumed, let the next iteration open
-		 * another.
-		 */
-		ctx = NULL;
-		continue;
+		return eth_dev;
 port_error:
 		if (priv)
 			rte_free(priv);
@@ -1174,24 +1103,173 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			claim_zero(mlx5_glue->dealloc_pd(pd));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
-		break;
 	}
-	/*
-	 * XXX if something went wrong in the loop above, there is a resource
-	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-	 * long as the dpdk does not provide a way to deallocate a ethdev and a
-	 * way to enumerate the registered ethdevs to free the previous ones.
-	 */
 error:
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (list)
-		mlx5_glue->free_device_list(list);
-	if (err) {
-		rte_errno = err;
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Spawn Ethernet devices from Verbs information, one per detected port.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ *
+ * @return
+ *   A NULL-terminated list of Ethernet device objects on success, NULL
+ *   otherwise and rte_errno is set. Caller is expected to release list
+ *   memory through free().
+ */
+static struct rte_eth_dev **
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct ibv_device *ibv_dev,
+	       int vf)
+{
+	struct rte_eth_dev **eth_list = NULL;
+	struct ibv_context *ctx;
+	struct ibv_device_attr_ex attr;
+	unsigned int i;
+	int ret;
+
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		if (rte_errno == ENODEV)
+			DRV_LOG(ERR,
+				"cannot access device, is mlx5_ib loaded?");
+		else
+			DRV_LOG(ERR,
+				"cannot use device, are drivers up to date?");
+		return NULL;
+	}
+	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
+	mlx5_glue->close_device(ctx);
+	if (ret) {
+		rte_errno = ret;
+		DRV_LOG(ERR, "unable to query device information: %s",
+			strerror(rte_errno));
+		return NULL;
+	}
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	eth_list = malloc(sizeof(*eth_list) *
+			  (attr.orig_attr.phys_port_cnt + 1));
+	if (!eth_list) {
+		rte_errno = errno;
+		return NULL;
+	}
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
+		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
+						 &attr, i + 1);
+		if (eth_list[i])
+			continue;
+		/* Save rte_errno and roll back in case of failure. */
+		ret = rte_errno;
+		while (i--) {
+			mlx5_dev_close(eth_list[i]);
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				rte_free(eth_list[i]->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+		}
+		free(eth_list);
+		rte_errno = ret;
+		return NULL;
+	}
+	eth_list[i] = NULL;
+	return eth_list;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	struct rte_eth_dev **eth_list = NULL;
+	int vf;
+	int ret;
+
+	assert(pci_drv == &mlx5_driver);
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		vf = 1;
+		break;
+	default:
+		vf = 0;
+	}
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
-	return 0;
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+			ibv_list[ret]->name);
+		break;
+	}
+	if (ret >= 0)
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	mlx5_glue->free_device_list(ibv_list);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI device " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+	} else if (!eth_list || !*eth_list) {
+		DRV_LOG(ERR,
+			"probe of PCI device " PCI_PRI_FMT " aborted after"
+			" encountering an error: %s",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function,
+			strerror(rte_errno));
+		ret = -rte_errno;
+	} else {
+		for (ret = 0; eth_list[ret]; ++ret) {
+			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+			rte_eth_dev_probing_finish(eth_list[ret]);
+		}
+		ret = 0;
+	}
+	free(eth_list);
+	return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {