[v2,6/7] net/mlx5: probe all port representors
diff mbox series

Message ID 20180614083047.10812-7-adrien.mazarguil@6wind.com
State Superseded, archived
Headers show
Series
  • net/mlx5: add port representor support
Related show

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply issues

Commit Message

Adrien Mazarguil June 14, 2018, 8:35 a.m. UTC
Probe existing port representors in addition to their master device and
associate them automatically.

To avoid name collision between Ethernet devices, their names use the same
convention as ixgbe and i40e PMDs, that is, instead of only a PCI address
in DBDF notation:

- "net_{DBDF}_0" for master/switch devices.
- "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
  representors.

Both optionally suffixed with "_port_{num}" instead of " port {num}" for
devices that expose several Verbs ports (note this is never the case on
mlx5, but kept for historical reasons for the time being).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
--
v2 changes:

- Added representor information to dev_infos_get(). DPDK port ID of master
  device is now stored in the private structure to retrieve it
  conveniently.
- Master device is assigned dummy representor ID value -1 to better
  distinguish from the the first actual representor reported by
  dev_infos_get() as those are indexed from 0.
- Added RTE_ETH_DEV_REPRESENTOR device flag.
---
 drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
 drivers/net/mlx5/mlx5.h        |   9 ++-
 drivers/net/mlx5/mlx5_ethdev.c | 151 ++++++++++++++++++++++++++++++++----
 drivers/net/mlx5/mlx5_mac.c    |   2 +-
 drivers/net/mlx5/mlx5_stats.c  |   6 +-
 5 files changed, 252 insertions(+), 54 deletions(-)

Comments

Xueming Li June 16, 2018, 8:57 a.m. UTC | #1
Reviewed-by: Xueming Li <xuemingl@mellanox.com>

Minor comments inside:

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Thursday, June 14, 2018 4:35 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> Probe existing port representors in addition to their master device and associate them automatically.
> 
> To avoid name collision between Ethernet devices, their names use the same convention as ixgbe and
> i40e PMDs, that is, instead of only a PCI address in DBDF notation:
> 
> - "net_{DBDF}_0" for master/switch devices.
> - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
>   representors.
> 
> Both optionally suffixed with "_port_{num}" instead of " port {num}" for devices that expose several
> Verbs ports (note this is never the case on mlx5, but kept for historical reasons for the time being).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> --
> v2 changes:
> 
> - Added representor information to dev_infos_get(). DPDK port ID of master
>   device is now stored in the private structure to retrieve it
>   conveniently.
> - Master device is assigned dummy representor ID value -1 to better
>   distinguish from the the first actual representor reported by
>   dev_infos_get() as those are indexed from 0.
> - Added RTE_ETH_DEV_REPRESENTOR device flag.
> ---
>  drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
>  drivers/net/mlx5/mlx5.h        |   9 ++-
>  drivers/net/mlx5/mlx5_ethdev.c | 151 ++++++++++++++++++++++++++++++++----
>  drivers/net/mlx5/mlx5_mac.c    |   2 +-
>  drivers/net/mlx5/mlx5_stats.c  |   6 +-
>  5 files changed, 252 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 498f80c89..716c9d9a5 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  	if (ret)
>  		DRV_LOG(WARNING, "port %u some flows still remain",
>  			dev->data->port_id);
> +	if (!priv->representor &&
> +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
>  	memset(priv, 0, sizeof(*priv));
>  }
> 
> @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
>   *   Verbs device attributes.
>   * @param port
>   *   Verbs port to use (indexed from 1).
> + * @param master
> + *   Master device in case @p ibv_dev is a port representor.
> + * @param rep_id
> + *   Representor identifier when @p master is non-NULL.
>   *
>   * @return
>   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		   struct ibv_device *ibv_dev,
>  		   int vf,
>  		   const struct ibv_device_attr_ex *attr,
> -		   unsigned int port)
> +		   unsigned int port,
> +		   struct rte_eth_dev *master,
> +		   unsigned int rep_id)
>  {
>  	struct ibv_context *ctx;
>  	struct ibv_port_attr port_attr;
> @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		" old OFED/rdma-core version or firmware configuration");  #endif
>  	config.mpls_en = mpls_en;
> -	if (attr->orig_attr.phys_port_cnt > 1)
> -		snprintf(name, sizeof(name), "%s port %u",
> -			 dpdk_dev->name, port);
> +	if (!master)
> +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
>  	else
> -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> +			 dpdk_dev->name, rep_id);
> +	if (attr->orig_attr.phys_port_cnt > 1)
> +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  		eth_dev = rte_eth_dev_attach_secondary(name);
>  		if (eth_dev == NULL) {
> @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	priv->port = port;
>  	priv->pd = pd;
>  	priv->mtu = ETHER_MTU;
> +	/*
> +	 * Allocate a switch domain for master devices and share it with
> +	 * port representors.
> +	 */
> +	if (!master) {
> +		priv->representor = 0;
> +		priv->master_id = -1; /* Updated once known. */
> +		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;

Domain_id will override below.

> +		priv->rep_id = -1; /* Dummy unique value. */
> +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> +		if (err) {
> +			err = rte_errno;
> +			DRV_LOG(ERR, "unable to allocate switch domain: %s",
> +				strerror(rte_errno));
> +			goto error;
> +		}
> +	} else {
> +		priv->representor = 1;
> +		priv->master_id =
> +			((struct priv *)master->data->dev_private)->master_id;
> +		priv->domain_id =
> +			((struct priv *)master->data->dev_private)->domain_id;
> +		priv->rep_id = rep_id;
> +	}

Do you think such information should be set as well in secondary process?

>  	err = mlx5_args(&config, dpdk_dev->devargs);
>  	if (err) {
>  		err = rte_errno;
> @@ -964,6 +1000,18 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  		err = ENOMEM;
>  		goto error;
>  	}
> +	/*
> +	 * Now that eth_dev is allocated and its port ID is known, make
> +	 * non-representor ports target their own port ID as master for
> +	 * convenience.
> +	 *
> +	 * Master port ID is already set for actual representors. Those only
> +	 * need the right device flag.
> +	 */
> +	if (!master)
> +		priv->master_id = eth_dev->data->port_id;
> +	else
> +		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
>  	eth_dev->data->dev_private = priv;
>  	priv->dev_data = eth_dev->data;
>  	eth_dev->data->mac_addrs = priv->mac;
> @@ -1083,8 +1131,12 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  	return eth_dev;
>  error:
> -	if (priv)
> +	if (priv) {
> +		if (!priv->representor &&
> +		    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> +			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
>  		rte_free(priv);
> +	}
>  	if (pd)
>  		claim_zero(mlx5_glue->dealloc_pd(pd));
>  	if (eth_dev)
> @@ -1097,12 +1149,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,  }
> 
>  /**
> - * Spawn Ethernet devices from Verbs information, one per detected port.
> + * Spawn Ethernet devices from Verbs information, one per detected port
> + and
> + * port representor.
>   *
>   * @param dpdk_dev
>   *   Backing DPDK device.
>   * @param ibv_dev
> - *   Verbs device.
> + *   NULL-terminated list of Verbs devices. First entry is the master device
> + *   (mandatory), followed by optional representors.
>   * @param vf
>   *   If nonzero, enable VF-specific features.
>   *
> @@ -1113,17 +1167,21 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>   */
>  static struct rte_eth_dev **
>  mlx5_dev_spawn(struct rte_device *dpdk_dev,
> -	       struct ibv_device *ibv_dev,
> +	       struct ibv_device **ibv_dev,
>  	       int vf)
>  {
>  	struct rte_eth_dev **eth_list = NULL;
>  	struct ibv_context *ctx;
>  	struct ibv_device_attr_ex attr;
> +	void *tmp;
>  	unsigned int i;
> +	unsigned int j = 0;
> +	unsigned int n = 0;
>  	int ret;
> 
> +next:
>  	errno = 0;
> -	ctx = mlx5_glue->open_device(ibv_dev);
> +	ctx = mlx5_glue->open_device(ibv_dev[j]);
>  	if (!ctx) {
>  		rte_errno = errno ? errno : ENODEV;
>  		if (rte_errno == ENODEV)
> @@ -1132,7 +1190,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		else
>  			DRV_LOG(ERR,
>  				"cannot use device, are drivers up to date?");
> -		return NULL;
> +		goto error;
>  	}
>  	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
>  	mlx5_glue->close_device(ctx);
> @@ -1140,34 +1198,42 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
>  		rte_errno = ret;
>  		DRV_LOG(ERR, "unable to query device information: %s",
>  			strerror(rte_errno));
> -		return NULL;
> +		goto error;
>  	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	eth_list = malloc(sizeof(*eth_list) *
> -			  (attr.orig_attr.phys_port_cnt + 1));
> -	if (!eth_list) {
> +	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
> +		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
> +	tmp = realloc(eth_list, sizeof(*eth_list) *
> +		      (n + attr.orig_attr.phys_port_cnt + 1));
> +	if (!tmp) {
>  		rte_errno = errno;
> -		return NULL;
> +		goto error;
>  	}
> +	eth_list = tmp;
>  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> -						 &attr, i + 1);
> -		if (eth_list[i])
> -			continue;
> -		/* Save rte_errno and roll back in case of failure. */
> -		ret = rte_errno;
> -		while (i--) {
> -			mlx5_dev_close(eth_list[i]);
> -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> -				rte_free(eth_list[i]->data->dev_private);
> -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> -		}
> -		free(eth_list);
> -		rte_errno = ret;
> -		return NULL;
> +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j], vf,
> +						 &attr, i + 1,
> +						 j ? eth_list[0] : NULL,
> +						 j - 1);
> +		if (!eth_list[n])
> +			goto error;
> +		++n;
>  	}
> -	eth_list[i] = NULL;
> +	if (ibv_dev[++j])
> +		goto next;
> +	eth_list[n] = NULL;
>  	return eth_list;
> +error:
> +	/* Save rte_errno and roll back in case of failure. */
> +	ret = rte_errno;
> +	while (n--) {
> +		mlx5_dev_close(eth_list[n]);
> +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +			rte_free(eth_list[n]->data->dev_private);
> +		claim_zero(rte_eth_dev_release_port(eth_list[n]));
> +	}
> +	free(eth_list);
> +	rte_errno = ret;
> +	return NULL;
>  }
> 
>  /**
> @@ -1282,7 +1348,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				ibv_match[ret]->name, ret - 1);
>  	}
>  	if (n)
> -		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match, vf);
>  	mlx5_glue->free_device_list(ibv_list);
>  	if (!n) {
>  		DRV_LOG(WARNING,
> @@ -1302,7 +1368,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		ret = -rte_errno;
>  	} else {
>  		for (ret = 0; eth_list[ret]; ++ret) {
> +			uint32_t restore = eth_list[ret]->data->dev_flags;
> +
>  			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			/* Restore non-PCI flags cleared by the above call. */
> +			eth_list[ret]->data->dev_flags |= restore;
>  			rte_eth_dev_probing_finish(eth_list[ret]);
>  		}
>  		ret = 0;
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 997b04a33..0fe467140 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -161,6 +161,10 @@ struct priv {
>  	uint16_t mtu; /* Configured MTU. */
>  	uint8_t port; /* Physical port number. */
>  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> +	unsigned int representor:1; /* Device is a port representor. */
> +	uint16_t master_id; /* DPDK port ID of switch domain master. */
> +	uint16_t domain_id; /* Switch domain identifier. */
> +	unsigned int rep_id; /* Port representor identifier. */
>  	/* RX/TX queues. */
>  	unsigned int rxqs_n; /* RX queues array size. */
>  	unsigned int txqs_n; /* TX queues array size. */ @@ -209,9 +213,12 @@ int mlx5_getenv_int(const
> char *);
> 
>  /* mlx5_ethdev.c */
> 
> +int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +			   char (*ifname)[IF_NAMESIZE]);
>  int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);  int
> mlx5_ifindex(const struct rte_eth_dev *dev); -int mlx5_ifreq(const struct rte_eth_dev *dev, int req,
> struct ifreq *ifr);
> +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	       int master);
>  int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);  int mlx5_set_flags(struct rte_eth_dev *dev,
> unsigned int keep,
>  		   unsigned int flags);
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index
> 90488af33..9d579659e 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -93,7 +93,7 @@ struct ethtool_link_settings {  #endif
> 
>  /**
> - * Get interface name from private structure.
> + * Get master interface name from private structure.
>   *
>   * @param[in] dev
>   *   Pointer to Ethernet device.
> @@ -104,7 +104,8 @@ struct ethtool_link_settings {
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
> +mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> +		       char (*ifname)[IF_NAMESIZE])
>  {
>  	struct priv *priv = dev->data->dev_private;
>  	DIR *dir;
> @@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])  }
> 
>  /**
> + * Get interface name from private structure.
> + *
> + * This is a port representor-aware version of mlx5_get_master_ifname().
> + *
> + * @param[in] dev
> + *   Pointer to Ethernet device.
> + * @param[out] ifname
> + *   Interface name output buffer.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +int
> +mlx5_get_ifname(const struct rte_eth_dev *dev, char
> +(*ifname)[IF_NAMESIZE]) {
> +	struct priv *priv = dev->data->dev_private;
> +	int ret;
> +	char master[IF_NAMESIZE];
> +	FILE *file;
> +	DIR *dir;
> +	uint64_t phys_switch_id;
> +
> +	if (!priv->representor)
> +		return mlx5_get_master_ifname(dev, ifname);
> +	ret = mlx5_get_master_ifname(dev, &master);
> +	if (ret)
> +		return ret;
> +	{
> +		MKSTR(path, "%s/device/net/%s/phys_switch_id",
> +		      priv->ibdev_path, master);
> +
> +		file = fopen(path, "rb");
> +	}
> +	if (!file) {
> +		rte_errno = errno;
> +		return -rte_errno;
> +	}
> +	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
> +	fclose(file);
> +	if (ret != 1) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	{
> +		MKSTR(path, "%s/device/net/%s/subsystem",
> +		      priv->ibdev_path, master);
> +
> +		dir = opendir(path);
> +	}
> +	if (!dir) {
> +		rte_errno = errno;
> +		return -rte_errno;
> +	}
> +	/*
> +	 * Scan network interfaces to find one with matching phys_switch_id
> +	 * and phys_switch_name.
> +	 */
> +	do {
> +		struct dirent *dent;
> +		uint64_t phys_switch_id_rep;
> +		int rep_id;
> +
> +		ret = -ENOENT;
> +		dent = readdir(dir);
> +		if (!dent)
> +			break;
> +		{
> +			MKSTR(path,
> +			      "%s/device/net/%s/subsystem/%s/phys_switch_id",
> +			      priv->ibdev_path, master, dent->d_name);
> +
> +			file = fopen(path, "rb");
> +		}
> +		if (!file)
> +			continue;
> +		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
> +		fclose(file);
> +		if (ret != 1)
> +			continue;
> +		if (phys_switch_id_rep != phys_switch_id)
> +			continue;
> +		{
> +			MKSTR(path,
> +			      "%s/device/net/%s/subsystem/%s/phys_port_name",
> +			      priv->ibdev_path, master, dent->d_name);
> +
> +			file = fopen(path, "rb");
> +		}
> +		if (!file)
> +			continue;
> +		ret = fscanf(file, "%d", &rep_id);
> +		fclose(file);
> +		if (ret != 1)
> +			continue;
> +		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
> +			continue;
> +		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
> +		ret = 0;
> +		break;
> +	} while (1);
> +	closedir(dir);
> +	if (ret)
> +		rte_errno = -ret;
> +	return ret;
> +}
> +
> +/**
>   * Get the interface index from device name.
>   *
>   * @param[in] dev
> @@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
>   *   Request number to pass to ioctl().
>   * @param[out] ifr
>   *   Interface request structure output buffer.
> + * @param master
> + *   When device is a port representor, perform request on master device
> + *   instead.
>   *
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
>  int
> -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
> +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> +	   int master)
>  {
>  	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
>  	int ret = 0;
> @@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
>  		rte_errno = errno;
>  		return -rte_errno;
>  	}
> -	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> +	if (master)
> +		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
> +	else
> +		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
>  	if (ret)
>  		goto error;
>  	ret = ioctl(sock, req, ifr);
> @@ -258,7 +373,7 @@ int
>  mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
> 
>  	if (ret)
>  		return ret;
> @@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)  {
>  	struct ifreq request = { .ifr_mtu = mtu, };
> 
> -	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
> +	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
>  }
> 
>  /**
> @@ -302,13 +417,13 @@ int
>  mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)  {
>  	struct ifreq request;
> -	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
> +	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
> 
>  	if (ret)
>  		return ret;
>  	request.ifr_flags &= keep;
>  	request.ifr_flags |= flags & ~keep;
> -	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
> +	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
>  }
> 
>  /**
> @@ -477,6 +592,12 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
>  	info->speed_capa = priv->link_speed_capa;
>  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
>  	mlx5_set_default_params(dev, info);
> +	if (rte_eth_dev_is_valid_port(priv->master_id)) {
> +		info->switch_info.name =
> +			rte_eth_devices[priv->master_id].data->name;
> +		info->switch_info.domain_id = priv->domain_id;
> +		info->switch_info.port_id = priv->rep_id;
> +	}
>  }
> 
>  /**
> @@ -540,7 +661,7 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	int link_speed = 0;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -550,7 +671,7 @@
> mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&edata;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", @@ -611,7 +732,7 @@
> mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	uint64_t sc;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
>  			dev->data->port_id, strerror(rte_errno)); @@ -621,7 +742,7 @@
> mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
>  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
>  				(ifr.ifr_flags & IFF_RUNNING));
>  	ifr.ifr_data = (void *)&gcmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
> @@ -638,7 +759,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> 
>  	*ecmd = gcmd;
>  	ifr.ifr_data = (void *)ecmd;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(DEBUG,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
> @@ -801,7 +922,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
>  	int ret;
> 
>  	ifr.ifr_data = (void *)&ethpause;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
> @@ -854,7 +975,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
>  		ethpause.tx_pause = 1;
>  	else
>  		ethpause.tx_pause = 0;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
> diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c index 672a47619..12ee37f55
> 100644
> --- a/drivers/net/mlx5/mlx5_mac.c
> +++ b/drivers/net/mlx5/mlx5_mac.c
> @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
>  	struct ifreq request;
>  	int ret;
> 
> -	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
> +	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
>  	if (ret)
>  		return ret;
>  	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff --git
> a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index 875dd1027..91f3d474a 100644
> --- a/drivers/net/mlx5/mlx5_stats.c
> +++ b/drivers/net/mlx5/mlx5_stats.c
> @@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
>  	et_stats->cmd = ETHTOOL_GSTATS;
>  	et_stats->n_stats = xstats_ctrl->stats_n;
>  	ifr.ifr_data = (caddr_t)et_stats;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING,
>  			"port %u unable to read statistic values from device", @@ -194,7 +194,7 @@
> mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
> 
>  	drvinfo.cmd = ETHTOOL_GDRVINFO;
>  	ifr.ifr_data = (caddr_t)&drvinfo;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to query number of statistics",
>  			dev->data->port_id);
> @@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
>  	strings->string_set = ETH_SS_STATS;
>  	strings->len = dev_stats_n;
>  	ifr.ifr_data = (caddr_t)strings;
> -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
>  	if (ret) {
>  		DRV_LOG(WARNING, "port %u unable to get statistic names",
>  			dev->data->port_id);
> --
> 2.11.0
Shahaf Shuler June 17, 2018, 10:15 a.m. UTC | #2
Hi Adrien,

Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > Sent: Thursday, June 14, 2018 4:35 PM
> > To: Shahaf Shuler <shahafs@mellanox.com>
> > Cc: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > representors
> >
> > Probe existing port representors in addition to their master device and
> associate them automatically.
> >
> > To avoid name collision between Ethernet devices, their names use the
> > same convention as ixgbe and i40e PMDs, that is, instead of only a PCI
> address in DBDF notation:
> >
> > - "net_{DBDF}_0" for master/switch devices.

This is breaking compatibility for application using the device names in order to attach them to the application (e.g. OVS-DPDK). 
Before this patch the naming scheme for non-representor port is "{DBDF}". 

Can we preserve the compatibility and add appropriate suffix for the representor case? 

> > - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> >   representors.
> >
> > Both optionally suffixed with "_port_{num}" instead of " port {num}"
> > for devices that expose several Verbs ports (note this is never the case on
> mlx5, but kept for historical reasons for the time being).
> >
> > (Patch based on prior work from Yuanhan Liu)
> >
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> >
> > - Added representor information to dev_infos_get(). DPDK port ID of
> master
> >   device is now stored in the private structure to retrieve it
> >   conveniently.
> > - Master device is assigned dummy representor ID value -1 to better
> >   distinguish from the the first actual representor reported by
> >   dev_infos_get() as those are indexed from 0.
> > - Added RTE_ETH_DEV_REPRESENTOR device flag.
> > ---
> >  drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
> >  drivers/net/mlx5/mlx5.h        |   9 ++-
> >  drivers/net/mlx5/mlx5_ethdev.c | 151
> ++++++++++++++++++++++++++++++++----
> >  drivers/net/mlx5/mlx5_mac.c    |   2 +-
> >  drivers/net/mlx5/mlx5_stats.c  |   6 +-
> >  5 files changed, 252 insertions(+), 54 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 498f80c89..716c9d9a5 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
> >  	if (ret)
> >  		DRV_LOG(WARNING, "port %u some flows still remain",
> >  			dev->data->port_id);
> > +	if (!priv->representor &&
> > +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
> >  	memset(priv, 0, sizeof(*priv));
> >  }
> >
> > @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev
> *dev)
> >   *   Verbs device attributes.
> >   * @param port
> >   *   Verbs port to use (indexed from 1).
> > + * @param master
> > + *   Master device in case @p ibv_dev is a port representor.
> > + * @param rep_id
> > + *   Representor identifier when @p master is non-NULL.
> >   *
> >   * @return
> >   *   A valid Ethernet device object on success, NULL otherwise and
> rte_errno
> > @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  		   struct ibv_device *ibv_dev,
> >  		   int vf,
> >  		   const struct ibv_device_attr_ex *attr,
> > -		   unsigned int port)
> > +		   unsigned int port,
> > +		   struct rte_eth_dev *master,
> > +		   unsigned int rep_id)
> >  {
> >  	struct ibv_context *ctx;
> >  	struct ibv_port_attr port_attr;
> > @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  		" old OFED/rdma-core version or firmware configuration");
> #endif
> >  	config.mpls_en = mpls_en;
> > -	if (attr->orig_attr.phys_port_cnt > 1)
> > -		snprintf(name, sizeof(name), "%s port %u",
> > -			 dpdk_dev->name, port);
> > +	if (!master)
> > +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev-
> >name);
> >  	else
> > -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> > +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> > +			 dpdk_dev->name, rep_id);
> > +	if (attr->orig_attr.phys_port_cnt > 1)
> > +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> > +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
> >  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> >  		eth_dev = rte_eth_dev_attach_secondary(name);
> >  		if (eth_dev == NULL) {
> > @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  	priv->port = port;
> >  	priv->pd = pd;
> >  	priv->mtu = ETHER_MTU;
> > +	/*
> > +	 * Allocate a switch domain for master devices and share it with
> > +	 * port representors.
> > +	 */
> > +	if (!master) {
> > +		priv->representor = 0;
> > +		priv->master_id = -1; /* Updated once known. */
> > +		priv->domain_id =
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> 
> Domain_id will override below.
> 
> > +		priv->rep_id = -1; /* Dummy unique value. */
> > +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> > +		if (err) {
> > +			err = rte_errno;
> > +			DRV_LOG(ERR, "unable to allocate switch domain:
> %s",
> > +				strerror(rte_errno));
> > +			goto error;
> > +		}
> > +	} else {
> > +		priv->representor = 1;
> > +		priv->master_id =
> > +			((struct priv *)master->data->dev_private)-
> >master_id;
> > +		priv->domain_id =
> > +			((struct priv *)master->data->dev_private)-
> >domain_id;
> > +		priv->rep_id = rep_id;
> > +	}
> 
> Do you think such information should be set as well in secondary process?
> 
> >  	err = mlx5_args(&config, dpdk_dev->devargs);
> >  	if (err) {
> >  		err = rte_errno;
> > @@ -964,6 +1000,18 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >  		err = ENOMEM;
> >  		goto error;
> >  	}
> > +	/*
> > +	 * Now that eth_dev is allocated and its port ID is known, make
> > +	 * non-representor ports target their own port ID as master for
> > +	 * convenience.
> > +	 *
> > +	 * Master port ID is already set for actual representors. Those only
> > +	 * need the right device flag.
> > +	 */
> > +	if (!master)
> > +		priv->master_id = eth_dev->data->port_id;
> > +	else
> > +		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
> >  	eth_dev->data->dev_private = priv;
> >  	priv->dev_data = eth_dev->data;
> >  	eth_dev->data->mac_addrs = priv->mac; @@ -1083,8 +1131,12 @@
> > mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  	rte_rwlock_write_unlock(&mlx5_shared_data-
> >mem_event_rwlock);
> >  	return eth_dev;
> >  error:
> > -	if (priv)
> > +	if (priv) {
> > +		if (!priv->representor &&
> > +		    priv->domain_id !=
> RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +			claim_zero(rte_eth_switch_domain_free(priv-
> >domain_id));
> >  		rte_free(priv);
> > +	}
> >  	if (pd)
> >  		claim_zero(mlx5_glue->dealloc_pd(pd));
> >  	if (eth_dev)
> > @@ -1097,12 +1149,14 @@ mlx5_dev_spawn_one(struct rte_device
> > *dpdk_dev,  }
> >
> >  /**
> > - * Spawn Ethernet devices from Verbs information, one per detected
> port.
> > + * Spawn Ethernet devices from Verbs information, one per detected
> > + port and
> > + * port representor.
> >   *
> >   * @param dpdk_dev
> >   *   Backing DPDK device.
> >   * @param ibv_dev
> > - *   Verbs device.
> > + *   NULL-terminated list of Verbs devices. First entry is the master device
> > + *   (mandatory), followed by optional representors.
> >   * @param vf
> >   *   If nonzero, enable VF-specific features.
> >   *
> > @@ -1113,17 +1167,21 @@ mlx5_dev_spawn_one(struct rte_device
> *dpdk_dev,
> >   */
> >  static struct rte_eth_dev **
> >  mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > -	       struct ibv_device *ibv_dev,
> > +	       struct ibv_device **ibv_dev,
> >  	       int vf)
> >  {
> >  	struct rte_eth_dev **eth_list = NULL;
> >  	struct ibv_context *ctx;
> >  	struct ibv_device_attr_ex attr;
> > +	void *tmp;
> >  	unsigned int i;
> > +	unsigned int j = 0;
> > +	unsigned int n = 0;
> >  	int ret;
> >
> > +next:
> >  	errno = 0;
> > -	ctx = mlx5_glue->open_device(ibv_dev);
> > +	ctx = mlx5_glue->open_device(ibv_dev[j]);
> >  	if (!ctx) {
> >  		rte_errno = errno ? errno : ENODEV;
> >  		if (rte_errno == ENODEV)
> > @@ -1132,7 +1190,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
> >  		else
> >  			DRV_LOG(ERR,
> >  				"cannot use device, are drivers up to date?");
> > -		return NULL;
> > +		goto error;
> >  	}
> >  	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> >  	mlx5_glue->close_device(ctx);
> > @@ -1140,34 +1198,42 @@ mlx5_dev_spawn(struct rte_device
> *dpdk_dev,
> >  		rte_errno = ret;
> >  		DRV_LOG(ERR, "unable to query device information: %s",
> >  			strerror(rte_errno));
> > -		return NULL;
> > +		goto error;
> >  	}
> > -	DRV_LOG(INFO, "%u port(s) detected",
> attr.orig_attr.phys_port_cnt);
> > -	eth_list = malloc(sizeof(*eth_list) *
> > -			  (attr.orig_attr.phys_port_cnt + 1));
> > -	if (!eth_list) {
> > +	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
> > +		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
> > +	tmp = realloc(eth_list, sizeof(*eth_list) *
> > +		      (n + attr.orig_attr.phys_port_cnt + 1));
> > +	if (!tmp) {
> >  		rte_errno = errno;
> > -		return NULL;
> > +		goto error;
> >  	}
> > +	eth_list = tmp;
> >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > -						 &attr, i + 1);
> > -		if (eth_list[i])
> > -			continue;
> > -		/* Save rte_errno and roll back in case of failure. */
> > -		ret = rte_errno;
> > -		while (i--) {
> > -			mlx5_dev_close(eth_list[i]);
> > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > -				rte_free(eth_list[i]->data->dev_private);
> > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > -		}
> > -		free(eth_list);
> > -		rte_errno = ret;
> > -		return NULL;
> > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> vf,
> > +						 &attr, i + 1,
> > +						 j ? eth_list[0] : NULL,
> > +						 j - 1);

The representor id is according to the sort made by qsort (based on device names).
A better way may be to set it according to the sysfs information, like you do in the mlx5_get_ifname function.
What do you think? 

> > +		if (!eth_list[n])
> > +			goto error;
> > +		++n;
> >  	}
> > -	eth_list[i] = NULL;
> > +	if (ibv_dev[++j])
> > +		goto next;
> > +	eth_list[n] = NULL;
> >  	return eth_list;
> > +error:
> > +	/* Save rte_errno and roll back in case of failure. */
> > +	ret = rte_errno;
> > +	while (n--) {
> > +		mlx5_dev_close(eth_list[n]);
> > +		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > +			rte_free(eth_list[n]->data->dev_private);
> > +		claim_zero(rte_eth_dev_release_port(eth_list[n]));
> > +	}
> > +	free(eth_list);
> > +	rte_errno = ret;
> > +	return NULL;
> >  }
> >
> >  /**
> > @@ -1282,7 +1348,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
> >  				ibv_match[ret]->name, ret - 1);
> >  	}
> >  	if (n)
> > -		eth_list = mlx5_dev_spawn(&pci_dev->device,
> ibv_match[0], vf);
> > +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match,
> vf);
> >  	mlx5_glue->free_device_list(ibv_list);
> >  	if (!n) {
> >  		DRV_LOG(WARNING,
> > @@ -1302,7 +1368,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
> >  		ret = -rte_errno;
> >  	} else {
> >  		for (ret = 0; eth_list[ret]; ++ret) {
> > +			uint32_t restore = eth_list[ret]->data->dev_flags;
> > +
> >  			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> > +			/* Restore non-PCI flags cleared by the above call. */
> > +			eth_list[ret]->data->dev_flags |= restore;
> >  			rte_eth_dev_probing_finish(eth_list[ret]);
> >  		}
> >  		ret = 0;
> > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > 997b04a33..0fe467140 100644
> > --- a/drivers/net/mlx5/mlx5.h
> > +++ b/drivers/net/mlx5/mlx5.h
> > @@ -161,6 +161,10 @@ struct priv {
> >  	uint16_t mtu; /* Configured MTU. */
> >  	uint8_t port; /* Physical port number. */
> >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > +	unsigned int representor:1; /* Device is a port representor. */

Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR from eth_dev->data->dev_flags. 

> > +	uint16_t master_id; /* DPDK port ID of switch domain master. */
> > +	uint16_t domain_id; /* Switch domain identifier. */
> > +	unsigned int rep_id; /* Port representor identifier. */
> >  	/* RX/TX queues. */
> >  	unsigned int rxqs_n; /* RX queues array size. */
> >  	unsigned int txqs_n; /* TX queues array size. */ @@ -209,9 +213,12
> > @@ int mlx5_getenv_int(const char *);
> >
> >  /* mlx5_ethdev.c */
> >
> > +int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> > +			   char (*ifname)[IF_NAMESIZE]);
> >  int mlx5_get_ifname(const struct rte_eth_dev *dev, char
> > (*ifname)[IF_NAMESIZE]);  int mlx5_ifindex(const struct rte_eth_dev
> > *dev); -int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct
> > ifreq *ifr);
> > +int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> > +	       int master);
> >  int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);  int
> > mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
> >  		   unsigned int flags);
> > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > b/drivers/net/mlx5/mlx5_ethdev.c index 90488af33..9d579659e 100644
> > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > @@ -93,7 +93,7 @@ struct ethtool_link_settings {  #endif
> >
> >  /**
> > - * Get interface name from private structure.
> > + * Get master interface name from private structure.
> >   *
> >   * @param[in] dev
> >   *   Pointer to Ethernet device.
> > @@ -104,7 +104,8 @@ struct ethtool_link_settings {
> >   *   0 on success, a negative errno value otherwise and rte_errno is set.
> >   */
> >  int
> > -mlx5_get_ifname(const struct rte_eth_dev *dev, char
> > (*ifname)[IF_NAMESIZE])
> > +mlx5_get_master_ifname(const struct rte_eth_dev *dev,
> > +		       char (*ifname)[IF_NAMESIZE])
> >  {
> >  	struct priv *priv = dev->data->dev_private;
> >  	DIR *dir;
> > @@ -179,6 +180,113 @@ mlx5_get_ifname(const struct rte_eth_dev *dev,
> > char (*ifname)[IF_NAMESIZE])  }
> >
> >  /**
> > + * Get interface name from private structure.
> > + *
> > + * This is a port representor-aware version of mlx5_get_master_ifname().
> > + *
> > + * @param[in] dev
> > + *   Pointer to Ethernet device.
> > + * @param[out] ifname
> > + *   Interface name output buffer.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +int
> > +mlx5_get_ifname(const struct rte_eth_dev *dev, char
> > +(*ifname)[IF_NAMESIZE]) {
> > +	struct priv *priv = dev->data->dev_private;
> > +	int ret;
> > +	char master[IF_NAMESIZE];
> > +	FILE *file;
> > +	DIR *dir;
> > +	uint64_t phys_switch_id;
> > +
> > +	if (!priv->representor)
> > +		return mlx5_get_master_ifname(dev, ifname);
> > +	ret = mlx5_get_master_ifname(dev, &master);
> > +	if (ret)
> > +		return ret;
> > +	{
> > +		MKSTR(path, "%s/device/net/%s/phys_switch_id",
> > +		      priv->ibdev_path, master);
> > +
> > +		file = fopen(path, "rb");
> > +	}
> > +	if (!file) {
> > +		rte_errno = errno;
> > +		return -rte_errno;
> > +	}
> > +	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
> > +	fclose(file);
> > +	if (ret != 1) {
> > +		rte_errno = EINVAL;
> > +		return -rte_errno;
> > +	}
> > +	{
> > +		MKSTR(path, "%s/device/net/%s/subsystem",
> > +		      priv->ibdev_path, master);
> > +
> > +		dir = opendir(path);
> > +	}
> > +	if (!dir) {
> > +		rte_errno = errno;
> > +		return -rte_errno;
> > +	}
> > +	/*
> > +	 * Scan network interfaces to find one with matching phys_switch_id
> > +	 * and phys_switch_name.
> > +	 */
> > +	do {
> > +		struct dirent *dent;
> > +		uint64_t phys_switch_id_rep;
> > +		int rep_id;
> > +
> > +		ret = -ENOENT;
> > +		dent = readdir(dir);
> > +		if (!dent)
> > +			break;
> > +		{
> > +			MKSTR(path,
> > +
> "%s/device/net/%s/subsystem/%s/phys_switch_id",
> > +			      priv->ibdev_path, master, dent->d_name);
> > +
> > +			file = fopen(path, "rb");
> > +		}
> > +		if (!file)
> > +			continue;
> > +		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
> > +		fclose(file);
> > +		if (ret != 1)
> > +			continue;
> > +		if (phys_switch_id_rep != phys_switch_id)
> > +			continue;
> > +		{
> > +			MKSTR(path,
> > +
> "%s/device/net/%s/subsystem/%s/phys_port_name",
> > +			      priv->ibdev_path, master, dent->d_name);
> > +
> > +			file = fopen(path, "rb");
> > +		}
> > +		if (!file)
> > +			continue;
> > +		ret = fscanf(file, "%d", &rep_id);
> > +		fclose(file);
> > +		if (ret != 1)
> > +			continue;
> > +		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
> > +			continue;
> > +		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
> > +		ret = 0;
> > +		break;
> > +	} while (1);
> > +	closedir(dir);
> > +	if (ret)
> > +		rte_errno = -ret;
> > +	return ret;
> > +}
> > +
> > +/**
> >   * Get the interface index from device name.
> >   *
> >   * @param[in] dev
> > @@ -214,12 +322,16 @@ mlx5_ifindex(const struct rte_eth_dev *dev)
> >   *   Request number to pass to ioctl().
> >   * @param[out] ifr
> >   *   Interface request structure output buffer.
> > + * @param master
> > + *   When device is a port representor, perform request on master device
> > + *   instead.
> >   *
> >   * @return
> >   *   0 on success, a negative errno value otherwise and rte_errno is set.
> >   */
> >  int
> > -mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
> > +mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
> > +	   int master)
> >  {
> >  	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
> >  	int ret = 0;
> > @@ -228,7 +340,10 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req,
> struct ifreq *ifr)
> >  		rte_errno = errno;
> >  		return -rte_errno;
> >  	}
> > -	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> > +	if (master)
> > +		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
> > +	else
> > +		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
> >  	if (ret)
> >  		goto error;
> >  	ret = ioctl(sock, req, ifr);
> > @@ -258,7 +373,7 @@ int
> >  mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)  {
> >  	struct ifreq request;
> > -	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
> > +	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
> >
> >  	if (ret)
> >  		return ret;
> > @@ -282,7 +397,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t
> mtu)  {
> >  	struct ifreq request = { .ifr_mtu = mtu, };
> >
> > -	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
> > +	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
> >  }
> >
> >  /**
> > @@ -302,13 +417,13 @@ int
> >  mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int
> flags)  {
> >  	struct ifreq request;
> > -	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
> > +	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
> >
> >  	if (ret)
> >  		return ret;
> >  	request.ifr_flags &= keep;
> >  	request.ifr_flags |= flags & ~keep;
> > -	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
> > +	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
> >  }
> >
> >  /**
> > @@ -477,6 +592,12 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev,
> struct rte_eth_dev_info *info)
> >  	info->speed_capa = priv->link_speed_capa;
> >  	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
> >  	mlx5_set_default_params(dev, info);
> > +	if (rte_eth_dev_is_valid_port(priv->master_id)) {
> > +		info->switch_info.name =
> > +			rte_eth_devices[priv->master_id].data->name;
> > +		info->switch_info.domain_id = priv->domain_id;
> > +		info->switch_info.port_id = priv->rep_id;
> > +	}
> >  }
> >
> >  /**
> > @@ -540,7 +661,7 @@ mlx5_link_update_unlocked_gset(struct
> rte_eth_dev *dev,
> >  	int link_speed = 0;
> >  	int ret;
> >
> > -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed:
> %s",
> >  			dev->data->port_id, strerror(rte_errno)); @@ -550,7
> +671,7 @@
> > mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
> >  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
> >  				(ifr.ifr_flags & IFF_RUNNING));
> >  	ifr.ifr_data = (void *)&edata;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed:
> %s", @@ -611,7
> > +732,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> >  	uint64_t sc;
> >  	int ret;
> >
> > -	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed:
> %s",
> >  			dev->data->port_id, strerror(rte_errno)); @@ -621,7
> +742,7 @@
> > mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
> >  	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
> >  				(ifr.ifr_flags & IFF_RUNNING));
> >  	ifr.ifr_data = (void *)&gcmd;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(DEBUG,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GLINKSETTINGS)"
> > @@ -638,7 +759,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev
> > *dev,
> >
> >  	*ecmd = gcmd;
> >  	ifr.ifr_data = (void *)ecmd;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(DEBUG,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GLINKSETTINGS)"
> > @@ -801,7 +922,7 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev,
> struct rte_eth_fc_conf *fc_conf)
> >  	int ret;
> >
> >  	ifr.ifr_data = (void *)&ethpause;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_GPAUSEPARAM) failed:"
> > @@ -854,7 +975,7 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev,
> struct rte_eth_fc_conf *fc_conf)
> >  		ethpause.tx_pause = 1;
> >  	else
> >  		ethpause.tx_pause = 0;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u ioctl(SIOCETHTOOL,
> ETHTOOL_SPAUSEPARAM)"
> > diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
> > index 672a47619..12ee37f55
> > 100644
> > --- a/drivers/net/mlx5/mlx5_mac.c
> > +++ b/drivers/net/mlx5/mlx5_mac.c
> > @@ -49,7 +49,7 @@ mlx5_get_mac(struct rte_eth_dev *dev, uint8_t
> (*mac)[ETHER_ADDR_LEN])
> >  	struct ifreq request;
> >  	int ret;
> >
> > -	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
> > +	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
> >  	if (ret)
> >  		return ret;
> >  	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); diff -
> -git
> > a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c index
> > 875dd1027..91f3d474a 100644
> > --- a/drivers/net/mlx5/mlx5_stats.c
> > +++ b/drivers/net/mlx5/mlx5_stats.c
> > @@ -146,7 +146,7 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev,
> uint64_t *stats)
> >  	et_stats->cmd = ETHTOOL_GSTATS;
> >  	et_stats->n_stats = xstats_ctrl->stats_n;
> >  	ifr.ifr_data = (caddr_t)et_stats;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING,
> >  			"port %u unable to read statistic values from device",
> @@ -194,7
> > +194,7 @@ mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
> >
> >  	drvinfo.cmd = ETHTOOL_GDRVINFO;
> >  	ifr.ifr_data = (caddr_t)&drvinfo;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u unable to query number of
> statistics",
> >  			dev->data->port_id);
> > @@ -244,7 +244,7 @@ mlx5_xstats_init(struct rte_eth_dev *dev)
> >  	strings->string_set = ETH_SS_STATS;
> >  	strings->len = dev_stats_n;
> >  	ifr.ifr_data = (caddr_t)strings;
> > -	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
> > +	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
> >  	if (ret) {
> >  		DRV_LOG(WARNING, "port %u unable to get statistic
> names",
> >  			dev->data->port_id);
> > --
> > 2.11.0
Shahaf Shuler June 24, 2018, 1:33 p.m. UTC | #3
One more input, 

Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors

[...]

> > > +	eth_list = tmp;
> > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > -						 &attr, i + 1);
> > > -		if (eth_list[i])
> > > -			continue;
> > > -		/* Save rte_errno and roll back in case of failure. */
> > > -		ret = rte_errno;
> > > -		while (i--) {
> > > -			mlx5_dev_close(eth_list[i]);
> > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > -				rte_free(eth_list[i]->data->dev_private);
> > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > -		}
> > > -		free(eth_list);
> > > -		rte_errno = ret;
> > > -		return NULL;
> > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > vf,
> > > +						 &attr, i + 1,
> > > +						 j ? eth_list[0] : NULL,
> > > +						 j - 1);
> 
> The representor id is according to the sort made by qsort (based on device
> names).
> A better way may be to set it according to the sysfs information, like you do
> in the mlx5_get_ifname function.
> What do you think?

In fact relaying on linear increasing port numbers is dangerous. In may break on special scenarios like BlueField.
In BlueField there are representors between the x86 and the ARM cores. Those are not VF representors. The phys_port_name of those is -1 and each of them belongs to different phys_switch_id.

We can argue whether it is correct/not to assign them w/ -1 value, but I think the suggested approach above can detect the right "vf_id" for those and not break the current behavior on x86.  
Let me know if you have other suggestions.
Adrien Mazarguil June 27, 2018, 1:32 p.m. UTC | #4
On Sat, Jun 16, 2018 at 08:57:51AM +0000, Xueming(Steven) Li wrote:
> Reviewed-by: Xueming Li <xuemingl@mellanox.com>
> 
> Minor comments inside:
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > Sent: Thursday, June 14, 2018 4:35 PM
> > To: Shahaf Shuler <shahafs@mellanox.com>
> > Cc: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > Probe existing port representors in addition to their master device and associate them automatically.
> > 
> > To avoid name collision between Ethernet devices, their names use the same convention as ixgbe and
> > i40e PMDs, that is, instead of only a PCI address in DBDF notation:
> > 
> > - "net_{DBDF}_0" for master/switch devices.
> > - "net_{DBDF}_representor_{rep}" with "rep" starting from 0 for port
> >   representors.
> > 
> > Both optionally suffixed with "_port_{num}" instead of " port {num}" for devices that expose several
> > Verbs ports (note this is never the case on mlx5, but kept for historical reasons for the time being).
> > 
> > (Patch based on prior work from Yuanhan Liu)
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > --
> > v2 changes:
> > 
> > - Added representor information to dev_infos_get(). DPDK port ID of master
> >   device is now stored in the private structure to retrieve it
> >   conveniently.
> > - Master device is assigned dummy representor ID value -1 to better
> >   distinguish from the the first actual representor reported by
> >   dev_infos_get() as those are indexed from 0.
> > - Added RTE_ETH_DEV_REPRESENTOR device flag.
> > ---
> >  drivers/net/mlx5/mlx5.c        | 138 ++++++++++++++++++++++++--------
> >  drivers/net/mlx5/mlx5.h        |   9 ++-
> >  drivers/net/mlx5/mlx5_ethdev.c | 151 ++++++++++++++++++++++++++++++++----
> >  drivers/net/mlx5/mlx5_mac.c    |   2 +-
> >  drivers/net/mlx5/mlx5_stats.c  |   6 +-
> >  5 files changed, 252 insertions(+), 54 deletions(-)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 498f80c89..716c9d9a5 100644
<snip>
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -304,6 +304,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
> >  	if (ret)
> >  		DRV_LOG(WARNING, "port %u some flows still remain",
> >  			dev->data->port_id);
> > +	if (!priv->representor &&
> > +	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
> > +		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
> >  	memset(priv, 0, sizeof(*priv));
> >  }
> > 
> > @@ -648,6 +651,10 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
> >   *   Verbs device attributes.
> >   * @param port
> >   *   Verbs port to use (indexed from 1).
> > + * @param master
> > + *   Master device in case @p ibv_dev is a port representor.
> > + * @param rep_id
> > + *   Representor identifier when @p master is non-NULL.
> >   *
> >   * @return
> >   *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> > @@ -658,7 +665,9 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  		   struct ibv_device *ibv_dev,
> >  		   int vf,
> >  		   const struct ibv_device_attr_ex *attr,
> > -		   unsigned int port)
> > +		   unsigned int port,
> > +		   struct rte_eth_dev *master,
> > +		   unsigned int rep_id)
> >  {
> >  	struct ibv_context *ctx;
> >  	struct ibv_port_attr port_attr;
> > @@ -802,11 +811,14 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  		" old OFED/rdma-core version or firmware configuration");  #endif
> >  	config.mpls_en = mpls_en;
> > -	if (attr->orig_attr.phys_port_cnt > 1)
> > -		snprintf(name, sizeof(name), "%s port %u",
> > -			 dpdk_dev->name, port);
> > +	if (!master)
> > +		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
> >  	else
> > -		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> > +		snprintf(name, sizeof(name), "net_%s_representor_%u",
> > +			 dpdk_dev->name, rep_id);
> > +	if (attr->orig_attr.phys_port_cnt > 1)
> > +		snprintf(name, sizeof(name), "%s_port_%u", name, port);
> > +	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
> >  	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> >  		eth_dev = rte_eth_dev_attach_secondary(name);
> >  		if (eth_dev == NULL) {
> > @@ -883,6 +895,30 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> >  	priv->port = port;
> >  	priv->pd = pd;
> >  	priv->mtu = ETHER_MTU;
> > +	/*
> > +	 * Allocate a switch domain for master devices and share it with
> > +	 * port representors.
> > +	 */
> > +	if (!master) {
> > +		priv->representor = 0;
> > +		priv->master_id = -1; /* Updated once known. */
> > +		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
> 
> Domain_id will override below.

It's done as a safety measure. If rte_eth_switch_domain_alloc() happened to
fail, rte_eth_switch_domain_free() would otherwise be attempted on an
uninitialized value when cleaning up priv, possibly destroying an unrelated
domain. This is prevented thanks to RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID.

> > +		priv->rep_id = -1; /* Dummy unique value. */
> > +		err = rte_eth_switch_domain_alloc(&priv->domain_id);
> > +		if (err) {
> > +			err = rte_errno;
> > +			DRV_LOG(ERR, "unable to allocate switch domain: %s",
> > +				strerror(rte_errno));
> > +			goto error;
> > +		}
> > +	} else {
> > +		priv->representor = 1;
> > +		priv->master_id =
> > +			((struct priv *)master->data->dev_private)->master_id;
> > +		priv->domain_id =
> > +			((struct priv *)master->data->dev_private)->domain_id;
> > +		priv->rep_id = rep_id;
> > +	}
> 
> Do you think such information should be set as well in secondary process?

Unless I'm mistaken, it's implicitly the case as secondaries do not allocate
their own private structure, they inherit it from the primary.

Thanks for the review.
Adrien Mazarguil June 27, 2018, 1:32 p.m. UTC | #5
On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> Hi Adrien,
> 
> Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > Sent: Thursday, June 14, 2018 4:35 PM
> > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > Cc: dev@dpdk.org
> > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > Probe existing port representors in addition to their master device and
> > associate them automatically.
> > >
> > > To avoid name collision between Ethernet devices, their names use the
> > > same convention as ixgbe and i40e PMDs, that is, instead of only a PCI
> > address in DBDF notation:
> > >
> > > - "net_{DBDF}_0" for master/switch devices.
> 
> This is breaking compatibility for application using the device names in order to attach them to the application (e.g. OVS-DPDK). 
> Before this patch the naming scheme for non-representor port is "{DBDF}". 
> 
> Can we preserve the compatibility and add appropriate suffix for the representor case? 

There's one issue if representors are hot-plugged. The name of the master
device, which happens to be that of the switch domain, cannot be
updated. The form "net_{DBDF}_0" seems expected for PMDs that support
representors (see ixgbe and i40e).

Now since representor hot-plugging is not supported yet, I guess we could
postpone this problem by keeping the old format in the meantime, however
ideally, these applications should not rely on it. The only safe assumption
they can make is the uniqueness of any given name among ethdevs.

PCI bus addresses, if needed, should be retrieved by looking at the
underlying bus object.

By the way, while thinking again about a past comment from Xueming [1],
maybe it's finally time to remove support for multiple Verbs ports on mlx5
after all. This should drop another unnecessary loop and the need for the
unused "port %u" suffix at all while naming the device.

So how about the following plan for v3:

- Adding a patch that drops support for multiple Verbs ports (note for
  Xueming, yes I changed my mind *again* :)

- If you really think this will break OVS (please confirm), then when no
  "representor" parameter is provided (regardless of the presence of any
  representors), name format will use the usual "{DBDF}" notation as you
  suggested.

- Otherwise as soon as a "representor" is found on the command line, the new
  format will be used, again regardless of the presence of any representors.

- In both cases, representors if any, will be named according to the format
  specified in this patch.

[1] https://mails.dpdk.org/archives/dev/2018-June/104015.html

<snip>
> > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > -						 &attr, i + 1);
> > > -		if (eth_list[i])
> > > -			continue;
> > > -		/* Save rte_errno and roll back in case of failure. */
> > > -		ret = rte_errno;
> > > -		while (i--) {
> > > -			mlx5_dev_close(eth_list[i]);
> > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > -				rte_free(eth_list[i]->data->dev_private);
> > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > -		}
> > > -		free(eth_list);
> > > -		rte_errno = ret;
> > > -		return NULL;
> > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > vf,
> > > +						 &attr, i + 1,
> > > +						 j ? eth_list[0] : NULL,
> > > +						 j - 1);
> 
> The representor id is according to the sort made by qsort (based on device names).
> A better way may be to set it according to the sysfs information, like you do in the mlx5_get_ifname function.
> What do you think? 

I agree that the current approach sucks, hence the big fat warnings I left
around (see discussion with Xueming [2]). Problem is that the needed
information is not yet known at this stage; there is no private structure to
rely on to use mlx5_get_ifname() directly.

I'd also rather see these assumptions go in any case. I'll attempt to
improve things for v3 in preparation of allowing representors to be probed
on their own anytime, possibly even before the master device.

[2] https://mails.dpdk.org/archives/dev/2018-June/104059.html

<snip>
> > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> > > 997b04a33..0fe467140 100644
> > > --- a/drivers/net/mlx5/mlx5.h
> > > +++ b/drivers/net/mlx5/mlx5.h
> > > @@ -161,6 +161,10 @@ struct priv {
> > >  	uint16_t mtu; /* Configured MTU. */
> > >  	uint8_t port; /* Physical port number. */
> > >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > > +	unsigned int representor:1; /* Device is a port representor. */
> 
> Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR from eth_dev->data->dev_flags. 

Problem is that this flag can only be set once the ethdev is fully
instantiated and can't be relied on internally where needed (e.g. during
clean up in error handling code). It's reported to applications but not used
internally.

As a device property, it's actually pretty similar to the VF bit or
offloaded capabilities where checking exposed information would be
needlessly complex.

Now maybe it could be part of struct mlx5_dev_config as well. I initially
assumed this object was only for user-provided parameters but looks like
it's not the case. I intend to move it there for v3.
Adrien Mazarguil June 27, 2018, 1:32 p.m. UTC | #6
On Sun, Jun 24, 2018 at 01:33:31PM +0000, Shahaf Shuler wrote:
> One more input, 
> 
> Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> [...]
> 
> > > > +	eth_list = tmp;
> > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > > -						 &attr, i + 1);
> > > > -		if (eth_list[i])
> > > > -			continue;
> > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > -		ret = rte_errno;
> > > > -		while (i--) {
> > > > -			mlx5_dev_close(eth_list[i]);
> > > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > > -				rte_free(eth_list[i]->data->dev_private);
> > > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > -		}
> > > > -		free(eth_list);
> > > > -		rte_errno = ret;
> > > > -		return NULL;
> > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > > vf,
> > > > +						 &attr, i + 1,
> > > > +						 j ? eth_list[0] : NULL,
> > > > +						 j - 1);
> > 
> > The representor id is according to the sort made by qsort (based on device
> > names).
> > A better way may be to set it according to the sysfs information, like you do
> > in the mlx5_get_ifname function.
> > What do you think?
> 
> In fact relaying on linear increasing port numbers is dangerous. In may break on special scenarios like BlueField.
> In BlueField there are representors between the x86 and the ARM cores. Those are not VF representors. The phys_port_name of those is -1 and each of them belongs to different phys_switch_id.
> 
> We can argue whether it is correct/not to assign them w/ -1 value, but I think the suggested approach above can detect the right "vf_id" for those and not break the current behavior on x86.  
> Let me know if you have other suggestions. 

I didn't know that. Assuming that with these, there is exactly only one
representor per device, I think we can manage, the main issue being that
"-1" will be difficult to parse as a valid "representor" argument which uses
"-" for ranges.

Anyway, I suggest to deal with Bluefield specifics in a subsequent series.
This one focuses on and is validated with VF representors only.
Xueming Li June 27, 2018, 5:30 p.m. UTC | #7
> -----Original Message-----
> From: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Sent: Wednesday, June 27, 2018 9:32 PM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: Xueming(Steven) Li <xuemingl@mellanox.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > Hi Adrien,
> >
> > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > Cc: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > > >
> > > > Probe existing port representors in addition to their master
> > > > device and
> > > associate them automatically.
> > > >
> > > > To avoid name collision between Ethernet devices, their names use
> > > > the same convention as ixgbe and i40e PMDs, that is, instead of
> > > > only a PCI
> > > address in DBDF notation:
> > > >
> > > > - "net_{DBDF}_0" for master/switch devices.
> >
> > This is breaking compatibility for application using the device names in order to attach them to the
> application (e.g. OVS-DPDK).
> > Before this patch the naming scheme for non-representor port is "{DBDF}".
> >
> > Can we preserve the compatibility and add appropriate suffix for the representor case?
> 
> There's one issue if representors are hot-plugged. The name of the master device, which happens to be
> that of the switch domain, cannot be updated. The form "net_{DBDF}_0" seems expected for PMDs that
> support representors (see ixgbe and i40e).
> 
> Now since representor hot-plugging is not supported yet, I guess we could postpone this problem by
> keeping the old format in the meantime, however ideally, these applications should not rely on it. The
> only safe assumption they can make is the uniqueness of any given name among ethdevs.
> 
> PCI bus addresses, if needed, should be retrieved by looking at the underlying bus object.
> 
> By the way, while thinking again about a past comment from Xueming [1], maybe it's finally time to
> remove support for multiple Verbs ports on mlx5 after all. This should drop another unnecessary loop
> and the need for the unused "port %u" suffix at all while naming the device.
> 
> So how about the following plan for v3:
> 
> - Adding a patch that drops support for multiple Verbs ports (note for
>   Xueming, yes I changed my mind *again* :)
> 
> - If you really think this will break OVS (please confirm), then when no
>   "representor" parameter is provided (regardless of the presence of any
>   representors), name format will use the usual "{DBDF}" notation as you
>   suggested.
> 
> - Otherwise as soon as a "representor" is found on the command line, the new
>   format will be used, again regardless of the presence of any representors.

The port creation sequence of upcoming hot plug looks like this:
0000:81:00.1
0000:81:00.1,representor=0
0000:81:00.1,representor=1

So the PF attaching comes always w/o "representor" parameter.

> 
> - In both cases, representors if any, will be named according to the format
>   specified in this patch.
> 
> [1]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2
> 018-
> June%2F104015.html&data=02%7C01%7Cxuemingl%40mellanox.com%7Cad9a1b32e5e241e375d208d5dc32778b%7Ca652971
> c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657031666639542&sdata=w4oNeWXwKXS0%2BNSZsYQaneW%2BkFxvWIHZFHLoM
> fLOxkg%3D&reserved=0
> 
> <snip>
> > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > > -						 &attr, i + 1);
> > > > -		if (eth_list[i])
> > > > -			continue;
> > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > -		ret = rte_errno;
> > > > -		while (i--) {
> > > > -			mlx5_dev_close(eth_list[i]);
> > > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > > -				rte_free(eth_list[i]->data->dev_private);
> > > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > -		}
> > > > -		free(eth_list);
> > > > -		rte_errno = ret;
> > > > -		return NULL;
> > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > > vf,
> > > > +						 &attr, i + 1,
> > > > +						 j ? eth_list[0] : NULL,
> > > > +						 j - 1);
> >
> > The representor id is according to the sort made by qsort (based on device names).
> > A better way may be to set it according to the sysfs information, like you do in the mlx5_get_ifname
> function.
> > What do you think?
> 
> I agree that the current approach sucks, hence the big fat warnings I left around (see discussion with
> Xueming [2]). Problem is that the needed information is not yet known at this stage; there is no
> private structure to rely on to use mlx5_get_ifname() directly.
> 
> I'd also rather see these assumptions go in any case. I'll attempt to improve things for v3 in
> preparation of allowing representors to be probed on their own anytime, possibly even before the
> master device.
> 
> [2]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2
> 018-
> June%2F104059.html&data=02%7C01%7Cxuemingl%40mellanox.com%7Cad9a1b32e5e241e375d208d5dc32778b%7Ca652971
> c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657031666639542&sdata=5eOWb69duEB%2BkIW1ZGkv%2FLxkZfwErQOd%2FV7
> nDpN2jOg%3D&reserved=0
> 
> <snip>
> > > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> > > > index
> > > > 997b04a33..0fe467140 100644
> > > > --- a/drivers/net/mlx5/mlx5.h
> > > > +++ b/drivers/net/mlx5/mlx5.h
> > > > @@ -161,6 +161,10 @@ struct priv {
> > > >  	uint16_t mtu; /* Configured MTU. */
> > > >  	uint8_t port; /* Physical port number. */
> > > >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > > > +	unsigned int representor:1; /* Device is a port representor. */
> >
> > Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR from eth_dev->data->dev_flags.
> 
> Problem is that this flag can only be set once the ethdev is fully instantiated and can't be relied on
> internally where needed (e.g. during clean up in error handling code). It's reported to applications
> but not used internally.
> 
> As a device property, it's actually pretty similar to the VF bit or offloaded capabilities where
> checking exposed information would be needlessly complex.
> 
> Now maybe it could be part of struct mlx5_dev_config as well. I initially assumed this object was only
> for user-provided parameters but looks like it's not the case. I intend to move it there for v3.
> 
> --
> Adrien Mazarguil
> 6WIND
Shahaf Shuler June 28, 2018, 5:57 a.m. UTC | #8
Wednesday, June 27, 2018 4:33 PM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Sun, Jun 24, 2018 at 01:33:31PM +0000, Shahaf Shuler wrote:
> > One more input,
> >
> > Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> >
> > [...]
> >
> > > > > +	eth_list = tmp;
> > > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev,
> ibv_dev, vf,
> > > > > -						 &attr, i + 1);
> > > > > -		if (eth_list[i])
> > > > > -			continue;
> > > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > > -		ret = rte_errno;
> > > > > -		while (i--) {
> > > > > -			mlx5_dev_close(eth_list[i]);
> > > > > -			if (rte_eal_process_type() ==
> RTE_PROC_PRIMARY)
> > > > > -				rte_free(eth_list[i]->data-
> >dev_private);
> > > > > -
> 	claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > > -		}
> > > > > -		free(eth_list);
> > > > > -		rte_errno = ret;
> > > > > -		return NULL;
> > > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev,
> ibv_dev[j],
> > > > vf,
> > > > > +						 &attr, i + 1,
> > > > > +						 j ? eth_list[0] : NULL,
> > > > > +						 j - 1);
> > >
> > > The representor id is according to the sort made by qsort (based on
> > > device names).
> > > A better way may be to set it according to the sysfs information,
> > > like you do in the mlx5_get_ifname function.
> > > What do you think?
> >
> > In fact relaying on linear increasing port numbers is dangerous. In may
> break on special scenarios like BlueField.
> > In BlueField there are representors between the x86 and the ARM cores.
> Those are not VF representors. The phys_port_name of those is -1 and each
> of them belongs to different phys_switch_id.
> >
> > We can argue whether it is correct/not to assign them w/ -1 value, but I
> think the suggested approach above can detect the right "vf_id" for those
> and not break the current behavior on x86.
> > Let me know if you have other suggestions.
> 
> I didn't know that. Assuming that with these, there is exactly only one
> representor per device, I think we can manage, the main issue being that "-
> 1" will be difficult to parse as a valid "representor" argument which uses "-"
> for ranges.

The -1 value is not for the representor id, It is for the id of the entity which exists on the other size of the representor. 
The repesentor index is still 0, meaning the command line -w <pci_bdf>,representor=0 is correct on this case.

The problems comes from the assumption you do in your code about the representor id.
What you do currently is to receive the representors and qsort them by device name. then you assign the priv->rep_id based on the qsort output.
Later on when querying the if_name (mlx5_get_ifname) you assume that the phys_port_name of representor (which include the enumeration of what exists on its other side) is the same.

For x86 it probably works. On BlueField it breaks, as from some reason the phys_port_name is -1. 

My suggestion is to set the priv->rep_id based on the phys_port_name instead of qsort output. 

> 
> Anyway, I suggest to deal with Bluefield specifics in a subsequent series.
> This one focuses on and is validated with VF representors only.

It is related to VF representors only. BlueField is just an example. 

> 
> --
> Adrien Mazarguil
> 6WIND
Shahaf Shuler June 28, 2018, 6:01 a.m. UTC | #9
Wednesday, June 27, 2018 4:32 PM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > Hi Adrien,
> >
> > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > Cc: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > > >
> > > > Probe existing port representors in addition to their master
> > > > device and
> > > associate them automatically.
> > > >
> > > > To avoid name collision between Ethernet devices, their names use
> > > > the same convention as ixgbe and i40e PMDs, that is, instead of
> > > > only a PCI
> > > address in DBDF notation:
> > > >
> > > > - "net_{DBDF}_0" for master/switch devices.
> >
> > This is breaking compatibility for application using the device names in
> order to attach them to the application (e.g. OVS-DPDK).
> > Before this patch the naming scheme for non-representor port is "{DBDF}".
> >
> > Can we preserve the compatibility and add appropriate suffix for the
> representor case?
> 
> There's one issue if representors are hot-plugged. The name of the master
> device, which happens to be that of the switch domain, cannot be updated.
> The form "net_{DBDF}_0" seems expected for PMDs that support
> representors (see ixgbe and i40e).
> 
> Now since representor hot-plugging is not supported yet, I guess we could
> postpone this problem by keeping the old format in the meantime, however
> ideally, these applications should not rely on it. The only safe assumption
> they can make is the uniqueness of any given name among ethdevs.
> 
> PCI bus addresses, if needed, should be retrieved by looking at the
> underlying bus object.

Am not sure I understand. Those application attach the device to the application based on its name, which happens to be the PCI address in case of mlx5. 

> 
> By the way, while thinking again about a past comment from Xueming [1],
> maybe it's finally time to remove support for multiple Verbs ports on mlx5
> after all. This should drop another unnecessary loop and the need for the
> unused "port %u" suffix at all while naming the device.
> 
> So how about the following plan for v3:
> 
> - Adding a patch that drops support for multiple Verbs ports (note for
>   Xueming, yes I changed my mind *again* :)

I am OK w/ that. 

> 
> - If you really think this will break OVS (please confirm),

It will. 

 then when no
>   "representor" parameter is provided (regardless of the presence of any
>   representors), name format will use the usual "{DBDF}" notation as you
>   suggested.
> 
> - Otherwise as soon as a "representor" is found on the command line, the
> new
>   format will be used, again regardless of the presence of any representors.
> 
> - In both cases, representors if any, will be named according to the format
>   specified in this patch.

Can we do the following?
In case representor is found the naming will be DBDF_representor_%d
In case no-representor naming will be DBDF

Just removing the net prefix.  

> 
> [1]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fma
> ils.dpdk.org%2Farchives%2Fdev%2F2018-
> June%2F104015.html&data=02%7C01%7Cshahafs%40mellanox.com%7C0037
> 6c6df5044ac9f8f908d5dc32778f%7Ca652971c7d2e4d9ba6a4d149256f461b%7C
> 0%7C0%7C636657031665047796&sdata=XXYtvW3J3i3Xzkn%2B8YBKYK1b2D6P
> 5eUiD2h4VqLUJD8%3D&reserved=0
> 
> <snip>
> > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > > > -						 &attr, i + 1);
> > > > -		if (eth_list[i])
> > > > -			continue;
> > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > -		ret = rte_errno;
> > > > -		while (i--) {
> > > > -			mlx5_dev_close(eth_list[i]);
> > > > -			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > > > -				rte_free(eth_list[i]->data->dev_private);
> > > > -			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > -		}
> > > > -		free(eth_list);
> > > > -		rte_errno = ret;
> > > > -		return NULL;
> > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j],
> > > vf,
> > > > +						 &attr, i + 1,
> > > > +						 j ? eth_list[0] : NULL,
> > > > +						 j - 1);
> >
> > The representor id is according to the sort made by qsort (based on device
> names).
> > A better way may be to set it according to the sysfs information, like you do
> in the mlx5_get_ifname function.
> > What do you think?
> 
> I agree that the current approach sucks, hence the big fat warnings I left
> around (see discussion with Xueming [2]). Problem is that the needed
> information is not yet known at this stage; there is no private structure to
> rely on to use mlx5_get_ifname() directly.
> 
> I'd also rather see these assumptions go in any case. I'll attempt to improve
> things for v3 in preparation of allowing representors to be probed on their
> own anytime, possibly even before the master device.
> 
> [2]
> https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fma
> ils.dpdk.org%2Farchives%2Fdev%2F2018-
> June%2F104059.html&data=02%7C01%7Cshahafs%40mellanox.com%7C0037
> 6c6df5044ac9f8f908d5dc32778f%7Ca652971c7d2e4d9ba6a4d149256f461b%7C
> 0%7C0%7C636657031665047796&sdata=jWLFP6GMdQ6C88r1v%2BYZx7iKH3k
> ZDhVpgP4am9F11PU%3D&reserved=0
> 
> <snip>
> > > > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> > > > index
> > > > 997b04a33..0fe467140 100644
> > > > --- a/drivers/net/mlx5/mlx5.h
> > > > +++ b/drivers/net/mlx5/mlx5.h
> > > > @@ -161,6 +161,10 @@ struct priv {
> > > >  	uint16_t mtu; /* Configured MTU. */
> > > >  	uint8_t port; /* Physical port number. */
> > > >  	unsigned int isolated:1; /* Whether isolated mode is enabled. */
> > > > +	unsigned int representor:1; /* Device is a port representor. */
> >
> > Why we need above flag? Why can't we use RTE_ETH_DEV_REPRESENTOR
> from eth_dev->data->dev_flags.
> 
> Problem is that this flag can only be set once the ethdev is fully instantiated
> and can't be relied on internally where needed (e.g. during clean up in error
> handling code). It's reported to applications but not used internally.
> 
> As a device property, it's actually pretty similar to the VF bit or offloaded
> capabilities where checking exposed information would be needlessly
> complex.
> 
> Now maybe it could be part of struct mlx5_dev_config as well. I initially
> assumed this object was only for user-provided parameters but looks like it's
> not the case. I intend to move it there for v3.
> 
> --
> Adrien Mazarguil
> 6WIND
Adrien Mazarguil June 28, 2018, 8:45 a.m. UTC | #10
On Thu, Jun 28, 2018 at 06:01:54AM +0000, Shahaf Shuler wrote:
> Wednesday, June 27, 2018 4:32 PM, Adrien Mazarguil:
> > Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > > Hi Adrien,
> > >
> > > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > > >
> > > > > -----Original Message-----
> > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> > > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > > Cc: dev@dpdk.org
> > > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > > representors
> > > > >
> > > > > Probe existing port representors in addition to their master
> > > > > device and
> > > > associate them automatically.
> > > > >
> > > > > To avoid name collision between Ethernet devices, their names use
> > > > > the same convention as ixgbe and i40e PMDs, that is, instead of
> > > > > only a PCI
> > > > address in DBDF notation:
> > > > >
> > > > > - "net_{DBDF}_0" for master/switch devices.
> > >
> > > This is breaking compatibility for application using the device names in
> > order to attach them to the application (e.g. OVS-DPDK).
> > > Before this patch the naming scheme for non-representor port is "{DBDF}".
> > >
> > > Can we preserve the compatibility and add appropriate suffix for the
> > representor case?
> > 
> > There's one issue if representors are hot-plugged. The name of the master
> > device, which happens to be that of the switch domain, cannot be updated.
> > The form "net_{DBDF}_0" seems expected for PMDs that support
> > representors (see ixgbe and i40e).
> > 
> > Now since representor hot-plugging is not supported yet, I guess we could
> > postpone this problem by keeping the old format in the meantime, however
> > ideally, these applications should not rely on it. The only safe assumption
> > they can make is the uniqueness of any given name among ethdevs.
> > 
> > PCI bus addresses, if needed, should be retrieved by looking at the
> > underlying bus object.
> 
> Am not sure I understand. Those application attach the device to the application based on its name, which happens to be the PCI address in case of mlx5. 

I'm only saying it's not future-proof seeing what happens when they rely on
it. Moreover this forces them to convert the name to some binary form
instead of e.g. simply checking RTE_DEV_TO_PCI(dev->device)->addr where
needed and only use the name as some kind of opaque identifier.

> > By the way, while thinking again about a past comment from Xueming [1],
> > maybe it's finally time to remove support for multiple Verbs ports on mlx5
> > after all. This should drop another unnecessary loop and the need for the
> > unused "port %u" suffix at all while naming the device.
> > 
> > So how about the following plan for v3:
> > 
> > - Adding a patch that drops support for multiple Verbs ports (note for
> >   Xueming, yes I changed my mind *again* :)
> 
> I am OK w/ that. 
> 
> > 
> > - If you really think this will break OVS (please confirm),
> 
> It will. 

Out of curiosity, can you point me to the relevant code in OVS? Maybe
something can be done on their side.

Either ixgbe and i40e are unaffected by the very same change, or they also
break OVS, in which case there's an issue we need to solve with the
representor interface in DPDK before it's too late.

>  then when no
> >   "representor" parameter is provided (regardless of the presence of any
> >   representors), name format will use the usual "{DBDF}" notation as you
> >   suggested.
> > 
> > - Otherwise as soon as a "representor" is found on the command line, the
> > new
> >   format will be used, again regardless of the presence of any representors.
> > 
> > - In both cases, representors if any, will be named according to the format
> >   specified in this patch.
> 
> Can we do the following?
> In case representor is found the naming will be DBDF_representor_%d
> In case no-representor naming will be DBDF
> 
> Just removing the net prefix.  

Yes, I'll remove it. We'll standardize on the naming used for ixgbe/i40e
only once the above concerns are addressed.
Shahaf Shuler June 28, 2018, 9:06 a.m. UTC | #11
Thursday, June 28, 2018 11:46 AM, Adrien Mazarguil:
> Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> 
> On Thu, Jun 28, 2018 at 06:01:54AM +0000, Shahaf Shuler wrote:
> > Wednesday, June 27, 2018 4:32 PM, Adrien Mazarguil:
> > > Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > representors
> > >
> > > On Sun, Jun 17, 2018 at 10:15:07AM +0000, Shahaf Shuler wrote:
> > > > Hi Adrien,
> > > >
> > > > Saturday, June 16, 2018 11:58 AM, Xueming(Steven) Li:
> > > > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > > representors
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien
> Mazarguil
> > > > > > Sent: Thursday, June 14, 2018 4:35 PM
> > > > > > To: Shahaf Shuler <shahafs@mellanox.com>
> > > > > > Cc: dev@dpdk.org
> > > > > > Subject: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > > > representors
> > > > > >
> > > > > > Probe existing port representors in addition to their master
> > > > > > device and
> > > > > associate them automatically.
> > > > > >
> > > > > > To avoid name collision between Ethernet devices, their names
> > > > > > use the same convention as ixgbe and i40e PMDs, that is,
> > > > > > instead of only a PCI
> > > > > address in DBDF notation:
> > > > > >
> > > > > > - "net_{DBDF}_0" for master/switch devices.
> > > >
> > > > This is breaking compatibility for application using the device
> > > > names in
> > > order to attach them to the application (e.g. OVS-DPDK).
> > > > Before this patch the naming scheme for non-representor port is
> "{DBDF}".
> > > >
> > > > Can we preserve the compatibility and add appropriate suffix for
> > > > the
> > > representor case?
> > >
> > > There's one issue if representors are hot-plugged. The name of the
> > > master device, which happens to be that of the switch domain, cannot be
> updated.
> > > The form "net_{DBDF}_0" seems expected for PMDs that support
> > > representors (see ixgbe and i40e).
> > >
> > > Now since representor hot-plugging is not supported yet, I guess we
> > > could postpone this problem by keeping the old format in the
> > > meantime, however ideally, these applications should not rely on it.
> > > The only safe assumption they can make is the uniqueness of any given
> name among ethdevs.
> > >
> > > PCI bus addresses, if needed, should be retrieved by looking at the
> > > underlying bus object.
> >
> > Am not sure I understand. Those application attach the device to the
> application based on its name, which happens to be the PCI address in case
> of mlx5.
> 
> I'm only saying it's not future-proof seeing what happens when they rely on
> it. Moreover this forces them to convert the name to some binary form
> instead of e.g. simply checking RTE_DEV_TO_PCI(dev->device)->addr where
> needed and only use the name as some kind of opaque identifier.
> 
> > > By the way, while thinking again about a past comment from Xueming
> > > [1], maybe it's finally time to remove support for multiple Verbs
> > > ports on mlx5 after all. This should drop another unnecessary loop
> > > and the need for the unused "port %u" suffix at all while naming the
> device.
> > >
> > > So how about the following plan for v3:
> > >
> > > - Adding a patch that drops support for multiple Verbs ports (note for
> > >   Xueming, yes I changed my mind *again* :)
> >
> > I am OK w/ that.
> >
> > >
> > > - If you really think this will break OVS (please confirm),
> >
> > It will.
> 
> Out of curiosity, can you point me to the relevant code in OVS? Maybe
> something can be done on their side.

For example the command to add new port to the ovs bridge is done on the following syntax
ovs-vsctl add-port ovsbr0 dpdk0 \        
    -- set interface dpdk0 type=dpdk \   
    options:dpdk-devargs="0000:81:00.0" \
    ofport_request=1                    

OVS users/automation are using the PCI address for Mellanox PMDs. w/ your change they will need to use net_0000:81:00.0 to attach the port. 

> 
> Either ixgbe and i40e are unaffected by the very same change, or they also
> break OVS, in which case there's an issue we need to solve with the
> representor interface in DPDK before it's too late.
> 
> >  then when no
> > >   "representor" parameter is provided (regardless of the presence of any
> > >   representors), name format will use the usual "{DBDF}" notation as you
> > >   suggested.
> > >
> > > - Otherwise as soon as a "representor" is found on the command line,
> > > the new
> > >   format will be used, again regardless of the presence of any
> representors.
> > >
> > > - In both cases, representors if any, will be named according to the
> format
> > >   specified in this patch.
> >
> > Can we do the following?
> > In case representor is found the naming will be DBDF_representor_%d In
> > case no-representor naming will be DBDF
> >
> > Just removing the net prefix.
> 
> Yes, I'll remove it. We'll standardize on the naming used for ixgbe/i40e only
> once the above concerns are addressed.
> 
> --
> Adrien Mazarguil
> 6WIND
Adrien Mazarguil June 28, 2018, 9:13 a.m. UTC | #12
On Thu, Jun 28, 2018 at 05:57:03AM +0000, Shahaf Shuler wrote:
> Wednesday, June 27, 2018 4:33 PM, Adrien Mazarguil:
> > Subject: Re: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port representors
> > 
> > On Sun, Jun 24, 2018 at 01:33:31PM +0000, Shahaf Shuler wrote:
> > > One more input,
> > >
> > > Sunday, June 17, 2018 1:15 PM, Shahaf Shuler:
> > > > Subject: RE: [dpdk-dev] [PATCH v2 6/7] net/mlx5: probe all port
> > > > representors
> > >
> > > [...]
> > >
> > > > > > +	eth_list = tmp;
> > > > > >  	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > > > > > -		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev,
> > ibv_dev, vf,
> > > > > > -						 &attr, i + 1);
> > > > > > -		if (eth_list[i])
> > > > > > -			continue;
> > > > > > -		/* Save rte_errno and roll back in case of failure. */
> > > > > > -		ret = rte_errno;
> > > > > > -		while (i--) {
> > > > > > -			mlx5_dev_close(eth_list[i]);
> > > > > > -			if (rte_eal_process_type() ==
> > RTE_PROC_PRIMARY)
> > > > > > -				rte_free(eth_list[i]->data-
> > >dev_private);
> > > > > > -
> > 	claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > > > > > -		}
> > > > > > -		free(eth_list);
> > > > > > -		rte_errno = ret;
> > > > > > -		return NULL;
> > > > > > +		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev,
> > ibv_dev[j],
> > > > > vf,
> > > > > > +						 &attr, i + 1,
> > > > > > +						 j ? eth_list[0] : NULL,
> > > > > > +						 j - 1);
> > > >
> > > > The representor id is according to the sort made by qsort (based on
> > > > device names).
> > > > A better way may be to set it according to the sysfs information,
> > > > like you do in the mlx5_get_ifname function.
> > > > What do you think?
> > >
> > > In fact relaying on linear increasing port numbers is dangerous. In may
> > break on special scenarios like BlueField.
> > > In BlueField there are representors between the x86 and the ARM cores.
> > Those are not VF representors. The phys_port_name of those is -1 and each
> > of them belongs to different phys_switch_id.
> > >
> > > We can argue whether it is correct/not to assign them w/ -1 value, but I
> > think the suggested approach above can detect the right "vf_id" for those
> > and not break the current behavior on x86.
> > > Let me know if you have other suggestions.
> > 
> > I didn't know that. Assuming that with these, there is exactly only one
> > representor per device, I think we can manage, the main issue being that "-
> > 1" will be difficult to parse as a valid "representor" argument which uses "-"
> > for ranges.
> 
> The -1 value is not for the representor id, It is for the id of the entity which exists on the other size of the representor. 
> The repesentor index is still 0, meaning the command line -w <pci_bdf>,representor=0 is correct on this case.
> 
> The problems comes from the assumption you do in your code about the representor id.
> What you do currently is to receive the representors and qsort them by device name. then you assign the priv->rep_id based on the qsort output.
> Later on when querying the if_name (mlx5_get_ifname) you assume that the phys_port_name of representor (which include the enumeration of what exists on its other side) is the same.
> 
> For x86 it probably works. On BlueField it breaks, as from some reason the phys_port_name is -1. 
> 
> My suggestion is to set the priv->rep_id based on the phys_port_name instead of qsort output. 

Yes, understood. The only drawback using this approach is that mlx5 devices
won't be usable at all if no netdevice can be associated with them (e.g. in
case it was moved to another netns). Currently all matching IB devs are
probed regardless, except they are handled as normal devices when the PMD
can't determine whether it's dealing with a representor.

> > Anyway, I suggest to deal with Bluefield specifics in a subsequent series.
> > This one focuses on and is validated with VF representors only.
> 
> It is related to VF representors only. BlueField is just an example. 

By "BlueField specifics", I mean the translation of -1 to 0 which so far is
specific to BlueField. Another patch is needed for that.

For devices where representors are properly numbered starting from 0, we must
rely on the uninterpreted phys_port_name value directly, which must be a
positive integer instead of a qsort() interpretation in order to properly
handle holes in the sequence due to missing devices (netns).

I intend to modify this patch as described.

Patch
diff mbox series

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 498f80c89..716c9d9a5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -304,6 +304,9 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 	if (ret)
 		DRV_LOG(WARNING, "port %u some flows still remain",
 			dev->data->port_id);
+	if (!priv->representor &&
+	    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+		claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 	memset(priv, 0, sizeof(*priv));
 }
 
@@ -648,6 +651,10 @@  mlx5_uar_init_secondary(struct rte_eth_dev *dev)
  *   Verbs device attributes.
  * @param port
  *   Verbs port to use (indexed from 1).
+ * @param master
+ *   Master device in case @p ibv_dev is a port representor.
+ * @param rep_id
+ *   Representor identifier when @p master is non-NULL.
  *
  * @return
  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
@@ -658,7 +665,9 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		   struct ibv_device *ibv_dev,
 		   int vf,
 		   const struct ibv_device_attr_ex *attr,
-		   unsigned int port)
+		   unsigned int port,
+		   struct rte_eth_dev *master,
+		   unsigned int rep_id)
 {
 	struct ibv_context *ctx;
 	struct ibv_port_attr port_attr;
@@ -802,11 +811,14 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		" old OFED/rdma-core version or firmware configuration");
 #endif
 	config.mpls_en = mpls_en;
-	if (attr->orig_attr.phys_port_cnt > 1)
-		snprintf(name, sizeof(name), "%s port %u",
-			 dpdk_dev->name, port);
+	if (!master)
+		snprintf(name, sizeof(name), "net_%s_0", dpdk_dev->name);
 	else
-		snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+		snprintf(name, sizeof(name), "net_%s_representor_%u",
+			 dpdk_dev->name, rep_id);
+	if (attr->orig_attr.phys_port_cnt > 1)
+		snprintf(name, sizeof(name), "%s_port_%u", name, port);
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		eth_dev = rte_eth_dev_attach_secondary(name);
 		if (eth_dev == NULL) {
@@ -883,6 +895,30 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	priv->port = port;
 	priv->pd = pd;
 	priv->mtu = ETHER_MTU;
+	/*
+	 * Allocate a switch domain for master devices and share it with
+	 * port representors.
+	 */
+	if (!master) {
+		priv->representor = 0;
+		priv->master_id = -1; /* Updated once known. */
+		priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+		priv->rep_id = -1; /* Dummy unique value. */
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+	} else {
+		priv->representor = 1;
+		priv->master_id =
+			((struct priv *)master->data->dev_private)->master_id;
+		priv->domain_id =
+			((struct priv *)master->data->dev_private)->domain_id;
+		priv->rep_id = rep_id;
+	}
 	err = mlx5_args(&config, dpdk_dev->devargs);
 	if (err) {
 		err = rte_errno;
@@ -964,6 +1000,18 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 		err = ENOMEM;
 		goto error;
 	}
+	/*
+	 * Now that eth_dev is allocated and its port ID is known, make
+	 * non-representor ports target their own port ID as master for
+	 * convenience.
+	 *
+	 * Master port ID is already set for actual representors. Those only
+	 * need the right device flag.
+	 */
+	if (!master)
+		priv->master_id = eth_dev->data->port_id;
+	else
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
 	eth_dev->data->dev_private = priv;
 	priv->dev_data = eth_dev->data;
 	eth_dev->data->mac_addrs = priv->mac;
@@ -1083,8 +1131,12 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 	return eth_dev;
 error:
-	if (priv)
+	if (priv) {
+		if (!priv->representor &&
+		    priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
+	}
 	if (pd)
 		claim_zero(mlx5_glue->dealloc_pd(pd));
 	if (eth_dev)
@@ -1097,12 +1149,14 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 }
 
 /**
- * Spawn Ethernet devices from Verbs information, one per detected port.
+ * Spawn Ethernet devices from Verbs information, one per detected port and
+ * port representor.
  *
  * @param dpdk_dev
  *   Backing DPDK device.
  * @param ibv_dev
- *   Verbs device.
+ *   NULL-terminated list of Verbs devices. First entry is the master device
+ *   (mandatory), followed by optional representors.
  * @param vf
  *   If nonzero, enable VF-specific features.
  *
@@ -1113,17 +1167,21 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
  */
 static struct rte_eth_dev **
 mlx5_dev_spawn(struct rte_device *dpdk_dev,
-	       struct ibv_device *ibv_dev,
+	       struct ibv_device **ibv_dev,
 	       int vf)
 {
 	struct rte_eth_dev **eth_list = NULL;
 	struct ibv_context *ctx;
 	struct ibv_device_attr_ex attr;
+	void *tmp;
 	unsigned int i;
+	unsigned int j = 0;
+	unsigned int n = 0;
 	int ret;
 
+next:
 	errno = 0;
-	ctx = mlx5_glue->open_device(ibv_dev);
+	ctx = mlx5_glue->open_device(ibv_dev[j]);
 	if (!ctx) {
 		rte_errno = errno ? errno : ENODEV;
 		if (rte_errno == ENODEV)
@@ -1132,7 +1190,7 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		else
 			DRV_LOG(ERR,
 				"cannot use device, are drivers up to date?");
-		return NULL;
+		goto error;
 	}
 	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
 	mlx5_glue->close_device(ctx);
@@ -1140,34 +1198,42 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		rte_errno = ret;
 		DRV_LOG(ERR, "unable to query device information: %s",
 			strerror(rte_errno));
-		return NULL;
+		goto error;
 	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	eth_list = malloc(sizeof(*eth_list) *
-			  (attr.orig_attr.phys_port_cnt + 1));
-	if (!eth_list) {
+	DRV_LOG(INFO, "%u port(s) detected on \"%s\"",
+		attr.orig_attr.phys_port_cnt, ibv_dev[j]->name);
+	tmp = realloc(eth_list, sizeof(*eth_list) *
+		      (n + attr.orig_attr.phys_port_cnt + 1));
+	if (!tmp) {
 		rte_errno = errno;
-		return NULL;
+		goto error;
 	}
+	eth_list = tmp;
 	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
-		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
-						 &attr, i + 1);
-		if (eth_list[i])
-			continue;
-		/* Save rte_errno and roll back in case of failure. */
-		ret = rte_errno;
-		while (i--) {
-			mlx5_dev_close(eth_list[i]);
-			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-				rte_free(eth_list[i]->data->dev_private);
-			claim_zero(rte_eth_dev_release_port(eth_list[i]));
-		}
-		free(eth_list);
-		rte_errno = ret;
-		return NULL;
+		eth_list[n] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev[j], vf,
+						 &attr, i + 1,
+						 j ? eth_list[0] : NULL,
+						 j - 1);
+		if (!eth_list[n])
+			goto error;
+		++n;
 	}
-	eth_list[i] = NULL;
+	if (ibv_dev[++j])
+		goto next;
+	eth_list[n] = NULL;
 	return eth_list;
+error:
+	/* Save rte_errno and roll back in case of failure. */
+	ret = rte_errno;
+	while (n--) {
+		mlx5_dev_close(eth_list[n]);
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+			rte_free(eth_list[n]->data->dev_private);
+		claim_zero(rte_eth_dev_release_port(eth_list[n]));
+	}
+	free(eth_list);
+	rte_errno = ret;
+	return NULL;
 }
 
 /**
@@ -1282,7 +1348,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				ibv_match[ret]->name, ret - 1);
 	}
 	if (n)
-		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_match, vf);
 	mlx5_glue->free_device_list(ibv_list);
 	if (!n) {
 		DRV_LOG(WARNING,
@@ -1302,7 +1368,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		ret = -rte_errno;
 	} else {
 		for (ret = 0; eth_list[ret]; ++ret) {
+			uint32_t restore = eth_list[ret]->data->dev_flags;
+
 			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+			/* Restore non-PCI flags cleared by the above call. */
+			eth_list[ret]->data->dev_flags |= restore;
 			rte_eth_dev_probing_finish(eth_list[ret]);
 		}
 		ret = 0;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 997b04a33..0fe467140 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -161,6 +161,10 @@  struct priv {
 	uint16_t mtu; /* Configured MTU. */
 	uint8_t port; /* Physical port number. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
+	unsigned int representor:1; /* Device is a port representor. */
+	uint16_t master_id; /* DPDK port ID of switch domain master. */
+	uint16_t domain_id; /* Switch domain identifier. */
+	unsigned int rep_id; /* Port representor identifier. */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
@@ -209,9 +213,12 @@  int mlx5_getenv_int(const char *);
 
 /* mlx5_ethdev.c */
 
+int mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+			   char (*ifname)[IF_NAMESIZE]);
 int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
 int mlx5_ifindex(const struct rte_eth_dev *dev);
-int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
+int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	       int master);
 int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
 int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
 		   unsigned int flags);
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 90488af33..9d579659e 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -93,7 +93,7 @@  struct ethtool_link_settings {
 #endif
 
 /**
- * Get interface name from private structure.
+ * Get master interface name from private structure.
  *
  * @param[in] dev
  *   Pointer to Ethernet device.
@@ -104,7 +104,8 @@  struct ethtool_link_settings {
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+mlx5_get_master_ifname(const struct rte_eth_dev *dev,
+		       char (*ifname)[IF_NAMESIZE])
 {
 	struct priv *priv = dev->data->dev_private;
 	DIR *dir;
@@ -179,6 +180,113 @@  mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 }
 
 /**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+{
+	struct priv *priv = dev->data->dev_private;
+	int ret;
+	char master[IF_NAMESIZE];
+	FILE *file;
+	DIR *dir;
+	uint64_t phys_switch_id;
+
+	if (!priv->representor)
+		return mlx5_get_master_ifname(dev, ifname);
+	ret = mlx5_get_master_ifname(dev, &master);
+	if (ret)
+		return ret;
+	{
+		MKSTR(path, "%s/device/net/%s/phys_switch_id",
+		      priv->ibdev_path, master);
+
+		file = fopen(path, "rb");
+	}
+	if (!file) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = fscanf(file, "%" SCNx64, &phys_switch_id);
+	fclose(file);
+	if (ret != 1) {
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	{
+		MKSTR(path, "%s/device/net/%s/subsystem",
+		      priv->ibdev_path, master);
+
+		dir = opendir(path);
+	}
+	if (!dir) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	/*
+	 * Scan network interfaces to find one with matching phys_switch_id
+	 * and phys_switch_name.
+	 */
+	do {
+		struct dirent *dent;
+		uint64_t phys_switch_id_rep;
+		int rep_id;
+
+		ret = -ENOENT;
+		dent = readdir(dir);
+		if (!dent)
+			break;
+		{
+			MKSTR(path,
+			      "%s/device/net/%s/subsystem/%s/phys_switch_id",
+			      priv->ibdev_path, master, dent->d_name);
+
+			file = fopen(path, "rb");
+		}
+		if (!file)
+			continue;
+		ret = fscanf(file, "%" SCNx64, &phys_switch_id_rep);
+		fclose(file);
+		if (ret != 1)
+			continue;
+		if (phys_switch_id_rep != phys_switch_id)
+			continue;
+		{
+			MKSTR(path,
+			      "%s/device/net/%s/subsystem/%s/phys_port_name",
+			      priv->ibdev_path, master, dent->d_name);
+
+			file = fopen(path, "rb");
+		}
+		if (!file)
+			continue;
+		ret = fscanf(file, "%d", &rep_id);
+		fclose(file);
+		if (ret != 1)
+			continue;
+		if (rep_id < 0 || (unsigned int)rep_id != priv->rep_id)
+			continue;
+		strlcpy(*ifname, dent->d_name, sizeof(*ifname));
+		ret = 0;
+		break;
+	} while (1);
+	closedir(dir);
+	if (ret)
+		rte_errno = -ret;
+	return ret;
+}
+
+/**
  * Get the interface index from device name.
  *
  * @param[in] dev
@@ -214,12 +322,16 @@  mlx5_ifindex(const struct rte_eth_dev *dev)
  *   Request number to pass to ioctl().
  * @param[out] ifr
  *   Interface request structure output buffer.
+ * @param master
+ *   When device is a port representor, perform request on master device
+ *   instead.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
+	   int master)
 {
 	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
 	int ret = 0;
@@ -228,7 +340,10 @@  mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
 		rte_errno = errno;
 		return -rte_errno;
 	}
-	ret = mlx5_get_ifname(dev, &ifr->ifr_name);
+	if (master)
+		ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
+	else
+		ret = mlx5_get_ifname(dev, &ifr->ifr_name);
 	if (ret)
 		goto error;
 	ret = ioctl(sock, req, ifr);
@@ -258,7 +373,7 @@  int
 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
 
 	if (ret)
 		return ret;
@@ -282,7 +397,7 @@  mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 {
 	struct ifreq request = { .ifr_mtu = mtu, };
 
-	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
 }
 
 /**
@@ -302,13 +417,13 @@  int
 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 {
 	struct ifreq request;
-	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
 
 	if (ret)
 		return ret;
 	request.ifr_flags &= keep;
 	request.ifr_flags |= flags & ~keep;
-	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
 }
 
 /**
@@ -477,6 +592,12 @@  mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	info->speed_capa = priv->link_speed_capa;
 	info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
 	mlx5_set_default_params(dev, info);
+	if (rte_eth_dev_is_valid_port(priv->master_id)) {
+		info->switch_info.name =
+			rte_eth_devices[priv->master_id].data->name;
+		info->switch_info.domain_id = priv->domain_id;
+		info->switch_info.port_id = priv->rep_id;
+	}
 }
 
 /**
@@ -540,7 +661,7 @@  mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	int link_speed = 0;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -550,7 +671,7 @@  mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&edata;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
@@ -611,7 +732,7 @@  mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	uint64_t sc;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 			dev->data->port_id, strerror(rte_errno));
@@ -621,7 +742,7 @@  mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 	dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
 				(ifr.ifr_flags & IFF_RUNNING));
 	ifr.ifr_data = (void *)&gcmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -638,7 +759,7 @@  mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 
 	*ecmd = gcmd;
 	ifr.ifr_data = (void *)ecmd;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(DEBUG,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
@@ -801,7 +922,7 @@  mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 	int ret;
 
 	ifr.ifr_data = (void *)&ethpause;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
@@ -854,7 +975,7 @@  mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 		ethpause.tx_pause = 1;
 	else
 		ethpause.tx_pause = 0;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 672a47619..12ee37f55 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -49,7 +49,7 @@  mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
 	struct ifreq request;
 	int ret;
 
-	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request, 0);
 	if (ret)
 		return ret;
 	memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN);
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index 875dd1027..91f3d474a 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -146,7 +146,7 @@  mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
 	et_stats->cmd = ETHTOOL_GSTATS;
 	et_stats->n_stats = xstats_ctrl->stats_n;
 	ifr.ifr_data = (caddr_t)et_stats;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING,
 			"port %u unable to read statistic values from device",
@@ -194,7 +194,7 @@  mlx5_ethtool_get_stats_n(struct rte_eth_dev *dev) {
 
 	drvinfo.cmd = ETHTOOL_GDRVINFO;
 	ifr.ifr_data = (caddr_t)&drvinfo;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to query number of statistics",
 			dev->data->port_id);
@@ -244,7 +244,7 @@  mlx5_xstats_init(struct rte_eth_dev *dev)
 	strings->string_set = ETH_SS_STATS;
 	strings->len = dev_stats_n;
 	ifr.ifr_data = (caddr_t)strings;
-	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
 	if (ret) {
 		DRV_LOG(WARNING, "port %u unable to get statistic names",
 			dev->data->port_id);