[v6,01/18] net/mana: add basic driver, build environment and doc

Message ID 1661899911-13086-2-git-send-email-longli@linuxonhyperv.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series Introduce Microsoft Azure Network Adatper (MANA) PMD |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Long Li Aug. 30, 2022, 10:51 p.m. UTC
  From: Long Li <longli@microsoft.com>

MANA is a PCI device. It uses IB verbs to access hardware through the
kernel RDMA layer. This patch introduces build environment and basic
device probe functions.

Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb header file
v4:
Remove extra "\n" in logging code.
Use "r" in place of "rb" in fopen() to read text files.

 MAINTAINERS                       |   6 +
 doc/guides/nics/features/mana.ini |  10 +
 doc/guides/nics/index.rst         |   1 +
 doc/guides/nics/mana.rst          |  66 +++
 drivers/net/mana/mana.c           | 704 ++++++++++++++++++++++++++++++
 drivers/net/mana/mana.h           | 210 +++++++++
 drivers/net/mana/meson.build      |  44 ++
 drivers/net/mana/mp.c             | 235 ++++++++++
 drivers/net/mana/version.map      |   3 +
 drivers/net/meson.build           |   1 +
 10 files changed, 1280 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/version.map
  

Comments

lihuisong (C) Aug. 31, 2022, 1:32 a.m. UTC | #1
在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
> From: Long Li <longli@microsoft.com>
>
> MANA is a PCI device. It uses IB verbs to access hardware through the
> kernel RDMA layer. This patch introduces build environment and basic
> device probe functions.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> Change log:
> v2:
> Fix typos.
> Make the driver build only on x86-64 and Linux.
> Remove unused header files.
> Change port definition to uint16_t or uint8_t (for IB).
> Use getline() in place of fgets() to read and truncate a line.
> v3:
> Add meson build check for required functions from RDMA direct verb header file
> v4:
> Remove extra "\n" in logging code.
> Use "r" in place of "rb" in fopen() to read text files.
>
> [snip]
> +
> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
> +			      struct rte_pci_device *pci_dev,
> +			      struct rte_ether_addr *mac_addr)
> +{
> +	struct ibv_device **ibv_list;
> +	int ibv_idx;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex dev_attr;
> +	int num_devices;
> +	int ret = 0;
> +	uint8_t port;
> +	struct mana_priv *priv = NULL;
> +	struct rte_eth_dev *eth_dev = NULL;
> +	bool found_port;
> +
> +	ibv_list = ibv_get_device_list(&num_devices);
> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
> +			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
> +
> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> +			continue;
> +
> +		/* Ignore if this IB device is not this PCI device */
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +
> +		ctx = ibv_open_device(ibdev);
> +		if (!ctx) {
> +			DRV_LOG(ERR, "Failed to open IB device %s",
> +				ibdev->name);
> +			continue;
> +		}
> +
> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
> +			dev_attr.orig_attr.phys_port_cnt);
> +		found_port = false;
> +
> +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
> +		     port++) {
> +			struct ibv_parent_domain_init_attr attr = {};
> +			struct rte_ether_addr addr;
> +			char address[64];
> +			char name[RTE_ETH_NAME_MAX_LEN];
> +
> +			ret = get_port_mac(ibdev, port, &addr);
> +			if (ret)
> +				continue;
> +
> +			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
> +				continue;
> +
> +			rte_ether_format_addr(address, sizeof(address), &addr);
> +			DRV_LOG(INFO, "device located port %u address %s",
> +				port, address);
> +			found_port = true;
> +
> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> +						  RTE_CACHE_LINE_SIZE,
> +						  SOCKET_ID_ANY);
> +			if (!priv) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			snprintf(name, sizeof(name), "%s_port%d",
> +				 pci_dev->device.name, port);
> +
> +			if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +				int fd;
> +
> +				eth_dev = rte_eth_dev_attach_secondary(name);
> +				if (!eth_dev) {
> +					DRV_LOG(ERR, "Can't attach to dev %s",
> +						name);
> +					ret = -ENOMEM;
> +					goto failed;
> +				}
> +
> +				eth_dev->device = &pci_dev->device;
> +				eth_dev->dev_ops = &mana_dev_sec_ops;
> +				ret = mana_proc_priv_init(eth_dev);
> +				if (ret)
> +					goto failed;
> +				priv->process_priv = eth_dev->process_private;
> +
> +				/* Get the IB FD from the primary process */
> +				fd = mana_mp_req_verbs_cmd_fd(eth_dev);
> +				if (fd < 0) {
> +					DRV_LOG(ERR, "Failed to get FD %d", fd);
> +					ret = -ENODEV;
> +					goto failed;
> +				}
> +
> +				ret = mana_map_doorbell_secondary(eth_dev, fd);
> +				if (ret) {
> +					DRV_LOG(ERR, "Failed secondary map %d",
> +						fd);
> +					goto failed;
> +				}
> +
> +				/* fd is no not used after mapping doorbell */
> +				close(fd);
> +
> +				rte_spinlock_lock(&mana_shared_data->lock);
> +				mana_shared_data->secondary_cnt++;
> +				mana_local_data.secondary_cnt++;
> +				rte_spinlock_unlock(&mana_shared_data->lock);
> +
> +				rte_eth_copy_pci_info(eth_dev, pci_dev);
> +				rte_eth_dev_probing_finish(eth_dev);
> +
> +				/* Impossible to have more than one port
> +				 * matching a MAC address
> +				 */
> +				continue;
> +			}
> +
> +			eth_dev = rte_eth_dev_allocate(name);
> +			if (!eth_dev) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			eth_dev->data->mac_addrs =
> +				rte_calloc("mana_mac", 1,
> +					   sizeof(struct rte_ether_addr), 0);
> +			if (!eth_dev->data->mac_addrs) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs);
> +
> +			priv->ib_pd = ibv_alloc_pd(ctx);
> +			if (!priv->ib_pd) {
> +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			/* Create a parent domain with the port number */
> +			attr.pd = priv->ib_pd;
> +			attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> +			attr.pd_context = (void *)(uint64_t)port;
> +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
> +			if (!priv->ib_parent_pd) {
> +				DRV_LOG(ERR,
> +					"ibv_alloc_parent_domain failed port %d",
> +					port);
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			priv->ib_ctx = ctx;
> +			priv->port_id = eth_dev->data->port_id;
> +			priv->dev_port = port;
> +			eth_dev->data->dev_private = priv;
> +			priv->dev_data = eth_dev->data;
> +
> +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
> +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
> +
> +			priv->max_rx_desc =
> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> +					dev_attr.orig_attr.max_cqe);
> +			priv->max_tx_desc =
> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> +					dev_attr.orig_attr.max_cqe);
> +
> +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
> +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
> +
> +			priv->max_mr = dev_attr.orig_attr.max_mr;
> +			priv->max_mr_size = dev_attr.orig_attr.max_mr_size;
> +
> +			DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
> +				name, priv->max_rx_queues, priv->max_rx_desc,
> +				priv->max_send_sge);
> +
> +			rte_spinlock_lock(&mana_shared_data->lock);
> +			mana_shared_data->primary_cnt++;
> +			rte_spinlock_unlock(&mana_shared_data->lock);
> +
> +			eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
> +
> +			eth_dev->device = &pci_dev->device;
> +			eth_dev->data->dev_flags |=
> +				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
> +

Please do not use the temporary macro. Please review this patch:

f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue xstats")

This patch requires that per queue statistics are filled in 
.xstats_get() by PMD.

> +			DRV_LOG(INFO, "device %s at port %u",
> +				name, eth_dev->data->port_id);
> +
> +			eth_dev->rx_pkt_burst = mana_rx_burst_removed;
> +			eth_dev->tx_pkt_burst = mana_tx_burst_removed;
> +			eth_dev->dev_ops = &mana_dev_ops;
> +
> +			rte_eth_copy_pci_info(eth_dev, pci_dev);
> +			rte_eth_dev_probing_finish(eth_dev);
> +		}
> +
> +		/* Secondary process doesn't need an ibv_ctx. It maps the
> +		 * doorbell pages using the IB cmd_fd passed from the primary
> +		 * process and send messages to primary process for memory
> +		 * registartions.
> +		 */
> +		if (!found_port || rte_eal_process_type() == RTE_PROC_SECONDARY)
> +			ibv_close_device(ctx);
> +	}
> +
> +	ibv_free_device_list(ibv_list);
> +	return 0;
> +
> +failed:
> +	/* Free the resource for the port failed */
> +	if (priv) {
> +		if (priv->ib_parent_pd)
> +			ibv_dealloc_pd(priv->ib_parent_pd);
> +
> +		if (priv->ib_pd)
> +			ibv_dealloc_pd(priv->ib_pd);
> +	}
> +
> +	if (eth_dev)
> +		rte_eth_dev_release_port(eth_dev);
> +
> +	rte_free(priv);
> +
> +	ibv_close_device(ctx);
> +	ibv_free_device_list(ibv_list);
> +
> +	return ret;
> +}
> [snip]
>
  
Long Li Aug. 31, 2022, 6:05 p.m. UTC | #2
> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment
> and doc
> 
> 
> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
> > From: Long Li <longli@microsoft.com>
> >
> > MANA is a PCI device. It uses IB verbs to access hardware through the
> > kernel RDMA layer. This patch introduces build environment and basic
> > device probe functions.
> >
> > Signed-off-by: Long Li <longli@microsoft.com>
> > ---
> > Change log:
> > v2:
> > Fix typos.
> > Make the driver build only on x86-64 and Linux.
> > Remove unused header files.
> > Change port definition to uint16_t or uint8_t (for IB).
> > Use getline() in place of fgets() to read and truncate a line.
> > v3:
> > Add meson build check for required functions from RDMA direct verb
> > header file
> > v4:
> > Remove extra "\n" in logging code.
> > Use "r" in place of "rb" in fopen() to read text files.
> >
> > [snip]
> > +
> > +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
> __rte_unused,
> > +			      struct rte_pci_device *pci_dev,
> > +			      struct rte_ether_addr *mac_addr) {
> > +	struct ibv_device **ibv_list;
> > +	int ibv_idx;
> > +	struct ibv_context *ctx;
> > +	struct ibv_device_attr_ex dev_attr;
> > +	int num_devices;
> > +	int ret = 0;
> > +	uint8_t port;
> > +	struct mana_priv *priv = NULL;
> > +	struct rte_eth_dev *eth_dev = NULL;
> > +	bool found_port;
> > +
> > +	ibv_list = ibv_get_device_list(&num_devices);
> > +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> > +		struct ibv_device *ibdev = ibv_list[ibv_idx];
> > +		struct rte_pci_addr pci_addr;
> > +
> > +		DRV_LOG(INFO, "Probe device name %s dev_name %s
> ibdev_path %s",
> > +			ibdev->name, ibdev->dev_name, ibdev-
> >ibdev_path);
> > +
> > +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> > +			continue;
> > +
> > +		/* Ignore if this IB device is not this PCI device */
> > +		if (pci_dev->addr.domain != pci_addr.domain ||
> > +		    pci_dev->addr.bus != pci_addr.bus ||
> > +		    pci_dev->addr.devid != pci_addr.devid ||
> > +		    pci_dev->addr.function != pci_addr.function)
> > +			continue;
> > +
> > +		ctx = ibv_open_device(ibdev);
> > +		if (!ctx) {
> > +			DRV_LOG(ERR, "Failed to open IB device %s",
> > +				ibdev->name);
> > +			continue;
> > +		}
> > +
> > +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> > +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
> > +			dev_attr.orig_attr.phys_port_cnt);
> > +		found_port = false;
> > +
> > +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
> > +		     port++) {
> > +			struct ibv_parent_domain_init_attr attr = {};
> > +			struct rte_ether_addr addr;
> > +			char address[64];
> > +			char name[RTE_ETH_NAME_MAX_LEN];
> > +
> > +			ret = get_port_mac(ibdev, port, &addr);
> > +			if (ret)
> > +				continue;
> > +
> > +			if (mac_addr && !rte_is_same_ether_addr(&addr,
> mac_addr))
> > +				continue;
> > +
> > +			rte_ether_format_addr(address, sizeof(address),
> &addr);
> > +			DRV_LOG(INFO, "device located port %u address
> %s",
> > +				port, address);
> > +			found_port = true;
> > +
> > +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> > +						  RTE_CACHE_LINE_SIZE,
> > +						  SOCKET_ID_ANY);
> > +			if (!priv) {
> > +				ret = -ENOMEM;
> > +				goto failed;
> > +			}
> > +
> > +			snprintf(name, sizeof(name), "%s_port%d",
> > +				 pci_dev->device.name, port);
> > +
> > +			if (rte_eal_process_type() ==
> RTE_PROC_SECONDARY) {
> > +				int fd;
> > +
> > +				eth_dev =
> rte_eth_dev_attach_secondary(name);
> > +				if (!eth_dev) {
> > +					DRV_LOG(ERR, "Can't attach to dev
> %s",
> > +						name);
> > +					ret = -ENOMEM;
> > +					goto failed;
> > +				}
> > +
> > +				eth_dev->device = &pci_dev->device;
> > +				eth_dev->dev_ops = &mana_dev_sec_ops;
> > +				ret = mana_proc_priv_init(eth_dev);
> > +				if (ret)
> > +					goto failed;
> > +				priv->process_priv = eth_dev-
> >process_private;
> > +
> > +				/* Get the IB FD from the primary process */
> > +				fd =
> mana_mp_req_verbs_cmd_fd(eth_dev);
> > +				if (fd < 0) {
> > +					DRV_LOG(ERR, "Failed to get FD %d",
> fd);
> > +					ret = -ENODEV;
> > +					goto failed;
> > +				}
> > +
> > +				ret =
> mana_map_doorbell_secondary(eth_dev, fd);
> > +				if (ret) {
> > +					DRV_LOG(ERR, "Failed secondary
> map %d",
> > +						fd);
> > +					goto failed;
> > +				}
> > +
> > +				/* fd is no not used after mapping doorbell */
> > +				close(fd);
> > +
> > +				rte_spinlock_lock(&mana_shared_data-
> >lock);
> > +				mana_shared_data->secondary_cnt++;
> > +				mana_local_data.secondary_cnt++;
> > +				rte_spinlock_unlock(&mana_shared_data-
> >lock);
> > +
> > +				rte_eth_copy_pci_info(eth_dev, pci_dev);
> > +				rte_eth_dev_probing_finish(eth_dev);
> > +
> > +				/* Impossible to have more than one port
> > +				 * matching a MAC address
> > +				 */
> > +				continue;
> > +			}
> > +
> > +			eth_dev = rte_eth_dev_allocate(name);
> > +			if (!eth_dev) {
> > +				ret = -ENOMEM;
> > +				goto failed;
> > +			}
> > +
> > +			eth_dev->data->mac_addrs =
> > +				rte_calloc("mana_mac", 1,
> > +					   sizeof(struct rte_ether_addr), 0);
> > +			if (!eth_dev->data->mac_addrs) {
> > +				ret = -ENOMEM;
> > +				goto failed;
> > +			}
> > +
> > +			rte_ether_addr_copy(&addr, eth_dev->data-
> >mac_addrs);
> > +
> > +			priv->ib_pd = ibv_alloc_pd(ctx);
> > +			if (!priv->ib_pd) {
> > +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
> port);
> > +				ret = -ENOMEM;
> > +				goto failed;
> > +			}
> > +
> > +			/* Create a parent domain with the port number */
> > +			attr.pd = priv->ib_pd;
> > +			attr.comp_mask =
> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> > +			attr.pd_context = (void *)(uint64_t)port;
> > +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
> &attr);
> > +			if (!priv->ib_parent_pd) {
> > +				DRV_LOG(ERR,
> > +					"ibv_alloc_parent_domain failed port
> %d",
> > +					port);
> > +				ret = -ENOMEM;
> > +				goto failed;
> > +			}
> > +
> > +			priv->ib_ctx = ctx;
> > +			priv->port_id = eth_dev->data->port_id;
> > +			priv->dev_port = port;
> > +			eth_dev->data->dev_private = priv;
> > +			priv->dev_data = eth_dev->data;
> > +
> > +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
> > +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
> > +
> > +			priv->max_rx_desc =
> > +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> > +					dev_attr.orig_attr.max_cqe);
> > +			priv->max_tx_desc =
> > +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> > +					dev_attr.orig_attr.max_cqe);
> > +
> > +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
> > +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
> > +
> > +			priv->max_mr = dev_attr.orig_attr.max_mr;
> > +			priv->max_mr_size =
> dev_attr.orig_attr.max_mr_size;
> > +
> > +			DRV_LOG(INFO, "dev %s max queues %d desc %d
> sge %d",
> > +				name, priv->max_rx_queues, priv-
> >max_rx_desc,
> > +				priv->max_send_sge);
> > +
> > +			rte_spinlock_lock(&mana_shared_data->lock);
> > +			mana_shared_data->primary_cnt++;
> > +			rte_spinlock_unlock(&mana_shared_data->lock);
> > +
> > +			eth_dev->data->dev_flags |=
> RTE_ETH_DEV_INTR_RMV;
> > +
> > +			eth_dev->device = &pci_dev->device;
> > +			eth_dev->data->dev_flags |=
> > +				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
> > +
> 
> Please do not use the temporary macro. Please review this patch:
> 
> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue xstats")
> 
> This patch requires that per queue statistics are filled in
> .xstats_get() by PMD.

Thanks for pointing this out.

It seems some PMDs are still depending on this flag for xstats.

MANA doesn't implement xstats_get() currently, this flag is useful. Is it okay to keep using this flag before it's finally the time to remove it from all PMDs, or when MANA implements xstats?
  
fengchengwen Sept. 2, 2022, 12:09 p.m. UTC | #3
On 2022/8/31 6:51, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> MANA is a PCI device. It uses IB verbs to access hardware through the
> kernel RDMA layer. This patch introduces build environment and basic
> device probe functions.
> 
> Signed-off-by: Long Li <longli@microsoft.com>

...

> +static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
> +				  const void *peer)
> +{
> +	struct rte_eth_dev *dev;
> +	const struct mana_mp_param *param =
> +		(const struct mana_mp_param *)mp_msg->param;
> +	struct rte_mp_msg mp_res = { 0 };
> +	struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
> +	int ret;
> +	struct mana_priv *priv;
> +
> +	if (!rte_eth_dev_is_valid_port(param->port_id)) {
> +		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
> +		return -ENODEV;
> +	}
> +
> +	dev = &rte_eth_devices[param->port_id];
> +	priv = dev->data->dev_private;
> +
> +	mp_init_msg(&mp_res, param->type, param->port_id);
> +
> +	switch (param->type) {
> +	case MANA_MP_REQ_VERBS_CMD_FD:
> +		mp_res.num_fds = 1;
> +		mp_res.fds[0] = priv->ib_ctx->cmd_fd;

The cmd_fd is system level handler?

If it's process private handler, it should not used directly in another process.

> +		res->result = 0;
> +		ret = rte_mp_reply(&mp_res, peer);
> +		break;
> +
> +	default:
> +		DRV_LOG(ERR, "Port %u unknown primary MP type %u",
> +			param->port_id, param->type);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +
  
Long Li Sept. 2, 2022, 7:45 p.m. UTC | #4
> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment and
> doc
> 
> On 2022/8/31 6:51, longli@linuxonhyperv.com wrote:
> > From: Long Li <longli@microsoft.com>
> >
> > MANA is a PCI device. It uses IB verbs to access hardware through the
> > kernel RDMA layer. This patch introduces build environment and basic
> > device probe functions.
> >
> > Signed-off-by: Long Li <longli@microsoft.com>
> 
> ...
> 
> > +static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
> > +				  const void *peer)
> > +{
> > +	struct rte_eth_dev *dev;
> > +	const struct mana_mp_param *param =
> > +		(const struct mana_mp_param *)mp_msg->param;
> > +	struct rte_mp_msg mp_res = { 0 };
> > +	struct mana_mp_param *res = (struct mana_mp_param
> *)mp_res.param;
> > +	int ret;
> > +	struct mana_priv *priv;
> > +
> > +	if (!rte_eth_dev_is_valid_port(param->port_id)) {
> > +		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
> > +		return -ENODEV;
> > +	}
> > +
> > +	dev = &rte_eth_devices[param->port_id];
> > +	priv = dev->data->dev_private;
> > +
> > +	mp_init_msg(&mp_res, param->type, param->port_id);
> > +
> > +	switch (param->type) {
> > +	case MANA_MP_REQ_VERBS_CMD_FD:
> > +		mp_res.num_fds = 1;
> > +		mp_res.fds[0] = priv->ib_ctx->cmd_fd;
> 
> The cmd_fd is system level handler?
> 
> If it's process private handler, it should not used directly in another process.

According to rte_mp_xxx semantics, the file handle is duplicated to another process. It's not directly used. It's required for secondary process to map to the same doorbell pages.

> 
> > +		res->result = 0;
> > +		ret = rte_mp_reply(&mp_res, peer);
> > +		break;
> > +
> > +	default:
> > +		DRV_LOG(ERR, "Port %u unknown primary MP type %u",
> > +			param->port_id, param->type);
> > +		ret = -EINVAL;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
  
fengchengwen Sept. 3, 2022, 1:44 a.m. UTC | #5
On 2022/9/3 3:45, Long Li wrote:
>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment and
>> doc
>>
>> On 2022/8/31 6:51, longli@linuxonhyperv.com wrote:
>>> From: Long Li <longli@microsoft.com>
>>>
>>> MANA is a PCI device. It uses IB verbs to access hardware through the
>>> kernel RDMA layer. This patch introduces build environment and basic
>>> device probe functions.
>>>
>>> Signed-off-by: Long Li <longli@microsoft.com>
>> ...
>>
>>> +static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
>>> +				  const void *peer)
>>> +{
>>> +	struct rte_eth_dev *dev;
>>> +	const struct mana_mp_param *param =
>>> +		(const struct mana_mp_param *)mp_msg->param;
>>> +	struct rte_mp_msg mp_res = { 0 };
>>> +	struct mana_mp_param *res = (struct mana_mp_param
>> *)mp_res.param;
>>> +	int ret;
>>> +	struct mana_priv *priv;
>>> +
>>> +	if (!rte_eth_dev_is_valid_port(param->port_id)) {
>>> +		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
>>> +		return -ENODEV;
>>> +	}
>>> +
>>> +	dev = &rte_eth_devices[param->port_id];
>>> +	priv = dev->data->dev_private;
>>> +
>>> +	mp_init_msg(&mp_res, param->type, param->port_id);
>>> +
>>> +	switch (param->type) {
>>> +	case MANA_MP_REQ_VERBS_CMD_FD:
>>> +		mp_res.num_fds = 1;
>>> +		mp_res.fds[0] = priv->ib_ctx->cmd_fd;
>> The cmd_fd is system level handler?
>>
>> If it's process private handler, it should not used directly in another process.
> According to rte_mp_xxx semantics, the file handle is duplicated to another process. It's not directly used. It's required for secondary process to map to the same doorbell pages.


You are right. I notice the rte_mp_xx use AF_UNIX which could process it.

Thanks for clarify.


>
>>> +		res->result = 0;
>>> +		ret = rte_mp_reply(&mp_res, peer);
>>> +		break;
>>> +
>>> +	default:
>>> +		DRV_LOG(ERR, "Port %u unknown primary MP type %u",
>>> +			param->port_id, param->type);
>>> +		ret = -EINVAL;
>>> +	}
>>> +
>>> +	return ret;
>>> +}
>>> +
  
lihuisong (C) Sept. 5, 2022, 7:15 a.m. UTC | #6
在 2022/9/1 2:05, Long Li 写道:
>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment
>> and doc
>>
>>
>> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
>>> From: Long Li <longli@microsoft.com>
>>>
>>> MANA is a PCI device. It uses IB verbs to access hardware through the
>>> kernel RDMA layer. This patch introduces build environment and basic
>>> device probe functions.
>>>
>>> Signed-off-by: Long Li <longli@microsoft.com>
>>> ---
>>> Change log:
>>> v2:
>>> Fix typos.
>>> Make the driver build only on x86-64 and Linux.
>>> Remove unused header files.
>>> Change port definition to uint16_t or uint8_t (for IB).
>>> Use getline() in place of fgets() to read and truncate a line.
>>> v3:
>>> Add meson build check for required functions from RDMA direct verb
>>> header file
>>> v4:
>>> Remove extra "\n" in logging code.
>>> Use "r" in place of "rb" in fopen() to read text files.
>>>
>>> [snip]
>>> +
>>> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
>> __rte_unused,
>>> +			      struct rte_pci_device *pci_dev,
>>> +			      struct rte_ether_addr *mac_addr) {
>>> +	struct ibv_device **ibv_list;
>>> +	int ibv_idx;
>>> +	struct ibv_context *ctx;
>>> +	struct ibv_device_attr_ex dev_attr;
>>> +	int num_devices;
>>> +	int ret = 0;
>>> +	uint8_t port;
>>> +	struct mana_priv *priv = NULL;
>>> +	struct rte_eth_dev *eth_dev = NULL;
>>> +	bool found_port;
>>> +
>>> +	ibv_list = ibv_get_device_list(&num_devices);
>>> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
>>> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
>>> +		struct rte_pci_addr pci_addr;
>>> +
>>> +		DRV_LOG(INFO, "Probe device name %s dev_name %s
>> ibdev_path %s",
>>> +			ibdev->name, ibdev->dev_name, ibdev-
>>> ibdev_path);
>>> +
>>> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
>>> +			continue;
>>> +
>>> +		/* Ignore if this IB device is not this PCI device */
>>> +		if (pci_dev->addr.domain != pci_addr.domain ||
>>> +		    pci_dev->addr.bus != pci_addr.bus ||
>>> +		    pci_dev->addr.devid != pci_addr.devid ||
>>> +		    pci_dev->addr.function != pci_addr.function)
>>> +			continue;
>>> +
>>> +		ctx = ibv_open_device(ibdev);
>>> +		if (!ctx) {
>>> +			DRV_LOG(ERR, "Failed to open IB device %s",
>>> +				ibdev->name);
>>> +			continue;
>>> +		}
>>> +
>>> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
>>> +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
>>> +			dev_attr.orig_attr.phys_port_cnt);
>>> +		found_port = false;
>>> +
>>> +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
>>> +		     port++) {
>>> +			struct ibv_parent_domain_init_attr attr = {};
>>> +			struct rte_ether_addr addr;
>>> +			char address[64];
>>> +			char name[RTE_ETH_NAME_MAX_LEN];
>>> +
>>> +			ret = get_port_mac(ibdev, port, &addr);
>>> +			if (ret)
>>> +				continue;
>>> +
>>> +			if (mac_addr && !rte_is_same_ether_addr(&addr,
>> mac_addr))
>>> +				continue;
>>> +
>>> +			rte_ether_format_addr(address, sizeof(address),
>> &addr);
>>> +			DRV_LOG(INFO, "device located port %u address
>> %s",
>>> +				port, address);
>>> +			found_port = true;
>>> +
>>> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
>>> +						  RTE_CACHE_LINE_SIZE,
>>> +						  SOCKET_ID_ANY);
>>> +			if (!priv) {
>>> +				ret = -ENOMEM;
>>> +				goto failed;
>>> +			}
>>> +
>>> +			snprintf(name, sizeof(name), "%s_port%d",
>>> +				 pci_dev->device.name, port);
>>> +
>>> +			if (rte_eal_process_type() ==
>> RTE_PROC_SECONDARY) {
>>> +				int fd;
>>> +
>>> +				eth_dev =
>> rte_eth_dev_attach_secondary(name);
>>> +				if (!eth_dev) {
>>> +					DRV_LOG(ERR, "Can't attach to dev
>> %s",
>>> +						name);
>>> +					ret = -ENOMEM;
>>> +					goto failed;
>>> +				}
>>> +
>>> +				eth_dev->device = &pci_dev->device;
>>> +				eth_dev->dev_ops = &mana_dev_sec_ops;
>>> +				ret = mana_proc_priv_init(eth_dev);
>>> +				if (ret)
>>> +					goto failed;
>>> +				priv->process_priv = eth_dev-
>>> process_private;
>>> +
>>> +				/* Get the IB FD from the primary process */
>>> +				fd =
>> mana_mp_req_verbs_cmd_fd(eth_dev);
>>> +				if (fd < 0) {
>>> +					DRV_LOG(ERR, "Failed to get FD %d",
>> fd);
>>> +					ret = -ENODEV;
>>> +					goto failed;
>>> +				}
>>> +
>>> +				ret =
>> mana_map_doorbell_secondary(eth_dev, fd);
>>> +				if (ret) {
>>> +					DRV_LOG(ERR, "Failed secondary
>> map %d",
>>> +						fd);
>>> +					goto failed;
>>> +				}
>>> +
>>> +				/* fd is no not used after mapping doorbell */
>>> +				close(fd);
>>> +
>>> +				rte_spinlock_lock(&mana_shared_data-
>>> lock);
>>> +				mana_shared_data->secondary_cnt++;
>>> +				mana_local_data.secondary_cnt++;
>>> +				rte_spinlock_unlock(&mana_shared_data-
>>> lock);
>>> +
>>> +				rte_eth_copy_pci_info(eth_dev, pci_dev);
>>> +				rte_eth_dev_probing_finish(eth_dev);
>>> +
>>> +				/* Impossible to have more than one port
>>> +				 * matching a MAC address
>>> +				 */
>>> +				continue;
>>> +			}
>>> +
>>> +			eth_dev = rte_eth_dev_allocate(name);
>>> +			if (!eth_dev) {
>>> +				ret = -ENOMEM;
>>> +				goto failed;
>>> +			}
>>> +
>>> +			eth_dev->data->mac_addrs =
>>> +				rte_calloc("mana_mac", 1,
>>> +					   sizeof(struct rte_ether_addr), 0);
>>> +			if (!eth_dev->data->mac_addrs) {
>>> +				ret = -ENOMEM;
>>> +				goto failed;
>>> +			}
>>> +
>>> +			rte_ether_addr_copy(&addr, eth_dev->data-
>>> mac_addrs);
>>> +
>>> +			priv->ib_pd = ibv_alloc_pd(ctx);
>>> +			if (!priv->ib_pd) {
>>> +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
>> port);
>>> +				ret = -ENOMEM;
>>> +				goto failed;
>>> +			}
>>> +
>>> +			/* Create a parent domain with the port number */
>>> +			attr.pd = priv->ib_pd;
>>> +			attr.comp_mask =
>> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
>>> +			attr.pd_context = (void *)(uint64_t)port;
>>> +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
>> &attr);
>>> +			if (!priv->ib_parent_pd) {
>>> +				DRV_LOG(ERR,
>>> +					"ibv_alloc_parent_domain failed port
>> %d",
>>> +					port);
>>> +				ret = -ENOMEM;
>>> +				goto failed;
>>> +			}
>>> +
>>> +			priv->ib_ctx = ctx;
>>> +			priv->port_id = eth_dev->data->port_id;
>>> +			priv->dev_port = port;
>>> +			eth_dev->data->dev_private = priv;
>>> +			priv->dev_data = eth_dev->data;
>>> +
>>> +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
>>> +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
>>> +
>>> +			priv->max_rx_desc =
>>> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
>>> +					dev_attr.orig_attr.max_cqe);
>>> +			priv->max_tx_desc =
>>> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
>>> +					dev_attr.orig_attr.max_cqe);
>>> +
>>> +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
>>> +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
>>> +
>>> +			priv->max_mr = dev_attr.orig_attr.max_mr;
>>> +			priv->max_mr_size =
>> dev_attr.orig_attr.max_mr_size;
>>> +
>>> +			DRV_LOG(INFO, "dev %s max queues %d desc %d
>> sge %d",
>>> +				name, priv->max_rx_queues, priv-
>>> max_rx_desc,
>>> +				priv->max_send_sge);
>>> +
>>> +			rte_spinlock_lock(&mana_shared_data->lock);
>>> +			mana_shared_data->primary_cnt++;
>>> +			rte_spinlock_unlock(&mana_shared_data->lock);
>>> +
>>> +			eth_dev->data->dev_flags |=
>> RTE_ETH_DEV_INTR_RMV;
>>> +
>>> +			eth_dev->device = &pci_dev->device;
>>> +			eth_dev->data->dev_flags |=
>>> +				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
>>> +
>> Please do not use the temporary macro. Please review this patch:
>>
>> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue xstats")
>>
>> This patch requires that per queue statistics are filled in
>> .xstats_get() by PMD.
> Thanks for pointing this out.
>
> It seems some PMDs are still depending on this flag for xstats.
>
> MANA doesn't implement xstats_get() currently, this flag is useful. Is it okay to keep using this flag before it's finally the time to remove it from all PMDs, or when MANA implements xstats?
Yes, your xstats doesn't implement now. Per queue stats should be filled 
in xstats API,
and the stats API cannot see per queue stats, so stats API in driver 
shouldn't fill
it(suggest that delete it from patch 17/18).

I guess this flag can be removed if PMD does not support xstats.
>
>
  
Long Li Sept. 7, 2022, 1:36 a.m. UTC | #7
> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment
> and doc
> 
> 
> 在 2022/9/1 2:05, Long Li 写道:
> >> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
> >> environment and doc
> >>
> >>
> >> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
> >>> From: Long Li <longli@microsoft.com>
> >>>
> >>> MANA is a PCI device. It uses IB verbs to access hardware through
> >>> the kernel RDMA layer. This patch introduces build environment and
> >>> basic device probe functions.
> >>>
> >>> Signed-off-by: Long Li <longli@microsoft.com>
> >>> ---
> >>> Change log:
> >>> v2:
> >>> Fix typos.
> >>> Make the driver build only on x86-64 and Linux.
> >>> Remove unused header files.
> >>> Change port definition to uint16_t or uint8_t (for IB).
> >>> Use getline() in place of fgets() to read and truncate a line.
> >>> v3:
> >>> Add meson build check for required functions from RDMA direct verb
> >>> header file
> >>> v4:
> >>> Remove extra "\n" in logging code.
> >>> Use "r" in place of "rb" in fopen() to read text files.
> >>>
> >>> [snip]
> >>> +
> >>> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
> >> __rte_unused,
> >>> +			      struct rte_pci_device *pci_dev,
> >>> +			      struct rte_ether_addr *mac_addr) {
> >>> +	struct ibv_device **ibv_list;
> >>> +	int ibv_idx;
> >>> +	struct ibv_context *ctx;
> >>> +	struct ibv_device_attr_ex dev_attr;
> >>> +	int num_devices;
> >>> +	int ret = 0;
> >>> +	uint8_t port;
> >>> +	struct mana_priv *priv = NULL;
> >>> +	struct rte_eth_dev *eth_dev = NULL;
> >>> +	bool found_port;
> >>> +
> >>> +	ibv_list = ibv_get_device_list(&num_devices);
> >>> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> >>> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
> >>> +		struct rte_pci_addr pci_addr;
> >>> +
> >>> +		DRV_LOG(INFO, "Probe device name %s dev_name %s
> >> ibdev_path %s",
> >>> +			ibdev->name, ibdev->dev_name, ibdev-
> >>> ibdev_path);
> >>> +
> >>> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> >>> +			continue;
> >>> +
> >>> +		/* Ignore if this IB device is not this PCI device */
> >>> +		if (pci_dev->addr.domain != pci_addr.domain ||
> >>> +		    pci_dev->addr.bus != pci_addr.bus ||
> >>> +		    pci_dev->addr.devid != pci_addr.devid ||
> >>> +		    pci_dev->addr.function != pci_addr.function)
> >>> +			continue;
> >>> +
> >>> +		ctx = ibv_open_device(ibdev);
> >>> +		if (!ctx) {
> >>> +			DRV_LOG(ERR, "Failed to open IB device %s",
> >>> +				ibdev->name);
> >>> +			continue;
> >>> +		}
> >>> +
> >>> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> >>> +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
> >>> +			dev_attr.orig_attr.phys_port_cnt);
> >>> +		found_port = false;
> >>> +
> >>> +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
> >>> +		     port++) {
> >>> +			struct ibv_parent_domain_init_attr attr = {};
> >>> +			struct rte_ether_addr addr;
> >>> +			char address[64];
> >>> +			char name[RTE_ETH_NAME_MAX_LEN];
> >>> +
> >>> +			ret = get_port_mac(ibdev, port, &addr);
> >>> +			if (ret)
> >>> +				continue;
> >>> +
> >>> +			if (mac_addr && !rte_is_same_ether_addr(&addr,
> >> mac_addr))
> >>> +				continue;
> >>> +
> >>> +			rte_ether_format_addr(address, sizeof(address),
> >> &addr);
> >>> +			DRV_LOG(INFO, "device located port %u address
> >> %s",
> >>> +				port, address);
> >>> +			found_port = true;
> >>> +
> >>> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> >>> +						  RTE_CACHE_LINE_SIZE,
> >>> +						  SOCKET_ID_ANY);
> >>> +			if (!priv) {
> >>> +				ret = -ENOMEM;
> >>> +				goto failed;
> >>> +			}
> >>> +
> >>> +			snprintf(name, sizeof(name), "%s_port%d",
> >>> +				 pci_dev->device.name, port);
> >>> +
> >>> +			if (rte_eal_process_type() ==
> >> RTE_PROC_SECONDARY) {
> >>> +				int fd;
> >>> +
> >>> +				eth_dev =
> >> rte_eth_dev_attach_secondary(name);
> >>> +				if (!eth_dev) {
> >>> +					DRV_LOG(ERR, "Can't attach to dev
> >> %s",
> >>> +						name);
> >>> +					ret = -ENOMEM;
> >>> +					goto failed;
> >>> +				}
> >>> +
> >>> +				eth_dev->device = &pci_dev->device;
> >>> +				eth_dev->dev_ops = &mana_dev_sec_ops;
> >>> +				ret = mana_proc_priv_init(eth_dev);
> >>> +				if (ret)
> >>> +					goto failed;
> >>> +				priv->process_priv = eth_dev-
> >>> process_private;
> >>> +
> >>> +				/* Get the IB FD from the primary process */
> >>> +				fd =
> >> mana_mp_req_verbs_cmd_fd(eth_dev);
> >>> +				if (fd < 0) {
> >>> +					DRV_LOG(ERR, "Failed to get FD %d",
> >> fd);
> >>> +					ret = -ENODEV;
> >>> +					goto failed;
> >>> +				}
> >>> +
> >>> +				ret =
> >> mana_map_doorbell_secondary(eth_dev, fd);
> >>> +				if (ret) {
> >>> +					DRV_LOG(ERR, "Failed secondary
> >> map %d",
> >>> +						fd);
> >>> +					goto failed;
> >>> +				}
> >>> +
> >>> +				/* fd is no not used after mapping doorbell */
> >>> +				close(fd);
> >>> +
> >>> +				rte_spinlock_lock(&mana_shared_data-
> >>> lock);
> >>> +				mana_shared_data->secondary_cnt++;
> >>> +				mana_local_data.secondary_cnt++;
> >>> +				rte_spinlock_unlock(&mana_shared_data-
> >>> lock);
> >>> +
> >>> +				rte_eth_copy_pci_info(eth_dev, pci_dev);
> >>> +				rte_eth_dev_probing_finish(eth_dev);
> >>> +
> >>> +				/* Impossible to have more than one port
> >>> +				 * matching a MAC address
> >>> +				 */
> >>> +				continue;
> >>> +			}
> >>> +
> >>> +			eth_dev = rte_eth_dev_allocate(name);
> >>> +			if (!eth_dev) {
> >>> +				ret = -ENOMEM;
> >>> +				goto failed;
> >>> +			}
> >>> +
> >>> +			eth_dev->data->mac_addrs =
> >>> +				rte_calloc("mana_mac", 1,
> >>> +					   sizeof(struct rte_ether_addr), 0);
> >>> +			if (!eth_dev->data->mac_addrs) {
> >>> +				ret = -ENOMEM;
> >>> +				goto failed;
> >>> +			}
> >>> +
> >>> +			rte_ether_addr_copy(&addr, eth_dev->data-
> >>> mac_addrs);
> >>> +
> >>> +			priv->ib_pd = ibv_alloc_pd(ctx);
> >>> +			if (!priv->ib_pd) {
> >>> +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
> >> port);
> >>> +				ret = -ENOMEM;
> >>> +				goto failed;
> >>> +			}
> >>> +
> >>> +			/* Create a parent domain with the port number */
> >>> +			attr.pd = priv->ib_pd;
> >>> +			attr.comp_mask =
> >> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> >>> +			attr.pd_context = (void *)(uint64_t)port;
> >>> +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
> >> &attr);
> >>> +			if (!priv->ib_parent_pd) {
> >>> +				DRV_LOG(ERR,
> >>> +					"ibv_alloc_parent_domain failed port
> >> %d",
> >>> +					port);
> >>> +				ret = -ENOMEM;
> >>> +				goto failed;
> >>> +			}
> >>> +
> >>> +			priv->ib_ctx = ctx;
> >>> +			priv->port_id = eth_dev->data->port_id;
> >>> +			priv->dev_port = port;
> >>> +			eth_dev->data->dev_private = priv;
> >>> +			priv->dev_data = eth_dev->data;
> >>> +
> >>> +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
> >>> +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
> >>> +
> >>> +			priv->max_rx_desc =
> >>> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> >>> +					dev_attr.orig_attr.max_cqe);
> >>> +			priv->max_tx_desc =
> >>> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> >>> +					dev_attr.orig_attr.max_cqe);
> >>> +
> >>> +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
> >>> +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
> >>> +
> >>> +			priv->max_mr = dev_attr.orig_attr.max_mr;
> >>> +			priv->max_mr_size =
> >> dev_attr.orig_attr.max_mr_size;
> >>> +
> >>> +			DRV_LOG(INFO, "dev %s max queues %d desc %d
> >> sge %d",
> >>> +				name, priv->max_rx_queues, priv-
> >>> max_rx_desc,
> >>> +				priv->max_send_sge);
> >>> +
> >>> +			rte_spinlock_lock(&mana_shared_data->lock);
> >>> +			mana_shared_data->primary_cnt++;
> >>> +			rte_spinlock_unlock(&mana_shared_data->lock);
> >>> +
> >>> +			eth_dev->data->dev_flags |=
> >> RTE_ETH_DEV_INTR_RMV;
> >>> +
> >>> +			eth_dev->device = &pci_dev->device;
> >>> +			eth_dev->data->dev_flags |=
> >>> +				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
> >>> +
> >> Please do not use the temporary macro. Please review this patch:
> >>
> >> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue
> >> xstats")
> >>
> >> This patch requires that per queue statistics are filled in
> >> .xstats_get() by PMD.
> > Thanks for pointing this out.
> >
> > It seems some PMDs are still depending on this flag for xstats.
> >
> > MANA doesn't implement xstats_get() currently, this flag is useful. Is it
> okay to keep using this flag before it's finally the time to remove it from all
> PMDs, or when MANA implements xstats?
> Yes, your xstats doesn't implement now. Per queue stats should be filled in
> xstats API, and the stats API cannot see per queue stats, so stats API in driver
> shouldn't fill it(suggest that delete it from patch 17/18).
> 
> I guess this flag can be removed if PMD does not support xstats.
> >
> >

I don't understand your suggestion. An application can call rte_eth_stats_get() to get port stats, and this will call into stats_get() in the driver, as implemented in patch 17/18.

When flag RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS  is set, an application can also use rte_eth_xstats_get() to get port stats even the driver doesn't implement xstats_get().
  
lihuisong (C) Sept. 7, 2022, 2:16 a.m. UTC | #8
在 2022/9/7 9:36, Long Li 写道:
>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment
>> and doc
>>
>>
>> 在 2022/9/1 2:05, Long Li 写道:
>>>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
>>>> environment and doc
>>>>
>>>>
>>>> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
>>>>> From: Long Li <longli@microsoft.com>
>>>>>
>>>>> MANA is a PCI device. It uses IB verbs to access hardware through
>>>>> the kernel RDMA layer. This patch introduces build environment and
>>>>> basic device probe functions.
>>>>>
>>>>> Signed-off-by: Long Li <longli@microsoft.com>
>>>>> ---
>>>>> Change log:
>>>>> v2:
>>>>> Fix typos.
>>>>> Make the driver build only on x86-64 and Linux.
>>>>> Remove unused header files.
>>>>> Change port definition to uint16_t or uint8_t (for IB).
>>>>> Use getline() in place of fgets() to read and truncate a line.
>>>>> v3:
>>>>> Add meson build check for required functions from RDMA direct verb
>>>>> header file
>>>>> v4:
>>>>> Remove extra "\n" in logging code.
>>>>> Use "r" in place of "rb" in fopen() to read text files.
>>>>>
>>>>> [snip]
>>>>> +
>>>>> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
>>>> __rte_unused,
>>>>> +			      struct rte_pci_device *pci_dev,
>>>>> +			      struct rte_ether_addr *mac_addr) {
>>>>> +	struct ibv_device **ibv_list;
>>>>> +	int ibv_idx;
>>>>> +	struct ibv_context *ctx;
>>>>> +	struct ibv_device_attr_ex dev_attr;
>>>>> +	int num_devices;
>>>>> +	int ret = 0;
>>>>> +	uint8_t port;
>>>>> +	struct mana_priv *priv = NULL;
>>>>> +	struct rte_eth_dev *eth_dev = NULL;
>>>>> +	bool found_port;
>>>>> +
>>>>> +	ibv_list = ibv_get_device_list(&num_devices);
>>>>> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
>>>>> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
>>>>> +		struct rte_pci_addr pci_addr;
>>>>> +
>>>>> +		DRV_LOG(INFO, "Probe device name %s dev_name %s
>>>> ibdev_path %s",
>>>>> +			ibdev->name, ibdev->dev_name, ibdev-
>>>>> ibdev_path);
>>>>> +
>>>>> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
>>>>> +			continue;
>>>>> +
>>>>> +		/* Ignore if this IB device is not this PCI device */
>>>>> +		if (pci_dev->addr.domain != pci_addr.domain ||
>>>>> +		    pci_dev->addr.bus != pci_addr.bus ||
>>>>> +		    pci_dev->addr.devid != pci_addr.devid ||
>>>>> +		    pci_dev->addr.function != pci_addr.function)
>>>>> +			continue;
>>>>> +
>>>>> +		ctx = ibv_open_device(ibdev);
>>>>> +		if (!ctx) {
>>>>> +			DRV_LOG(ERR, "Failed to open IB device %s",
>>>>> +				ibdev->name);
>>>>> +			continue;
>>>>> +		}
>>>>> +
>>>>> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
>>>>> +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
>>>>> +			dev_attr.orig_attr.phys_port_cnt);
>>>>> +		found_port = false;
>>>>> +
>>>>> +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
>>>>> +		     port++) {
>>>>> +			struct ibv_parent_domain_init_attr attr = {};
>>>>> +			struct rte_ether_addr addr;
>>>>> +			char address[64];
>>>>> +			char name[RTE_ETH_NAME_MAX_LEN];
>>>>> +
>>>>> +			ret = get_port_mac(ibdev, port, &addr);
>>>>> +			if (ret)
>>>>> +				continue;
>>>>> +
>>>>> +			if (mac_addr && !rte_is_same_ether_addr(&addr,
>>>> mac_addr))
>>>>> +				continue;
>>>>> +
>>>>> +			rte_ether_format_addr(address, sizeof(address),
>>>> &addr);
>>>>> +			DRV_LOG(INFO, "device located port %u address
>>>> %s",
>>>>> +				port, address);
>>>>> +			found_port = true;
>>>>> +
>>>>> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
>>>>> +						  RTE_CACHE_LINE_SIZE,
>>>>> +						  SOCKET_ID_ANY);
>>>>> +			if (!priv) {
>>>>> +				ret = -ENOMEM;
>>>>> +				goto failed;
>>>>> +			}
>>>>> +
>>>>> +			snprintf(name, sizeof(name), "%s_port%d",
>>>>> +				 pci_dev->device.name, port);
>>>>> +
>>>>> +			if (rte_eal_process_type() ==
>>>> RTE_PROC_SECONDARY) {
>>>>> +				int fd;
>>>>> +
>>>>> +				eth_dev =
>>>> rte_eth_dev_attach_secondary(name);
>>>>> +				if (!eth_dev) {
>>>>> +					DRV_LOG(ERR, "Can't attach to dev
>>>> %s",
>>>>> +						name);
>>>>> +					ret = -ENOMEM;
>>>>> +					goto failed;
>>>>> +				}
>>>>> +
>>>>> +				eth_dev->device = &pci_dev->device;
>>>>> +				eth_dev->dev_ops = &mana_dev_sec_ops;
>>>>> +				ret = mana_proc_priv_init(eth_dev);
>>>>> +				if (ret)
>>>>> +					goto failed;
>>>>> +				priv->process_priv = eth_dev-
>>>>> process_private;
>>>>> +
>>>>> +				/* Get the IB FD from the primary process */
>>>>> +				fd =
>>>> mana_mp_req_verbs_cmd_fd(eth_dev);
>>>>> +				if (fd < 0) {
>>>>> +					DRV_LOG(ERR, "Failed to get FD %d",
>>>> fd);
>>>>> +					ret = -ENODEV;
>>>>> +					goto failed;
>>>>> +				}
>>>>> +
>>>>> +				ret =
>>>> mana_map_doorbell_secondary(eth_dev, fd);
>>>>> +				if (ret) {
>>>>> +					DRV_LOG(ERR, "Failed secondary
>>>> map %d",
>>>>> +						fd);
>>>>> +					goto failed;
>>>>> +				}
>>>>> +
>>>>> +				/* fd is no not used after mapping doorbell */
>>>>> +				close(fd);
>>>>> +
>>>>> +				rte_spinlock_lock(&mana_shared_data-
>>>>> lock);
>>>>> +				mana_shared_data->secondary_cnt++;
>>>>> +				mana_local_data.secondary_cnt++;
>>>>> +				rte_spinlock_unlock(&mana_shared_data-
>>>>> lock);
>>>>> +
>>>>> +				rte_eth_copy_pci_info(eth_dev, pci_dev);
>>>>> +				rte_eth_dev_probing_finish(eth_dev);
>>>>> +
>>>>> +				/* Impossible to have more than one port
>>>>> +				 * matching a MAC address
>>>>> +				 */
>>>>> +				continue;
>>>>> +			}
>>>>> +
>>>>> +			eth_dev = rte_eth_dev_allocate(name);
>>>>> +			if (!eth_dev) {
>>>>> +				ret = -ENOMEM;
>>>>> +				goto failed;
>>>>> +			}
>>>>> +
>>>>> +			eth_dev->data->mac_addrs =
>>>>> +				rte_calloc("mana_mac", 1,
>>>>> +					   sizeof(struct rte_ether_addr), 0);
>>>>> +			if (!eth_dev->data->mac_addrs) {
>>>>> +				ret = -ENOMEM;
>>>>> +				goto failed;
>>>>> +			}
>>>>> +
>>>>> +			rte_ether_addr_copy(&addr, eth_dev->data-
>>>>> mac_addrs);
>>>>> +
>>>>> +			priv->ib_pd = ibv_alloc_pd(ctx);
>>>>> +			if (!priv->ib_pd) {
>>>>> +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
>>>> port);
>>>>> +				ret = -ENOMEM;
>>>>> +				goto failed;
>>>>> +			}
>>>>> +
>>>>> +			/* Create a parent domain with the port number */
>>>>> +			attr.pd = priv->ib_pd;
>>>>> +			attr.comp_mask =
>>>> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
>>>>> +			attr.pd_context = (void *)(uint64_t)port;
>>>>> +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
>>>> &attr);
>>>>> +			if (!priv->ib_parent_pd) {
>>>>> +				DRV_LOG(ERR,
>>>>> +					"ibv_alloc_parent_domain failed port
>>>> %d",
>>>>> +					port);
>>>>> +				ret = -ENOMEM;
>>>>> +				goto failed;
>>>>> +			}
>>>>> +
>>>>> +			priv->ib_ctx = ctx;
>>>>> +			priv->port_id = eth_dev->data->port_id;
>>>>> +			priv->dev_port = port;
>>>>> +			eth_dev->data->dev_private = priv;
>>>>> +			priv->dev_data = eth_dev->data;
>>>>> +
>>>>> +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
>>>>> +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
>>>>> +
>>>>> +			priv->max_rx_desc =
>>>>> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
>>>>> +					dev_attr.orig_attr.max_cqe);
>>>>> +			priv->max_tx_desc =
>>>>> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
>>>>> +					dev_attr.orig_attr.max_cqe);
>>>>> +
>>>>> +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
>>>>> +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
>>>>> +
>>>>> +			priv->max_mr = dev_attr.orig_attr.max_mr;
>>>>> +			priv->max_mr_size =
>>>> dev_attr.orig_attr.max_mr_size;
>>>>> +
>>>>> +			DRV_LOG(INFO, "dev %s max queues %d desc %d
>>>> sge %d",
>>>>> +				name, priv->max_rx_queues, priv-
>>>>> max_rx_desc,
>>>>> +				priv->max_send_sge);
>>>>> +
>>>>> +			rte_spinlock_lock(&mana_shared_data->lock);
>>>>> +			mana_shared_data->primary_cnt++;
>>>>> +			rte_spinlock_unlock(&mana_shared_data->lock);
>>>>> +
>>>>> +			eth_dev->data->dev_flags |=
>>>> RTE_ETH_DEV_INTR_RMV;
>>>>> +
>>>>> +			eth_dev->device = &pci_dev->device;
>>>>> +			eth_dev->data->dev_flags |=
>>>>> +				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
>>>>> +
>>>> Please do not use the temporary macro. Please review this patch:
>>>>
>>>> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue
>>>> xstats")
>>>>
>>>> This patch requires that per queue statistics are filled in
>>>> .xstats_get() by PMD.
>>> Thanks for pointing this out.
>>>
>>> It seems some PMDs are still depending on this flag for xstats.
>>>
>>> MANA doesn't implement xstats_get() currently, this flag is useful. Is it
>> okay to keep using this flag before it's finally the time to remove it from all
>> PMDs, or when MANA implements xstats?
>> Yes, your xstats doesn't implement now. Per queue stats should be filled in
>> xstats API, and the stats API cannot see per queue stats, so stats API in driver
>> shouldn't fill it(suggest that delete it from patch 17/18).
>>
>> I guess this flag can be removed if PMD does not support xstats.
>>>
> I don't understand your suggestion. An application can call rte_eth_stats_get() to get port stats, and this will call into stats_get() in the driver, as implemented in patch 17/18.
>
> When flag RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS  is set, an application can also use rte_eth_xstats_get() to get port stats even the driver doesn't implement xstats_get().

I think new PMD should follow the announced switch Ferruh mentioned, 
otherwise, the switch will never be completed.

Suggest that mana driver can implement a simple xstats_get() to fill per 
queue stats if you want to support per queue stats.

@Ferruh, what do you think?
  
Long Li Sept. 7, 2022, 2:26 a.m. UTC | #9
> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment
> and doc
> 
> 
> 在 2022/9/7 9:36, Long Li 写道:
> >> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
> >> environment and doc
> >>
> >>
> >> 在 2022/9/1 2:05, Long Li 写道:
> >>>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
> >>>> environment and doc
> >>>>
> >>>>
> >>>> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
> >>>>> From: Long Li <longli@microsoft.com>
> >>>>>
> >>>>> MANA is a PCI device. It uses IB verbs to access hardware through
> >>>>> the kernel RDMA layer. This patch introduces build environment and
> >>>>> basic device probe functions.
> >>>>>
> >>>>> Signed-off-by: Long Li <longli@microsoft.com>
> >>>>> ---
> >>>>> Change log:
> >>>>> v2:
> >>>>> Fix typos.
> >>>>> Make the driver build only on x86-64 and Linux.
> >>>>> Remove unused header files.
> >>>>> Change port definition to uint16_t or uint8_t (for IB).
> >>>>> Use getline() in place of fgets() to read and truncate a line.
> >>>>> v3:
> >>>>> Add meson build check for required functions from RDMA direct verb
> >>>>> header file
> >>>>> v4:
> >>>>> Remove extra "\n" in logging code.
> >>>>> Use "r" in place of "rb" in fopen() to read text files.
> >>>>>
> >>>>> [snip]
> >>>>> +
> >>>>> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
> >>>> __rte_unused,
> >>>>> +			      struct rte_pci_device *pci_dev,
> >>>>> +			      struct rte_ether_addr *mac_addr) {
> >>>>> +	struct ibv_device **ibv_list;
> >>>>> +	int ibv_idx;
> >>>>> +	struct ibv_context *ctx;
> >>>>> +	struct ibv_device_attr_ex dev_attr;
> >>>>> +	int num_devices;
> >>>>> +	int ret = 0;
> >>>>> +	uint8_t port;
> >>>>> +	struct mana_priv *priv = NULL;
> >>>>> +	struct rte_eth_dev *eth_dev = NULL;
> >>>>> +	bool found_port;
> >>>>> +
> >>>>> +	ibv_list = ibv_get_device_list(&num_devices);
> >>>>> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> >>>>> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
> >>>>> +		struct rte_pci_addr pci_addr;
> >>>>> +
> >>>>> +		DRV_LOG(INFO, "Probe device name %s
> dev_name %s
> >>>> ibdev_path %s",
> >>>>> +			ibdev->name, ibdev->dev_name, ibdev-
> >>>>> ibdev_path);
> >>>>> +
> >>>>> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> >>>>> +			continue;
> >>>>> +
> >>>>> +		/* Ignore if this IB device is not this PCI device */
> >>>>> +		if (pci_dev->addr.domain != pci_addr.domain ||
> >>>>> +		    pci_dev->addr.bus != pci_addr.bus ||
> >>>>> +		    pci_dev->addr.devid != pci_addr.devid ||
> >>>>> +		    pci_dev->addr.function != pci_addr.function)
> >>>>> +			continue;
> >>>>> +
> >>>>> +		ctx = ibv_open_device(ibdev);
> >>>>> +		if (!ctx) {
> >>>>> +			DRV_LOG(ERR, "Failed to open IB device %s",
> >>>>> +				ibdev->name);
> >>>>> +			continue;
> >>>>> +		}
> >>>>> +
> >>>>> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> >>>>> +		DRV_LOG(INFO,
> "dev_attr.orig_attr.phys_port_cnt %u",
> >>>>> +			dev_attr.orig_attr.phys_port_cnt);
> >>>>> +		found_port = false;
> >>>>> +
> >>>>> +		for (port = 1; port <=
> dev_attr.orig_attr.phys_port_cnt;
> >>>>> +		     port++) {
> >>>>> +			struct ibv_parent_domain_init_attr attr = {};
> >>>>> +			struct rte_ether_addr addr;
> >>>>> +			char address[64];
> >>>>> +			char name[RTE_ETH_NAME_MAX_LEN];
> >>>>> +
> >>>>> +			ret = get_port_mac(ibdev, port, &addr);
> >>>>> +			if (ret)
> >>>>> +				continue;
> >>>>> +
> >>>>> +			if (mac_addr
> && !rte_is_same_ether_addr(&addr,
> >>>> mac_addr))
> >>>>> +				continue;
> >>>>> +
> >>>>> +			rte_ether_format_addr(address,
> sizeof(address),
> >>>> &addr);
> >>>>> +			DRV_LOG(INFO, "device located port %u
> address
> >>>> %s",
> >>>>> +				port, address);
> >>>>> +			found_port = true;
> >>>>> +
> >>>>> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> >>>>> +
> RTE_CACHE_LINE_SIZE,
> >>>>> +						  SOCKET_ID_ANY);
> >>>>> +			if (!priv) {
> >>>>> +				ret = -ENOMEM;
> >>>>> +				goto failed;
> >>>>> +			}
> >>>>> +
> >>>>> +			snprintf(name, sizeof(name), "%s_port%d",
> >>>>> +				 pci_dev->device.name, port);
> >>>>> +
> >>>>> +			if (rte_eal_process_type() ==
> >>>> RTE_PROC_SECONDARY) {
> >>>>> +				int fd;
> >>>>> +
> >>>>> +				eth_dev =
> >>>> rte_eth_dev_attach_secondary(name);
> >>>>> +				if (!eth_dev) {
> >>>>> +					DRV_LOG(ERR, "Can't attach
> to dev
> >>>> %s",
> >>>>> +						name);
> >>>>> +					ret = -ENOMEM;
> >>>>> +					goto failed;
> >>>>> +				}
> >>>>> +
> >>>>> +				eth_dev->device = &pci_dev->device;
> >>>>> +				eth_dev->dev_ops =
> &mana_dev_sec_ops;
> >>>>> +				ret = mana_proc_priv_init(eth_dev);
> >>>>> +				if (ret)
> >>>>> +					goto failed;
> >>>>> +				priv->process_priv = eth_dev-
> >>>>> process_private;
> >>>>> +
> >>>>> +				/* Get the IB FD from the primary
> process */
> >>>>> +				fd =
> >>>> mana_mp_req_verbs_cmd_fd(eth_dev);
> >>>>> +				if (fd < 0) {
> >>>>> +					DRV_LOG(ERR, "Failed to get
> FD %d",
> >>>> fd);
> >>>>> +					ret = -ENODEV;
> >>>>> +					goto failed;
> >>>>> +				}
> >>>>> +
> >>>>> +				ret =
> >>>> mana_map_doorbell_secondary(eth_dev, fd);
> >>>>> +				if (ret) {
> >>>>> +					DRV_LOG(ERR, "Failed
> secondary
> >>>> map %d",
> >>>>> +						fd);
> >>>>> +					goto failed;
> >>>>> +				}
> >>>>> +
> >>>>> +				/* fd is no not used after mapping
> doorbell */
> >>>>> +				close(fd);
> >>>>> +
> >>>>> +
> 	rte_spinlock_lock(&mana_shared_data-
> >>>>> lock);
> >>>>> +				mana_shared_data-
> >secondary_cnt++;
> >>>>> +				mana_local_data.secondary_cnt++;
> >>>>> +
> 	rte_spinlock_unlock(&mana_shared_data-
> >>>>> lock);
> >>>>> +
> >>>>> +				rte_eth_copy_pci_info(eth_dev,
> pci_dev);
> >>>>> +
> 	rte_eth_dev_probing_finish(eth_dev);
> >>>>> +
> >>>>> +				/* Impossible to have more than one
> port
> >>>>> +				 * matching a MAC address
> >>>>> +				 */
> >>>>> +				continue;
> >>>>> +			}
> >>>>> +
> >>>>> +			eth_dev = rte_eth_dev_allocate(name);
> >>>>> +			if (!eth_dev) {
> >>>>> +				ret = -ENOMEM;
> >>>>> +				goto failed;
> >>>>> +			}
> >>>>> +
> >>>>> +			eth_dev->data->mac_addrs =
> >>>>> +				rte_calloc("mana_mac", 1,
> >>>>> +					   sizeof(struct
> rte_ether_addr), 0);
> >>>>> +			if (!eth_dev->data->mac_addrs) {
> >>>>> +				ret = -ENOMEM;
> >>>>> +				goto failed;
> >>>>> +			}
> >>>>> +
> >>>>> +			rte_ether_addr_copy(&addr, eth_dev-
> >data-
> >>>>> mac_addrs);
> >>>>> +
> >>>>> +			priv->ib_pd = ibv_alloc_pd(ctx);
> >>>>> +			if (!priv->ib_pd) {
> >>>>> +				DRV_LOG(ERR, "ibv_alloc_pd failed
> port %d",
> >>>> port);
> >>>>> +				ret = -ENOMEM;
> >>>>> +				goto failed;
> >>>>> +			}
> >>>>> +
> >>>>> +			/* Create a parent domain with the port
> number */
> >>>>> +			attr.pd = priv->ib_pd;
> >>>>> +			attr.comp_mask =
> >>>> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> >>>>> +			attr.pd_context = (void *)(uint64_t)port;
> >>>>> +			priv->ib_parent_pd =
> ibv_alloc_parent_domain(ctx,
> >>>> &attr);
> >>>>> +			if (!priv->ib_parent_pd) {
> >>>>> +				DRV_LOG(ERR,
> >>>>> +					"ibv_alloc_parent_domain
> failed port
> >>>> %d",
> >>>>> +					port);
> >>>>> +				ret = -ENOMEM;
> >>>>> +				goto failed;
> >>>>> +			}
> >>>>> +
> >>>>> +			priv->ib_ctx = ctx;
> >>>>> +			priv->port_id = eth_dev->data->port_id;
> >>>>> +			priv->dev_port = port;
> >>>>> +			eth_dev->data->dev_private = priv;
> >>>>> +			priv->dev_data = eth_dev->data;
> >>>>> +
> >>>>> +			priv->max_rx_queues =
> dev_attr.orig_attr.max_qp;
> >>>>> +			priv->max_tx_queues =
> dev_attr.orig_attr.max_qp;
> >>>>> +
> >>>>> +			priv->max_rx_desc =
> >>>>> +
> 	RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> >>>>> +					dev_attr.orig_attr.max_cqe);
> >>>>> +			priv->max_tx_desc =
> >>>>> +
> 	RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> >>>>> +					dev_attr.orig_attr.max_cqe);
> >>>>> +
> >>>>> +			priv->max_send_sge =
> dev_attr.orig_attr.max_sge;
> >>>>> +			priv->max_recv_sge =
> dev_attr.orig_attr.max_sge;
> >>>>> +
> >>>>> +			priv->max_mr = dev_attr.orig_attr.max_mr;
> >>>>> +			priv->max_mr_size =
> >>>> dev_attr.orig_attr.max_mr_size;
> >>>>> +
> >>>>> +			DRV_LOG(INFO, "dev %s max queues %d
> desc %d
> >>>> sge %d",
> >>>>> +				name, priv->max_rx_queues, priv-
> >>>>> max_rx_desc,
> >>>>> +				priv->max_send_sge);
> >>>>> +
> >>>>> +			rte_spinlock_lock(&mana_shared_data-
> >lock);
> >>>>> +			mana_shared_data->primary_cnt++;
> >>>>> +			rte_spinlock_unlock(&mana_shared_data-
> >lock);
> >>>>> +
> >>>>> +			eth_dev->data->dev_flags |=
> >>>> RTE_ETH_DEV_INTR_RMV;
> >>>>> +
> >>>>> +			eth_dev->device = &pci_dev->device;
> >>>>> +			eth_dev->data->dev_flags |=
> >>>>> +
> 	RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
> >>>>> +
> >>>> Please do not use the temporary macro. Please review this patch:
> >>>>
> >>>> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue
> >>>> xstats")
> >>>>
> >>>> This patch requires that per queue statistics are filled in
> >>>> .xstats_get() by PMD.
> >>> Thanks for pointing this out.
> >>>
> >>> It seems some PMDs are still depending on this flag for xstats.
> >>>
> >>> MANA doesn't implement xstats_get() currently, this flag is useful.
> >>> Is it
> >> okay to keep using this flag before it's finally the time to remove
> >> it from all PMDs, or when MANA implements xstats?
> >> Yes, your xstats doesn't implement now. Per queue stats should be
> >> filled in xstats API, and the stats API cannot see per queue stats,
> >> so stats API in driver shouldn't fill it(suggest that delete it from patch
> 17/18).
> >>
> >> I guess this flag can be removed if PMD does not support xstats.
> >>>
> > I don't understand your suggestion. An application can call
> rte_eth_stats_get() to get port stats, and this will call into stats_get() in the
> driver, as implemented in patch 17/18.
> >
> > When flag RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS  is set, an application
> can also use rte_eth_xstats_get() to get port stats even the driver doesn't
> implement xstats_get().
> 
> I think new PMD should follow the announced switch Ferruh mentioned,
> otherwise, the switch will never be completed.

I see now I understand more. I prefer to keep patch 17/18, or an application calling rte_eth_stats_get() will fail. (before the switch happens)

> 
> Suggest that mana driver can implement a simple xstats_get() to fill per
> queue stats if you want to support per queue stats.
> 
> @Ferruh, what do you think?
  
Ferruh Yigit Sept. 7, 2022, 11:11 a.m. UTC | #10
On 9/7/2022 3:16 AM, lihuisong (C) wrote:
> CAUTION: This message has originated from an External Source. Please use 
> proper judgment and caution when opening attachments, clicking links, or 
> responding to this email.
> 
> 
> 在 2022/9/7 9:36, Long Li 写道:
>>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build 
>>> environment
>>> and doc
>>>
>>>
>>> 在 2022/9/1 2:05, Long Li 写道:
>>>>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
>>>>> environment and doc
>>>>>
>>>>>
>>>>> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
>>>>>> From: Long Li <longli@microsoft.com>
>>>>>>
>>>>>> MANA is a PCI device. It uses IB verbs to access hardware through
>>>>>> the kernel RDMA layer. This patch introduces build environment and
>>>>>> basic device probe functions.
>>>>>>
>>>>>> Signed-off-by: Long Li <longli@microsoft.com>
>>>>>> ---
>>>>>> Change log:
>>>>>> v2:
>>>>>> Fix typos.
>>>>>> Make the driver build only on x86-64 and Linux.
>>>>>> Remove unused header files.
>>>>>> Change port definition to uint16_t or uint8_t (for IB).
>>>>>> Use getline() in place of fgets() to read and truncate a line.
>>>>>> v3:
>>>>>> Add meson build check for required functions from RDMA direct verb
>>>>>> header file
>>>>>> v4:
>>>>>> Remove extra "\n" in logging code.
>>>>>> Use "r" in place of "rb" in fopen() to read text files.
>>>>>>
>>>>>> [snip]
>>>>>> +
>>>>>> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
>>>>> __rte_unused,
>>>>>> +                       struct rte_pci_device *pci_dev,
>>>>>> +                       struct rte_ether_addr *mac_addr) {
>>>>>> + struct ibv_device **ibv_list;
>>>>>> + int ibv_idx;
>>>>>> + struct ibv_context *ctx;
>>>>>> + struct ibv_device_attr_ex dev_attr;
>>>>>> + int num_devices;
>>>>>> + int ret = 0;
>>>>>> + uint8_t port;
>>>>>> + struct mana_priv *priv = NULL;
>>>>>> + struct rte_eth_dev *eth_dev = NULL;
>>>>>> + bool found_port;
>>>>>> +
>>>>>> + ibv_list = ibv_get_device_list(&num_devices);
>>>>>> + for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
>>>>>> +         struct ibv_device *ibdev = ibv_list[ibv_idx];
>>>>>> +         struct rte_pci_addr pci_addr;
>>>>>> +
>>>>>> +         DRV_LOG(INFO, "Probe device name %s dev_name %s
>>>>> ibdev_path %s",
>>>>>> +                 ibdev->name, ibdev->dev_name, ibdev-
>>>>>> ibdev_path);
>>>>>> +
>>>>>> +         if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
>>>>>> +                 continue;
>>>>>> +
>>>>>> +         /* Ignore if this IB device is not this PCI device */
>>>>>> +         if (pci_dev->addr.domain != pci_addr.domain ||
>>>>>> +             pci_dev->addr.bus != pci_addr.bus ||
>>>>>> +             pci_dev->addr.devid != pci_addr.devid ||
>>>>>> +             pci_dev->addr.function != pci_addr.function)
>>>>>> +                 continue;
>>>>>> +
>>>>>> +         ctx = ibv_open_device(ibdev);
>>>>>> +         if (!ctx) {
>>>>>> +                 DRV_LOG(ERR, "Failed to open IB device %s",
>>>>>> +                         ibdev->name);
>>>>>> +                 continue;
>>>>>> +         }
>>>>>> +
>>>>>> +         ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
>>>>>> +         DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
>>>>>> +                 dev_attr.orig_attr.phys_port_cnt);
>>>>>> +         found_port = false;
>>>>>> +
>>>>>> +         for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
>>>>>> +              port++) {
>>>>>> +                 struct ibv_parent_domain_init_attr attr = {};
>>>>>> +                 struct rte_ether_addr addr;
>>>>>> +                 char address[64];
>>>>>> +                 char name[RTE_ETH_NAME_MAX_LEN];
>>>>>> +
>>>>>> +                 ret = get_port_mac(ibdev, port, &addr);
>>>>>> +                 if (ret)
>>>>>> +                         continue;
>>>>>> +
>>>>>> +                 if (mac_addr && !rte_is_same_ether_addr(&addr,
>>>>> mac_addr))
>>>>>> +                         continue;
>>>>>> +
>>>>>> +                 rte_ether_format_addr(address, sizeof(address),
>>>>> &addr);
>>>>>> +                 DRV_LOG(INFO, "device located port %u address
>>>>> %s",
>>>>>> +                         port, address);
>>>>>> +                 found_port = true;
>>>>>> +
>>>>>> +                 priv = rte_zmalloc_socket(NULL, sizeof(*priv),
>>>>>> +                                           RTE_CACHE_LINE_SIZE,
>>>>>> +                                           SOCKET_ID_ANY);
>>>>>> +                 if (!priv) {
>>>>>> +                         ret = -ENOMEM;
>>>>>> +                         goto failed;
>>>>>> +                 }
>>>>>> +
>>>>>> +                 snprintf(name, sizeof(name), "%s_port%d",
>>>>>> +                          pci_dev->device.name, port);
>>>>>> +
>>>>>> +                 if (rte_eal_process_type() ==
>>>>> RTE_PROC_SECONDARY) {
>>>>>> +                         int fd;
>>>>>> +
>>>>>> +                         eth_dev =
>>>>> rte_eth_dev_attach_secondary(name);
>>>>>> +                         if (!eth_dev) {
>>>>>> +                                 DRV_LOG(ERR, "Can't attach to dev
>>>>> %s",
>>>>>> +                                         name);
>>>>>> +                                 ret = -ENOMEM;
>>>>>> +                                 goto failed;
>>>>>> +                         }
>>>>>> +
>>>>>> +                         eth_dev->device = &pci_dev->device;
>>>>>> +                         eth_dev->dev_ops = &mana_dev_sec_ops;
>>>>>> +                         ret = mana_proc_priv_init(eth_dev);
>>>>>> +                         if (ret)
>>>>>> +                                 goto failed;
>>>>>> +                         priv->process_priv = eth_dev-
>>>>>> process_private;
>>>>>> +
>>>>>> +                         /* Get the IB FD from the primary 
>>>>>> process */
>>>>>> +                         fd =
>>>>> mana_mp_req_verbs_cmd_fd(eth_dev);
>>>>>> +                         if (fd < 0) {
>>>>>> +                                 DRV_LOG(ERR, "Failed to get FD %d",
>>>>> fd);
>>>>>> +                                 ret = -ENODEV;
>>>>>> +                                 goto failed;
>>>>>> +                         }
>>>>>> +
>>>>>> +                         ret =
>>>>> mana_map_doorbell_secondary(eth_dev, fd);
>>>>>> +                         if (ret) {
>>>>>> +                                 DRV_LOG(ERR, "Failed secondary
>>>>> map %d",
>>>>>> +                                         fd);
>>>>>> +                                 goto failed;
>>>>>> +                         }
>>>>>> +
>>>>>> +                         /* fd is no not used after mapping 
>>>>>> doorbell */
>>>>>> +                         close(fd);
>>>>>> +
>>>>>> +                         rte_spinlock_lock(&mana_shared_data-
>>>>>> lock);
>>>>>> +                         mana_shared_data->secondary_cnt++;
>>>>>> +                         mana_local_data.secondary_cnt++;
>>>>>> +                         rte_spinlock_unlock(&mana_shared_data-
>>>>>> lock);
>>>>>> +
>>>>>> +                         rte_eth_copy_pci_info(eth_dev, pci_dev);
>>>>>> +                         rte_eth_dev_probing_finish(eth_dev);
>>>>>> +
>>>>>> +                         /* Impossible to have more than one port
>>>>>> +                          * matching a MAC address
>>>>>> +                          */
>>>>>> +                         continue;
>>>>>> +                 }
>>>>>> +
>>>>>> +                 eth_dev = rte_eth_dev_allocate(name);
>>>>>> +                 if (!eth_dev) {
>>>>>> +                         ret = -ENOMEM;
>>>>>> +                         goto failed;
>>>>>> +                 }
>>>>>> +
>>>>>> +                 eth_dev->data->mac_addrs =
>>>>>> +                         rte_calloc("mana_mac", 1,
>>>>>> +                                    sizeof(struct 
>>>>>> rte_ether_addr), 0);
>>>>>> +                 if (!eth_dev->data->mac_addrs) {
>>>>>> +                         ret = -ENOMEM;
>>>>>> +                         goto failed;
>>>>>> +                 }
>>>>>> +
>>>>>> +                 rte_ether_addr_copy(&addr, eth_dev->data-
>>>>>> mac_addrs);
>>>>>> +
>>>>>> +                 priv->ib_pd = ibv_alloc_pd(ctx);
>>>>>> +                 if (!priv->ib_pd) {
>>>>>> +                         DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
>>>>> port);
>>>>>> +                         ret = -ENOMEM;
>>>>>> +                         goto failed;
>>>>>> +                 }
>>>>>> +
>>>>>> +                 /* Create a parent domain with the port number */
>>>>>> +                 attr.pd = priv->ib_pd;
>>>>>> +                 attr.comp_mask =
>>>>> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
>>>>>> +                 attr.pd_context = (void *)(uint64_t)port;
>>>>>> +                 priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
>>>>> &attr);
>>>>>> +                 if (!priv->ib_parent_pd) {
>>>>>> +                         DRV_LOG(ERR,
>>>>>> +                                 "ibv_alloc_parent_domain failed 
>>>>>> port
>>>>> %d",
>>>>>> +                                 port);
>>>>>> +                         ret = -ENOMEM;
>>>>>> +                         goto failed;
>>>>>> +                 }
>>>>>> +
>>>>>> +                 priv->ib_ctx = ctx;
>>>>>> +                 priv->port_id = eth_dev->data->port_id;
>>>>>> +                 priv->dev_port = port;
>>>>>> +                 eth_dev->data->dev_private = priv;
>>>>>> +                 priv->dev_data = eth_dev->data;
>>>>>> +
>>>>>> +                 priv->max_rx_queues = dev_attr.orig_attr.max_qp;
>>>>>> +                 priv->max_tx_queues = dev_attr.orig_attr.max_qp;
>>>>>> +
>>>>>> +                 priv->max_rx_desc =
>>>>>> +                         RTE_MIN(dev_attr.orig_attr.max_qp_wr,
>>>>>> +                                 dev_attr.orig_attr.max_cqe);
>>>>>> +                 priv->max_tx_desc =
>>>>>> +                         RTE_MIN(dev_attr.orig_attr.max_qp_wr,
>>>>>> +                                 dev_attr.orig_attr.max_cqe);
>>>>>> +
>>>>>> +                 priv->max_send_sge = dev_attr.orig_attr.max_sge;
>>>>>> +                 priv->max_recv_sge = dev_attr.orig_attr.max_sge;
>>>>>> +
>>>>>> +                 priv->max_mr = dev_attr.orig_attr.max_mr;
>>>>>> +                 priv->max_mr_size =
>>>>> dev_attr.orig_attr.max_mr_size;
>>>>>> +
>>>>>> +                 DRV_LOG(INFO, "dev %s max queues %d desc %d
>>>>> sge %d",
>>>>>> +                         name, priv->max_rx_queues, priv-
>>>>>> max_rx_desc,
>>>>>> +                         priv->max_send_sge);
>>>>>> +
>>>>>> +                 rte_spinlock_lock(&mana_shared_data->lock);
>>>>>> +                 mana_shared_data->primary_cnt++;
>>>>>> +                 rte_spinlock_unlock(&mana_shared_data->lock);
>>>>>> +
>>>>>> +                 eth_dev->data->dev_flags |=
>>>>> RTE_ETH_DEV_INTR_RMV;
>>>>>> +
>>>>>> +                 eth_dev->device = &pci_dev->device;
>>>>>> +                 eth_dev->data->dev_flags |=
>>>>>> +                         RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
>>>>>> +
>>>>> Please do not use the temporary macro. Please review this patch:
>>>>>
>>>>> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue
>>>>> xstats")
>>>>>
>>>>> This patch requires that per queue statistics are filled in
>>>>> .xstats_get() by PMD.
>>>> Thanks for pointing this out.
>>>>
>>>> It seems some PMDs are still depending on this flag for xstats.
>>>>
>>>> MANA doesn't implement xstats_get() currently, this flag is useful. 
>>>> Is it
>>> okay to keep using this flag before it's finally the time to remove 
>>> it from all
>>> PMDs, or when MANA implements xstats?
>>> Yes, your xstats doesn't implement now. Per queue stats should be 
>>> filled in
>>> xstats API, and the stats API cannot see per queue stats, so stats 
>>> API in driver
>>> shouldn't fill it(suggest that delete it from patch 17/18).
>>>
>>> I guess this flag can be removed if PMD does not support xstats.
>>>>
>> I don't understand your suggestion. An application can call 
>> rte_eth_stats_get() to get port stats, and this will call into 
>> stats_get() in the driver, as implemented in patch 17/18.
>>
>> When flag RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS  is set, an application 
>> can also use rte_eth_xstats_get() to get port stats even the driver 
>> doesn't implement xstats_get().
> 
> I think new PMD should follow the announced switch Ferruh mentioned,
> otherwise, the switch will never be completed.
> 
> Suggest that mana driver can implement a simple xstats_get() to fill per
> queue stats if you want to support per queue stats.
> 
> @Ferruh, what do you think?
> 

Hi Huisong,

Thanks for reminding it, yes it makes sense to implement new method in 
new drivers.


Long,

There is a long term plan to move queue stats from basic stats structure 
to xstats. The reason behind is increasing number of queues makes basis 
stats struct too big, on the other hand xstats is more flexible and no 
fixes size array is required.

You can remove the 'RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS' flag and driver 
won't support queue stats in xstats by default. Instead to have that 
support you will need to implement xstats, later or in this set.
When queue stats implemented in xstats, please remember to remove 
updating 'stats->q_*' in basic stats.

Also in 17/18, the feature 'Stats per queue' seems added, but that is 
not correct, feature name is misleading here. But it is about queue 
stats mapping, please check 'doc/guides/nics/features.rst'.
So can you please drop that feature.
  
Long Li Sept. 7, 2022, 6:12 p.m. UTC | #11
> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment
> and doc
> 
> On 9/7/2022 3:16 AM, lihuisong (C) wrote:
> > CAUTION: This message has originated from an External Source. Please
> > use proper judgment and caution when opening attachments, clicking
> > links, or responding to this email.
> >
> >
> > 在 2022/9/7 9:36, Long Li 写道:
> >>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
> >>> environment and doc
> >>>
> >>>
> >>> 在 2022/9/1 2:05, Long Li 写道:
> >>>>> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build
> >>>>> environment and doc
> >>>>>
> >>>>>
> >>>>> 在 2022/8/31 6:51, longli@linuxonhyperv.com 写道:
> >>>>>> From: Long Li <longli@microsoft.com>
> >>>>>>
> >>>>>> MANA is a PCI device. It uses IB verbs to access hardware through
> >>>>>> the kernel RDMA layer. This patch introduces build environment and
> >>>>>> basic device probe functions.
> >>>>>>
> >>>>>> Signed-off-by: Long Li <longli@microsoft.com>
> >>>>>> ---
> >>>>>> Change log:
> >>>>>> v2:
> >>>>>> Fix typos.
> >>>>>> Make the driver build only on x86-64 and Linux.
> >>>>>> Remove unused header files.
> >>>>>> Change port definition to uint16_t or uint8_t (for IB).
> >>>>>> Use getline() in place of fgets() to read and truncate a line.
> >>>>>> v3:
> >>>>>> Add meson build check for required functions from RDMA direct
> verb
> >>>>>> header file
> >>>>>> v4:
> >>>>>> Remove extra "\n" in logging code.
> >>>>>> Use "r" in place of "rb" in fopen() to read text files.
> >>>>>>
> >>>>>> [snip]
> >>>>>> +
> >>>>>> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv
> >>>>> __rte_unused,
> >>>>>> +                       struct rte_pci_device *pci_dev,
> >>>>>> +                       struct rte_ether_addr *mac_addr) {
> >>>>>> + struct ibv_device **ibv_list;
> >>>>>> + int ibv_idx;
> >>>>>> + struct ibv_context *ctx;
> >>>>>> + struct ibv_device_attr_ex dev_attr;
> >>>>>> + int num_devices;
> >>>>>> + int ret = 0;
> >>>>>> + uint8_t port;
> >>>>>> + struct mana_priv *priv = NULL;
> >>>>>> + struct rte_eth_dev *eth_dev = NULL;
> >>>>>> + bool found_port;
> >>>>>> +
> >>>>>> + ibv_list = ibv_get_device_list(&num_devices);
> >>>>>> + for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> >>>>>> +         struct ibv_device *ibdev = ibv_list[ibv_idx];
> >>>>>> +         struct rte_pci_addr pci_addr;
> >>>>>> +
> >>>>>> +         DRV_LOG(INFO, "Probe device name %s dev_name %s
> >>>>> ibdev_path %s",
> >>>>>> +                 ibdev->name, ibdev->dev_name, ibdev-
> >>>>>> ibdev_path);
> >>>>>> +
> >>>>>> +         if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> >>>>>> +                 continue;
> >>>>>> +
> >>>>>> +         /* Ignore if this IB device is not this PCI device */
> >>>>>> +         if (pci_dev->addr.domain != pci_addr.domain ||
> >>>>>> +             pci_dev->addr.bus != pci_addr.bus ||
> >>>>>> +             pci_dev->addr.devid != pci_addr.devid ||
> >>>>>> +             pci_dev->addr.function != pci_addr.function)
> >>>>>> +                 continue;
> >>>>>> +
> >>>>>> +         ctx = ibv_open_device(ibdev);
> >>>>>> +         if (!ctx) {
> >>>>>> +                 DRV_LOG(ERR, "Failed to open IB device %s",
> >>>>>> +                         ibdev->name);
> >>>>>> +                 continue;
> >>>>>> +         }
> >>>>>> +
> >>>>>> +         ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> >>>>>> +         DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
> >>>>>> +                 dev_attr.orig_attr.phys_port_cnt);
> >>>>>> +         found_port = false;
> >>>>>> +
> >>>>>> +         for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
> >>>>>> +              port++) {
> >>>>>> +                 struct ibv_parent_domain_init_attr attr = {};
> >>>>>> +                 struct rte_ether_addr addr;
> >>>>>> +                 char address[64];
> >>>>>> +                 char name[RTE_ETH_NAME_MAX_LEN];
> >>>>>> +
> >>>>>> +                 ret = get_port_mac(ibdev, port, &addr);
> >>>>>> +                 if (ret)
> >>>>>> +                         continue;
> >>>>>> +
> >>>>>> +                 if (mac_addr && !rte_is_same_ether_addr(&addr,
> >>>>> mac_addr))
> >>>>>> +                         continue;
> >>>>>> +
> >>>>>> +                 rte_ether_format_addr(address, sizeof(address),
> >>>>> &addr);
> >>>>>> +                 DRV_LOG(INFO, "device located port %u address
> >>>>> %s",
> >>>>>> +                         port, address);
> >>>>>> +                 found_port = true;
> >>>>>> +
> >>>>>> +                 priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> >>>>>> +                                           RTE_CACHE_LINE_SIZE,
> >>>>>> +                                           SOCKET_ID_ANY);
> >>>>>> +                 if (!priv) {
> >>>>>> +                         ret = -ENOMEM;
> >>>>>> +                         goto failed;
> >>>>>> +                 }
> >>>>>> +
> >>>>>> +                 snprintf(name, sizeof(name), "%s_port%d",
> >>>>>> +                          pci_dev->device.name, port);
> >>>>>> +
> >>>>>> +                 if (rte_eal_process_type() ==
> >>>>> RTE_PROC_SECONDARY) {
> >>>>>> +                         int fd;
> >>>>>> +
> >>>>>> +                         eth_dev =
> >>>>> rte_eth_dev_attach_secondary(name);
> >>>>>> +                         if (!eth_dev) {
> >>>>>> +                                 DRV_LOG(ERR, "Can't attach to dev
> >>>>> %s",
> >>>>>> +                                         name);
> >>>>>> +                                 ret = -ENOMEM;
> >>>>>> +                                 goto failed;
> >>>>>> +                         }
> >>>>>> +
> >>>>>> +                         eth_dev->device = &pci_dev->device;
> >>>>>> +                         eth_dev->dev_ops = &mana_dev_sec_ops;
> >>>>>> +                         ret = mana_proc_priv_init(eth_dev);
> >>>>>> +                         if (ret)
> >>>>>> +                                 goto failed;
> >>>>>> +                         priv->process_priv = eth_dev-
> >>>>>> process_private;
> >>>>>> +
> >>>>>> +                         /* Get the IB FD from the primary
> >>>>>> process */
> >>>>>> +                         fd =
> >>>>> mana_mp_req_verbs_cmd_fd(eth_dev);
> >>>>>> +                         if (fd < 0) {
> >>>>>> +                                 DRV_LOG(ERR, "Failed to get FD %d",
> >>>>> fd);
> >>>>>> +                                 ret = -ENODEV;
> >>>>>> +                                 goto failed;
> >>>>>> +                         }
> >>>>>> +
> >>>>>> +                         ret =
> >>>>> mana_map_doorbell_secondary(eth_dev, fd);
> >>>>>> +                         if (ret) {
> >>>>>> +                                 DRV_LOG(ERR, "Failed secondary
> >>>>> map %d",
> >>>>>> +                                         fd);
> >>>>>> +                                 goto failed;
> >>>>>> +                         }
> >>>>>> +
> >>>>>> +                         /* fd is no not used after mapping
> >>>>>> doorbell */
> >>>>>> +                         close(fd);
> >>>>>> +
> >>>>>> +                         rte_spinlock_lock(&mana_shared_data-
> >>>>>> lock);
> >>>>>> +                         mana_shared_data->secondary_cnt++;
> >>>>>> +                         mana_local_data.secondary_cnt++;
> >>>>>> +                         rte_spinlock_unlock(&mana_shared_data-
> >>>>>> lock);
> >>>>>> +
> >>>>>> +                         rte_eth_copy_pci_info(eth_dev, pci_dev);
> >>>>>> +                         rte_eth_dev_probing_finish(eth_dev);
> >>>>>> +
> >>>>>> +                         /* Impossible to have more than one port
> >>>>>> +                          * matching a MAC address
> >>>>>> +                          */
> >>>>>> +                         continue;
> >>>>>> +                 }
> >>>>>> +
> >>>>>> +                 eth_dev = rte_eth_dev_allocate(name);
> >>>>>> +                 if (!eth_dev) {
> >>>>>> +                         ret = -ENOMEM;
> >>>>>> +                         goto failed;
> >>>>>> +                 }
> >>>>>> +
> >>>>>> +                 eth_dev->data->mac_addrs =
> >>>>>> +                         rte_calloc("mana_mac", 1,
> >>>>>> +                                    sizeof(struct
> >>>>>> rte_ether_addr), 0);
> >>>>>> +                 if (!eth_dev->data->mac_addrs) {
> >>>>>> +                         ret = -ENOMEM;
> >>>>>> +                         goto failed;
> >>>>>> +                 }
> >>>>>> +
> >>>>>> +                 rte_ether_addr_copy(&addr, eth_dev->data-
> >>>>>> mac_addrs);
> >>>>>> +
> >>>>>> +                 priv->ib_pd = ibv_alloc_pd(ctx);
> >>>>>> +                 if (!priv->ib_pd) {
> >>>>>> +                         DRV_LOG(ERR, "ibv_alloc_pd failed port %d",
> >>>>> port);
> >>>>>> +                         ret = -ENOMEM;
> >>>>>> +                         goto failed;
> >>>>>> +                 }
> >>>>>> +
> >>>>>> +                 /* Create a parent domain with the port number */
> >>>>>> +                 attr.pd = priv->ib_pd;
> >>>>>> +                 attr.comp_mask =
> >>>>> IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> >>>>>> +                 attr.pd_context = (void *)(uint64_t)port;
> >>>>>> +                 priv->ib_parent_pd = ibv_alloc_parent_domain(ctx,
> >>>>> &attr);
> >>>>>> +                 if (!priv->ib_parent_pd) {
> >>>>>> +                         DRV_LOG(ERR,
> >>>>>> +                                 "ibv_alloc_parent_domain failed
> >>>>>> port
> >>>>> %d",
> >>>>>> +                                 port);
> >>>>>> +                         ret = -ENOMEM;
> >>>>>> +                         goto failed;
> >>>>>> +                 }
> >>>>>> +
> >>>>>> +                 priv->ib_ctx = ctx;
> >>>>>> +                 priv->port_id = eth_dev->data->port_id;
> >>>>>> +                 priv->dev_port = port;
> >>>>>> +                 eth_dev->data->dev_private = priv;
> >>>>>> +                 priv->dev_data = eth_dev->data;
> >>>>>> +
> >>>>>> +                 priv->max_rx_queues = dev_attr.orig_attr.max_qp;
> >>>>>> +                 priv->max_tx_queues = dev_attr.orig_attr.max_qp;
> >>>>>> +
> >>>>>> +                 priv->max_rx_desc =
> >>>>>> +                         RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> >>>>>> +                                 dev_attr.orig_attr.max_cqe);
> >>>>>> +                 priv->max_tx_desc =
> >>>>>> +                         RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> >>>>>> +                                 dev_attr.orig_attr.max_cqe);
> >>>>>> +
> >>>>>> +                 priv->max_send_sge = dev_attr.orig_attr.max_sge;
> >>>>>> +                 priv->max_recv_sge = dev_attr.orig_attr.max_sge;
> >>>>>> +
> >>>>>> +                 priv->max_mr = dev_attr.orig_attr.max_mr;
> >>>>>> +                 priv->max_mr_size =
> >>>>> dev_attr.orig_attr.max_mr_size;
> >>>>>> +
> >>>>>> +                 DRV_LOG(INFO, "dev %s max queues %d desc %d
> >>>>> sge %d",
> >>>>>> +                         name, priv->max_rx_queues, priv-
> >>>>>> max_rx_desc,
> >>>>>> +                         priv->max_send_sge);
> >>>>>> +
> >>>>>> +                 rte_spinlock_lock(&mana_shared_data->lock);
> >>>>>> +                 mana_shared_data->primary_cnt++;
> >>>>>> +                 rte_spinlock_unlock(&mana_shared_data->lock);
> >>>>>> +
> >>>>>> +                 eth_dev->data->dev_flags |=
> >>>>> RTE_ETH_DEV_INTR_RMV;
> >>>>>> +
> >>>>>> +                 eth_dev->device = &pci_dev->device;
> >>>>>> +                 eth_dev->data->dev_flags |=
> >>>>>> +                         RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
> >>>>>> +
> >>>>> Please do not use the temporary macro. Please review this patch:
> >>>>>
> >>>>> f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue
> >>>>> xstats")
> >>>>>
> >>>>> This patch requires that per queue statistics are filled in
> >>>>> .xstats_get() by PMD.
> >>>> Thanks for pointing this out.
> >>>>
> >>>> It seems some PMDs are still depending on this flag for xstats.
> >>>>
> >>>> MANA doesn't implement xstats_get() currently, this flag is useful.
> >>>> Is it
> >>> okay to keep using this flag before it's finally the time to remove
> >>> it from all
> >>> PMDs, or when MANA implements xstats?
> >>> Yes, your xstats doesn't implement now. Per queue stats should be
> >>> filled in
> >>> xstats API, and the stats API cannot see per queue stats, so stats
> >>> API in driver
> >>> shouldn't fill it(suggest that delete it from patch 17/18).
> >>>
> >>> I guess this flag can be removed if PMD does not support xstats.
> >>>>
> >> I don't understand your suggestion. An application can call
> >> rte_eth_stats_get() to get port stats, and this will call into
> >> stats_get() in the driver, as implemented in patch 17/18.
> >>
> >> When flag RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS  is set, an application
> >> can also use rte_eth_xstats_get() to get port stats even the driver
> >> doesn't implement xstats_get().
> >
> > I think new PMD should follow the announced switch Ferruh mentioned,
> > otherwise, the switch will never be completed.
> >
> > Suggest that mana driver can implement a simple xstats_get() to fill per
> > queue stats if you want to support per queue stats.
> >
> > @Ferruh, what do you think?
> >
> 
> Hi Huisong,
> 
> Thanks for reminding it, yes it makes sense to implement new method in
> new drivers.
> 
> 
> Long,
> 
> There is a long term plan to move queue stats from basic stats structure
> to xstats. The reason behind is increasing number of queues makes basis
> stats struct too big, on the other hand xstats is more flexible and no
> fixes size array is required.
> 
> You can remove the 'RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS' flag and
> driver
> won't support queue stats in xstats by default. Instead to have that
> support you will need to implement xstats, later or in this set.
> When queue stats implemented in xstats, please remember to remove
> updating 'stats->q_*' in basic stats.
> 
> Also in 17/18, the feature 'Stats per queue' seems added, but that is
> not correct, feature name is misleading here. But it is about queue
> stats mapping, please check 'doc/guides/nics/features.rst'.
> So can you please drop that feature.

Thank you for clearing the confusion. I'm removing "Stats per queue" in doc/guides/nics/features/mana.ini. The other parts of patch 17/18 are still relevant so not changing them.

Also removing 'RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS' as suggested.
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 18d9edaf88..b8bda48a33 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -837,6 +837,12 @@  F: buildtools/options-ibverbs-static.sh
 F: doc/guides/nics/mlx5.rst
 F: doc/guides/nics/features/mlx5.ini
 
+Microsoft mana
+M: Long Li <longli@microsoft.com>
+F: drivers/net/mana
+F: doc/guides/nics/mana.rst
+F: doc/guides/nics/features/mana.ini
+
 Microsoft vdev_netvsc - EXPERIMENTAL
 M: Matan Azrad <matan@nvidia.com>
 F: drivers/net/vdev_netvsc/
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
new file mode 100644
index 0000000000..b92a27374c
--- /dev/null
+++ b/doc/guides/nics/features/mana.ini
@@ -0,0 +1,10 @@ 
+;
+; Supported features of the 'mana' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Linux                = Y
+Multiprocess aware   = Y
+Usage doc            = Y
+x86-64               = Y
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 1c94caccea..2725d1d9f0 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -41,6 +41,7 @@  Network Interface Controller Drivers
     intel_vf
     kni
     liquidio
+    mana
     memif
     mlx4
     mlx5
diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
new file mode 100644
index 0000000000..40e18fe810
--- /dev/null
+++ b/doc/guides/nics/mana.rst
@@ -0,0 +1,66 @@ 
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright 2022 Microsoft Corporation
+
+MANA poll mode driver library
+=============================
+
+The MANA poll mode driver library (**librte_net_mana**) implements support
+for Microsoft Azure Network Adapter VF in SR-IOV context.
+
+Features
+--------
+
+Features of the MANA Ethdev PMD are:
+
+Prerequisites
+-------------
+
+This driver relies on external libraries and kernel drivers for resources
+allocations and initialization. The following dependencies are not part of
+DPDK and must be installed separately:
+
+- **libibverbs** (provided by rdma-core package)
+
+  User space verbs framework used by librte_net_mana. This library provides
+  a generic interface between the kernel and low-level user space drivers
+  such as libmana.
+
+  It allows slow and privileged operations (context initialization, hardware
+  resources allocations) to be managed by the kernel and fast operations to
+  never leave user space.
+
+- **libmana** (provided by rdma-core package)
+
+  Low-level user space driver library for Microsoft Azure Network Adapter
+  devices, it is automatically loaded by libibverbs.
+
+- **Kernel modules**
+
+  They provide the kernel-side verbs API and low level device drivers that
+  manage actual hardware initialization and resources sharing with user
+  space processes.
+
+  Unlike most other PMDs, these modules must remain loaded and bound to
+  their devices:
+
+  - mana: Ethernet device driver that provides kernel network interfaces.
+  - mana_ib: InifiniBand device driver.
+  - ib_uverbs: user space driver for verbs (entry point for libibverbs).
+
+Driver compilation and testing
+------------------------------
+
+Refer to the document :ref:`compiling and testing a PMD for a NIC <pmd_build_and_test>`
+for details.
+
+Netvsc PMD arguments
+--------------------
+
+The user can specify below argument in devargs.
+
+#.  ``mac``:
+
+    Specify the MAC address for this device. If it is set, the driver
+    probes and loads the NIC with a matching mac address. If it is not
+    set, the driver probes on all the NICs on the PCI device. The default
+    value is not set, meaning all the NICs will be probed and loaded.
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
new file mode 100644
index 0000000000..cb59eb6882
--- /dev/null
+++ b/drivers/net/mana/mana.c
@@ -0,0 +1,704 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_kvargs.h>
+#include <rte_eal_paging.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include <assert.h>
+
+#include "mana.h"
+
+/* Shared memory between primary/secondary processes, per driver */
+struct mana_shared_data *mana_shared_data;
+const struct rte_memzone *mana_shared_mz;
+static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
+
+struct mana_shared_data mana_local_data;
+
+/* Spinlock for mana_shared_data */
+static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
+	char name[mkstr_size_##name + 1]; \
+	\
+	memset(name, 0, mkstr_size_##name + 1); \
+	snprintf(name, sizeof(name), "" __VA_ARGS__)
+
+int mana_logtype_driver;
+int mana_logtype_init;
+
+const struct eth_dev_ops mana_dev_ops = {
+};
+
+const struct eth_dev_ops mana_dev_sec_ops = {
+};
+
+uint16_t
+mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
+		      struct rte_mbuf **pkts __rte_unused,
+		      uint16_t pkts_n __rte_unused)
+{
+	rte_mb();
+	return 0;
+}
+
+uint16_t
+mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
+		      struct rte_mbuf **pkts __rte_unused,
+		      uint16_t pkts_n __rte_unused)
+{
+	rte_mb();
+	return 0;
+}
+
+static const char *mana_init_args[] = {
+	"mac",
+	NULL,
+};
+
+/* Support of parsing up to 8 mac address from EAL command line */
+#define MAX_NUM_ADDRESS 8
+struct mana_conf {
+	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
+	unsigned int index;
+};
+
+static int mana_arg_parse_callback(const char *key, const char *val,
+				   void *private)
+{
+	struct mana_conf *conf = (struct mana_conf *)private;
+	int ret;
+
+	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
+
+	if (conf->index >= MAX_NUM_ADDRESS) {
+		DRV_LOG(ERR, "Exceeding max MAC address");
+		return 1;
+	}
+
+	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
+	if (ret) {
+		DRV_LOG(ERR, "Invalid MAC address %s", val);
+		return ret;
+	}
+
+	conf->index++;
+
+	return 0;
+}
+
+static int mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
+{
+	struct rte_kvargs *kvlist;
+	unsigned int arg_count;
+	int ret = 0;
+
+	kvlist = rte_kvargs_parse(devargs->args, mana_init_args);
+	if (!kvlist) {
+		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->args);
+		return -EINVAL;
+	}
+
+	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
+	if (arg_count > MAX_NUM_ADDRESS) {
+		ret = -EINVAL;
+		goto free_kvlist;
+	}
+	ret = rte_kvargs_process(kvlist, mana_init_args[0],
+				 mana_arg_parse_callback, conf);
+	if (ret) {
+		DRV_LOG(ERR, "error parsing args");
+		goto free_kvlist;
+	}
+
+free_kvlist:
+	rte_kvargs_free(kvlist);
+	return ret;
+}
+
+static int get_port_mac(struct ibv_device *device, unsigned int port,
+			struct rte_ether_addr *addr)
+{
+	FILE *file;
+	int ret = 0;
+	DIR *dir;
+	struct dirent *dent;
+	unsigned int dev_port;
+	char mac[20];
+
+	MKSTR(path, "%s/device/net", device->ibdev_path);
+
+	dir = opendir(path);
+	if (!dir)
+		return -ENOENT;
+
+	while ((dent = readdir(dir))) {
+		char *name = dent->d_name;
+
+		MKSTR(filepath, "%s/%s/dev_port", path, name);
+
+		/* Ignore . and .. */
+		if ((name[0] == '.') &&
+		    ((name[1] == '\0') ||
+		     ((name[1] == '.') && (name[2] == '\0'))))
+			continue;
+
+		file = fopen(filepath, "r");
+		if (!file)
+			continue;
+
+		ret = fscanf(file, "%u", &dev_port);
+		fclose(file);
+
+		if (ret != 1)
+			continue;
+
+		/* Ethernet ports start at 0, IB port start at 1 */
+		if (dev_port == port - 1) {
+			MKSTR(filepath, "%s/%s/address", path, name);
+
+			file = fopen(filepath, "r");
+			if (!file)
+				continue;
+
+			ret = fscanf(file, "%s", mac);
+			fclose(file);
+
+			if (ret < 0)
+				break;
+
+			ret = rte_ether_unformat_addr(mac, addr);
+			if (ret)
+				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
+			break;
+		}
+	}
+
+	closedir(dir);
+	return ret;
+}
+
+static int mana_ibv_device_to_pci_addr(const struct ibv_device *device,
+				       struct rte_pci_addr *pci_addr)
+{
+	FILE *file;
+	char *line = NULL;
+	size_t len = 0;
+
+	MKSTR(path, "%s/device/uevent", device->ibdev_path);
+
+	file = fopen(path, "r");
+	if (!file)
+		return -errno;
+
+	while (getline(&line, &len, file) != -1) {
+		/* Extract information. */
+		if (sscanf(line,
+			   "PCI_SLOT_NAME="
+			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
+			   &pci_addr->domain,
+			   &pci_addr->bus,
+			   &pci_addr->devid,
+			   &pci_addr->function) == 4) {
+			break;
+		}
+	}
+
+	free(line);
+	fclose(file);
+	return 0;
+}
+
+static int mana_proc_priv_init(struct rte_eth_dev *dev)
+{
+	struct mana_process_priv *priv;
+
+	priv = rte_zmalloc_socket("mana_proc_priv",
+				  sizeof(struct mana_process_priv),
+				  RTE_CACHE_LINE_SIZE,
+				  dev->device->numa_node);
+	if (!priv)
+		return -ENOMEM;
+
+	dev->process_private = priv;
+	return 0;
+}
+
+static int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
+{
+	struct mana_process_priv *priv = eth_dev->process_private;
+
+	void *addr;
+
+	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED) {
+		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
+			eth_dev->data->port_id);
+		return -ENOMEM;
+	}
+
+	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
+
+	priv->db_page = addr;
+
+	return 0;
+}
+
+/* Initialize shared data for the driver (all devices) */
+static int mana_init_shared_data(void)
+{
+	int ret =  0;
+	const struct rte_memzone *secondary_mz;
+
+	rte_spinlock_lock(&mana_shared_data_lock);
+
+	/* Skip if shared data is already initialized */
+	if (mana_shared_data)
+		goto exit;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
+						     sizeof(*mana_shared_data),
+						     SOCKET_ID_ANY, 0);
+		if (!mana_shared_mz) {
+			DRV_LOG(ERR, "Cannot allocate mana shared data");
+			ret = -rte_errno;
+			goto exit;
+		}
+
+		mana_shared_data = mana_shared_mz->addr;
+		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
+		rte_spinlock_init(&mana_shared_data->lock);
+	} else {
+		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
+		if (!secondary_mz) {
+			DRV_LOG(ERR, "Cannot attach mana shared data");
+			ret = -rte_errno;
+			goto exit;
+		}
+
+		mana_shared_data = secondary_mz->addr;
+		memset(&mana_local_data, 0, sizeof(mana_local_data));
+	}
+
+exit:
+	rte_spinlock_unlock(&mana_shared_data_lock);
+
+	return ret;
+}
+
+static int mana_init_once(void)
+{
+	int ret;
+
+	ret = mana_init_shared_data();
+	if (ret)
+		return ret;
+
+	rte_spinlock_lock(&mana_shared_data->lock);
+
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		if (mana_shared_data->init_done)
+			break;
+
+		ret = mana_mp_init_primary();
+		if (ret)
+			break;
+		DRV_LOG(ERR, "MP INIT PRIMARY");
+
+		mana_shared_data->init_done = 1;
+		break;
+
+	case RTE_PROC_SECONDARY:
+
+		if (mana_local_data.init_done)
+			break;
+
+		ret = mana_mp_init_secondary();
+		if (ret)
+			break;
+
+		DRV_LOG(ERR, "MP INIT SECONDARY");
+
+		mana_local_data.init_done = 1;
+		break;
+
+	default:
+		/* Impossible, internal error */
+		ret = -EPROTO;
+		break;
+	}
+
+	rte_spinlock_unlock(&mana_shared_data->lock);
+
+	return ret;
+}
+
+static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
+			      struct rte_pci_device *pci_dev,
+			      struct rte_ether_addr *mac_addr)
+{
+	struct ibv_device **ibv_list;
+	int ibv_idx;
+	struct ibv_context *ctx;
+	struct ibv_device_attr_ex dev_attr;
+	int num_devices;
+	int ret = 0;
+	uint8_t port;
+	struct mana_priv *priv = NULL;
+	struct rte_eth_dev *eth_dev = NULL;
+	bool found_port;
+
+	ibv_list = ibv_get_device_list(&num_devices);
+	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
+		struct ibv_device *ibdev = ibv_list[ibv_idx];
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
+			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
+
+		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
+			continue;
+
+		/* Ignore if this IB device is not this PCI device */
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+
+		ctx = ibv_open_device(ibdev);
+		if (!ctx) {
+			DRV_LOG(ERR, "Failed to open IB device %s",
+				ibdev->name);
+			continue;
+		}
+
+		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
+		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
+			dev_attr.orig_attr.phys_port_cnt);
+		found_port = false;
+
+		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
+		     port++) {
+			struct ibv_parent_domain_init_attr attr = {};
+			struct rte_ether_addr addr;
+			char address[64];
+			char name[RTE_ETH_NAME_MAX_LEN];
+
+			ret = get_port_mac(ibdev, port, &addr);
+			if (ret)
+				continue;
+
+			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
+				continue;
+
+			rte_ether_format_addr(address, sizeof(address), &addr);
+			DRV_LOG(INFO, "device located port %u address %s",
+				port, address);
+			found_port = true;
+
+			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
+						  RTE_CACHE_LINE_SIZE,
+						  SOCKET_ID_ANY);
+			if (!priv) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			snprintf(name, sizeof(name), "%s_port%d",
+				 pci_dev->device.name, port);
+
+			if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+				int fd;
+
+				eth_dev = rte_eth_dev_attach_secondary(name);
+				if (!eth_dev) {
+					DRV_LOG(ERR, "Can't attach to dev %s",
+						name);
+					ret = -ENOMEM;
+					goto failed;
+				}
+
+				eth_dev->device = &pci_dev->device;
+				eth_dev->dev_ops = &mana_dev_sec_ops;
+				ret = mana_proc_priv_init(eth_dev);
+				if (ret)
+					goto failed;
+				priv->process_priv = eth_dev->process_private;
+
+				/* Get the IB FD from the primary process */
+				fd = mana_mp_req_verbs_cmd_fd(eth_dev);
+				if (fd < 0) {
+					DRV_LOG(ERR, "Failed to get FD %d", fd);
+					ret = -ENODEV;
+					goto failed;
+				}
+
+				ret = mana_map_doorbell_secondary(eth_dev, fd);
+				if (ret) {
+					DRV_LOG(ERR, "Failed secondary map %d",
+						fd);
+					goto failed;
+				}
+
+				/* fd is no not used after mapping doorbell */
+				close(fd);
+
+				rte_spinlock_lock(&mana_shared_data->lock);
+				mana_shared_data->secondary_cnt++;
+				mana_local_data.secondary_cnt++;
+				rte_spinlock_unlock(&mana_shared_data->lock);
+
+				rte_eth_copy_pci_info(eth_dev, pci_dev);
+				rte_eth_dev_probing_finish(eth_dev);
+
+				/* Impossible to have more than one port
+				 * matching a MAC address
+				 */
+				continue;
+			}
+
+			eth_dev = rte_eth_dev_allocate(name);
+			if (!eth_dev) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			eth_dev->data->mac_addrs =
+				rte_calloc("mana_mac", 1,
+					   sizeof(struct rte_ether_addr), 0);
+			if (!eth_dev->data->mac_addrs) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs);
+
+			priv->ib_pd = ibv_alloc_pd(ctx);
+			if (!priv->ib_pd) {
+				DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			/* Create a parent domain with the port number */
+			attr.pd = priv->ib_pd;
+			attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
+			attr.pd_context = (void *)(uint64_t)port;
+			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
+			if (!priv->ib_parent_pd) {
+				DRV_LOG(ERR,
+					"ibv_alloc_parent_domain failed port %d",
+					port);
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			priv->ib_ctx = ctx;
+			priv->port_id = eth_dev->data->port_id;
+			priv->dev_port = port;
+			eth_dev->data->dev_private = priv;
+			priv->dev_data = eth_dev->data;
+
+			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
+			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
+
+			priv->max_rx_desc =
+				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+					dev_attr.orig_attr.max_cqe);
+			priv->max_tx_desc =
+				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+					dev_attr.orig_attr.max_cqe);
+
+			priv->max_send_sge = dev_attr.orig_attr.max_sge;
+			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
+
+			priv->max_mr = dev_attr.orig_attr.max_mr;
+			priv->max_mr_size = dev_attr.orig_attr.max_mr_size;
+
+			DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
+				name, priv->max_rx_queues, priv->max_rx_desc,
+				priv->max_send_sge);
+
+			rte_spinlock_lock(&mana_shared_data->lock);
+			mana_shared_data->primary_cnt++;
+			rte_spinlock_unlock(&mana_shared_data->lock);
+
+			eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
+
+			eth_dev->device = &pci_dev->device;
+			eth_dev->data->dev_flags |=
+				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
+
+			DRV_LOG(INFO, "device %s at port %u",
+				name, eth_dev->data->port_id);
+
+			eth_dev->rx_pkt_burst = mana_rx_burst_removed;
+			eth_dev->tx_pkt_burst = mana_tx_burst_removed;
+			eth_dev->dev_ops = &mana_dev_ops;
+
+			rte_eth_copy_pci_info(eth_dev, pci_dev);
+			rte_eth_dev_probing_finish(eth_dev);
+		}
+
+		/* Secondary process doesn't need an ibv_ctx. It maps the
+		 * doorbell pages using the IB cmd_fd passed from the primary
+		 * process and send messages to primary process for memory
+		 * registartions.
+		 */
+		if (!found_port || rte_eal_process_type() == RTE_PROC_SECONDARY)
+			ibv_close_device(ctx);
+	}
+
+	ibv_free_device_list(ibv_list);
+	return 0;
+
+failed:
+	/* Free the resource for the port failed */
+	if (priv) {
+		if (priv->ib_parent_pd)
+			ibv_dealloc_pd(priv->ib_parent_pd);
+
+		if (priv->ib_pd)
+			ibv_dealloc_pd(priv->ib_pd);
+	}
+
+	if (eth_dev)
+		rte_eth_dev_release_port(eth_dev);
+
+	rte_free(priv);
+
+	ibv_close_device(ctx);
+	ibv_free_device_list(ibv_list);
+
+	return ret;
+}
+
+static int mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+			  struct rte_pci_device *pci_dev)
+{
+	struct rte_devargs *args = pci_dev->device.devargs;
+	struct mana_conf conf = {};
+	unsigned int i;
+	int ret;
+
+	if (args && args->args) {
+		ret = mana_parse_args(args, &conf);
+		if (ret) {
+			DRV_LOG(ERR, "failed to parse parameters args = %s",
+				args->args);
+			return ret;
+		}
+	}
+
+	ret = mana_init_once();
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
+		return ret;
+	}
+
+	/* If there are no driver parameters, probe on all ports */
+	if (!conf.index)
+		return mana_pci_probe_mac(pci_drv, pci_dev, NULL);
+
+	for (i = 0; i < conf.index; i++) {
+		ret = mana_pci_probe_mac(pci_drv, pci_dev, &conf.mac_array[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mana_dev_uninit(struct rte_eth_dev *dev)
+{
+	RTE_SET_USED(dev);
+	return 0;
+}
+
+static int mana_pci_remove(struct rte_pci_device *pci_dev)
+{
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		rte_spinlock_lock(&mana_shared_data_lock);
+
+		rte_spinlock_lock(&mana_shared_data->lock);
+
+		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
+		mana_shared_data->primary_cnt--;
+		if (!mana_shared_data->primary_cnt) {
+			DRV_LOG(DEBUG, "mp uninit primary");
+			mana_mp_uninit_primary();
+		}
+
+		rte_spinlock_unlock(&mana_shared_data->lock);
+
+		/* Also free the shared memory if this is the last */
+		if (!mana_shared_data->primary_cnt) {
+			DRV_LOG(DEBUG, "free shared memezone data");
+			rte_memzone_free(mana_shared_mz);
+		}
+
+		rte_spinlock_unlock(&mana_shared_data_lock);
+	} else {
+		rte_spinlock_lock(&mana_shared_data_lock);
+
+		rte_spinlock_lock(&mana_shared_data->lock);
+		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
+		mana_shared_data->secondary_cnt--;
+		rte_spinlock_unlock(&mana_shared_data->lock);
+
+		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
+		mana_local_data.secondary_cnt--;
+		if (!mana_local_data.secondary_cnt) {
+			DRV_LOG(DEBUG, "mp uninit secondary");
+			mana_mp_uninit_secondary();
+		}
+
+		rte_spinlock_unlock(&mana_shared_data_lock);
+	}
+
+	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
+}
+
+static const struct rte_pci_id mana_pci_id_map[] = {
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
+			       PCI_DEVICE_ID_MICROSOFT_MANA)
+	},
+};
+
+static struct rte_pci_driver mana_pci_driver = {
+	.driver = {
+		.name = "mana_pci",
+	},
+	.id_table = mana_pci_id_map,
+	.probe = mana_pci_probe,
+	.remove = mana_pci_remove,
+	.drv_flags = RTE_PCI_DRV_INTR_RMV,
+};
+
+RTE_INIT(rte_mana_pmd_init)
+{
+	rte_pci_register(&mana_pci_driver);
+}
+
+RTE_PMD_EXPORT_NAME(net_mana, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
new file mode 100644
index 0000000000..e30c030b4e
--- /dev/null
+++ b/drivers/net/mana/mana.h
@@ -0,0 +1,210 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#ifndef __MANA_H__
+#define __MANA_H__
+
+enum {
+	PCI_VENDOR_ID_MICROSOFT = 0x1414,
+};
+
+enum {
+	PCI_DEVICE_ID_MICROSOFT_MANA = 0x00ba,
+};
+
+/* Shared data between primary/secondary processes */
+struct mana_shared_data {
+	rte_spinlock_t lock;
+	int init_done;
+	unsigned int primary_cnt;
+	unsigned int secondary_cnt;
+};
+
+#define MIN_RX_BUF_SIZE	1024
+#define MAX_FRAME_SIZE	RTE_ETHER_MAX_LEN
+#define BNIC_MAX_MAC_ADDR 1
+
+#define BNIC_DEV_RX_OFFLOAD_SUPPORT ( \
+		DEV_RX_OFFLOAD_CHECKSUM | \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
+#define BNIC_DEV_TX_OFFLOAD_SUPPORT ( \
+		RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \
+		RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \
+		RTE_ETH_TX_OFFLOAD_TCP_CKSUM | \
+		RTE_ETH_TX_OFFLOAD_UDP_CKSUM | \
+		RTE_ETH_TX_OFFLOAD_TCP_TSO)
+
+#define INDIRECTION_TABLE_NUM_ELEMENTS 64
+#define TOEPLITZ_HASH_KEY_SIZE_IN_BYTES 40
+#define BNIC_ETH_RSS_SUPPORT ( \
+	ETH_RSS_IPV4 |	     \
+	ETH_RSS_NONFRAG_IPV4_TCP | \
+	ETH_RSS_NONFRAG_IPV4_UDP | \
+	ETH_RSS_IPV6 |	     \
+	ETH_RSS_NONFRAG_IPV6_TCP | \
+	ETH_RSS_NONFRAG_IPV6_UDP)
+
+#define MIN_BUFFERS_PER_QUEUE		64
+#define MAX_RECEIVE_BUFFERS_PER_QUEUE	256
+#define MAX_SEND_BUFFERS_PER_QUEUE	256
+
+struct mana_process_priv {
+	void *db_page;
+};
+
+struct mana_priv {
+	struct rte_eth_dev_data *dev_data;
+	struct mana_process_priv *process_priv;
+	int num_queues;
+
+	/* DPDK port */
+	uint16_t port_id;
+
+	/* IB device port */
+	uint8_t dev_port;
+
+	struct ibv_context *ib_ctx;
+	struct ibv_pd *ib_pd;
+	struct ibv_pd *ib_parent_pd;
+	struct ibv_rwq_ind_table *ind_table;
+	uint8_t ind_table_key[40];
+	struct ibv_qp *rwq_qp;
+	void *db_page;
+	int max_rx_queues;
+	int max_tx_queues;
+	int max_rx_desc;
+	int max_tx_desc;
+	int max_send_sge;
+	int max_recv_sge;
+	int max_mr;
+	uint64_t max_mr_size;
+};
+
+struct mana_txq_desc {
+	struct rte_mbuf *pkt;
+	uint32_t wqe_size_in_bu;
+};
+
+struct mana_rxq_desc {
+	struct rte_mbuf *pkt;
+	uint32_t wqe_size_in_bu;
+};
+
+struct mana_gdma_queue {
+	void *buffer;
+	uint32_t count;	/* in entries */
+	uint32_t size;	/* in bytes */
+	uint32_t id;
+	uint32_t head;
+	uint32_t tail;
+};
+
+struct mana_stats {
+	uint64_t packets;
+	uint64_t bytes;
+	uint64_t errors;
+	uint64_t nombuf;
+};
+
+#define MANA_MR_BTREE_PER_QUEUE_N	64
+struct mana_txq {
+	struct mana_priv *priv;
+	uint32_t num_desc;
+	struct ibv_cq *cq;
+	struct ibv_qp *qp;
+
+	struct mana_gdma_queue gdma_sq;
+	struct mana_gdma_queue gdma_cq;
+
+	uint32_t tx_vp_offset;
+
+	/* For storing pending requests */
+	struct mana_txq_desc *desc_ring;
+
+	/* desc_ring_head is where we put pending requests to ring,
+	 * completion pull off desc_ring_tail
+	 */
+	uint32_t desc_ring_head, desc_ring_tail;
+
+	struct mana_stats stats;
+	unsigned int socket;
+};
+
+struct mana_rxq {
+	struct mana_priv *priv;
+	uint32_t num_desc;
+	struct rte_mempool *mp;
+	struct ibv_cq *cq;
+	struct ibv_wq *wq;
+
+	/* For storing pending requests */
+	struct mana_rxq_desc *desc_ring;
+
+	/* desc_ring_head is where we put pending requests to ring,
+	 * completion pull off desc_ring_tail
+	 */
+	uint32_t desc_ring_head, desc_ring_tail;
+
+	struct mana_gdma_queue gdma_rq;
+	struct mana_gdma_queue gdma_cq;
+
+	struct mana_stats stats;
+
+	unsigned int socket;
+};
+
+extern int mana_logtype_driver;
+extern int mana_logtype_init;
+
+#define DRV_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, mana_logtype_driver, "%s(): " fmt "\n", \
+		__func__, ## args)
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, mana_logtype_init, "%s(): " fmt "\n",\
+		__func__, ## args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
+
+uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
+uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
+/** Request timeout for IPC. */
+#define MANA_MP_REQ_TIMEOUT_SEC 5
+
+/* Request types for IPC. */
+enum mana_mp_req_type {
+	MANA_MP_REQ_VERBS_CMD_FD = 1,
+	MANA_MP_REQ_CREATE_MR,
+	MANA_MP_REQ_START_RXTX,
+	MANA_MP_REQ_STOP_RXTX,
+};
+
+/* Pameters for IPC. */
+struct mana_mp_param {
+	enum mana_mp_req_type type;
+	int port_id;
+	int result;
+
+	/* MANA_MP_REQ_CREATE_MR */
+	uintptr_t addr;
+	uint32_t len;
+};
+
+#define MANA_MP_NAME	"net_mana_mp"
+int mana_mp_init_primary(void);
+int mana_mp_init_secondary(void);
+void mana_mp_uninit_primary(void);
+void mana_mp_uninit_secondary(void);
+int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+
+void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
+
+#endif
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
new file mode 100644
index 0000000000..81c4118f53
--- /dev/null
+++ b/drivers/net/mana/meson.build
@@ -0,0 +1,44 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2022 Microsoft Corporation
+
+if not is_linux or not dpdk_conf.has('RTE_ARCH_X86_64')
+    build = false
+    reason = 'mana is supported on Linux X86_64'
+    subdir_done()
+endif
+
+deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
+
+sources += files(
+	'mana.c',
+	'mp.c',
+)
+
+libnames = ['ibverbs', 'mana' ]
+foreach libname:libnames
+    lib = cc.find_library(libname, required:false)
+    if lib.found()
+        ext_deps += lib
+    else
+        build = false
+        reason = 'missing dependency, "' + libname + '"'
+        subdir_done()
+    endif
+endforeach
+
+required_symbols = [
+    ['infiniband/manadv.h', 'manadv_set_context_attr'],
+    ['infiniband/manadv.h', 'manadv_init_obj'],
+    ['infiniband/manadv.h', 'MANADV_CTX_ATTR_BUF_ALLOCATORS'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_QP'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_CQ'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_RWQ'],
+]
+
+foreach arg:required_symbols
+    if not cc.has_header_symbol(arg[0], arg[1])
+        build = false
+        reason = 'missing symbol "' + arg[1] + '" in "' + arg[0] + '"'
+        subdir_done()
+    endif
+endforeach
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
new file mode 100644
index 0000000000..d7580e8a28
--- /dev/null
+++ b/drivers/net/mana/mp.c
@@ -0,0 +1,235 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_log.h>
+
+#include <infiniband/verbs.h>
+
+#include "mana.h"
+
+extern struct mana_shared_data *mana_shared_data;
+
+static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type,
+			int port_id)
+{
+	struct mana_mp_param *param;
+
+	strlcpy(msg->name, MANA_MP_NAME, sizeof(msg->name));
+	msg->len_param = sizeof(*param);
+
+	param = (struct mana_mp_param *)msg->param;
+	param->type = type;
+	param->port_id = port_id;
+}
+
+static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
+				  const void *peer)
+{
+	struct rte_eth_dev *dev;
+	const struct mana_mp_param *param =
+		(const struct mana_mp_param *)mp_msg->param;
+	struct rte_mp_msg mp_res = { 0 };
+	struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
+	int ret;
+	struct mana_priv *priv;
+
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+
+	mp_init_msg(&mp_res, param->type, param->port_id);
+
+	switch (param->type) {
+	case MANA_MP_REQ_VERBS_CMD_FD:
+		mp_res.num_fds = 1;
+		mp_res.fds[0] = priv->ib_ctx->cmd_fd;
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	default:
+		DRV_LOG(ERR, "Port %u unknown primary MP type %u",
+			param->port_id, param->type);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg,
+				    const void *peer)
+{
+	struct rte_mp_msg mp_res = { 0 };
+	struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
+	const struct mana_mp_param *param =
+		(const struct mana_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	int ret;
+
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[param->port_id];
+
+	mp_init_msg(&mp_res, param->type, param->port_id);
+
+	switch (param->type) {
+	case MANA_MP_REQ_START_RXTX:
+		DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
+
+		rte_mb();
+
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	case MANA_MP_REQ_STOP_RXTX:
+		DRV_LOG(INFO, "Port %u stopping datapath", dev->data->port_id);
+
+		dev->tx_pkt_burst = mana_tx_burst_removed;
+		dev->rx_pkt_burst = mana_rx_burst_removed;
+
+		rte_mb();
+
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	default:
+		DRV_LOG(ERR, "Port %u unknown secondary MP type %u",
+			param->port_id, param->type);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+int mana_mp_init_primary(void)
+{
+	int ret;
+
+	ret = rte_mp_action_register(MANA_MP_NAME, mana_mp_primary_handle);
+	if (ret && rte_errno != ENOTSUP) {
+		DRV_LOG(ERR, "Failed to register primary handler %d %d",
+			ret, rte_errno);
+		return -1;
+	}
+
+	return 0;
+}
+
+void mana_mp_uninit_primary(void)
+{
+	rte_mp_action_unregister(MANA_MP_NAME);
+}
+
+int mana_mp_init_secondary(void)
+{
+	return rte_mp_action_register(MANA_MP_NAME, mana_mp_secondary_handle);
+}
+
+void mana_mp_uninit_secondary(void)
+{
+	rte_mp_action_unregister(MANA_MP_NAME);
+}
+
+int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
+{
+	struct rte_mp_msg mp_req = { 0 };
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mana_mp_param *res;
+	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int ret;
+
+	mp_init_msg(&mp_req, MANA_MP_REQ_VERBS_CMD_FD, dev->data->port_id);
+
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		DRV_LOG(ERR, "port %u request to primary process failed",
+			dev->data->port_id);
+		return ret;
+	}
+
+	if (mp_rep.nb_received != 1) {
+		DRV_LOG(ERR, "primary replied %u messages", mp_rep.nb_received);
+		ret = -EPROTO;
+		goto exit;
+	}
+
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mana_mp_param *)mp_res->param;
+	if (res->result) {
+		DRV_LOG(ERR, "failed to get CMD FD, port %u",
+			dev->data->port_id);
+		ret = res->result;
+		goto exit;
+	}
+
+	if (mp_res->num_fds != 1) {
+		DRV_LOG(ERR, "got FDs %d unexpected", mp_res->num_fds);
+		ret = -EPROTO;
+		goto exit;
+	}
+
+	ret = mp_res->fds[0];
+	DRV_LOG(ERR, "port %u command FD from primary is %d",
+		dev->data->port_id, ret);
+exit:
+	free(mp_rep.msgs);
+	return ret;
+}
+
+void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
+{
+	struct rte_mp_msg mp_req = { 0 };
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mana_mp_param *res;
+	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int i, ret;
+
+	if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) {
+		DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+			dev->data->port_id, type);
+		return;
+	}
+
+	if (!mana_shared_data->secondary_cnt)
+		return;
+
+	mp_init_msg(&mp_req, type, dev->data->port_id);
+
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		if (rte_errno != ENOTSUP)
+			DRV_LOG(ERR, "port %u failed to request Rx/Tx (%d)",
+				dev->data->port_id, type);
+		goto exit;
+	}
+	if (mp_rep.nb_sent != mp_rep.nb_received) {
+		DRV_LOG(ERR, "port %u not all secondaries responded (%d)",
+			dev->data->port_id, type);
+		goto exit;
+	}
+	for (i = 0; i < mp_rep.nb_received; i++) {
+		mp_res = &mp_rep.msgs[i];
+		res = (struct mana_mp_param *)mp_res->param;
+		if (res->result) {
+			DRV_LOG(ERR, "port %u request failed on secondary %d",
+				dev->data->port_id, i);
+			goto exit;
+		}
+	}
+exit:
+	free(mp_rep.msgs);
+}
diff --git a/drivers/net/mana/version.map b/drivers/net/mana/version.map
new file mode 100644
index 0000000000..c2e0723b4c
--- /dev/null
+++ b/drivers/net/mana/version.map
@@ -0,0 +1,3 @@ 
+DPDK_22 {
+	local: *;
+};
diff --git a/drivers/net/meson.build b/drivers/net/meson.build
index 2355d1cde8..0b111a6ebb 100644
--- a/drivers/net/meson.build
+++ b/drivers/net/meson.build
@@ -34,6 +34,7 @@  drivers = [
         'ixgbe',
         'kni',
         'liquidio',
+        'mana',
         'memif',
         'mlx4',
         'mlx5',