[dpdk-dev,v7,1/5] vfio: extend data structure for multi container

Message ID 20180415153349.62105-2-xiao.w.wang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply patch file failure

Commit Message

Xiao Wang April 15, 2018, 3:33 p.m. UTC
  Currently eal vfio framework binds vfio group fd to the default
container fd during rte_vfio_setup_device, while in some cases,
e.g. vDPA (vhost data path acceleration), we want to put vfio group
to a separate container and program IOMMU via this container.

This patch extends the vfio_config structure to contain per-container
user_mem_maps and defines an array of vfio_config. The next patch will
base on this to add container API.

Signed-off-by: Junjie Chen <junjie.j.chen@intel.com>
Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 config/common_base                     |   1 +
 lib/librte_eal/linuxapp/eal/eal_vfio.c | 407 ++++++++++++++++++++++-----------
 lib/librte_eal/linuxapp/eal/eal_vfio.h |  19 +-
 3 files changed, 275 insertions(+), 152 deletions(-)
  

Comments

Anatoly Burakov April 16, 2018, 10:02 a.m. UTC | #1
On 15-Apr-18 4:33 PM, Xiao Wang wrote:
> Currently eal vfio framework binds vfio group fd to the default
> container fd during rte_vfio_setup_device, while in some cases,
> e.g. vDPA (vhost data path acceleration), we want to put vfio group
> to a separate container and program IOMMU via this container.
> 
> This patch extends the vfio_config structure to contain per-container
> user_mem_maps and defines an array of vfio_config. The next patch will
> base on this to add container API.
> 
> Signed-off-by: Junjie Chen <junjie.j.chen@intel.com>
> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
> ---
>   config/common_base                     |   1 +
>   lib/librte_eal/linuxapp/eal/eal_vfio.c | 407 ++++++++++++++++++++++-----------
>   lib/librte_eal/linuxapp/eal/eal_vfio.h |  19 +-
>   3 files changed, 275 insertions(+), 152 deletions(-)
> 
> diff --git a/config/common_base b/config/common_base
> index c4236fd1f..4a76d2f14 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -87,6 +87,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>   CONFIG_RTE_EAL_IGB_UIO=n
>   CONFIG_RTE_EAL_VFIO=n
>   CONFIG_RTE_MAX_VFIO_GROUPS=64
> +CONFIG_RTE_MAX_VFIO_CONTAINERS=64
>   CONFIG_RTE_MALLOC_DEBUG=n
>   CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>   CONFIG_RTE_USE_LIBBSD=n
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 589d7d478..46fba2d8d 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -22,8 +22,46 @@
>   
>   #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
>   
> +/*
> + * we don't need to store device fd's anywhere since they can be obtained from
> + * the group fd via an ioctl() call.
> + */
> +struct vfio_group {
> +	int group_no;
> +	int fd;
> +	int devices;
> +};

What is the purpose of moving this into .c file? Seems like an 
unnecessary change.

> +
> +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
> + * recreate the mappings for DPDK segments, but we cannot do so for memory that
> + * was registered by the user themselves, so we need to store the user mappings
> + * somewhere, to recreate them later.
> + */
> +#define VFIO_MAX_USER_MEM_MAPS 256
> +struct user_mem_map {
> +	uint64_t addr;
> +	uint64_t iova;
> +	uint64_t len;
> +};
> +

<...>

> +static struct vfio_config *
> +get_vfio_cfg_by_group_no(int iommu_group_no)
> +{
> +	struct vfio_config *vfio_cfg;
> +	int i, j;
> +
> +	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
> +		vfio_cfg = &vfio_cfgs[i];
> +		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
> +			if (vfio_cfg->vfio_groups[j].group_no ==
> +					iommu_group_no)
> +				return vfio_cfg;
> +		}
> +	}
> +
> +	return default_vfio_cfg;

Here and in other places: i'm not sure returning default vfio config if 
group not found is such a good idea. It would be better if calling code 
explicitly handled case of group not existing yet.

> +}
> +
> +static struct vfio_config *
> +get_vfio_cfg_by_group_fd(int vfio_group_fd)
> +{
> +	struct vfio_config *vfio_cfg;
> +	int i, j;
> +
> +	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
> +		vfio_cfg = &vfio_cfgs[i];
> +		for (j = 0; j < VFIO_MAX_GROUPS; j++)
> +			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
> +				return vfio_cfg;
> +	}
>   

<...>

> -	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
> -		vfio_cfg.vfio_groups[i].fd = -1;
> -		vfio_cfg.vfio_groups[i].group_no = -1;
> -		vfio_cfg.vfio_groups[i].devices = 0;
> +	rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
> +
> +	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
> +		vfio_cfgs[i].vfio_container_fd = -1;
> +		vfio_cfgs[i].vfio_active_groups = 0;
> +		vfio_cfgs[i].vfio_iommu_type = NULL;
> +		vfio_cfgs[i].mem_maps.lock = lock;

Nitpick - why copy, instead of straight up initializing with 
RTE_SPINLOCK_RECURSIVE_INITIALIZER?

> +
> +		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
> +			vfio_cfgs[i].vfio_groups[j].fd = -1;
> +			vfio_cfgs[i].vfio_groups[j].group_no = -1;
> +			vfio_cfgs[i].vfio_groups[j].devices = 0;
> +		}
>   	}
>   
>   	/* inform the user that we are probing for VFIO */
> @@ -841,12 +971,12 @@ rte_vfio_enable(const char *modname)
>   		return 0;
>   	}

<...>
  
Xiao Wang April 16, 2018, 12:22 p.m. UTC | #2
Hi Anatoly,

> -----Original Message-----

> From: Burakov, Anatoly

> Sent: Monday, April 16, 2018 6:03 PM

> To: Wang, Xiao W <xiao.w.wang@intel.com>; Yigit, Ferruh

> <ferruh.yigit@intel.com>

> Cc: dev@dpdk.org; maxime.coquelin@redhat.com; Wang, Zhihong

> <zhihong.wang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Tan, Jianfeng

> <jianfeng.tan@intel.com>; Liang, Cunming <cunming.liang@intel.com>; Daly,

> Dan <dan.daly@intel.com>; thomas@monjalon.net; Chen, Junjie J

> <junjie.j.chen@intel.com>

> Subject: Re: [PATCH v7 1/5] vfio: extend data structure for multi container

> 

> On 15-Apr-18 4:33 PM, Xiao Wang wrote:

> > Currently eal vfio framework binds vfio group fd to the default

> > container fd during rte_vfio_setup_device, while in some cases,

> > e.g. vDPA (vhost data path acceleration), we want to put vfio group

> > to a separate container and program IOMMU via this container.

> >

> > This patch extends the vfio_config structure to contain per-container

> > user_mem_maps and defines an array of vfio_config. The next patch will

> > base on this to add container API.

> >

> > Signed-off-by: Junjie Chen <junjie.j.chen@intel.com>

> > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>

> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

> > Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

> > ---

> >   config/common_base                     |   1 +

> >   lib/librte_eal/linuxapp/eal/eal_vfio.c | 407 ++++++++++++++++++++++-------

> ----

> >   lib/librte_eal/linuxapp/eal/eal_vfio.h |  19 +-

> >   3 files changed, 275 insertions(+), 152 deletions(-)

> >

> > diff --git a/config/common_base b/config/common_base

> > index c4236fd1f..4a76d2f14 100644

> > --- a/config/common_base

> > +++ b/config/common_base

> > @@ -87,6 +87,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n

> >   CONFIG_RTE_EAL_IGB_UIO=n

> >   CONFIG_RTE_EAL_VFIO=n

> >   CONFIG_RTE_MAX_VFIO_GROUPS=64

> > +CONFIG_RTE_MAX_VFIO_CONTAINERS=64

> >   CONFIG_RTE_MALLOC_DEBUG=n

> >   CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n

> >   CONFIG_RTE_USE_LIBBSD=n

> > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c

> b/lib/librte_eal/linuxapp/eal/eal_vfio.c

> > index 589d7d478..46fba2d8d 100644

> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c

> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c

> > @@ -22,8 +22,46 @@

> >

> >   #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"

> >

> > +/*

> > + * we don't need to store device fd's anywhere since they can be obtained

> from

> > + * the group fd via an ioctl() call.

> > + */

> > +struct vfio_group {

> > +	int group_no;

> > +	int fd;

> > +	int devices;

> > +};

> 

> What is the purpose of moving this into .c file? Seems like an

> unnecessary change.


Yes, we can let vfio_group stay at .h, and move vfio_config into .c

> 

> > +

> > +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped.

> we can

> > + * recreate the mappings for DPDK segments, but we cannot do so for

> memory that

> > + * was registered by the user themselves, so we need to store the user

> mappings

> > + * somewhere, to recreate them later.

> > + */

> > +#define VFIO_MAX_USER_MEM_MAPS 256

> > +struct user_mem_map {

> > +	uint64_t addr;

> > +	uint64_t iova;

> > +	uint64_t len;

> > +};

> > +

> 

> <...>

> 

> > +static struct vfio_config *

> > +get_vfio_cfg_by_group_no(int iommu_group_no)

> > +{

> > +	struct vfio_config *vfio_cfg;

> > +	int i, j;

> > +

> > +	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {

> > +		vfio_cfg = &vfio_cfgs[i];

> > +		for (j = 0; j < VFIO_MAX_GROUPS; j++) {

> > +			if (vfio_cfg->vfio_groups[j].group_no ==

> > +					iommu_group_no)

> > +				return vfio_cfg;

> > +		}

> > +	}

> > +

> > +	return default_vfio_cfg;

> 

> Here and in other places: i'm not sure returning default vfio config if

> group not found is such a good idea. It would be better if calling code

> explicitly handled case of group not existing yet.


Agree. It would be explicit.

> 

> > +}

> > +

> > +static struct vfio_config *

> > +get_vfio_cfg_by_group_fd(int vfio_group_fd)

> > +{

> > +	struct vfio_config *vfio_cfg;

> > +	int i, j;

> > +

> > +	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {

> > +		vfio_cfg = &vfio_cfgs[i];

> > +		for (j = 0; j < VFIO_MAX_GROUPS; j++)

> > +			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)

> > +				return vfio_cfg;

> > +	}

> >

> 

> <...>

> 

> > -	for (i = 0; i < VFIO_MAX_GROUPS; i++) {

> > -		vfio_cfg.vfio_groups[i].fd = -1;

> > -		vfio_cfg.vfio_groups[i].group_no = -1;

> > -		vfio_cfg.vfio_groups[i].devices = 0;

> > +	rte_spinlock_recursive_t lock =

> RTE_SPINLOCK_RECURSIVE_INITIALIZER;

> > +

> > +	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {

> > +		vfio_cfgs[i].vfio_container_fd = -1;

> > +		vfio_cfgs[i].vfio_active_groups = 0;

> > +		vfio_cfgs[i].vfio_iommu_type = NULL;

> > +		vfio_cfgs[i].mem_maps.lock = lock;

> 

> Nitpick - why copy, instead of straight up initializing with

> RTE_SPINLOCK_RECURSIVE_INITIALIZER?


I tried but compiler doesn't allow this assignment.
RTE_SPINLOCK_RECURSIVE_INITIALIZER could only be used for initialization.

Thanks for the comments,
Xiao
  
Xiao Wang April 16, 2018, 3:34 p.m. UTC | #3
IFCVF driver
============
The IFCVF vDPA (vhost data path acceleration) driver provides support for the
Intel FPGA 100G VF (IFCVF). IFCVF's datapath is virtio ring compatible, it
works as a HW vhost backend which can send/receive packets to/from virtio
directly by DMA. Besides, it supports dirty page logging and device state
report/restore. This driver enables its vDPA functionality with live migration
feature.

vDPA mode
=========
IFCVF's vendor ID and device ID are same as that of virtio net pci device,
with its specific subsystem vendor ID and device ID. To let the device be
probed by IFCVF driver, adding "vdpa=1" parameter helps to specify that this
device is to be used in vDPA mode, rather than polling mode, virtio pmd will
skip when it detects this message.

Container per device
====================
vDPA needs to create different containers for different devices, thus this
patch set adds some APIs in eal/vfio to support multiple container, e.g.
- rte_vfio_create_container
- rte_vfio_destroy_container
- rte_vfio_bind_group
- rte_vfio_unbind_group

By this extension, a device can be put into a new specific container, rather
than the previous default container.

IFCVF vDPA details
==================
Key vDPA driver ops implemented:
- ifcvf_dev_config:
  Enable VF data path with virtio information provided by vhost lib, including
  IOMMU programming to enable VF DMA to VM's memory, VFIO interrupt setup to
  route HW interrupt to virtio driver, create notify relay thread to translate
  virtio driver's kick to a MMIO write onto HW, HW queues configuration.

  This function gets called to set up HW data path backend when virtio driver
  in VM gets ready.

- ifcvf_dev_close:
  Revoke all the setup in ifcvf_dev_config.

  This function gets called when virtio driver stops device in VM.

Change log
==========
v8:
- Rebase on HEAD.
- Move vfio_group definition back to eal_vfio.h.
- Return NULL when vfio group num/fd is not found, let caller handle that.
- Fix wrong API name in commit log.
- Rename bind/unbind function to rte_vfio_container_group_bind/unbind for
  consistensy.
- Add note for rte_vfio_container_create and rte_vfio_dma_map and fix typo
  in comment.
- Extract out the shared code snip of rte_vfio_dma_map and
  rte_vfio_container_dma_map to avoid code duplication. So do for the unmap.

v7:
- Rebase on HEAD.
- Split the vfio patch into 2 parts, one for data structure extension, one for
  adding new API.
- Use static vfio_config array instead of dynamic alloating.
- Change rte_vfio_container_dma_map/unmap's parameters to use (va, iova, len).

v6:
- Rebase on master branch.
- Document "vdpa" devarg in virtio documentation.
- Rename ifcvf config option to CONFIG_RTE_LIBRTE_IFCVF_VDPA_PMD for
  consistensy, and add it into driver documentation.
- Add comments for ifcvf device ID.
- Minor code cleaning.

v5:
- Fix compilation in BSD, remove the rte_vfio.h including in BSD.

v4:
- Rebase on Zhihong's latest vDPA lib patch, with vDPA ops names change.
- Remove API "rte_vfio_get_group_fd", "rte_vfio_bind_group" will return the fd.
- Align the vfio_cfg search internal APIs naming.

v3:
- Add doc and release note for the new driver.
- Remove the vdev concept, make the driver as a PCI driver, it will get probed
  by PCI bus driver.
- Rebase on the v4 vDPA lib patch, register a vDPA device instead of a engine.
- Remove the PCI API exposure accordingly.
- Move the MAX_VFIO_CONTAINERS definition to config file.
- Let virtio pmd skips when a virtio device needs to work in vDPA mode.

v2:
- Rename function pci_get_kernel_driver_by_path to rte_pci_device_kdriver_name
  to make the API generic cross Linux and BSD, make it as EXPERIMENTAL.
- Rebase on Zhihong's vDPA v3 patch set.
- Minor code cleanup on vfio extension.


Xiao Wang (5):
  vfio: extend data structure for multi container
  vfio: add multi container support
  net/virtio: skip device probe in vdpa mode
  net/ifcvf: add ifcvf vdpa driver
  doc: add ifcvf driver document and release note

 config/common_base                       |   8 +
 config/common_linuxapp                   |   1 +
 doc/guides/nics/features/ifcvf.ini       |   8 +
 doc/guides/nics/ifcvf.rst                |  98 ++++
 doc/guides/nics/index.rst                |   1 +
 doc/guides/nics/virtio.rst               |  13 +
 doc/guides/rel_notes/release_18_05.rst   |   9 +
 drivers/net/Makefile                     |   3 +
 drivers/net/ifc/Makefile                 |  35 ++
 drivers/net/ifc/base/ifcvf.c             | 329 ++++++++++++
 drivers/net/ifc/base/ifcvf.h             | 160 ++++++
 drivers/net/ifc/base/ifcvf_osdep.h       |  52 ++
 drivers/net/ifc/ifcvf_vdpa.c             | 842 +++++++++++++++++++++++++++++++
 drivers/net/ifc/rte_ifcvf_version.map    |   4 +
 drivers/net/virtio/virtio_ethdev.c       |  43 ++
 lib/librte_eal/bsdapp/eal/eal.c          |  52 ++
 lib/librte_eal/common/include/rte_vfio.h | 128 ++++-
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 681 +++++++++++++++++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h   |   9 +-
 lib/librte_eal/rte_eal_version.map       |   6 +
 mk/rte.app.mk                            |   3 +
 21 files changed, 2329 insertions(+), 156 deletions(-)
 create mode 100644 doc/guides/nics/features/ifcvf.ini
 create mode 100644 doc/guides/nics/ifcvf.rst
 create mode 100644 drivers/net/ifc/Makefile
 create mode 100644 drivers/net/ifc/base/ifcvf.c
 create mode 100644 drivers/net/ifc/base/ifcvf.h
 create mode 100644 drivers/net/ifc/base/ifcvf_osdep.h
 create mode 100644 drivers/net/ifc/ifcvf_vdpa.c
 create mode 100644 drivers/net/ifc/rte_ifcvf_version.map
  
Ferruh Yigit April 16, 2018, 4:36 p.m. UTC | #4
On 4/16/2018 4:34 PM, Xiao Wang wrote:
> IFCVF driver
> ============
> The IFCVF vDPA (vhost data path acceleration) driver provides support for the
> Intel FPGA 100G VF (IFCVF). IFCVF's datapath is virtio ring compatible, it
> works as a HW vhost backend which can send/receive packets to/from virtio
> directly by DMA. Besides, it supports dirty page logging and device state
> report/restore. This driver enables its vDPA functionality with live migration
> feature.
> 
> vDPA mode
> =========
> IFCVF's vendor ID and device ID are same as that of virtio net pci device,
> with its specific subsystem vendor ID and device ID. To let the device be
> probed by IFCVF driver, adding "vdpa=1" parameter helps to specify that this
> device is to be used in vDPA mode, rather than polling mode, virtio pmd will
> skip when it detects this message.
> 
> Container per device
> ====================
> vDPA needs to create different containers for different devices, thus this
> patch set adds some APIs in eal/vfio to support multiple container, e.g.
> - rte_vfio_create_container
> - rte_vfio_destroy_container
> - rte_vfio_bind_group
> - rte_vfio_unbind_group
> 
> By this extension, a device can be put into a new specific container, rather
> than the previous default container.
> 
> IFCVF vDPA details
> ==================
> Key vDPA driver ops implemented:
> - ifcvf_dev_config:
>   Enable VF data path with virtio information provided by vhost lib, including
>   IOMMU programming to enable VF DMA to VM's memory, VFIO interrupt setup to
>   route HW interrupt to virtio driver, create notify relay thread to translate
>   virtio driver's kick to a MMIO write onto HW, HW queues configuration.
> 
>   This function gets called to set up HW data path backend when virtio driver
>   in VM gets ready.
> 
> - ifcvf_dev_close:
>   Revoke all the setup in ifcvf_dev_config.
> 
>   This function gets called when virtio driver stops device in VM.
> 
> Change log
> ==========
> v8:
> - Rebase on HEAD.
> - Move vfio_group definition back to eal_vfio.h.
> - Return NULL when vfio group num/fd is not found, let caller handle that.
> - Fix wrong API name in commit log.
> - Rename bind/unbind function to rte_vfio_container_group_bind/unbind for
>   consistensy.
> - Add note for rte_vfio_container_create and rte_vfio_dma_map and fix typo
>   in comment.
> - Extract out the shared code snip of rte_vfio_dma_map and
>   rte_vfio_container_dma_map to avoid code duplication. So do for the unmap.
> 
> v7:
> - Rebase on HEAD.
> - Split the vfio patch into 2 parts, one for data structure extension, one for
>   adding new API.
> - Use static vfio_config array instead of dynamic alloating.
> - Change rte_vfio_container_dma_map/unmap's parameters to use (va, iova, len).
> 
> v6:
> - Rebase on master branch.
> - Document "vdpa" devarg in virtio documentation.
> - Rename ifcvf config option to CONFIG_RTE_LIBRTE_IFCVF_VDPA_PMD for
>   consistensy, and add it into driver documentation.
> - Add comments for ifcvf device ID.
> - Minor code cleaning.
> 
> v5:
> - Fix compilation in BSD, remove the rte_vfio.h including in BSD.
> 
> v4:
> - Rebase on Zhihong's latest vDPA lib patch, with vDPA ops names change.
> - Remove API "rte_vfio_get_group_fd", "rte_vfio_bind_group" will return the fd.
> - Align the vfio_cfg search internal APIs naming.
> 
> v3:
> - Add doc and release note for the new driver.
> - Remove the vdev concept, make the driver as a PCI driver, it will get probed
>   by PCI bus driver.
> - Rebase on the v4 vDPA lib patch, register a vDPA device instead of a engine.
> - Remove the PCI API exposure accordingly.
> - Move the MAX_VFIO_CONTAINERS definition to config file.
> - Let virtio pmd skips when a virtio device needs to work in vDPA mode.
> 
> v2:
> - Rename function pci_get_kernel_driver_by_path to rte_pci_device_kdriver_name
>   to make the API generic cross Linux and BSD, make it as EXPERIMENTAL.
> - Rebase on Zhihong's vDPA v3 patch set.
> - Minor code cleanup on vfio extension.
> 
> 
> Xiao Wang (5):
>   vfio: extend data structure for multi container
>   vfio: add multi container support
>   net/virtio: skip device probe in vdpa mode
>   net/ifcvf: add ifcvf vdpa driver
>   doc: add ifcvf driver document and release note


Hi Xiao,

Getting following build error for 32bit [1], can you please check them?

[1]
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c: In function ‘ifcvf_dma_map’:
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:24:3: error: format ‘%lx’ expects argument
of type ‘long unsigned int’, but argument 6 has type ‘uint64_t {aka long long
unsigned int}’ [-Werror=format=]
   "%s(): " fmt "\n", __func__, ##args)
   ^
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:178:22:
    "size 0x%lx.", i, reg->host_user_addr,
                      ~~~~~~
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:177:3: note: in expansion of macro ‘DRV_LOG’
   DRV_LOG(INFO, "region %u: HVA 0x%lx, GPA 0x%lx, "
   ^~~~~~~
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:177:37: note: format string is defined here
   DRV_LOG(INFO, "region %u: HVA 0x%lx, GPA 0x%lx, "
                                   ~~^
                                   %llx
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:24:3: error: format ‘%lx’ expects argument
of type ‘long unsigned int’, but argument 7 has type ‘uint64_t {aka long long
unsigned int}’ [-Werror=format=]
   "%s(): " fmt "\n", __func__, ##args)
   ^
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:179:4:
    reg->guest_phys_addr, reg->size);
    ~~~~~~
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:177:3: note: in expansion of macro ‘DRV_LOG’
   DRV_LOG(INFO, "region %u: HVA 0x%lx, GPA 0x%lx, "
   ^~~~~~~
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:177:48: note: format string is defined here
   DRV_LOG(INFO, "region %u: HVA 0x%lx, GPA 0x%lx, "
                                              ~~^
                                              %llx
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:24:3: error: format ‘%lx’ expects argument
of type ‘long unsigned int’, but argument 8 has type ‘uint64_t {aka long long
unsigned int}’ [-Werror=format=]
   "%s(): " fmt "\n", __func__, ##args)
   ^
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:179:26:
    reg->guest_phys_addr, reg->size);
                          ~~~~~~
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:177:3: note: in expansion of macro ‘DRV_LOG’
   DRV_LOG(INFO, "region %u: HVA 0x%lx, GPA 0x%lx, "
   ^~~~~~~
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:178:14: note: format string is defined here
    "size 0x%lx.", i, reg->host_user_addr,
            ~~^
            %llx
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c: In function ‘vdpa_ifcvf_start’:
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:266:39: error: cast from pointer to
integer of different size [-Werror=pointer-to-int-cast]
   hw->vring[i].desc = qva_to_gpa(vid, (uint64_t)vq.desc);
                                       ^
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:267:40: error: cast from pointer to
integer of different size [-Werror=pointer-to-int-cast]
   hw->vring[i].avail = qva_to_gpa(vid, (uint64_t)vq.avail);
                                        ^
.../dpdk/drivers/net/ifc/ifcvf_vdpa.c:268:39: error: cast from pointer to
integer of different size [-Werror=pointer-to-int-cast]
   hw->vring[i].used = qva_to_gpa(vid, (uint64_t)vq.used);
                                       ^
  
Thomas Monjalon April 16, 2018, 6:07 p.m. UTC | #5
16/04/2018 18:36, Ferruh Yigit:
> Hi Xiao,
> 
> Getting following build error for 32bit [1], can you please check them?
> 
> [1]
> .../dpdk/drivers/net/ifc/ifcvf_vdpa.c: In function ‘ifcvf_dma_map’:
> .../dpdk/drivers/net/ifc/ifcvf_vdpa.c:24:3: error: format ‘%lx’ expects argument
> of type ‘long unsigned int’, but argument 6 has type ‘uint64_t {aka long long
> unsigned int}’ [-Werror=format=]

Reminder from this recent post:
	http://dpdk.org/ml/archives/dev/2018-February/090882.html
"
Most of the times, using %l is wrong (except when printing a long).
So next time you write %l, please think "I am probably wrong".
"
  
Xiao Wang April 17, 2018, 5:36 a.m. UTC | #6
Thanks for the reminder. Will fix it.

BRs,
Xiao

> -----Original Message-----

> From: Thomas Monjalon [mailto:thomas@monjalon.net]

> Sent: Tuesday, April 17, 2018 2:07 AM

> To: Wang, Xiao W <xiao.w.wang@intel.com>

> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; Burakov, Anatoly

> <anatoly.burakov@intel.com>; dev@dpdk.org; maxime.coquelin@redhat.com;

> Wang, Zhihong <zhihong.wang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;

> Tan, Jianfeng <jianfeng.tan@intel.com>; Liang, Cunming

> <cunming.liang@intel.com>; Daly, Dan <dan.daly@intel.com>

> Subject: Re: [PATCH v8 0/5] add ifcvf vdpa driver

> 

> 16/04/2018 18:36, Ferruh Yigit:

> > Hi Xiao,

> >

> > Getting following build error for 32bit [1], can you please check them?

> >

> > [1]

> > .../dpdk/drivers/net/ifc/ifcvf_vdpa.c: In function ‘ifcvf_dma_map’:

> > .../dpdk/drivers/net/ifc/ifcvf_vdpa.c:24:3: error: format ‘%lx’ expects

> argument

> > of type ‘long unsigned int’, but argument 6 has type ‘uint64_t {aka long long

> > unsigned int}’ [-Werror=format=]

> 

> Reminder from this recent post:

> 	http://dpdk.org/ml/archives/dev/2018-February/090882.html

> "

> Most of the times, using %l is wrong (except when printing a long).

> So next time you write %l, please think "I am probably wrong".

> "

> 

>
  

Patch

diff --git a/config/common_base b/config/common_base
index c4236fd1f..4a76d2f14 100644
--- a/config/common_base
+++ b/config/common_base
@@ -87,6 +87,7 @@  CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
 CONFIG_RTE_EAL_IGB_UIO=n
 CONFIG_RTE_EAL_VFIO=n
 CONFIG_RTE_MAX_VFIO_GROUPS=64
+CONFIG_RTE_MAX_VFIO_CONTAINERS=64
 CONFIG_RTE_MALLOC_DEBUG=n
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
 CONFIG_RTE_USE_LIBBSD=n
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 589d7d478..46fba2d8d 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -22,8 +22,46 @@ 
 
 #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
 
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+	int group_no;
+	int fd;
+	int devices;
+};
+
+/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
+ * recreate the mappings for DPDK segments, but we cannot do so for memory that
+ * was registered by the user themselves, so we need to store the user mappings
+ * somewhere, to recreate them later.
+ */
+#define VFIO_MAX_USER_MEM_MAPS 256
+struct user_mem_map {
+	uint64_t addr;
+	uint64_t iova;
+	uint64_t len;
+};
+
+struct user_mem_maps {
+	rte_spinlock_recursive_t lock;
+	int n_maps;
+	struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
+};
+
+struct vfio_config {
+	int vfio_enabled;
+	int vfio_container_fd;
+	int vfio_active_groups;
+	const struct vfio_iommu_type *vfio_iommu_type;
+	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+	struct user_mem_maps mem_maps;
+};
+
 /* per-process VFIO config */
-static struct vfio_config vfio_cfg;
+static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
+static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
 
 static int vfio_type1_dma_map(int);
 static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
@@ -31,8 +69,8 @@  static int vfio_spapr_dma_map(int);
 static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_noiommu_dma_map(int);
 static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len,
-		int do_map);
+static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
+		uint64_t iova, uint64_t len, int do_map);
 
 /* IOMMU types we support */
 static const struct vfio_iommu_type iommu_types[] = {
@@ -59,25 +97,6 @@  static const struct vfio_iommu_type iommu_types[] = {
 	},
 };
 
-/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
- * recreate the mappings for DPDK segments, but we cannot do so for memory that
- * was registered by the user themselves, so we need to store the user mappings
- * somewhere, to recreate them later.
- */
-#define VFIO_MAX_USER_MEM_MAPS 256
-struct user_mem_map {
-	uint64_t addr;
-	uint64_t iova;
-	uint64_t len;
-};
-static struct {
-	rte_spinlock_recursive_t lock;
-	int n_maps;
-	struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
-} user_mem_maps = {
-	.lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER
-};
-
 /* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use
  * rte_memseg_walk() because by the time we enter callback we will be holding a
  * write lock, so regular rte-memseg_walk will deadlock. copying the same
@@ -206,14 +225,15 @@  merge_map(struct user_mem_map *left, struct user_mem_map *right)
 }
 
 static struct user_mem_map *
-find_user_mem_map(uint64_t addr, uint64_t iova, uint64_t len)
+find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
+		uint64_t iova, uint64_t len)
 {
 	uint64_t va_end = addr + len;
 	uint64_t iova_end = iova + len;
 	int i;
 
-	for (i = 0; i < user_mem_maps.n_maps; i++) {
-		struct user_mem_map *map = &user_mem_maps.maps[i];
+	for (i = 0; i < user_mem_maps->n_maps; i++) {
+		struct user_mem_map *map = &user_mem_maps->maps[i];
 		uint64_t map_va_end = map->addr + map->len;
 		uint64_t map_iova_end = map->iova + map->len;
 
@@ -239,20 +259,20 @@  find_user_mem_map(uint64_t addr, uint64_t iova, uint64_t len)
 
 /* this will sort all user maps, and merge/compact any adjacent maps */
 static void
-compact_user_maps(void)
+compact_user_maps(struct user_mem_maps *user_mem_maps)
 {
 	int i, n_merged, cur_idx;
 
-	qsort(user_mem_maps.maps, user_mem_maps.n_maps,
-			sizeof(user_mem_maps.maps[0]), user_mem_map_cmp);
+	qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+			sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
 
 	/* we'll go over the list backwards when merging */
 	n_merged = 0;
-	for (i = user_mem_maps.n_maps - 2; i >= 0; i--) {
+	for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
 		struct user_mem_map *l, *r;
 
-		l = &user_mem_maps.maps[i];
-		r = &user_mem_maps.maps[i + 1];
+		l = &user_mem_maps->maps[i];
+		r = &user_mem_maps->maps[i + 1];
 
 		if (is_null_map(l) || is_null_map(r))
 			continue;
@@ -266,12 +286,12 @@  compact_user_maps(void)
 	 */
 	if (n_merged > 0) {
 		cur_idx = 0;
-		for (i = 0; i < user_mem_maps.n_maps; i++) {
-			if (!is_null_map(&user_mem_maps.maps[i])) {
+		for (i = 0; i < user_mem_maps->n_maps; i++) {
+			if (!is_null_map(&user_mem_maps->maps[i])) {
 				struct user_mem_map *src, *dst;
 
-				src = &user_mem_maps.maps[i];
-				dst = &user_mem_maps.maps[cur_idx++];
+				src = &user_mem_maps->maps[i];
+				dst = &user_mem_maps->maps[cur_idx++];
 
 				if (src != dst) {
 					memcpy(dst, src, sizeof(*src));
@@ -279,41 +299,16 @@  compact_user_maps(void)
 				}
 			}
 		}
-		user_mem_maps.n_maps = cur_idx;
+		user_mem_maps->n_maps = cur_idx;
 	}
 }
 
-int
-vfio_get_group_fd(int iommu_group_no)
+static int
+vfio_open_group_fd(int iommu_group_no)
 {
-	int i;
 	int vfio_group_fd;
 	char filename[PATH_MAX];
-	struct vfio_group *cur_grp;
-
-	/* check if we already have the group descriptor open */
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
-			return vfio_cfg.vfio_groups[i].fd;
-
-	/* Lets see first if there is room for a new group */
-	if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
-		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
-		return -1;
-	}
-
-	/* Now lets get an index for the new group */
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].group_no == -1) {
-			cur_grp = &vfio_cfg.vfio_groups[i];
-			break;
-		}
 
-	/* This should not happen */
-	if (i == VFIO_MAX_GROUPS) {
-		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
-		return -1;
-	}
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 		/* try regular group format */
@@ -343,9 +338,6 @@  vfio_get_group_fd(int iommu_group_no)
 			/* noiommu group found */
 		}
 
-		cur_grp->group_no = iommu_group_no;
-		cur_grp->fd = vfio_group_fd;
-		vfio_cfg.vfio_active_groups++;
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
@@ -380,9 +372,6 @@  vfio_get_group_fd(int iommu_group_no)
 			/* if we got the fd, store it and return it */
 			if (vfio_group_fd > 0) {
 				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
 				return vfio_group_fd;
 			}
 			/* fall-through on error */
@@ -392,56 +381,164 @@  vfio_get_group_fd(int iommu_group_no)
 			return -1;
 		}
 	}
-	return -1;
 }
 
+static struct vfio_config *
+get_vfio_cfg_by_group_no(int iommu_group_no)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfg = &vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+			if (vfio_cfg->vfio_groups[j].group_no ==
+					iommu_group_no)
+				return vfio_cfg;
+		}
+	}
+
+	return default_vfio_cfg;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfg = &vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++)
+			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+				return vfio_cfg;
+	}
 
-static int
-get_vfio_group_idx(int vfio_group_fd)
+	return default_vfio_cfg;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
 {
 	int i;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		if (vfio_cfgs[i].vfio_container_fd == container_fd)
+			return &vfio_cfgs[i];
+	}
+
+	return NULL;
+}
+
+int
+vfio_get_group_fd(int iommu_group_no)
+{
+	int i;
+	int vfio_group_fd;
+	struct vfio_group *cur_grp;
+	struct vfio_config *vfio_cfg;
+
+	/* get the vfio_config it belongs to */
+	vfio_cfg = get_vfio_cfg_by_group_no(iommu_group_no);
+
+	/* check if we already have the group descriptor open */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no == iommu_group_no)
+			return vfio_cfg->vfio_groups[i].fd;
+
+	/* Lets see first if there is room for a new group */
+	if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Now lets get an index for the new group */
 	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd)
-			return i;
+		if (vfio_cfg->vfio_groups[i].group_no == -1) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_no);
+		return -1;
+	}
+
+	cur_grp->group_no = iommu_group_no;
+	cur_grp->fd = vfio_group_fd;
+	vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+static int
+get_vfio_group_idx(int vfio_group_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfg = &vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++)
+			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+				return j;
+	}
+
 	return -1;
 }
 
 static void
 vfio_group_device_get(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 	else
-		vfio_cfg.vfio_groups[i].devices++;
+		vfio_cfg->vfio_groups[i].devices++;
 }
 
 static void
 vfio_group_device_put(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 	else
-		vfio_cfg.vfio_groups[i].devices--;
+		vfio_cfg->vfio_groups[i].devices--;
 }
 
 static int
 vfio_group_device_count(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 		return -1;
 	}
 
-	return vfio_cfg.vfio_groups[i].devices;
+	return vfio_cfg->vfio_groups[i].devices;
 }
 
 static void
@@ -457,9 +554,11 @@  vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len)
 	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
 		uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
 		if (type == RTE_MEM_EVENT_ALLOC)
-			vfio_dma_mem_map(vfio_va, vfio_va, len, 1);
+			vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+					len, 1);
 		else
-			vfio_dma_mem_map(vfio_va, vfio_va, len, 0);
+			vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+					len, 0);
 		return;
 	}
 
@@ -467,9 +566,11 @@  vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len)
 	ms = rte_mem_virt2memseg(addr, msl);
 	while (cur_len < len) {
 		if (type == RTE_MEM_EVENT_ALLOC)
-			vfio_dma_mem_map(ms->addr_64, ms->iova, ms->len, 1);
+			vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
+					ms->iova, ms->len, 1);
 		else
-			vfio_dma_mem_map(ms->addr_64, ms->iova, ms->len, 0);
+			vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
+					ms->iova, ms->len, 0);
 
 		cur_len += ms->len;
 		++ms;
@@ -481,16 +582,19 @@  rte_vfio_clear_group(int vfio_group_fd)
 {
 	int i;
 	int socket_fd, ret;
+	struct vfio_config *vfio_cfg;
+
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
 
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
 		i = get_vfio_group_idx(vfio_group_fd);
 		if (i < 0)
 			return -1;
-		vfio_cfg.vfio_groups[i].group_no = -1;
-		vfio_cfg.vfio_groups[i].fd = -1;
-		vfio_cfg.vfio_groups[i].devices = 0;
-		vfio_cfg.vfio_active_groups--;
+		vfio_cfg->vfio_groups[i].group_no = -1;
+		vfio_cfg->vfio_groups[i].fd = -1;
+		vfio_cfg->vfio_groups[i].devices = 0;
+		vfio_cfg->vfio_active_groups--;
 		return 0;
 	}
 
@@ -543,6 +647,9 @@  rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 	struct vfio_group_status group_status = {
 			.argsz = sizeof(group_status)
 	};
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int vfio_container_fd;
 	int vfio_group_fd;
 	int iommu_group_no;
 	int i, ret;
@@ -591,12 +698,17 @@  rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		return -1;
 	}
 
+	/* get the vfio_config it belongs to */
+	vfio_cfg = get_vfio_cfg_by_group_no(iommu_group_no);
+	vfio_container_fd = vfio_cfg->vfio_container_fd;
+	user_mem_maps = &vfio_cfg->mem_maps;
+
 	/* check if group does not have a container yet */
 	if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
 
 		/* add group to a container */
 		ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
-				&vfio_cfg.vfio_container_fd);
+				&vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
 					"error %i (%s)\n", dev_addr, errno, strerror(errno));
@@ -614,11 +726,11 @@  rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		 * functionality.
 		 */
 		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_cfg.vfio_active_groups == 1) {
+				vfio_cfg->vfio_active_groups == 1) {
 			const struct vfio_iommu_type *t;
 
 			/* select an IOMMU type which we will be using */
-			t = vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+			t = vfio_set_iommu_type(vfio_container_fd);
 			if (!t) {
 				RTE_LOG(ERR, EAL,
 					"  %s failed to select IOMMU type\n",
@@ -631,7 +743,10 @@  rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 			 * after registering callback, to prevent races
 			 */
 			rte_rwlock_read_lock(mem_lock);
-			ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
+			if (vfio_cfg == default_vfio_cfg)
+				ret = t->dma_map_func(vfio_container_fd);
+			else
+				ret = 0;
 			if (ret) {
 				RTE_LOG(ERR, EAL,
 					"  %s DMA remapping failed, error %i (%s)\n",
@@ -642,22 +757,22 @@  rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 				return -1;
 			}
 
-			vfio_cfg.vfio_iommu_type = t;
+			vfio_cfg->vfio_iommu_type = t;
 
 			/* re-map all user-mapped segments */
-			rte_spinlock_recursive_lock(&user_mem_maps.lock);
+			rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
 			/* this IOMMU type may not support DMA mapping, but
 			 * if we have mappings in the list - that means we have
 			 * previously mapped something successfully, so we can
 			 * be sure that DMA mapping is supported.
 			 */
-			for (i = 0; i < user_mem_maps.n_maps; i++) {
+			for (i = 0; i < user_mem_maps->n_maps; i++) {
 				struct user_mem_map *map;
-				map = &user_mem_maps.maps[i];
+				map = &user_mem_maps->maps[i];
 
 				ret = t->dma_user_map_func(
-						vfio_cfg.vfio_container_fd,
+						vfio_container_fd,
 						map->addr, map->iova, map->len,
 						1);
 				if (ret) {
@@ -668,17 +783,20 @@  rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 							map->addr, map->iova,
 							map->len);
 					rte_spinlock_recursive_unlock(
-							&user_mem_maps.lock);
+							&user_mem_maps->lock);
 					rte_rwlock_read_unlock(mem_lock);
 					return -1;
 				}
 			}
-			rte_spinlock_recursive_unlock(&user_mem_maps.lock);
+			rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 
 			/* register callback for mem events */
-			ret = rte_mem_event_callback_register(
+			if (vfio_cfg == default_vfio_cfg)
+				ret = rte_mem_event_callback_register(
 					VFIO_MEM_EVENT_CLB_NAME,
 					vfio_mem_event_callback);
+			else
+				ret = 0;
 			/* unlock memory hotplug */
 			rte_rwlock_read_unlock(mem_lock);
 
@@ -732,6 +850,7 @@  rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 	struct vfio_group_status group_status = {
 			.argsz = sizeof(group_status)
 	};
+	struct vfio_config *vfio_cfg;
 	int vfio_group_fd;
 	int iommu_group_no;
 	int ret;
@@ -761,6 +880,9 @@  rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 		goto out;
 	}
 
+	/* get the vfio_config it belongs to */
+	vfio_cfg = get_vfio_cfg_by_group_no(iommu_group_no);
+
 	/* At this point we got an active group. Closing it will make the
 	 * container detachment. If this is the last active group, VFIO kernel
 	 * code will unset the container and the IOMMU mappings.
@@ -798,7 +920,7 @@  rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 	/* if there are no active device groups, unregister the callback to
 	 * avoid spurious attempts to map/unmap memory from VFIO.
 	 */
-	if (vfio_cfg.vfio_active_groups == 0)
+	if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0)
 		rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME);
 
 	/* success */
@@ -813,13 +935,21 @@  int
 rte_vfio_enable(const char *modname)
 {
 	/* initialize group list */
-	int i;
+	int i, j;
 	int vfio_available;
-
-	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
-		vfio_cfg.vfio_groups[i].fd = -1;
-		vfio_cfg.vfio_groups[i].group_no = -1;
-		vfio_cfg.vfio_groups[i].devices = 0;
+	rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		vfio_cfgs[i].vfio_container_fd = -1;
+		vfio_cfgs[i].vfio_active_groups = 0;
+		vfio_cfgs[i].vfio_iommu_type = NULL;
+		vfio_cfgs[i].mem_maps.lock = lock;
+
+		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+			vfio_cfgs[i].vfio_groups[j].fd = -1;
+			vfio_cfgs[i].vfio_groups[j].group_no = -1;
+			vfio_cfgs[i].vfio_groups[j].devices = 0;
+		}
 	}
 
 	/* inform the user that we are probing for VFIO */
@@ -841,12 +971,12 @@  rte_vfio_enable(const char *modname)
 		return 0;
 	}
 
-	vfio_cfg.vfio_container_fd = vfio_get_container_fd();
+	default_vfio_cfg->vfio_container_fd = vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1) {
+	if (default_vfio_cfg->vfio_container_fd != -1) {
 		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
-		vfio_cfg.vfio_enabled = 1;
+		default_vfio_cfg->vfio_enabled = 1;
 	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
 	}
@@ -858,7 +988,7 @@  int
 rte_vfio_is_enabled(const char *modname)
 {
 	const int mod_available = rte_eal_check_module(modname) > 0;
-	return vfio_cfg.vfio_enabled && mod_available;
+	return default_vfio_cfg->vfio_enabled && mod_available;
 }
 
 const struct vfio_iommu_type *
@@ -1220,9 +1350,13 @@  vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 	struct vfio_iommu_spapr_tce_create create = {
 		.argsz = sizeof(create),
 	};
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
 	int i, ret = 0;
 
-	rte_spinlock_recursive_lock(&user_mem_maps.lock);
+	vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
 	/* check if window size needs to be adjusted */
 	memset(&param, 0, sizeof(param));
@@ -1235,9 +1369,9 @@  vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 	}
 
 	/* also check user maps */
-	for (i = 0; i < user_mem_maps.n_maps; i++) {
-		uint64_t max = user_mem_maps.maps[i].iova +
-				user_mem_maps.maps[i].len;
+	for (i = 0; i < user_mem_maps->n_maps; i++) {
+		uint64_t max = user_mem_maps->maps[i].iova +
+				user_mem_maps->maps[i].len;
 		create.window_size = RTE_MAX(create.window_size, max);
 	}
 
@@ -1263,9 +1397,9 @@  vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 				goto out;
 			}
 			/* remap all user maps */
-			for (i = 0; i < user_mem_maps.n_maps; i++) {
+			for (i = 0; i < user_mem_maps->n_maps; i++) {
 				struct user_mem_map *map =
-						&user_mem_maps.maps[i];
+						&user_mem_maps->maps[i];
 				if (vfio_spapr_dma_do_map(vfio_container_fd,
 						map->addr, map->iova, map->len,
 						1)) {
@@ -1306,7 +1440,7 @@  vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 		vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
 	}
 out:
-	rte_spinlock_recursive_unlock(&user_mem_maps.lock);
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
 }
 
@@ -1358,9 +1492,10 @@  vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
 }
 
 static int
-vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map)
+vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
 {
-	const struct vfio_iommu_type *t = vfio_cfg.vfio_iommu_type;
+	const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
 
 	if (!t) {
 		RTE_LOG(ERR, EAL, "  VFIO support not initialized\n");
@@ -1376,7 +1511,7 @@  vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map)
 		return -1;
 	}
 
-	return t->dma_user_map_func(vfio_cfg.vfio_container_fd, vaddr, iova,
+	return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
 			len, do_map);
 }
 
@@ -1384,6 +1519,7 @@  int __rte_experimental
 rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
 {
 	struct user_mem_map *new_map;
+	struct user_mem_maps *user_mem_maps;
 	int ret = 0;
 
 	if (len == 0) {
@@ -1391,15 +1527,16 @@  rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
 		return -1;
 	}
 
-	rte_spinlock_recursive_lock(&user_mem_maps.lock);
-	if (user_mem_maps.n_maps == VFIO_MAX_USER_MEM_MAPS) {
+	user_mem_maps = &default_vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+	if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
 		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
 		rte_errno = ENOMEM;
 		ret = -1;
 		goto out;
 	}
 	/* map the entry */
-	if (vfio_dma_mem_map(vaddr, iova, len, 1)) {
+	if (vfio_dma_mem_map(default_vfio_cfg, vaddr, iova, len, 1)) {
 		/* technically, this will fail if there are currently no devices
 		 * plugged in, even if a device were added later, this mapping
 		 * might have succeeded. however, since we cannot verify if this
@@ -1412,14 +1549,14 @@  rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
 		goto out;
 	}
 	/* create new user mem map entry */
-	new_map = &user_mem_maps.maps[user_mem_maps.n_maps++];
+	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	new_map->addr = vaddr;
 	new_map->iova = iova;
 	new_map->len = len;
 
-	compact_user_maps();
+	compact_user_maps(user_mem_maps);
 out:
-	rte_spinlock_recursive_unlock(&user_mem_maps.lock);
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
 }
 
@@ -1427,6 +1564,7 @@  int __rte_experimental
 rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
 {
 	struct user_mem_map *map, *new_map = NULL;
+	struct user_mem_maps *user_mem_maps;
 	int ret = 0;
 
 	if (len == 0) {
@@ -1434,10 +1572,11 @@  rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
 		return -1;
 	}
 
-	rte_spinlock_recursive_lock(&user_mem_maps.lock);
+	user_mem_maps = &default_vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
 	/* find our mapping */
-	map = find_user_mem_map(vaddr, iova, len);
+	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
 	if (!map) {
 		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
 		rte_errno = EINVAL;
@@ -1448,17 +1587,17 @@  rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
 		/* we're partially unmapping a previously mapped region, so we
 		 * need to split entry into two.
 		 */
-		if (user_mem_maps.n_maps == VFIO_MAX_USER_MEM_MAPS) {
+		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
 			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
 			rte_errno = ENOMEM;
 			ret = -1;
 			goto out;
 		}
-		new_map = &user_mem_maps.maps[user_mem_maps.n_maps++];
+		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
 	}
 
 	/* unmap the entry */
-	if (vfio_dma_mem_map(vaddr, iova, len, 0)) {
+	if (vfio_dma_mem_map(default_vfio_cfg, vaddr, iova, len, 0)) {
 		/* there may not be any devices plugged in, so unmapping will
 		 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
 		 * stop us from removing the mapping, as the assumption is we
@@ -1481,19 +1620,19 @@  rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
 
 		/* if we've created a new map by splitting, sort everything */
 		if (!is_null_map(new_map)) {
-			compact_user_maps();
+			compact_user_maps(user_mem_maps);
 		} else {
 			/* we've created a new mapping, but it was unused */
-			user_mem_maps.n_maps--;
+			user_mem_maps->n_maps--;
 		}
 	} else {
 		memset(map, 0, sizeof(*map));
-		compact_user_maps();
-		user_mem_maps.n_maps--;
+		compact_user_maps(user_mem_maps);
+		user_mem_maps->n_maps--;
 	}
 
 out:
-	rte_spinlock_recursive_unlock(&user_mem_maps.lock);
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
 	return ret;
 }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 549f4427e..e14d5be99 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,6 +88,7 @@  struct vfio_iommu_spapr_tce_info {
 #endif
 
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
+#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
 
 /*
  * Function prototypes for VFIO multiprocess sync functions
@@ -98,24 +99,6 @@  int vfio_mp_sync_send_fd(int socket, int fd);
 int vfio_mp_sync_receive_fd(int socket);
 int vfio_mp_sync_connect_to_primary(void);
 
-/*
- * we don't need to store device fd's anywhere since they can be obtained from
- * the group fd via an ioctl() call.
- */
-struct vfio_group {
-	int group_no;
-	int fd;
-	int devices;
-};
-
-struct vfio_config {
-	int vfio_enabled;
-	int vfio_container_fd;
-	int vfio_active_groups;
-	const struct vfio_iommu_type *vfio_iommu_type;
-	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
-};
-
 /* DMA mapping function prototype.
  * Takes VFIO container fd as a parameter.
  * Returns 0 on success, -1 on error.