[dpdk-dev,v7,2/5] vfio: add multi container support

Message ID 20180415153349.62105-3-xiao.w.wang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply patch file failure

Commit Message

Xiao Wang April 15, 2018, 3:33 p.m. UTC
  This patch adds APIs to support container create/destroy and device
bind/unbind with a container. It also provides API for IOMMU programing
on a specified container.

A driver could use "rte_vfio_create_container" helper to create a
new container from eal, use "rte_vfio_bind_group" to bind a device
to the newly created container. During rte_vfio_setup_device the
container bound with the device will be used for IOMMU setup.

Signed-off-by: Junjie Chen <junjie.j.chen@intel.com>
Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 lib/librte_eal/bsdapp/eal/eal.c          |  52 +++++
 lib/librte_eal/common/include/rte_vfio.h | 119 ++++++++++++
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 316 +++++++++++++++++++++++++++++++
 lib/librte_eal/rte_eal_version.map       |   6 +
 4 files changed, 493 insertions(+)
  

Comments

Anatoly Burakov April 16, 2018, 10:03 a.m. UTC | #1
On 15-Apr-18 4:33 PM, Xiao Wang wrote:
> This patch adds APIs to support container create/destroy and device
> bind/unbind with a container. It also provides API for IOMMU programing
> on a specified container.
> 
> A driver could use "rte_vfio_create_container" helper to create a

^^ wrong API name in commit message :)

> new container from eal, use "rte_vfio_bind_group" to bind a device
> to the newly created container. During rte_vfio_setup_device the
> container bound with the device will be used for IOMMU setup.
> 
> Signed-off-by: Junjie Chen <junjie.j.chen@intel.com>
> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
> ---
>   lib/librte_eal/bsdapp/eal/eal.c          |  52 +++++
>   lib/librte_eal/common/include/rte_vfio.h | 119 ++++++++++++
>   lib/librte_eal/linuxapp/eal/eal_vfio.c   | 316 +++++++++++++++++++++++++++++++
>   lib/librte_eal/rte_eal_version.map       |   6 +
>   4 files changed, 493 insertions(+)
> 
> diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
> index 727adc5d2..c5106d0d6 100644
> --- a/lib/librte_eal/bsdapp/eal/eal.c
> +++ b/lib/librte_eal/bsdapp/eal/eal.c
> @@ -769,6 +769,14 @@ int rte_vfio_noiommu_is_enabled(void);
>   int rte_vfio_clear_group(int vfio_group_fd);
>   int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
>   int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
> +int rte_vfio_container_create(void);
> +int rte_vfio_container_destroy(int container_fd);
> +int rte_vfio_bind_group(int container_fd, int iommu_group_no);
> +int rte_vfio_unbind_group(int container_fd, int iommu_group_no);

Maybe have these under "container" too? e.g. 
rte_vfio_container_group_bind/unbind? Seems like it would be more 
consistent that way - anything to do with custom containers would be 
under rte_vfio_container_* namespace.

> +int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
> +		uint64_t iova, uint64_t len);
> +int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
> +		uint64_t iova, uint64_t len);
>   
>   int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
>   		      __rte_unused const char *dev_addr,
> @@ -818,3 +826,47 @@ rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
>   {
>   	return -1;
>   }
> +

<...>

> diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
> index d26ab01cb..0c1509b29 100644
> --- a/lib/librte_eal/common/include/rte_vfio.h
> +++ b/lib/librte_eal/common/include/rte_vfio.h
> @@ -168,6 +168,125 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
>   int __rte_experimental
>   rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
>   
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Create a new container for device binding.

I would add a note that any newly allocated DPDK memory will not be 
mapped into these containers by default.

> + *
> + * @return
> + *   the container fd if successful
> + *   <0 if failed
> + */
> +int __rte_experimental
> +rte_vfio_container_create(void);
> +

<...>

> + *    0 if successful
> + *   <0 if failed
> + */
> +int __rte_experimental
> +rte_vfio_unbind_group(int container_fd, int iommu_group_no);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Perform dma mapping for devices in a conainer.

Here and in other places: "dma" should be DMA, and typo: "conainer" :)

I think you should also add a note to the original API (not this one, 
but the old one) that DMA maps done via that API will only apply to 
default container and will not apply to any of the containers created 
via container_create(). IOW, documentation should make it clear that if 
you use this functionality, you're on your own and you have to manage 
your own DMA mappings for any containers you create.

> + *
> + * @param container_fd
> + *   the specified container fd
> + *
> + * @param vaddr
> + *   Starting virtual address of memory to be mapped.
> + *

<...>

> +
> +int __rte_experimental
> +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
> +		uint64_t len)
> +{
> +	struct user_mem_map *new_map;
> +	struct vfio_config *vfio_cfg;
> +	struct user_mem_maps *user_mem_maps;
> +	int ret = 0;
> +
> +	if (len == 0) {
> +		rte_errno = EINVAL;
> +		return -1;
> +	}
> +
> +	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
> +	if (vfio_cfg == NULL) {
> +		RTE_LOG(ERR, EAL, "Invalid container fd\n");
> +		return -1;
> +	}
> +
> +	user_mem_maps = &vfio_cfg->mem_maps;
> +	rte_spinlock_recursive_lock(&user_mem_maps->lock);
> +	if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
> +		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
> +		rte_errno = ENOMEM;
> +		ret = -1;
> +		goto out;
> +	}
> +	/* map the entry */
> +	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
> +		/* technically, this will fail if there are currently no devices
> +		 * plugged in, even if a device were added later, this mapping
> +		 * might have succeeded. however, since we cannot verify if this
> +		 * is a valid mapping without having a device attached, consider
> +		 * this to be unsupported, because we can't just store any old
> +		 * mapping and pollute list of active mappings willy-nilly.
> +		 */
> +		RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
> +		ret = -1;
> +		goto out;
> +	}
> +	/* create new user mem map entry */
> +	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
> +	new_map->addr = vaddr;
> +	new_map->iova = iova;
> +	new_map->len = len;
> +
> +	compact_user_maps(user_mem_maps);
> +out:
> +	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
> +	return ret;

Please correct me if i'm wrong, but it looks like you've just duplicated 
the code for rte_vfio_dma_map() here and made a few small changes. It 
would be better if you moved most of this into a static function (e.g. 
static int container_dma_map(vfio_cfg, vaddr, iova, len)) and called it 
with either default vfio_cfg from rte_vfio_dma_map, or found vfio_cfg 
from rte_vfio_container_dma_map. Same applies to function below.

> +}
> +
> +int __rte_experimental
> +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
> +		uint64_t len)
> +{
> +	struct user_mem_map *map, *new_map = NULL;
> +	struct vfio_config *vfio_cfg;
> +	struct user_mem_maps *user_mem_maps;
> +	int ret = 0;
> +
> +	if (len == 0) {
> +		rte_errno = EINVAL;
> +		return -1;
> +	}
> +

<...>
  
Xiao Wang April 16, 2018, 12:44 p.m. UTC | #2
Hi Anatoly,

> -----Original Message-----

> From: Burakov, Anatoly

> Sent: Monday, April 16, 2018 6:03 PM

> To: Wang, Xiao W <xiao.w.wang@intel.com>; Yigit, Ferruh

> <ferruh.yigit@intel.com>

> Cc: dev@dpdk.org; maxime.coquelin@redhat.com; Wang, Zhihong

> <zhihong.wang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Tan, Jianfeng

> <jianfeng.tan@intel.com>; Liang, Cunming <cunming.liang@intel.com>; Daly,

> Dan <dan.daly@intel.com>; thomas@monjalon.net; Chen, Junjie J

> <junjie.j.chen@intel.com>

> Subject: Re: [PATCH v7 2/5] vfio: add multi container support

> 

> On 15-Apr-18 4:33 PM, Xiao Wang wrote:

> > This patch adds APIs to support container create/destroy and device

> > bind/unbind with a container. It also provides API for IOMMU programing

> > on a specified container.

> >

> > A driver could use "rte_vfio_create_container" helper to create a

> 

> ^^ wrong API name in commit message :)


Thanks for the catch. Will fix it.

> 

> > new container from eal, use "rte_vfio_bind_group" to bind a device

> > to the newly created container. During rte_vfio_setup_device the

> > container bound with the device will be used for IOMMU setup.

> >

> > Signed-off-by: Junjie Chen <junjie.j.chen@intel.com>

> > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>

> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

> > Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

> > ---

> >   lib/librte_eal/bsdapp/eal/eal.c          |  52 +++++

> >   lib/librte_eal/common/include/rte_vfio.h | 119 ++++++++++++

> >   lib/librte_eal/linuxapp/eal/eal_vfio.c   | 316

> +++++++++++++++++++++++++++++++

> >   lib/librte_eal/rte_eal_version.map       |   6 +

> >   4 files changed, 493 insertions(+)

> >

> > diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c

> > index 727adc5d2..c5106d0d6 100644

> > --- a/lib/librte_eal/bsdapp/eal/eal.c

> > +++ b/lib/librte_eal/bsdapp/eal/eal.c

> > @@ -769,6 +769,14 @@ int rte_vfio_noiommu_is_enabled(void);

> >   int rte_vfio_clear_group(int vfio_group_fd);

> >   int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);

> >   int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);

> > +int rte_vfio_container_create(void);

> > +int rte_vfio_container_destroy(int container_fd);

> > +int rte_vfio_bind_group(int container_fd, int iommu_group_no);

> > +int rte_vfio_unbind_group(int container_fd, int iommu_group_no);

> 

> Maybe have these under "container" too? e.g.

> rte_vfio_container_group_bind/unbind? Seems like it would be more

> consistent that way - anything to do with custom containers would be

> under rte_vfio_container_* namespace.


Agree.

> 

> > +int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,

> > +		uint64_t iova, uint64_t len);

> > +int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,

> > +		uint64_t iova, uint64_t len);

> >

> >   int rte_vfio_setup_device(__rte_unused const char *sysfs_base,

> >   		      __rte_unused const char *dev_addr,

> > @@ -818,3 +826,47 @@ rte_vfio_dma_unmap(uint64_t __rte_unused vaddr,

> uint64_t __rte_unused iova,

> >   {

> >   	return -1;

> >   }

> > +

> 

> <...>

> 

> > diff --git a/lib/librte_eal/common/include/rte_vfio.h

> b/lib/librte_eal/common/include/rte_vfio.h

> > index d26ab01cb..0c1509b29 100644

> > --- a/lib/librte_eal/common/include/rte_vfio.h

> > +++ b/lib/librte_eal/common/include/rte_vfio.h

> > @@ -168,6 +168,125 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova,

> uint64_t len);

> >   int __rte_experimental

> >   rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);

> >

> > +/**

> > + * @warning

> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior

> notice

> > + *

> > + * Create a new container for device binding.

> 

> I would add a note that any newly allocated DPDK memory will not be

> mapped into these containers by default.


Will add it.

> 

> > + *

> > + * @return

> > + *   the container fd if successful

> > + *   <0 if failed

> > + */

> > +int __rte_experimental

> > +rte_vfio_container_create(void);

> > +

> 

> <...>

> 

> > + *    0 if successful

> > + *   <0 if failed

> > + */

> > +int __rte_experimental

> > +rte_vfio_unbind_group(int container_fd, int iommu_group_no);

> > +

> > +/**

> > + * @warning

> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior

> notice

> > + *

> > + * Perform dma mapping for devices in a conainer.

> 

> Here and in other places: "dma" should be DMA, and typo: "conainer" :)

> 

> I think you should also add a note to the original API (not this one,

> but the old one) that DMA maps done via that API will only apply to

> default container and will not apply to any of the containers created

> via container_create(). IOW, documentation should make it clear that if

> you use this functionality, you're on your own and you have to manage

> your own DMA mappings for any containers you create.


OK, will add note to clearly describe it.

> 

> > + *

> > + * @param container_fd

> > + *   the specified container fd

> > + *

> > + * @param vaddr

> > + *   Starting virtual address of memory to be mapped.

> > + *

> 

> <...>

> 

> > +

> > +int __rte_experimental

> > +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t

> iova,

> > +		uint64_t len)

> > +{

> > +	struct user_mem_map *new_map;

> > +	struct vfio_config *vfio_cfg;

> > +	struct user_mem_maps *user_mem_maps;

> > +	int ret = 0;

> > +

> > +	if (len == 0) {

> > +		rte_errno = EINVAL;

> > +		return -1;

> > +	}

> > +

> > +	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);

> > +	if (vfio_cfg == NULL) {

> > +		RTE_LOG(ERR, EAL, "Invalid container fd\n");

> > +		return -1;

> > +	}

> > +

> > +	user_mem_maps = &vfio_cfg->mem_maps;

> > +	rte_spinlock_recursive_lock(&user_mem_maps->lock);

> > +	if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {

> > +		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");

> > +		rte_errno = ENOMEM;

> > +		ret = -1;

> > +		goto out;

> > +	}

> > +	/* map the entry */

> > +	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {

> > +		/* technically, this will fail if there are currently no devices

> > +		 * plugged in, even if a device were added later, this mapping

> > +		 * might have succeeded. however, since we cannot verify if

> this

> > +		 * is a valid mapping without having a device attached,

> consider

> > +		 * this to be unsupported, because we can't just store any old

> > +		 * mapping and pollute list of active mappings willy-nilly.

> > +		 */

> > +		RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");

> > +		ret = -1;

> > +		goto out;

> > +	}

> > +	/* create new user mem map entry */

> > +	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];

> > +	new_map->addr = vaddr;

> > +	new_map->iova = iova;

> > +	new_map->len = len;

> > +

> > +	compact_user_maps(user_mem_maps);

> > +out:

> > +	rte_spinlock_recursive_unlock(&user_mem_maps->lock);

> > +	return ret;

> 

> Please correct me if i'm wrong, but it looks like you've just duplicated

> the code for rte_vfio_dma_map() here and made a few small changes. It

> would be better if you moved most of this into a static function (e.g.

> static int container_dma_map(vfio_cfg, vaddr, iova, len)) and called it

> with either default vfio_cfg from rte_vfio_dma_map, or found vfio_cfg

> from rte_vfio_container_dma_map. Same applies to function below.


Agree, will do it in v8.

BRs,
Xiao

> 

> > +}

> > +

> > +int __rte_experimental

> > +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t

> iova,

> > +		uint64_t len)

> > +{

> > +	struct user_mem_map *map, *new_map = NULL;

> > +	struct vfio_config *vfio_cfg;

> > +	struct user_mem_maps *user_mem_maps;

> > +	int ret = 0;

> > +

> > +	if (len == 0) {

> > +		rte_errno = EINVAL;

> > +		return -1;

> > +	}

> > +

> 

> <...>

> 

> --

> Thanks,

> Anatoly
  

Patch

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 727adc5d2..c5106d0d6 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -769,6 +769,14 @@  int rte_vfio_noiommu_is_enabled(void);
 int rte_vfio_clear_group(int vfio_group_fd);
 int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
 int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_container_create(void);
+int rte_vfio_container_destroy(int container_fd);
+int rte_vfio_bind_group(int container_fd, int iommu_group_no);
+int rte_vfio_unbind_group(int container_fd, int iommu_group_no);
+int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
+int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
 
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
 		      __rte_unused const char *dev_addr,
@@ -818,3 +826,47 @@  rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
 {
 	return -1;
 }
+
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+			__rte_unused uint64_t vaddr,
+			__rte_unused uint64_t iova,
+			__rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+			__rte_unused uint64_t vaddr,
+			__rte_unused uint64_t iova,
+			__rte_unused uint64_t len)
+{
+	return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index d26ab01cb..0c1509b29 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -168,6 +168,125 @@  rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
 int __rte_experimental
 rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Create a new container for device binding.
+ *
+ * @return
+ *   the container fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_create(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Destroy the container, unbind all vfio groups within it.
+ *
+ * @param container_fd
+ *   the container fd to destroy
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Bind a IOMMU group to a container.
+ *
+ * @param container_fd
+ *   the container's fd
+ *
+ * @param iommu_group_no
+ *   the iommu_group_no to bind to container
+ *
+ * @return
+ *   group fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_bind_group(int container_fd, int iommu_group_no);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Unbind a IOMMU group from a container.
+ *
+ * @param container_fd
+ *   the container fd of container
+ *
+ * @param iommu_group_no
+ *   the iommu_group_no to delete from container
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_unbind_group(int container_fd, int iommu_group_no);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform dma mapping for devices in a conainer.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform dma unmapping for devices in a conainer.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 46fba2d8d..2f566a621 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1668,6 +1668,278 @@  rte_vfio_noiommu_is_enabled(void)
 	return c == 'Y';
 }
 
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	int i;
+
+	/* Find an empty slot to store new vfio config */
+	for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+		if (vfio_cfgs[i].vfio_container_fd == -1)
+			break;
+	}
+
+	if (i == VFIO_MAX_CONTAINERS) {
+		RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+		return -1;
+	}
+
+	vfio_cfgs[i].vfio_container_fd = vfio_get_container_fd();
+	if (vfio_cfgs[i].vfio_container_fd < 0) {
+		RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
+		return -1;
+	}
+
+	return vfio_cfgs[i].vfio_container_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no != -1)
+			rte_vfio_unbind_group(container_fd,
+				vfio_cfg->vfio_groups[i].group_no);
+
+	close(container_fd);
+	vfio_cfg->vfio_container_fd = -1;
+	vfio_cfg->vfio_active_groups = 0;
+	vfio_cfg->vfio_iommu_type = NULL;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(int container_fd, int iommu_group_no)
+{
+	struct vfio_config *vfio_cfg;
+	struct vfio_group *cur_grp;
+	int vfio_group_fd;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	/* Check room for new group */
+	if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no == -1) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_no);
+		return -1;
+	}
+	cur_grp->group_no = iommu_group_no;
+	cur_grp->fd = vfio_group_fd;
+	cur_grp->devices = 0;
+	vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(int container_fd, int iommu_group_no)
+{
+	struct vfio_config *vfio_cfg;
+	struct vfio_group *cur_grp;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+		if (vfio_cfg->vfio_groups[i].group_no == iommu_group_no) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+	}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Specified group number not found\n");
+		return -1;
+	}
+
+	if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+		RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+			" iommu_group_no %d\n", iommu_group_no);
+		return -1;
+	}
+	cur_grp->group_no = -1;
+	cur_grp->fd = -1;
+	cur_grp->devices = 0;
+	vfio_cfg->vfio_active_groups--;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct user_mem_map *new_map;
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int ret = 0;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+	if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+	/* map the entry */
+	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
+		/* technically, this will fail if there are currently no devices
+		 * plugged in, even if a device were added later, this mapping
+		 * might have succeeded. however, since we cannot verify if this
+		 * is a valid mapping without having a device attached, consider
+		 * this to be unsupported, because we can't just store any old
+		 * mapping and pollute list of active mappings willy-nilly.
+		 */
+		RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
+		ret = -1;
+		goto out;
+	}
+	/* create new user mem map entry */
+	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	new_map->addr = vaddr;
+	new_map->iova = iova;
+	new_map->len = len;
+
+	compact_user_maps(user_mem_maps);
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct user_mem_map *map, *new_map = NULL;
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int ret = 0;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+	/* find our mapping */
+	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
+	if (!map) {
+		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
+		rte_errno = EINVAL;
+		ret = -1;
+		goto out;
+	}
+	if (map->addr != vaddr || map->iova != iova || map->len != len) {
+		/* we're partially unmapping a previously mapped region, so we
+		 * need to split entry into two.
+		 */
+		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+			rte_errno = ENOMEM;
+			ret = -1;
+			goto out;
+		}
+		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/* unmap the entry */
+	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
+		/* there may not be any devices plugged in, so unmapping will
+		 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
+		 * stop us from removing the mapping, as the assumption is we
+		 * won't be needing this memory any more and thus will want to
+		 * prevent it from being remapped again on hotplug. so, only
+		 * fail if we indeed failed to unmap (e.g. if the mapping was
+		 * within our mapped range but had invalid alignment).
+		 */
+		if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+			RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
+			ret = -1;
+			goto out;
+		} else {
+			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
+		}
+	}
+	/* remove map from the list of active mappings */
+	if (new_map != NULL) {
+		adjust_map(map, new_map, vaddr, len);
+
+		/* if we've created a new map by splitting, sort everything */
+		if (!is_null_map(new_map)) {
+			compact_user_maps(user_mem_maps);
+		} else {
+			/* we've created a new mapping, but it was unused */
+			user_mem_maps->n_maps--;
+		}
+	} else {
+		memset(map, 0, sizeof(*map));
+		compact_user_maps(user_mem_maps);
+		user_mem_maps->n_maps--;
+	}
+
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
 #else
 
 int __rte_experimental
@@ -1684,4 +1956,48 @@  rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
 	return -1;
 }
 
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+		__rte_unused uint64_t vaddr,
+		__rte_unused uint64_t iova,
+		__rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+		__rte_unused uint64_t vaddr,
+		__rte_unused uint64_t iova,
+		__rte_unused uint64_t len)
+{
+	return -1;
+}
+
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 2b5b1dcf5..c5eff065e 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -284,7 +284,13 @@  EXPERIMENTAL {
 	rte_service_start_with_defaults;
 	rte_socket_count;
 	rte_socket_id_by_idx;
+	rte_vfio_bind_group;
+	rte_vfio_container_create;
+	rte_vfio_container_destroy;
+	rte_vfio_container_dma_map;
+	rte_vfio_container_dma_unmap;
 	rte_vfio_dma_map;
 	rte_vfio_dma_unmap;
+	rte_vfio_unbind_group;
 
 } DPDK_18.02;