> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Saturday, December 23, 2023 1:18 AM
> To: Xing, Beilei <beilei.xing@intel.com>
> Cc: Burakov, Anatoly <anatoly.burakov@intel.com>; dev@dpdk.org;
> thomas@monjalon.net; ferruh.yigit@amd.com; Richardson, Bruce
> <bruce.richardson@intel.com>; chenbox@nvidia.com; Cao, Yahui
> <yahui.cao@intel.com>
> Subject: Re: [PATCH 4/4] eal: add new args to choose VFIO mode
>
> On Fri, 22 Dec 2023 19:44:53 +0000
> beilei.xing@intel.com wrote:
>
> > From: Beilei Xing <beilei.xing@intel.com>
> >
> > Since now Linux has both of VFIO Container/GROUP & VFIO IOMMUFD/CDEV
> > support, user can determine how to probe the PCI device by the new
> > args "--vfio-mode".
> >
> > Use "--vfio-mode=container" to choose VFIO Container/GROUP, and use
> > "--vfio-mode=iommufd" to choose VFIO IOMMUFD/CDEV.
> >
> > Signed-off-by: Beilei Xing <beilei.xing@intel.com>
> > Signed-off-by: Yahui Cao <yahui.cao@intel.com>
>
> Can't this be automatic, users don't need more EAL options.
Thanks for your review. Since Linux supports both VFIO Container/GROUP and VFIO
OMMUFD/CDEV currently, I think user can choose which mode they want. The new
IOMMU features (e.g. PASID/SSID) may be only available through VFIO IOMMUFD/CDEV
interface, VFIO Container/GROUP may be deprecated in future, and then DPDK will
use iommufd mode automatically.
.
@@ -226,6 +226,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
struct rte_pci_device_internal *pdev;
struct rte_pci_device *dev;
char driver[PATH_MAX];
+ enum rte_vfio_mode vfio_mode;
int ret;
pdev = malloc(sizeof(*pdev));
@@ -317,6 +318,8 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
return -1;
}
+ vfio_mode = rte_eal_vfio_mode();
+
/* parse driver */
snprintf(filename, sizeof(filename), "%s/driver", dirname);
ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver));
@@ -327,8 +330,10 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
}
if (!ret) {
- if (!strcmp(driver, "vfio-pci"))
+ if (!strcmp(driver, "vfio-pci") && vfio_mode == RTE_VFIO_CONTAINER)
dev->kdrv = RTE_PCI_KDRV_VFIO;
+ else if (!strcmp(driver, "vfio-pci") && vfio_mode == RTE_VFIO_IOMMUFD)
+ dev->kdrv = RTE_PCI_KDRV_VFIO_IOMMUFD;
else if (!strcmp(driver, "igb_uio"))
dev->kdrv = RTE_PCI_KDRV_IGB_UIO;
else if (!strcmp(driver, "uio_pci_generic"))
@@ -58,6 +58,12 @@ rte_eal_iova_mode(void)
return rte_eal_get_configuration()->iova_mode;
}
+enum rte_vfio_mode
+rte_eal_vfio_mode(void)
+{
+ return internal_config.vfio_mode;
+}
+
/* Get the EAL base address */
uint64_t
rte_eal_get_baseaddr(void)
@@ -35,6 +35,7 @@
#include <rte_telemetry.h>
#endif
#include <rte_vect.h>
+#include <rte_vfio.h>
#include "eal_internal_cfg.h"
#include "eal_options.h"
@@ -96,6 +97,7 @@ eal_long_options[] = {
{OPT_SYSLOG, 1, NULL, OPT_SYSLOG_NUM },
{OPT_VDEV, 1, NULL, OPT_VDEV_NUM },
{OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM },
+ {OPT_VFIO_MODE, 1, NULL, OPT_VFIO_MODE_NUM },
{OPT_VFIO_VF_TOKEN, 1, NULL, OPT_VFIO_VF_TOKEN_NUM },
{OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM },
{OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM },
@@ -1598,6 +1600,42 @@ available_cores(void)
return str;
}
+static int
+eal_parse_vfio_mode(const char *name)
+{
+ int mode;
+ struct internal_config *internal_conf =
+ eal_get_internal_configuration();
+#ifdef VFIO_IOMMUFD_PRESENT
+ char dirname[PATH_MAX] = VFIO_CDEV_CLASS_DIR;
+#endif
+
+ if (name == NULL)
+ return -1;
+
+ if (!strcmp("container", name)) {
+ mode = RTE_VFIO_CONTAINER;
+ } else if (!strcmp("iommufd", name)) {
+#ifdef VFIO_IOMMUFD_PRESENT
+ if (opendir(dirname) == NULL) {
+ RTE_LOG(WARNING, EAL, "vfio cdev isn't supported, change to vfio container mode\n");
+ mode = RTE_VFIO_CONTAINER;
+ } else {
+ mode = RTE_VFIO_IOMMUFD;
+ }
+#else
+ RTE_LOG(WARNING, EAL, "vfio cdev isn't supported, change to vfio container mode\n");
+ mode = RTE_VFIO_CONTAINER;
+#endif
+ } else {
+ RTE_LOG(ERR, EAL, "unsupported vfio mode\n");
+ return -1;
+ }
+
+ internal_conf->vfio_mode = mode;
+ return 0;
+}
+
#define HUGE_UNLINK_NEVER "never"
static int
@@ -1922,7 +1960,13 @@ eal_parse_common_option(int opt, const char *optarg,
return -1;
}
break;
-
+ case OPT_VFIO_MODE_NUM:
+ if (eal_parse_vfio_mode(optarg) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_VFIO_MODE "\n");
+ return -1;
+ }
+ break;
/* don't know what to do, leave this to caller */
default:
return 1;
@@ -2189,6 +2233,8 @@ eal_common_usage(void)
" (ex: --vdev=net_pcap0,iface=eth2).\n"
" --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n"
" 'va' for IOVA_VA\n"
+ " --"OPT_VFIO_MODE" Set VFIO mode. 'container' for VFIO_CONTAINER\n"
+ " 'cdev' for VFIO_IOMMUFD\n"
" -d LIB.so|DIR Add a driver or driver directory\n"
" (can be used multiple times)\n"
" --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n"
@@ -103,6 +103,7 @@ struct internal_config {
struct simd_bitwidth max_simd_bitwidth;
/**< max simd bitwidth path to use */
size_t huge_worker_stack_size; /**< worker thread stack size */
+ enum rte_vfio_mode vfio_mode; /**< Set VFIO mode */
};
void eal_reset_internal_config(struct internal_config *internal_cfg);
@@ -89,6 +89,8 @@ enum {
OPT_FORCE_MAX_SIMD_BITWIDTH_NUM,
#define OPT_HUGE_WORKER_STACK "huge-worker-stack"
OPT_HUGE_WORKER_STACK_NUM,
+#define OPT_VFIO_MODE "vfio-mode"
+ OPT_VFIO_MODE_NUM,
OPT_LONG_MAX_NUM
};
@@ -472,6 +472,24 @@ enum rte_iova_mode {
*/
enum rte_iova_mode rte_eal_iova_mode(void);
+/**
+ * VFIO mode.
+ */
+enum rte_vfio_mode {
+ RTE_VFIO_CONTAINER = 0, /* vfio container mode */
+ RTE_VFIO_IOMMUFD = 1 /* vfio iommufd mode */
+};
+
+/**
+ * Get the vfio mode
+ *
+ * @return
+ * enum rte_vfio_mode value.
+ */
+
+__rte_experimental
+enum rte_vfio_mode rte_eal_vfio_mode(void);
+
/**
* Get user provided pool ops name for mbuf
*
@@ -394,6 +394,7 @@ EXPERIMENTAL {
rte_memzone_max_get;
rte_memzone_max_set;
+ rte_eal_vfio_mode; # WINDOWS_NO_EXPORT
rte_iommufd_enable; # WINDOWS_NO_EXPORT
rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
rte_vfio_iommufd_release_device; # WINDOWS_NO_EXPORT