@@ -32,6 +32,56 @@ target platform is x86-based. No additional compilation steps are necessary.
Device Setup
-------------
+Intel\ |reg| DSA devices can use the IDXD kernel driver or DPDK-supported drivers,
+such as ``vfio-pci``. Both are supported by the IDXD PMD.
+
+Intel\ |reg| DSA devices using IDXD kernel driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To use an Intel\ |reg| DSA device bound to the IDXD kernel driver, the device must first be configured.
+The `accel-config <https://github.com/intel/idxd-config>`_ utility library can be used for configuration.
+
+.. note::
+ The device configuration can also be done by directly interacting with the sysfs nodes.
+ An example of how this may be done can be seen in the script ``dpdk_idxd_cfg.py``
+ included in the driver source directory.
+
+There are some mandatory configuration steps before being able to use a device with an application.
+The internal engines, which do the copies or other operations,
+and the work-queues, which are used by applications to assign work to the device,
+need to be assigned to groups, and the various other configuration options,
+such as priority or queue depth, need to be set for each queue.
+
+To assign an engine to a group::
+
+ $ accel-config config-engine dsa0/engine0.0 --group-id=0
+ $ accel-config config-engine dsa0/engine0.1 --group-id=1
+
+To assign work queues to groups for passing descriptors to the engines a similar accel-config command can be used.
+However, the work queues also need to be configured depending on the use case.
+Some configuration options include:
+
+* mode (Dedicated/Shared): Indicates whether a WQ may accept jobs from multiple queues simultaneously.
+* priority: WQ priority between 1 and 15. Larger value means higher priority.
+* wq-size: the size of the WQ. Sum of all WQ sizes must be less that the total-size defined by the device.
+* type: WQ type (kernel/mdev/user). Determines how the device is presented.
+* name: identifier given to the WQ.
+
+Example configuration for a work queue::
+
+ $ accel-config config-wq dsa0/wq0.0 --group-id=0 \
+ --mode=dedicated --priority=10 --wq-size=8 \
+ --type=user --name=dpdk_app1
+
+Once the devices have been configured, they need to be enabled::
+
+ $ accel-config enable-device dsa0
+ $ accel-config enable-wq dsa0/wq0.0
+
+Check the device configuration::
+
+ $ accel-config list
+
Devices using VFIO/UIO drivers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -56,3 +106,17 @@ If fewer workqueues are required, then the ``max_queues`` parameter may be passe
the device driver on the EAL commandline, via the ``allowlist`` or ``-a`` flag e.g.::
$ dpdk-test -a <b:d:f>,max_queues=4
+
+For devices bound to the IDXD kernel driver,
+the DPDK IDXD driver will automatically perform a scan for available workqueues
+to use. Any workqueues found listed in ``/dev/dsa`` on the system will be checked
+in ``/sys``, and any which have ``dpdk_`` prefix in their name will be automatically
+probed by the driver to make them available to the application.
+Alternatively, to support use by multiple DPDK processes simultaneously,
+the value used as the DPDK ``--file-prefix`` parameter may be used as a workqueue
+name prefix, instead of ``dpdk_``, allowing each DPDK application instance to only
+use a subset of configured queues.
+
+Once probed successfully, irrespective of kernel driver, the device will appear as a ``dmadev``,
+that is a "DMA device type" inside DPDK, and can be accessed using APIs from the
+``rte_dmadev`` library.
new file mode 100644
@@ -0,0 +1,351 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <libgen.h>
+
+#include <rte_bus.h>
+#include <rte_log.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_string_fns.h>
+
+#include "idxd_internal.h"
+
+/* default value for DSA paths, but allow override in environment for testing */
+#define DSA_DEV_PATH "/dev/dsa"
+#define DSA_SYSFS_PATH "/sys/bus/dsa/devices"
+
+static unsigned int devcount;
+
+/** unique identifier for a DSA device/WQ instance */
+struct dsa_wq_addr {
+ uint16_t device_id;
+ uint16_t wq_id;
+};
+
+/** a DSA device instance */
+struct rte_dsa_device {
+ struct rte_device device; /**< Inherit core device */
+ TAILQ_ENTRY(rte_dsa_device) next; /**< next dev in list */
+
+ char wq_name[32]; /**< the workqueue name/number e.g. wq0.1 */
+ struct dsa_wq_addr addr; /**< Identifies the specific WQ */
+};
+
+/* forward prototypes */
+struct dsa_bus;
+static int dsa_scan(void);
+static int dsa_probe(void);
+static struct rte_device *dsa_find_device(const struct rte_device *start,
+ rte_dev_cmp_t cmp, const void *data);
+static enum rte_iova_mode dsa_get_iommu_class(void);
+static int dsa_addr_parse(const char *name, void *addr);
+
+/** List of devices */
+TAILQ_HEAD(dsa_device_list, rte_dsa_device);
+
+/**
+ * Structure describing the DSA bus
+ */
+struct dsa_bus {
+ struct rte_bus bus; /**< Inherit the generic class */
+ struct rte_driver driver; /**< Driver struct for devices to point to */
+ struct dsa_device_list device_list; /**< List of PCI devices */
+};
+
+struct dsa_bus dsa_bus = {
+ .bus = {
+ .scan = dsa_scan,
+ .probe = dsa_probe,
+ .find_device = dsa_find_device,
+ .get_iommu_class = dsa_get_iommu_class,
+ .parse = dsa_addr_parse,
+ },
+ .driver = {
+ .name = "dmadev_idxd"
+ },
+ .device_list = TAILQ_HEAD_INITIALIZER(dsa_bus.device_list),
+};
+
+static inline const char *
+dsa_get_dev_path(void)
+{
+ const char *path = getenv("DSA_DEV_PATH");
+ return path ? path : DSA_DEV_PATH;
+}
+
+static inline const char *
+dsa_get_sysfs_path(void)
+{
+ const char *path = getenv("DSA_SYSFS_PATH");
+ return path ? path : DSA_SYSFS_PATH;
+}
+
+static void *
+idxd_bus_mmap_wq(struct rte_dsa_device *dev)
+{
+ void *addr;
+ char path[PATH_MAX];
+ int fd;
+
+ snprintf(path, sizeof(path), "%s/%s", dsa_get_dev_path(), dev->wq_name);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ IDXD_PMD_ERR("Failed to open device path: %s", path);
+ return NULL;
+ }
+
+ addr = mmap(NULL, 0x1000, PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ IDXD_PMD_ERR("Failed to mmap device %s", path);
+ return NULL;
+ }
+
+ return addr;
+}
+
+static int
+read_wq_string(struct rte_dsa_device *dev, const char *filename,
+ char *value, size_t valuelen)
+{
+ char sysfs_node[PATH_MAX];
+ int len;
+ int fd;
+
+ snprintf(sysfs_node, sizeof(sysfs_node), "%s/%s/%s",
+ dsa_get_sysfs_path(), dev->wq_name, filename);
+ fd = open(sysfs_node, O_RDONLY);
+ if (fd < 0) {
+ IDXD_PMD_ERR("%s(): opening file '%s' failed: %s",
+ __func__, sysfs_node, strerror(errno));
+ return -1;
+ }
+
+ len = read(fd, value, valuelen - 1);
+ close(fd);
+ if (len < 0) {
+ IDXD_PMD_ERR("%s(): error reading file '%s': %s",
+ __func__, sysfs_node, strerror(errno));
+ return -1;
+ }
+ value[len] = '\0';
+ return 0;
+}
+
+static int
+read_wq_int(struct rte_dsa_device *dev, const char *filename,
+ int *value)
+{
+ char sysfs_node[PATH_MAX];
+ FILE *f;
+ int ret = 0;
+
+ snprintf(sysfs_node, sizeof(sysfs_node), "%s/%s/%s",
+ dsa_get_sysfs_path(), dev->wq_name, filename);
+ f = fopen(sysfs_node, "r");
+ if (f == NULL) {
+ IDXD_PMD_ERR("%s(): opening file '%s' failed: %s",
+ __func__, sysfs_node, strerror(errno));
+ return -1;
+ }
+
+ if (fscanf(f, "%d", value) != 1) {
+ IDXD_PMD_ERR("%s(): error reading file '%s': %s",
+ __func__, sysfs_node, strerror(errno));
+ ret = -1;
+ }
+
+ fclose(f);
+ return ret;
+}
+
+static int
+read_device_int(struct rte_dsa_device *dev, const char *filename,
+ int *value)
+{
+ char sysfs_node[PATH_MAX];
+ FILE *f;
+ int ret = 0;
+
+ snprintf(sysfs_node, sizeof(sysfs_node), "%s/dsa%d/%s",
+ dsa_get_sysfs_path(), dev->addr.device_id, filename);
+ f = fopen(sysfs_node, "r");
+ if (f == NULL) {
+ IDXD_PMD_ERR("%s(): opening file '%s' failed: %s",
+ __func__, sysfs_node, strerror(errno));
+ return -1;
+ }
+
+ if (fscanf(f, "%d", value) != 1) {
+ IDXD_PMD_ERR("%s(): error reading file '%s': %s",
+ __func__, sysfs_node, strerror(errno));
+ ret = -1;
+ }
+
+ fclose(f);
+ return ret;
+}
+
+static int
+idxd_probe_dsa(struct rte_dsa_device *dev)
+{
+ struct idxd_dmadev idxd = {0};
+ int ret = 0;
+
+ IDXD_PMD_INFO("Probing device %s on numa node %d",
+ dev->wq_name, dev->device.numa_node);
+ if (read_wq_int(dev, "size", &ret) < 0)
+ return -1;
+ idxd.max_batches = ret;
+ if (read_wq_int(dev, "max_batch_size", &ret) < 0)
+ return -1;
+ idxd.max_batch_size = ret;
+ idxd.qid = dev->addr.wq_id;
+ idxd.sva_support = 1;
+
+ idxd.portal = idxd_bus_mmap_wq(dev);
+ if (idxd.portal == NULL) {
+ IDXD_PMD_ERR("WQ mmap failed");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int
+is_for_this_process_use(const char *name)
+{
+ char *runtime_dir = strdup(rte_eal_get_runtime_dir());
+ char *prefix = basename(runtime_dir);
+ int prefixlen = strlen(prefix);
+ int retval = 0;
+
+ if (strncmp(name, "dpdk_", 5) == 0)
+ retval = 1;
+ if (strncmp(name, prefix, prefixlen) == 0 && name[prefixlen] == '_')
+ retval = 1;
+
+ free(runtime_dir);
+ return retval;
+}
+
+static int
+dsa_probe(void)
+{
+ struct rte_dsa_device *dev;
+
+ TAILQ_FOREACH(dev, &dsa_bus.device_list, next) {
+ char type[64], name[64];
+
+ if (read_wq_string(dev, "type", type, sizeof(type)) < 0 ||
+ read_wq_string(dev, "name", name, sizeof(name)) < 0)
+ continue;
+
+ if (strncmp(type, "user", 4) == 0 && is_for_this_process_use(name)) {
+ dev->device.driver = &dsa_bus.driver;
+ idxd_probe_dsa(dev);
+ continue;
+ }
+ IDXD_PMD_DEBUG("WQ '%s', not allocated to DPDK", dev->wq_name);
+ }
+
+ return 0;
+}
+
+static int
+dsa_scan(void)
+{
+ const char *path = dsa_get_dev_path();
+ struct dirent *wq;
+ DIR *dev_dir;
+
+ dev_dir = opendir(path);
+ if (dev_dir == NULL) {
+ if (errno == ENOENT)
+ return 0; /* no bus, return without error */
+ IDXD_PMD_ERR("%s(): opendir '%s' failed: %s",
+ __func__, path, strerror(errno));
+ return -1;
+ }
+
+ while ((wq = readdir(dev_dir)) != NULL) {
+ struct rte_dsa_device *dev;
+ int numa_node = -1;
+
+ if (strncmp(wq->d_name, "wq", 2) != 0)
+ continue;
+ if (strnlen(wq->d_name, sizeof(dev->wq_name)) == sizeof(dev->wq_name)) {
+ IDXD_PMD_ERR("%s(): wq name too long: '%s', skipping",
+ __func__, wq->d_name);
+ continue;
+ }
+ IDXD_PMD_DEBUG("%s(): found %s/%s", __func__, path, wq->d_name);
+
+ dev = malloc(sizeof(*dev));
+ if (dsa_addr_parse(wq->d_name, &dev->addr) < 0) {
+ IDXD_PMD_ERR("Error parsing WQ name: %s", wq->d_name);
+ free(dev);
+ continue;
+ }
+ dev->device.bus = &dsa_bus.bus;
+ strlcpy(dev->wq_name, wq->d_name, sizeof(dev->wq_name));
+ TAILQ_INSERT_TAIL(&dsa_bus.device_list, dev, next);
+ devcount++;
+
+ read_device_int(dev, "numa_node", &numa_node);
+ dev->device.numa_node = numa_node;
+ dev->device.name = dev->wq_name;
+ }
+
+ closedir(dev_dir);
+ return 0;
+}
+
+static struct rte_device *
+dsa_find_device(const struct rte_device *start, rte_dev_cmp_t cmp,
+ const void *data)
+{
+ struct rte_dsa_device *dev = TAILQ_FIRST(&dsa_bus.device_list);
+
+ /* the rte_device struct must be at start of dsa structure */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_dsa_device, device) != 0);
+
+ if (start != NULL) /* jump to start point if given */
+ dev = TAILQ_NEXT((const struct rte_dsa_device *)start, next);
+ while (dev != NULL) {
+ if (cmp(&dev->device, data) == 0)
+ return &dev->device;
+ dev = TAILQ_NEXT(dev, next);
+ }
+ return NULL;
+}
+
+static enum rte_iova_mode
+dsa_get_iommu_class(void)
+{
+ /* if there are no devices, report don't care, otherwise VA mode */
+ return devcount > 0 ? RTE_IOVA_VA : RTE_IOVA_DC;
+}
+
+static int
+dsa_addr_parse(const char *name, void *addr)
+{
+ struct dsa_wq_addr *wq = addr;
+ unsigned int device_id, wq_id;
+
+ if (sscanf(name, "wq%u.%u", &device_id, &wq_id) != 2) {
+ IDXD_PMD_DEBUG("Parsing WQ name failed: %s", name);
+ return -1;
+ }
+
+ wq->device_id = device_id;
+ wq->wq_id = wq_id;
+ return 0;
+}
+
+RTE_REGISTER_BUS(dsa, dsa_bus.bus);
@@ -7,5 +7,6 @@ endif
deps += ['bus_pci']
sources = files(
+ 'idxd_bus.c',
'idxd_pci.c'
)