@@ -791,6 +791,12 @@ CONFIG_RTE_LIBRTE_VHOST_DEBUG=n
#
CONFIG_RTE_LIBRTE_PMD_VHOST=n
+#
+# Compile IFCVF driver
+# To compile, CONFIG_RTE_LIBRTE_VHOST should be enabled.
+#
+CONFIG_RTE_LIBRTE_IFCVF=n
+
#
# Compile the test application
#
@@ -15,6 +15,7 @@ CONFIG_RTE_LIBRTE_PMD_KNI=y
CONFIG_RTE_LIBRTE_VHOST=y
CONFIG_RTE_LIBRTE_VHOST_NUMA=y
CONFIG_RTE_LIBRTE_PMD_VHOST=y
+CONFIG_RTE_LIBRTE_IFCVF=y
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
CONFIG_RTE_LIBRTE_PMD_TAP=y
CONFIG_RTE_LIBRTE_AVP_PMD=y
@@ -53,6 +53,7 @@ endif # $(CONFIG_RTE_LIBRTE_SCHED)
ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
+DIRS-$(CONFIG_RTE_LIBRTE_IFCVF) += ifcvf
endif # $(CONFIG_RTE_LIBRTE_VHOST)
ifeq ($(CONFIG_RTE_LIBRTE_MRVL_PMD),y)
new file mode 100644
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_ifcvf.a
+
+LDLIBS += -lpthread
+LDLIBS += -lrte_eal -lrte_mempool -lrte_pci
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_vhost
+LDLIBS += -lrte_bus_vdev -lrte_bus_pci
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/drivers/bus/pci/linux
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+#
+# Add extra flags for base driver source files to disable warnings in them
+#
+BASE_DRIVER_OBJS=$(sort $(patsubst %.c,%.o,$(notdir $(wildcard $(SRCDIR)/base/*.c))))
+$(foreach obj, $(BASE_DRIVER_OBJS), $(eval CFLAGS_$(obj)+=$(CFLAGS_BASE_DRIVER)))
+
+VPATH += $(SRCDIR)/base
+
+EXPORT_MAP := rte_ifcvf_version.map
+
+LIBABIVER := 1
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += ifcvf_ethdev.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += ifcvf.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
new file mode 100644
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include "ifcvf.h"
+#include "ifcvf_osdep.h"
+
+STATIC void *
+get_cap_addr(struct ifcvf_hw *hw, struct ifcvf_pci_cap *cap)
+{
+ u8 bar = cap->bar;
+ u32 length = cap->length;
+ u32 offset = cap->offset;
+
+ if (bar > IFCVF_PCI_MAX_RESOURCE - 1) {
+ DEBUGOUT("invalid bar: %u\n", bar);
+ return NULL;
+ }
+
+ if (offset + length < offset) {
+ DEBUGOUT("offset(%u) + length(%u) overflows\n",
+ offset, length);
+ return NULL;
+ }
+
+ if (offset + length > hw->mem_resource[cap->bar].len) {
+ DEBUGOUT("offset(%u) + length(%u) overflows bar length(%u)",
+ offset, length, (u32)hw->mem_resource[cap->bar].len);
+ return NULL;
+ }
+
+ return hw->mem_resource[bar].addr + offset;
+}
+
+int
+ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev)
+{
+ int ret;
+ u8 pos;
+ struct ifcvf_pci_cap cap;
+
+ ret = PCI_READ_CONFIG_BYTE(dev, &pos, PCI_CAPABILITY_LIST);
+ if (ret < 0) {
+ DEBUGOUT("failed to read pci capability list\n");
+ return -1;
+ }
+
+ while (pos) {
+ ret = PCI_READ_CONFIG_RANGE(dev, (u32 *)&cap,
+ sizeof(cap), pos);
+ if (ret < 0) {
+ DEBUGOUT("failed to read cap at pos: %x", pos);
+ break;
+ }
+
+ if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+ goto next;
+
+ DEBUGOUT("cfg type: %u, bar: %u, offset: %u, "
+ "len: %u\n", cap.cfg_type, cap.bar,
+ cap.offset, cap.length);
+
+ switch (cap.cfg_type) {
+ case IFCVF_PCI_CAP_COMMON_CFG:
+ hw->common_cfg = get_cap_addr(hw, &cap);
+ break;
+ case IFCVF_PCI_CAP_NOTIFY_CFG:
+ PCI_READ_CONFIG_DWORD(dev, &hw->notify_off_multiplier,
+ pos + sizeof(cap));
+ hw->notify_base = get_cap_addr(hw, &cap);
+ hw->notify_region = cap.bar;
+ break;
+ case IFCVF_PCI_CAP_ISR_CFG:
+ hw->isr = get_cap_addr(hw, &cap);
+ break;
+ case IFCVF_PCI_CAP_DEVICE_CFG:
+ hw->dev_cfg = get_cap_addr(hw, &cap);
+ break;
+ }
+next:
+ pos = cap.cap_next;
+ }
+
+ hw->lm_cfg = hw->mem_resource[4].addr;
+
+ if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+ hw->isr == NULL || hw->dev_cfg == NULL) {
+ DEBUGOUT("capability incomplete\n");
+ return -1;
+ }
+
+ DEBUGOUT("capability mapping:\ncommon cfg: %p\n"
+ "notify base: %p\nisr cfg: %p\ndevice cfg: %p\n"
+ "multiplier: %u\n",
+ hw->common_cfg, hw->dev_cfg,
+ hw->isr, hw->notify_base,
+ hw->notify_off_multiplier);
+
+ return 0;
+}
+
+STATIC u8
+ifcvf_get_status(struct ifcvf_hw *hw)
+{
+ return IFCVF_READ_REG8(&hw->common_cfg->device_status);
+}
+
+STATIC void
+ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
+{
+ IFCVF_WRITE_REG8(status, &hw->common_cfg->device_status);
+}
+
+STATIC void
+ifcvf_reset(struct ifcvf_hw *hw)
+{
+ ifcvf_set_status(hw, 0);
+
+ /* flush status write */
+ while (ifcvf_get_status(hw))
+ msec_delay(1);
+}
+
+STATIC void
+ifcvf_add_status(struct ifcvf_hw *hw, u8 status)
+{
+ if (status != 0)
+ status |= ifcvf_get_status(hw);
+
+ ifcvf_set_status(hw, status);
+ ifcvf_get_status(hw);
+}
+
+u64
+ifcvf_get_features(struct ifcvf_hw *hw)
+{
+ u32 features_lo, features_hi;
+ struct ifcvf_pci_common_cfg *cfg = hw->common_cfg;
+
+ IFCVF_WRITE_REG32(0, &cfg->device_feature_select);
+ features_lo = IFCVF_READ_REG32(&cfg->device_feature);
+
+ IFCVF_WRITE_REG32(1, &cfg->device_feature_select);
+ features_hi = IFCVF_READ_REG32(&cfg->device_feature);
+
+ return ((u64)features_hi << 32) | features_lo;
+}
+
+STATIC void
+ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
+{
+ struct ifcvf_pci_common_cfg *cfg = hw->common_cfg;
+
+ IFCVF_WRITE_REG32(0, &cfg->guest_feature_select);
+ IFCVF_WRITE_REG32(features & ((1ULL << 32) - 1), &cfg->guest_feature);
+
+ IFCVF_WRITE_REG32(1, &cfg->guest_feature_select);
+ IFCVF_WRITE_REG32(features >> 32, &cfg->guest_feature);
+}
+
+STATIC int
+ifcvf_config_features(struct ifcvf_hw *hw)
+{
+ u64 host_features;
+
+ host_features = ifcvf_get_features(hw);
+ hw->req_features &= host_features;
+
+ ifcvf_set_features(hw, hw->req_features);
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_FEATURES_OK);
+
+ if (!(ifcvf_get_status(hw) & IFCVF_CONFIG_STATUS_FEATURES_OK)) {
+ DEBUGOUT("failed to set FEATURES_OK status\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+STATIC void
+io_write64_twopart(u64 val, u32 *lo, u32 *hi)
+{
+ IFCVF_WRITE_REG32(val & ((1ULL << 32) - 1), lo);
+ IFCVF_WRITE_REG32(val >> 32, hi);
+}
+
+STATIC int
+ifcvf_hw_enable(struct ifcvf_hw *hw)
+{
+ struct ifcvf_pci_common_cfg *cfg;
+ u8 *lm_cfg;
+ u32 i;
+ u16 notify_off;
+
+ cfg = hw->common_cfg;
+ lm_cfg = hw->lm_cfg;
+
+ IFCVF_WRITE_REG16(0, &cfg->msix_config);
+ if (IFCVF_READ_REG16(&cfg->msix_config) == IFCVF_MSI_NO_VECTOR) {
+ DEBUGOUT("msix vec alloc failed for device config\n");
+ return -1;
+ }
+
+ for (i = 0; i < hw->nr_vring; i++) {
+ IFCVF_WRITE_REG16(i, &cfg->queue_select);
+ io_write64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
+ &cfg->queue_desc_hi);
+ io_write64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
+ &cfg->queue_avail_hi);
+ io_write64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
+ &cfg->queue_used_hi);
+ IFCVF_WRITE_REG16(hw->vring[i].size, &cfg->queue_size);
+
+ *(u32 *)(lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
+ (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4) =
+ (u32)hw->vring[i].last_avail_idx |
+ ((u32)hw->vring[i].last_used_idx << 16);
+
+ IFCVF_WRITE_REG16(i + 1, &cfg->queue_msix_vector);
+ if (IFCVF_READ_REG16(&cfg->queue_msix_vector) ==
+ IFCVF_MSI_NO_VECTOR) {
+ DEBUGOUT("queue %u, msix vec alloc failed\n",
+ i);
+ return -1;
+ }
+
+ notify_off = IFCVF_READ_REG16(&cfg->queue_notify_off);
+ hw->notify_addr[i] = (void *)((u8 *)hw->notify_base +
+ notify_off * hw->notify_off_multiplier);
+ IFCVF_WRITE_REG16(1, &cfg->queue_enable);
+ }
+
+ return 0;
+}
+
+STATIC void
+ifcvf_hw_disable(struct ifcvf_hw *hw)
+{
+ u32 i;
+ struct ifcvf_pci_common_cfg *cfg;
+ u32 ring_state;
+
+ cfg = hw->common_cfg;
+
+ IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->msix_config);
+ for (i = 0; i < hw->nr_vring; i++) {
+ IFCVF_WRITE_REG16(i, &cfg->queue_select);
+ IFCVF_WRITE_REG16(0, &cfg->queue_enable);
+ IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->queue_msix_vector);
+ ring_state = *(u32 *)(hw->lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
+ (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4);
+ hw->vring[i].last_avail_idx = (u16)ring_state;
+ hw->vring[i].last_used_idx = (u16)ring_state >> 16;
+ }
+}
+
+int
+ifcvf_start_hw(struct ifcvf_hw *hw)
+{
+ ifcvf_reset(hw);
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_ACK);
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_DRIVER);
+
+ if (ifcvf_config_features(hw) < 0)
+ return -1;
+
+ if (ifcvf_hw_enable(hw) < 0)
+ return -1;
+
+ ifcvf_add_status(hw, IFCVF_CONFIG_STATUS_DRIVER_OK);
+ return 0;
+}
+
+void
+ifcvf_stop_hw(struct ifcvf_hw *hw)
+{
+ ifcvf_hw_disable(hw);
+ ifcvf_reset(hw);
+}
+
+void
+ifcvf_enable_logging(struct ifcvf_hw *hw, u64 log_base, u64 log_size)
+{
+ u8 *lm_cfg;
+
+ lm_cfg = hw->lm_cfg;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_BASE_ADDR_LOW) =
+ log_base & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_BASE_ADDR_HIGH) =
+ (log_base >> 32) & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_END_ADDR_LOW) =
+ (log_base + log_size) & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_END_ADDR_HIGH) =
+ ((log_base + log_size) >> 32) & IFCVF_32_BIT_MASK;
+
+ *(u32 *)(lm_cfg + IFCVF_LM_LOGGING_CTRL) = IFCVF_LM_ENABLE_PF;
+}
+
+void
+ifcvf_disable_logging(struct ifcvf_hw *hw)
+{
+ u8 *lm_cfg;
+
+ lm_cfg = hw->lm_cfg;
+ *(u32 *)(lm_cfg + IFCVF_LM_LOGGING_CTRL) = IFCVF_LM_DISABLE;
+}
+
+void
+ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
+{
+ IFCVF_WRITE_REG16(qid, hw->notify_addr[qid]);
+}
+
+u8
+ifcvf_get_notify_region(struct ifcvf_hw *hw)
+{
+ return hw->notify_region;
+}
+
+u64
+ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid)
+{
+ return (u8 *)hw->notify_addr[qid] -
+ (u8 *)hw->mem_resource[hw->notify_region].addr;
+}
new file mode 100644
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _IFCVF_H_
+#define _IFCVF_H_
+
+#include "ifcvf_osdep.h"
+
+#define IFCVF_MAX_QUEUES 1
+#define IFCVF_MAX_DEVICES 64
+#define VIRTIO_F_IOMMU_PLATFORM 33
+
+/* Common configuration */
+#define IFCVF_PCI_CAP_COMMON_CFG 1
+/* Notifications */
+#define IFCVF_PCI_CAP_NOTIFY_CFG 2
+/* ISR Status */
+#define IFCVF_PCI_CAP_ISR_CFG 3
+/* Device specific configuration */
+#define IFCVF_PCI_CAP_DEVICE_CFG 4
+/* PCI configuration access */
+#define IFCVF_PCI_CAP_PCI_CFG 5
+
+#define IFCVF_CONFIG_STATUS_RESET 0x00
+#define IFCVF_CONFIG_STATUS_ACK 0x01
+#define IFCVF_CONFIG_STATUS_DRIVER 0x02
+#define IFCVF_CONFIG_STATUS_DRIVER_OK 0x04
+#define IFCVF_CONFIG_STATUS_FEATURES_OK 0x08
+#define IFCVF_CONFIG_STATUS_FAILED 0x80
+
+#define IFCVF_MSI_NO_VECTOR 0xffff
+#define IFCVF_PCI_MAX_RESOURCE 6
+
+#define IFCVF_LM_CFG_SIZE 0x40
+#define IFCVF_LM_RING_STATE_OFFSET 0x20
+
+#define IFCVF_LM_LOGGING_CTRL 0x0
+
+#define IFCVF_LM_BASE_ADDR_LOW 0x10
+#define IFCVF_LM_BASE_ADDR_HIGH 0x14
+#define IFCVF_LM_END_ADDR_LOW 0x18
+#define IFCVF_LM_END_ADDR_HIGH 0x1c
+
+#define IFCVF_LM_DISABLE 0x0
+#define IFCVF_LM_ENABLE_VF 0x1
+#define IFCVF_LM_ENABLE_PF 0x3
+
+#define IFCVF_32_BIT_MASK 0xffffffff
+
+
+struct ifcvf_pci_cap {
+ u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
+ u8 cap_next; /* Generic PCI field: next ptr. */
+ u8 cap_len; /* Generic PCI field: capability length */
+ u8 cfg_type; /* Identifies the structure. */
+ u8 bar; /* Where to find it. */
+ u8 padding[3]; /* Pad to full dword. */
+ u32 offset; /* Offset within bar. */
+ u32 length; /* Length of the structure, in bytes. */
+};
+
+struct ifcvf_pci_notify_cap {
+ struct ifcvf_pci_cap cap;
+ u32 notify_off_multiplier; /* Multiplier for queue_notify_off. */
+};
+
+struct ifcvf_pci_common_cfg {
+ /* About the whole device. */
+ u32 device_feature_select;
+ u32 device_feature;
+ u32 guest_feature_select;
+ u32 guest_feature;
+ u16 msix_config;
+ u16 num_queues;
+ u8 device_status;
+ u8 config_generation;
+
+ /* About a specific virtqueue. */
+ u16 queue_select;
+ u16 queue_size;
+ u16 queue_msix_vector;
+ u16 queue_enable;
+ u16 queue_notify_off;
+ u32 queue_desc_lo;
+ u32 queue_desc_hi;
+ u32 queue_avail_lo;
+ u32 queue_avail_hi;
+ u32 queue_used_lo;
+ u32 queue_used_hi;
+};
+
+struct ifcvf_net_config {
+ u8 mac[6];
+ u16 status;
+ u16 max_virtqueue_pairs;
+} __attribute__((packed));
+
+struct ifcvf_pci_mem_resource {
+ u64 phys_addr; /**< Physical address, 0 if not resource. */
+ u64 len; /**< Length of the resource. */
+ u8 *addr; /**< Virtual address, NULL when not mapped. */
+};
+
+struct vring_info {
+ u64 desc;
+ u64 avail;
+ u64 used;
+ u16 size;
+ u16 last_avail_idx;
+ u16 last_used_idx;
+};
+
+struct ifcvf_hw {
+ u64 req_features;
+ u8 notify_region;
+ u32 notify_off_multiplier;
+ struct ifcvf_pci_common_cfg *common_cfg;
+ struct ifcvf_net_device_config *dev_cfg;
+ u8 *isr;
+ u16 *notify_base;
+ u16 *notify_addr[IFCVF_MAX_QUEUES * 2];
+ u8 *lm_cfg;
+ struct vring_info vring[IFCVF_MAX_QUEUES * 2];
+ u8 nr_vring;
+ struct ifcvf_pci_mem_resource mem_resource[IFCVF_PCI_MAX_RESOURCE];
+};
+
+int
+ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev);
+
+u64
+ifcvf_get_features(struct ifcvf_hw *hw);
+
+int
+ifcvf_start_hw(struct ifcvf_hw *hw);
+
+void
+ifcvf_stop_hw(struct ifcvf_hw *hw);
+
+void
+ifcvf_enable_logging(struct ifcvf_hw *hw, u64 log_base, u64 log_size);
+
+void
+ifcvf_disable_logging(struct ifcvf_hw *hw);
+
+void
+ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid);
+
+u8
+ifcvf_get_notify_region(struct ifcvf_hw *hw);
+
+u64
+ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid);
+
+#endif /* _IFCVF_H_ */
new file mode 100644
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _IFCVF_OSDEP_H_
+#define _IFCVF_OSDEP_H_
+
+#include <stdint.h>
+#include <linux/pci_regs.h>
+
+#include <rte_cycles.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_log.h>
+#include <rte_io.h>
+
+#define DEBUGOUT(S, args...) RTE_LOG(DEBUG, PMD, S, ##args)
+#define STATIC static
+
+#define msec_delay rte_delay_ms
+
+#define IFCVF_READ_REG8(reg) rte_read8(reg)
+#define IFCVF_WRITE_REG8(val, reg) rte_write8((val), (reg))
+#define IFCVF_READ_REG16(reg) rte_read16(reg)
+#define IFCVF_WRITE_REG16(val, reg) rte_write16((val), (reg))
+#define IFCVF_READ_REG32(reg) rte_read32(reg)
+#define IFCVF_WRITE_REG32(val, reg) rte_write32((val), (reg))
+
+typedef struct rte_pci_device PCI_DEV;
+
+#define PCI_READ_CONFIG_BYTE(dev, val, where) \
+ rte_pci_read_config(dev, val, 1, where)
+
+#define PCI_READ_CONFIG_DWORD(dev, val, where) \
+ rte_pci_read_config(dev, val, 4, where)
+
+typedef uint8_t u8;
+typedef int8_t s8;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef int64_t s64;
+typedef uint64_t u64;
+
+static inline int
+PCI_READ_CONFIG_RANGE(PCI_DEV *dev, uint32_t *val, int size, int where)
+{
+ return rte_pci_read_config(dev, val, size, where);
+}
+
+#endif /* _IFCVF_OSDEP_H_ */
new file mode 100644
@@ -0,0 +1,1241 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <pthread.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+
+#include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_memory.h>
+#include <rte_memcpy.h>
+#include <rte_bus_vdev.h>
+#include <rte_bus_pci.h>
+#include <rte_kvargs.h>
+#include <rte_vhost.h>
+#include <rte_vdpa.h>
+#include <rte_vfio.h>
+#include <rte_spinlock.h>
+#include <eal_vfio.h>
+#include <pci_init.h>
+
+#include "base/ifcvf.h"
+
+#define ETH_IFCVF_BDF_ARG "bdf"
+#define ETH_IFCVF_DEVICES_ARG "int"
+
+static const char *const valid_arguments[] = {
+ ETH_IFCVF_BDF_ARG,
+ ETH_IFCVF_DEVICES_ARG,
+ NULL
+};
+
+static struct ether_addr base_eth_addr = {
+ .addr_bytes = {
+ 0x56 /* V */,
+ 0x44 /* D */,
+ 0x50 /* P */,
+ 0x41 /* A */,
+ 0x00,
+ 0x00
+ }
+};
+
+struct ifcvf_info {
+ struct ifcvf_hw hw;
+ struct rte_pci_device pdev;
+ int vfio_container_fd;
+ int vfio_group_fd;
+ int vfio_dev_fd;
+ pthread_t tid; /* thread for notify relay */
+ int epfd;
+ int vid;
+ rte_atomic32_t started;
+ rte_atomic32_t dev_attached;
+ rte_atomic32_t running;
+ rte_spinlock_t lock;
+};
+
+struct ifcvf_internal {
+ char *dev_name;
+ uint16_t max_queues;
+ uint16_t max_devices;
+ uint64_t features;
+ struct rte_vdpa_eng_addr eng_addr;
+ int eid;
+ struct ifcvf_info vf_info[IFCVF_MAX_DEVICES];
+};
+
+struct internal_list {
+ TAILQ_ENTRY(internal_list) next;
+ struct rte_eth_dev *eth_dev;
+};
+
+TAILQ_HEAD(internal_list_head, internal_list);
+static struct internal_list_head internal_list =
+ TAILQ_HEAD_INITIALIZER(internal_list);
+
+static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static struct rte_eth_link vdpa_link = {
+ .link_speed = 10000,
+ .link_duplex = ETH_LINK_FULL_DUPLEX,
+ .link_status = ETH_LINK_DOWN
+};
+
+static struct internal_list *
+find_internal_resource_by_eid(int eid)
+{
+ int found = 0;
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+
+ pthread_mutex_lock(&internal_list_lock);
+
+ TAILQ_FOREACH(list, &internal_list, next) {
+ internal = list->eth_dev->data->dev_private;
+ if (eid == internal->eid) {
+ found = 1;
+ break;
+ }
+ }
+
+ pthread_mutex_unlock(&internal_list_lock);
+
+ if (!found)
+ return NULL;
+
+ return list;
+}
+
+static struct internal_list *
+find_internal_resource_by_eng_addr(struct rte_vdpa_eng_addr *addr)
+{
+ int found = 0;
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+
+ pthread_mutex_lock(&internal_list_lock);
+
+ TAILQ_FOREACH(list, &internal_list, next) {
+ internal = list->eth_dev->data->dev_private;
+ if (addr == &internal->eng_addr) {
+ found = 1;
+ break;
+ }
+ }
+
+ pthread_mutex_unlock(&internal_list_lock);
+
+ if (!found)
+ return NULL;
+
+ return list;
+}
+
+static int
+check_pci_dev(struct rte_pci_device *dev)
+{
+ char filename[PATH_MAX];
+ char dev_dir[PATH_MAX];
+ char driver[PATH_MAX];
+ int ret;
+
+ snprintf(dev_dir, sizeof(dev_dir), "%s/" PCI_PRI_FMT,
+ rte_pci_get_sysfs_path(),
+ dev->addr.domain, dev->addr.bus,
+ dev->addr.devid, dev->addr.function);
+ if (access(dev_dir, R_OK) != 0) {
+ RTE_LOG(ERR, PMD, "%s not exist\n", dev_dir);
+ return -1;
+ }
+
+ /* parse resources */
+ snprintf(filename, sizeof(filename), "%s/resource", dev_dir);
+ if (rte_pci_parse_sysfs_resource(filename, dev) < 0) {
+ RTE_LOG(ERR, PMD, "cannot parse resource: %s\n", filename);
+ return -1;
+ }
+
+ /* parse driver */
+ snprintf(filename, sizeof(filename), "%s/driver", dev_dir);
+ ret = pci_get_kernel_driver_by_path(filename, driver);
+ if (ret != 0) {
+ RTE_LOG(ERR, PMD, "Fail to get kernel driver: %s\n", filename);
+ return -1;
+ }
+
+ if (strcmp(driver, "vfio-pci") != 0) {
+ RTE_LOG(ERR, PMD, "kernel driver %s is not vfio-pci\n", driver);
+ return -1;
+ }
+ dev->kdrv = RTE_KDRV_VFIO;
+ return 0;
+}
+
+static int
+ifcvf_vfio_setup(struct ifcvf_info *vf_info)
+{
+ struct rte_pci_device *dev = &vf_info->pdev;
+ char devname[RTE_DEV_NAME_MAX_LEN] = {0};
+ int iommu_group_no;
+ int ret = 0;
+ int i;
+
+ rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
+ vfio_get_group_no(rte_pci_get_sysfs_path(), devname, &iommu_group_no);
+
+ vf_info->vfio_container_fd = rte_vfio_create_container();
+ if (vf_info->vfio_container_fd < 0)
+ return -1;
+
+ ret = rte_vfio_bind_group_no(vf_info->vfio_container_fd,
+ iommu_group_no);
+ if (ret)
+ goto err;
+
+ if (rte_pci_map_device(dev))
+ goto err;
+
+ vf_info->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
+ vf_info->vfio_group_fd = rte_vfio_get_group_fd(iommu_group_no);
+ if (vf_info->vfio_group_fd < 0)
+ goto err;
+
+ for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
+ i++) {
+ vf_info->hw.mem_resource[i].addr =
+ vf_info->pdev.mem_resource[i].addr;
+ vf_info->hw.mem_resource[i].phys_addr =
+ vf_info->pdev.mem_resource[i].phys_addr;
+ vf_info->hw.mem_resource[i].len =
+ vf_info->pdev.mem_resource[i].len;
+ }
+ ret = ifcvf_init_hw(&vf_info->hw, &vf_info->pdev);
+
+ return ret;
+
+err:
+ rte_vfio_destroy_container(vf_info->vfio_container_fd);
+ return -1;
+}
+
+static int
+ifcvf_dma_map(struct ifcvf_info *vf_info)
+{
+ uint32_t i;
+ int ret;
+ struct rte_vhost_memory *mem = NULL;
+ int vfio_container_fd;
+
+ ret = rte_vhost_get_mem_table(vf_info->vid, &mem);
+ if (ret < 0) {
+ RTE_LOG(ERR, PMD, "failed to get VM memory layout\n");
+ goto exit;
+ }
+
+ vfio_container_fd = vf_info->vfio_container_fd;
+
+ for (i = 0; i < mem->nregions; i++) {
+ struct rte_vhost_mem_region *reg;
+ struct rte_memseg ms;
+
+ reg = &mem->regions[i];
+ RTE_LOG(INFO, PMD, "region %u: HVA 0x%lx, GPA 0x%lx, "
+ "size 0x%lx\n", i, reg->host_user_addr,
+ reg->guest_phys_addr, reg->size);
+
+ ms.addr_64 = reg->host_user_addr;
+ ms.iova = reg->guest_phys_addr;
+ ms.len = reg->size;
+ rte_vfio_dma_map(vfio_container_fd, VFIO_TYPE1_IOMMU, &ms);
+ }
+
+exit:
+ if (mem)
+ free(mem);
+ return ret;
+}
+
+static int
+ifcvf_dma_unmap(struct ifcvf_info *vf_info)
+{
+ uint32_t i;
+ int ret = 0;
+ struct rte_vhost_memory *mem = NULL;
+ int vfio_container_fd;
+
+ ret = rte_vhost_get_mem_table(vf_info->vid, &mem);
+ if (ret < 0) {
+ RTE_LOG(ERR, PMD, "failed to get VM memory layout\n");
+ goto exit;
+ }
+
+ vfio_container_fd = vf_info->vfio_container_fd;
+
+ for (i = 0; i < mem->nregions; i++) {
+ struct rte_vhost_mem_region *reg;
+ struct rte_memseg ms;
+
+ reg = &mem->regions[i];
+ ms.addr_64 = reg->host_user_addr;
+ ms.iova = reg->guest_phys_addr;
+ ms.len = reg->size;
+ rte_vfio_dma_unmap(vfio_container_fd, VFIO_TYPE1_IOMMU, &ms);
+ }
+
+exit:
+ if (mem)
+ free(mem);
+ return ret;
+}
+
+static uint64_t
+qva_to_gpa(int vid, uint64_t qva)
+{
+ struct rte_vhost_memory *mem = NULL;
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+ uint64_t gpa = 0;
+
+ if (rte_vhost_get_mem_table(vid, &mem) < 0)
+ goto exit;
+
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+
+ if (qva >= reg->host_user_addr &&
+ qva < reg->host_user_addr + reg->size) {
+ gpa = qva - reg->host_user_addr + reg->guest_phys_addr;
+ break;
+ }
+ }
+
+exit:
+ if (gpa == 0)
+ rte_panic("failed to get gpa\n");
+ if (mem)
+ free(mem);
+ return gpa;
+}
+
+static int
+vdpa_ifcvf_start(struct ifcvf_info *vf_info)
+{
+ struct ifcvf_hw *hw = &vf_info->hw;
+ int i, nr_vring;
+ int vid;
+ struct rte_vhost_vring vq;
+
+ vid = vf_info->vid;
+ nr_vring = rte_vhost_get_vring_num(vid);
+ rte_vhost_get_negotiated_features(vid, &hw->req_features);
+
+ for (i = 0; i < nr_vring; i++) {
+ rte_vhost_get_vhost_vring(vid, i, &vq);
+ hw->vring[i].desc = qva_to_gpa(vid, (uint64_t)vq.desc);
+ hw->vring[i].avail = qva_to_gpa(vid, (uint64_t)vq.avail);
+ hw->vring[i].used = qva_to_gpa(vid, (uint64_t)vq.used);
+ hw->vring[i].size = vq.size;
+ rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
+ &hw->vring[i].last_used_idx);
+ }
+ hw->nr_vring = i;
+
+ return ifcvf_start_hw(&vf_info->hw);
+}
+
+static void
+vdpa_ifcvf_stop(struct ifcvf_info *vf_info)
+{
+ struct ifcvf_hw *hw = &vf_info->hw;
+ int i, j;
+ int vid;
+ uint64_t features, pfn;
+ uint64_t log_base, log_size;
+ uint8_t *log_buf;
+
+ vid = vf_info->vid;
+ ifcvf_stop_hw(hw);
+
+ for (i = 0; i < hw->nr_vring; i++)
+ rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
+ hw->vring[i].last_used_idx);
+
+ rte_vhost_get_negotiated_features(vid, &features);
+ if (RTE_VHOST_NEED_LOG(features)) {
+ ifcvf_disable_logging(hw);
+ rte_vhost_get_log_base(vf_info->vid, &log_base, &log_size);
+ /*
+ * IFCVF marks dirty memory pages for only packet buffer,
+ * SW helps to mark all used ring as dirty.
+ */
+ log_buf = (uint8_t *)(uintptr_t)log_base;
+ for (i = 0; i < hw->nr_vring; i++) {
+ pfn = hw->vring[i].used / 4096;
+ for (j = 0; j <= hw->vring[i].size * 8 / 4096; j++)
+ __sync_fetch_and_or_8(&log_buf[(pfn + j) / 8],
+ 1 << ((pfn + j) % 8));
+ }
+ }
+}
+
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+ sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
+static int
+vdpa_enable_vfio_intr(struct ifcvf_info *vf_info)
+{
+ int ret;
+ uint32_t i, nr_vring;
+ char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+ struct rte_vhost_vring vring;
+
+ nr_vring = rte_vhost_get_vring_num(vf_info->vid);
+
+ irq_set = (struct vfio_irq_set *)irq_set_buf;
+ irq_set->argsz = sizeof(irq_set_buf);
+ irq_set->count = nr_vring + 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *)&irq_set->data;
+ fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = vf_info->pdev.intr_handle.fd;
+
+ for (i = 0; i < nr_vring; i++) {
+ rte_vhost_get_vhost_vring(vf_info->vid, i, &vring);
+ fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
+ }
+
+ ret = ioctl(vf_info->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ if (ret) {
+ RTE_LOG(ERR, PMD, "Error enabling MSI-X interrupts: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vdpa_disable_vfio_intr(struct ifcvf_info *vf_info)
+{
+ int ret;
+ char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+
+ irq_set = (struct vfio_irq_set *)irq_set_buf;
+ irq_set->argsz = sizeof(irq_set_buf);
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(vf_info->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+ if (ret) {
+ RTE_LOG(ERR, PMD, "Error disabling MSI-X interrupts: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void *
+notify_relay(void *arg)
+{
+ int i, kickfd, epfd, nfds = 0;
+ uint32_t qid, q_num;
+ struct epoll_event events[IFCVF_MAX_QUEUES * 2];
+ struct epoll_event ev;
+ uint64_t buf;
+ int nbytes;
+ struct rte_vhost_vring vring;
+ struct ifcvf_info *vf_info = (struct ifcvf_info *)arg;
+ struct ifcvf_hw *hw = &vf_info->hw;
+
+ q_num = rte_vhost_get_vring_num(vf_info->vid);
+
+ epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
+ if (epfd < 0) {
+ RTE_LOG(ERR, PMD, "failed to create epoll instance\n");
+ return NULL;
+ }
+ vf_info->epfd = epfd;
+
+ for (qid = 0; qid < q_num; qid++) {
+ ev.events = EPOLLIN | EPOLLPRI;
+ rte_vhost_get_vhost_vring(vf_info->vid, qid, &vring);
+ ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
+ RTE_LOG(ERR, PMD, "epoll add error, %s\n",
+ strerror(errno));
+ return NULL;
+ }
+ }
+
+ for (;;) {
+ nfds = epoll_wait(epfd, events, q_num, -1);
+ if (nfds < 0) {
+ if (errno == EINTR)
+ continue;
+ RTE_LOG(ERR, PMD, "epoll_wait return fail\n");
+ return NULL;
+ }
+
+ for (i = 0; i < nfds; i++) {
+ qid = events[i].data.u32;
+ kickfd = (uint32_t)(events[i].data.u64 >> 32);
+ do {
+ nbytes = read(kickfd, &buf, 8);
+ if (nbytes < 0) {
+ if (errno == EINTR ||
+ errno == EWOULDBLOCK ||
+ errno == EAGAIN)
+ continue;
+ RTE_LOG(INFO, PMD, "Error reading "
+ "kickfd: %s\n",
+ strerror(errno));
+ }
+ break;
+ } while (1);
+
+ ifcvf_notify_queue(hw, qid);
+ }
+ }
+
+ return NULL;
+}
+
+static int
+setup_notify_relay(struct ifcvf_info *vf_info)
+{
+ int ret;
+
+ ret = pthread_create(&vf_info->tid, NULL, notify_relay,
+ (void *)vf_info);
+ if (ret) {
+ RTE_LOG(ERR, PMD, "failed to create notify relay pthread\n");
+ return -1;
+ }
+ return 0;
+}
+
+static int
+unset_notify_relay(struct ifcvf_info *vf_info)
+{
+ void *status;
+
+ if (vf_info->tid) {
+ pthread_cancel(vf_info->tid);
+ pthread_join(vf_info->tid, &status);
+ }
+ vf_info->tid = 0;
+
+ if (vf_info->epfd >= 0)
+ close(vf_info->epfd);
+ vf_info->epfd = -1;
+
+ return 0;
+}
+
+static int
+update_datapath(struct ifcvf_info *vf_info)
+{
+ int ret;
+
+ rte_spinlock_lock(&vf_info->lock);
+
+ if (!rte_atomic32_read(&vf_info->running) &&
+ (rte_atomic32_read(&vf_info->started) &&
+ rte_atomic32_read(&vf_info->dev_attached))) {
+ ret = ifcvf_dma_map(vf_info);
+ if (ret)
+ goto err;
+
+ ret = vdpa_enable_vfio_intr(vf_info);
+ if (ret)
+ goto err;
+
+ ret = setup_notify_relay(vf_info);
+ if (ret)
+ goto err;
+
+ ret = vdpa_ifcvf_start(vf_info);
+ if (ret)
+ goto err;
+
+ rte_atomic32_set(&vf_info->running, 1);
+ } else if (rte_atomic32_read(&vf_info->running) &&
+ (!rte_atomic32_read(&vf_info->started) ||
+ !rte_atomic32_read(&vf_info->dev_attached))) {
+ vdpa_ifcvf_stop(vf_info);
+
+ ret = unset_notify_relay(vf_info);
+ if (ret)
+ goto err;
+
+ ret = vdpa_disable_vfio_intr(vf_info);
+ if (ret)
+ goto err;
+
+ ret = ifcvf_dma_unmap(vf_info);
+ if (ret)
+ goto err;
+
+ rte_atomic32_set(&vf_info->running, 0);
+ }
+
+ rte_spinlock_unlock(&vf_info->lock);
+ return 0;
+err:
+ rte_spinlock_unlock(&vf_info->lock);
+ return ret;
+}
+
+static int
+ifcvf_dev_config(int vid)
+{
+ int eid, did;
+ struct internal_list *list;
+ struct rte_eth_dev *eth_dev;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+
+ eid = rte_vhost_get_vdpa_eid(vid);
+ did = rte_vhost_get_vdpa_did(vid);
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ eth_dev = list->eth_dev;
+ internal = eth_dev->data->dev_private;
+ vf_info = &internal->vf_info[did];
+ vf_info->vid = vid;
+
+ eth_dev->data->dev_link.link_status = ETH_LINK_UP;
+
+ rte_atomic32_set(&vf_info->dev_attached, 1);
+ update_datapath(vf_info);
+
+ return 0;
+}
+
+static int
+ifcvf_dev_close(int vid)
+{
+ int eid, did;
+ struct internal_list *list;
+ struct rte_eth_dev *eth_dev;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+
+ eid = rte_vhost_get_vdpa_eid(vid);
+ did = rte_vhost_get_vdpa_did(vid);
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ eth_dev = list->eth_dev;
+ internal = eth_dev->data->dev_private;
+ vf_info = &internal->vf_info[did];
+
+ eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
+
+ rte_atomic32_set(&vf_info->dev_attached, 0);
+ update_datapath(vf_info);
+ vf_info->vid = -1;
+
+ return 0;
+}
+
+static int
+ifcvf_feature_set(int vid)
+{
+ uint64_t features;
+ int eid, did;
+ struct internal_list *list;
+ struct rte_eth_dev *eth_dev;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+ uint64_t log_base, log_size;
+
+ eid = rte_vhost_get_vdpa_eid(vid);
+ did = rte_vhost_get_vdpa_did(vid);
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ eth_dev = list->eth_dev;
+ internal = eth_dev->data->dev_private;
+ vf_info = &internal->vf_info[did];
+
+ rte_vhost_get_negotiated_features(vf_info->vid, &features);
+
+ if (RTE_VHOST_NEED_LOG(features)) {
+ rte_vhost_get_log_base(vf_info->vid, &log_base, &log_size);
+ log_base = rte_mem_virt2phy((void *)(uintptr_t)log_base);
+ ifcvf_enable_logging(&vf_info->hw, log_base, log_size);
+ }
+
+ return 0;
+}
+
+static int
+ifcvf_get_vfio_group_fd(int vid)
+{
+ int eid, did;
+ struct internal_list *list;
+ struct rte_eth_dev *eth_dev;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+
+ eid = rte_vhost_get_vdpa_eid(vid);
+ did = rte_vhost_get_vdpa_did(vid);
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ eth_dev = list->eth_dev;
+ internal = eth_dev->data->dev_private;
+ vf_info = &internal->vf_info[did];
+ return vf_info->vfio_group_fd;
+}
+
+static int
+ifcvf_get_vfio_device_fd(int vid)
+{
+ int eid, did;
+ struct internal_list *list;
+ struct rte_eth_dev *eth_dev;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+
+ eid = rte_vhost_get_vdpa_eid(vid);
+ did = rte_vhost_get_vdpa_did(vid);
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ eth_dev = list->eth_dev;
+ internal = eth_dev->data->dev_private;
+ vf_info = &internal->vf_info[did];
+ return vf_info->vfio_dev_fd;
+}
+
+static int
+ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
+{
+ int eid, did;
+ struct internal_list *list;
+ struct rte_eth_dev *eth_dev;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+ struct vfio_region_info reg = { .argsz = sizeof(reg) };
+ int ret;
+
+ eid = rte_vhost_get_vdpa_eid(vid);
+ did = rte_vhost_get_vdpa_did(vid);
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ eth_dev = list->eth_dev;
+ internal = eth_dev->data->dev_private;
+ vf_info = &internal->vf_info[did];
+
+ reg.index = ifcvf_get_notify_region(&vf_info->hw);
+ ret = ioctl(vf_info->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
+ if (ret) {
+ RTE_LOG(ERR, PMD, "Get not get device region info: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ *offset = ifcvf_get_queue_notify_off(&vf_info->hw, qid) + reg.offset;
+ *size = 0x1000;
+
+ return 0;
+}
+
+static int
+vdpa_eng_init(int eid, struct rte_vdpa_eng_addr *addr)
+{
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+ uint64_t features;
+ int i;
+
+ list = find_internal_resource_by_eng_addr(addr);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine addr\n");
+ return -1;
+ }
+
+ internal = list->eth_dev->data->dev_private;
+
+ for (i = 0; i < internal->max_devices; i++) {
+ vf_info = &internal->vf_info[i];
+ vf_info->vfio_dev_fd = -1;
+ vf_info->vfio_group_fd = -1;
+ vf_info->vfio_container_fd = -1;
+
+ if (check_pci_dev(&vf_info->pdev) < 0)
+ return -1;
+
+ if (ifcvf_vfio_setup(vf_info) < 0)
+ return -1;
+ }
+
+ internal->eid = eid;
+ internal->max_queues = IFCVF_MAX_QUEUES;
+ features = ifcvf_get_features(&internal->vf_info[0].hw);
+ internal->features = (features & ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
+ (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES);
+
+ return 0;
+}
+
+static int
+vdpa_eng_uninit(int eid)
+{
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+ int i;
+
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id %d\n", eid);
+ return -1;
+ }
+
+ internal = list->eth_dev->data->dev_private;
+ for (i = 0; i < internal->max_devices; i++) {
+ vf_info = &internal->vf_info[i];
+ rte_pci_unmap_device(&vf_info->pdev);
+ rte_vfio_destroy_container(vf_info->vfio_container_fd);
+ }
+ return 0;
+}
+
+#define VDPA_SUPPORTED_PROTOCOL_FEATURES \
+ (1ULL << RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK)
+static int
+vdpa_info_query(int eid, struct rte_vdpa_eng_attr *attr)
+{
+ struct internal_list *list;
+ struct ifcvf_internal *internal;
+
+ list = find_internal_resource_by_eid(eid);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+ return -1;
+ }
+
+ internal = list->eth_dev->data->dev_private;
+ attr->dev_num = internal->max_devices;
+ attr->queue_num = internal->max_queues;
+ attr->features = internal->features;
+ attr->protocol_features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
+
+ return 0;
+}
+
+struct rte_vdpa_eng_driver vdpa_ifcvf_driver = {
+ .name = "ifcvf",
+ .eng_ops = {
+ .eng_init = vdpa_eng_init,
+ .eng_uninit = vdpa_eng_uninit,
+ .info_query = vdpa_info_query,
+ },
+ .dev_ops = {
+ .dev_conf = ifcvf_dev_config,
+ .dev_close = ifcvf_dev_close,
+ .vring_state_set = NULL,
+ .feature_set = ifcvf_feature_set,
+ .migration_done = NULL,
+ .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
+ .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
+ .get_notify_area = ifcvf_get_notify_area,
+ },
+};
+
+RTE_VDPA_REGISTER_DRIVER(ifcvf, vdpa_ifcvf_driver);
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+ int i;
+
+ internal = dev->data->dev_private;
+ for (i = 0; i < internal->max_devices; i++) {
+ vf_info = &internal->vf_info[i];
+ rte_atomic32_set(&vf_info->started, 1);
+ update_datapath(vf_info);
+ }
+
+ return 0;
+}
+
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+ struct ifcvf_internal *internal;
+ struct ifcvf_info *vf_info;
+ int i;
+
+ internal = dev->data->dev_private;
+ for (i = 0; i < internal->max_devices; i++) {
+ vf_info = &internal->vf_info[i];
+ rte_atomic32_set(&vf_info->started, 0);
+ update_datapath(vf_info);
+ }
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev)
+{
+ struct ifcvf_internal *internal;
+ struct internal_list *list;
+
+ internal = dev->data->dev_private;
+ eth_dev_stop(dev);
+
+ list = find_internal_resource_by_eng_addr(&internal->eng_addr);
+ if (list == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid engine addr\n");
+ return;
+ }
+
+ rte_vdpa_unregister_engine(internal->eid);
+
+ pthread_mutex_lock(&internal_list_lock);
+ TAILQ_REMOVE(&internal_list, list, next);
+ pthread_mutex_unlock(&internal_list_lock);
+ rte_free(list);
+
+ rte_free(dev->data->mac_addrs);
+ free(internal->dev_name);
+ rte_free(internal);
+
+ dev->data->dev_private = NULL;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+ return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+ struct ifcvf_internal *internal;
+
+ internal = dev->data->dev_private;
+ if (internal == NULL) {
+ RTE_LOG(ERR, PMD, "Invalid device specified\n");
+ return;
+ }
+
+ dev_info->max_mac_addrs = 1;
+ dev_info->max_rx_pktlen = (uint32_t)-1;
+ dev_info->max_rx_queues = internal->max_queues;
+ dev_info->max_tx_queues = internal->max_queues;
+ dev_info->min_rx_bufsize = 0;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev __rte_unused,
+ uint16_t rx_queue_id __rte_unused,
+ uint16_t nb_rx_desc __rte_unused,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_rxconf *rx_conf __rte_unused,
+ struct rte_mempool *mb_pool __rte_unused)
+{
+ return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev __rte_unused,
+ uint16_t tx_queue_id __rte_unused,
+ uint16_t nb_tx_desc __rte_unused,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+ return 0;
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static uint16_t
+eth_ifcvf_rx(void *q __rte_unused, struct rte_mbuf **bufs __rte_unused,
+ uint16_t nb_bufs __rte_unused)
+{
+ return 0;
+}
+
+static uint16_t
+eth_ifcvf_tx(void *q __rte_unused, struct rte_mbuf **bufs __rte_unused,
+ uint16_t nb_bufs __rte_unused)
+{
+ return 0;
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+ int wait_to_complete __rte_unused)
+{
+ return 0;
+}
+
+static const struct eth_dev_ops ops = {
+ .dev_start = eth_dev_start,
+ .dev_stop = eth_dev_stop,
+ .dev_close = eth_dev_close,
+ .dev_configure = eth_dev_configure,
+ .dev_infos_get = eth_dev_info,
+ .rx_queue_setup = eth_rx_queue_setup,
+ .tx_queue_setup = eth_tx_queue_setup,
+ .rx_queue_release = eth_queue_release,
+ .tx_queue_release = eth_queue_release,
+ .link_update = eth_link_update,
+};
+
+static int
+eth_dev_ifcvf_create(struct rte_vdev_device *dev,
+ struct rte_pci_addr *pci_addr, int devices)
+{
+ const char *name = rte_vdev_device_name(dev);
+ struct rte_eth_dev *eth_dev = NULL;
+ struct ether_addr *eth_addr = NULL;
+ struct ifcvf_internal *internal = NULL;
+ struct internal_list *list = NULL;
+ struct rte_eth_dev_data *data = NULL;
+ struct rte_pci_addr pf_addr = *pci_addr;
+ int i;
+
+ list = rte_zmalloc_socket(name, sizeof(*list), 0,
+ dev->device.numa_node);
+ if (list == NULL)
+ goto error;
+
+ /* reserve an ethdev entry */
+ eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
+ if (eth_dev == NULL)
+ goto error;
+
+ eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0,
+ dev->device.numa_node);
+ if (eth_addr == NULL)
+ goto error;
+
+ *eth_addr = base_eth_addr;
+ eth_addr->addr_bytes[5] = eth_dev->data->port_id;
+
+ internal = eth_dev->data->dev_private;
+ internal->dev_name = strdup(name);
+ if (internal->dev_name == NULL)
+ goto error;
+
+ internal->eng_addr.pci_addr = *pci_addr;
+ for (i = 0; i < devices; i++) {
+ pf_addr.domain = pci_addr->domain;
+ pf_addr.bus = pci_addr->bus;
+ pf_addr.devid = pci_addr->devid + (i + 1) / 8;
+ pf_addr.function = pci_addr->function + (i + 1) % 8;
+ internal->vf_info[i].pdev.addr = pf_addr;
+ rte_spinlock_init(&internal->vf_info[i].lock);
+ }
+ internal->max_devices = devices;
+
+ list->eth_dev = eth_dev;
+ pthread_mutex_lock(&internal_list_lock);
+ TAILQ_INSERT_TAIL(&internal_list, list, next);
+ pthread_mutex_unlock(&internal_list_lock);
+
+ data = eth_dev->data;
+ data->nb_rx_queues = IFCVF_MAX_QUEUES;
+ data->nb_tx_queues = IFCVF_MAX_QUEUES;
+ data->dev_link = vdpa_link;
+ data->mac_addrs = eth_addr;
+ data->dev_flags = RTE_ETH_DEV_INTR_LSC;
+ eth_dev->dev_ops = &ops;
+
+ /* assign rx and tx ops, could be used as vDPA fallback */
+ eth_dev->rx_pkt_burst = eth_ifcvf_rx;
+ eth_dev->tx_pkt_burst = eth_ifcvf_tx;
+
+ if (rte_vdpa_register_engine(vdpa_ifcvf_driver.name,
+ &internal->eng_addr) < 0)
+ goto error;
+
+ return 0;
+
+error:
+ rte_free(list);
+ rte_free(eth_addr);
+ if (internal && internal->dev_name)
+ free(internal->dev_name);
+ rte_free(internal);
+ if (eth_dev)
+ rte_eth_dev_release_port(eth_dev);
+
+ return -1;
+}
+
+static int
+get_pci_addr(const char *key __rte_unused, const char *value, void *extra_args)
+{
+ if (value == NULL || extra_args == NULL)
+ return -1;
+
+ return rte_pci_addr_parse(value, extra_args);
+}
+
+static inline int
+open_int(const char *key __rte_unused, const char *value, void *extra_args)
+{
+ uint16_t *n = extra_args;
+
+ if (value == NULL || extra_args == NULL)
+ return -EINVAL;
+
+ *n = (uint16_t)strtoul(value, NULL, 0);
+ if (*n == USHRT_MAX && errno == ERANGE)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * If this vdev is created by user, then ifcvf will be taken by
+ * this vdev.
+ */
+static int
+ifcvf_take_over(struct rte_pci_addr *pci_addr, int num)
+{
+ uint16_t port_id;
+ int i, ret;
+ char devname[RTE_DEV_NAME_MAX_LEN];
+ struct rte_pci_addr vf_addr = *pci_addr;
+
+ for (i = 0; i < num; i++) {
+ vf_addr.function += i % 8;
+ vf_addr.devid += i / 8;
+ rte_pci_device_name(&vf_addr, devname, RTE_DEV_NAME_MAX_LEN);
+ ret = rte_eth_dev_get_port_by_name(devname, &port_id);
+ if (ret == 0) {
+ rte_eth_dev_close(port_id);
+ if (rte_eth_dev_detach(port_id, devname) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+rte_ifcvf_probe(struct rte_vdev_device *dev)
+{
+ struct rte_kvargs *kvlist = NULL;
+ int ret = 0;
+ struct rte_pci_addr pci_addr;
+ int devices;
+
+ RTE_LOG(INFO, PMD, "Initializing ifcvf for %s\n",
+ rte_vdev_device_name(dev));
+
+ kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+ if (kvlist == NULL)
+ return -1;
+
+ if (rte_kvargs_count(kvlist, ETH_IFCVF_BDF_ARG) == 1) {
+ ret = rte_kvargs_process(kvlist, ETH_IFCVF_BDF_ARG,
+ &get_pci_addr, &pci_addr);
+ if (ret < 0)
+ goto out_free;
+
+ } else {
+ ret = -1;
+ goto out_free;
+ }
+
+ if (rte_kvargs_count(kvlist, ETH_IFCVF_DEVICES_ARG) == 1) {
+ ret = rte_kvargs_process(kvlist, ETH_IFCVF_DEVICES_ARG,
+ &open_int, &devices);
+ if (ret < 0 || devices > IFCVF_MAX_DEVICES)
+ goto out_free;
+ } else {
+ devices = 1;
+ }
+
+ ret = ifcvf_take_over(&pci_addr, devices);
+ if (ret < 0)
+ goto out_free;
+
+ eth_dev_ifcvf_create(dev, &pci_addr, devices);
+
+out_free:
+ rte_kvargs_free(kvlist);
+ return ret;
+}
+
+static int
+rte_ifcvf_remove(struct rte_vdev_device *dev)
+{
+ const char *name;
+ struct rte_eth_dev *eth_dev = NULL;
+
+ name = rte_vdev_device_name(dev);
+ RTE_LOG(INFO, PMD, "Un-Initializing ifcvf for %s\n", name);
+
+ /* find an ethdev entry */
+ eth_dev = rte_eth_dev_allocated(name);
+ if (eth_dev == NULL)
+ return -ENODEV;
+
+ eth_dev_close(eth_dev);
+ rte_free(eth_dev->data);
+ rte_eth_dev_release_port(eth_dev);
+
+ return 0;
+}
+
+static struct rte_vdev_driver ifcvf_drv = {
+ .probe = rte_ifcvf_probe,
+ .remove = rte_ifcvf_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_ifcvf, ifcvf_drv);
+RTE_PMD_REGISTER_ALIAS(net_ifcvf, eth_ifcvf);
+RTE_PMD_REGISTER_PARAM_STRING(net_ifcvf,
+ "bdf=<bdf> "
+ "devices=<int>");
new file mode 100644
@@ -0,0 +1,4 @@
+EXPERIMENTAL {
+
+ local: *;
+};
@@ -171,6 +171,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD) += -lrte_pmd_vdev_netvsc
_LDLIBS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += -lrte_pmd_virtio
ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost
+_LDLIBS-$(CONFIG_RTE_LIBRTE_IFCVF) += -lrte_ifcvf
endif # $(CONFIG_RTE_LIBRTE_VHOST)
_LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio