@@ -36,15 +36,41 @@
#include <string.h>
#include <inttypes.h>
#include <sys/queue.h>
-
+#include <sys/signalfd.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
#include <rte_bus.h>
#include <rte_dev.h>
#include <rte_devargs.h>
#include <rte_debug.h>
#include <rte_log.h>
+#include <rte_spinlock.h>
#include "eal_private.h"
+/* spinlock for uevent callbacks */
+static rte_spinlock_t rte_eal_uev_cb_lock = RTE_SPINLOCK_INITIALIZER;
+
+/**
+ * The user application callback description.
+ *
+ * It contains callback address to be registered by user application,
+ * the pointer to the parameters for callback, and the event type.
+ */
+struct rte_eal_uev_callback {
+ TAILQ_ENTRY(rte_eal_uev_callback) next; /**< Callbacks list */
+ rte_eal_uev_cb_fn cb_fn; /**< Callback address */
+ void *cb_arg; /**< Parameter for callback */
+ void *ret_param; /**< Return parameter */
+ enum rte_eal_uevent_type event; /**< Interrupt event type */
+ uint32_t active; /**< Callback is executing */
+};
+
static int cmp_detached_dev_name(const struct rte_device *dev,
const void *_name)
{
@@ -244,3 +270,223 @@ int rte_eal_hotplug_remove(const char *busname, const char *devname)
rte_eal_devargs_remove(busname, devname);
return ret;
}
+
+int
+rte_eal_uev_fd_new(void)
+{
+
+ int netlink_fd = -1;
+
+ netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
+ if (netlink_fd < 0)
+ return -1;
+
+ return netlink_fd;
+}
+
+int
+rte_eal_uev_enable(int netlink_fd)
+{
+ struct sockaddr_nl addr;
+ int ret;
+ int size = 64 * 1024;
+ int nonblock = 1;
+ memset(&addr, 0, sizeof(addr));
+ addr.nl_family = AF_NETLINK;
+ addr.nl_pid = 0;
+ addr.nl_groups = 0xffffffff;
+
+ setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size));
+
+ ret = ioctl(netlink_fd, FIONBIO, &nonblock);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL,
+ "ioctl(FIONBIO) failed\n");
+ close(netlink_fd);
+ return -1;
+ }
+
+ if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+ close(netlink_fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+rte_eal_uev_parse(const char *buf, struct rte_eal_uevent *event)
+{
+ char action[RTE_EAL_UEVENT_MSG_LEN];
+ char subsystem[RTE_EAL_UEVENT_MSG_LEN];
+ char dev_path[RTE_EAL_UEVENT_MSG_LEN];
+ int i = 0;
+
+ memset(action, 0, RTE_EAL_UEVENT_MSG_LEN);
+ memset(subsystem, 0, RTE_EAL_UEVENT_MSG_LEN);
+ memset(dev_path, 0, RTE_EAL_UEVENT_MSG_LEN);
+
+ while (i < RTE_EAL_UEVENT_MSG_LEN) {
+ for (; i < RTE_EAL_UEVENT_MSG_LEN; i++) {
+ if (*buf)
+ break;
+ buf++;
+ }
+ if (!strncmp(buf, "ACTION=", 7)) {
+ buf += 7;
+ i += 7;
+ snprintf(action, sizeof(action), "%s", buf);
+ } else if (!strncmp(buf, "DEVPATH=", 8)) {
+ buf += 8;
+ i += 8;
+ snprintf(dev_path, sizeof(dev_path), "%s", buf);
+ } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+ buf += 10;
+ i += 10;
+ snprintf(subsystem, sizeof(subsystem), "%s", buf);
+ }
+ for (; i < RTE_EAL_UEVENT_MSG_LEN; i++) {
+ if (*buf == '\0')
+ break;
+ buf++;
+ }
+ }
+
+ if ((!strncmp(subsystem, "uio", 3)) ||
+ (!strncmp(subsystem, "pci", 3))) {
+ event->subsystem = RTE_EAL_UEVENT_SUBSYSTEM_UIO;
+ if (!strncmp(action, "add", 3))
+ event->type = RTE_EAL_UEVENT_ADD;
+ if (!strncmp(action, "remove", 6))
+ event->type = RTE_EAL_UEVENT_REMOVE;
+ return 0;
+ }
+
+ return -1;
+}
+
+int
+rte_eal_uev_receive(int fd, struct rte_eal_uevent *uevent)
+{
+ int ret;
+ char buf[RTE_EAL_UEVENT_MSG_LEN];
+
+ memset(uevent, 0, sizeof(struct rte_eal_uevent));
+ memset(buf, 0, RTE_EAL_UEVENT_MSG_LEN);
+
+ ret = recv(fd, buf, RTE_EAL_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
+ if (ret > 0)
+ return rte_eal_uev_parse(buf, uevent);
+ else if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "Socket read error(%d): %s\n",
+ errno, strerror(errno));
+ return -1;
+ } else
+ /* connection closed */
+ return -1;
+}
+
+int
+rte_eal_uev_callback_register(struct rte_device *dev,
+ enum rte_eal_uevent_type event,
+ rte_eal_uev_cb_fn cb_fn, void *cb_arg)
+{
+ struct rte_eal_uev_callback *user_cb;
+
+ if (!cb_fn)
+ return -EINVAL;
+
+ rte_spinlock_lock(&rte_eal_uev_cb_lock);
+
+ TAILQ_FOREACH(user_cb, &(dev->uev_cbs), next) {
+ if (user_cb->cb_fn == cb_fn &&
+ user_cb->cb_arg == cb_arg &&
+ user_cb->event == event) {
+ break;
+ }
+ }
+
+ /* create a new callback. */
+ if (user_cb == NULL) {
+ user_cb = rte_zmalloc("EAL_UEV_CALLBACK",
+ sizeof(struct rte_eal_uev_callback), 0);
+ if (user_cb != NULL) {
+ user_cb->cb_fn = cb_fn;
+ user_cb->cb_arg = cb_arg;
+ user_cb->event = event;
+ TAILQ_INSERT_TAIL(&(dev->uev_cbs), user_cb, next);
+ }
+ }
+
+ rte_spinlock_unlock(&rte_eal_uev_cb_lock);
+ return (user_cb == NULL) ? -ENOMEM : 0;
+}
+
+int
+rte_eal_uev_callback_unregister(struct rte_device *dev,
+ enum rte_eal_uevent_type event,
+ rte_eal_uev_cb_fn cb_fn, void *cb_arg)
+{
+ int ret;
+ struct rte_eal_uev_callback *cb, *next;
+
+ if (!cb_fn)
+ return -EINVAL;
+
+ rte_spinlock_lock(&rte_eal_uev_cb_lock);
+
+ ret = 0;
+ for (cb = TAILQ_FIRST(&dev->uev_cbs); cb != NULL; cb = next) {
+
+ next = TAILQ_NEXT(cb, next);
+
+ if (cb->cb_fn != cb_fn || cb->event != event ||
+ (cb->cb_arg != (void *)-1 &&
+ cb->cb_arg != cb_arg))
+ continue;
+
+ /*
+ * if this callback is not executing right now,
+ * then remove it.
+ */
+ if (cb->active == 0) {
+ TAILQ_REMOVE(&(dev->uev_cbs), cb, next);
+ rte_free(cb);
+ } else {
+ ret = -EAGAIN;
+ }
+ }
+
+ rte_spinlock_unlock(&rte_eal_uev_cb_lock);
+ return ret;
+}
+
+int
+_rte_eal_uev_callback_process(struct rte_device *dev,
+ enum rte_eal_uevent_type event, void *cb_arg, void *ret_param)
+{
+ struct rte_eal_uev_callback *cb_lst;
+ struct rte_eal_uev_callback dev_cb;
+ int rc = 0;
+
+ rte_spinlock_lock(&rte_eal_uev_cb_lock);
+ TAILQ_FOREACH(cb_lst, &(dev->uev_cbs), next) {
+ if (cb_lst->cb_fn == NULL || cb_lst->event != event)
+ continue;
+ dev_cb = *cb_lst;
+ cb_lst->active = 1;
+ if (cb_arg != NULL)
+ dev_cb.cb_arg = cb_arg;
+ if (ret_param != NULL)
+ dev_cb.ret_param = ret_param;
+
+ rte_spinlock_unlock(&rte_eal_uev_cb_lock);
+ rc = dev_cb.cb_fn(dev, dev_cb.event,
+ dev_cb.cb_arg, dev_cb.ret_param);
+ rte_spinlock_lock(&rte_eal_uev_cb_lock);
+ cb_lst->active = 0;
+ }
+ rte_spinlock_unlock(&rte_eal_uev_cb_lock);
+ return rc;
+}
@@ -110,6 +110,26 @@ pci_name_set(struct rte_pci_device *dev)
dev->device.name = dev->name;
}
+/* map a private resource from an address*/
+void *
+pci_map_private_resource(void *requested_addr, off_t offset, size_t size)
+{
+ void *mapaddr;
+
+ mapaddr = mmap(requested_addr, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (mapaddr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "%s(): cannot mmap(%p, 0x%lx, 0x%lx): %s (%p)\n",
+ __func__, requested_addr,
+ (unsigned long)size, (unsigned long)offset,
+ strerror(errno), mapaddr);
+ } else
+ RTE_LOG(DEBUG, EAL, " PCI memory mapped at %p\n", mapaddr);
+
+ return mapaddr;
+}
+
/* map a particular resource from a file */
void *
pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
@@ -192,6 +192,8 @@ int pci_uio_map_resource(struct rte_pci_device *dev);
*/
void pci_uio_unmap_resource(struct rte_pci_device *dev);
+void pci_uio_uev_handler(void *parm);
+
/**
* Allocate uio resource for PCI device
*
@@ -222,6 +224,18 @@ void pci_uio_free_resource(struct rte_pci_device *dev,
struct mapped_pci_resource *uio_res);
/**
+ * remap the pci uio resource..
+ *
+ * @param dev
+ * Point to the struct rte pci device.
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+pci_uio_remap_resource(struct rte_pci_device *dev);
+
+/**
* Map device memory to uio resource
*
* This function is private to EAL.
@@ -52,6 +52,13 @@ extern "C" {
#include <rte_config.h>
#include <rte_log.h>
+struct rte_device;
+
+struct rte_eal_uev_callback;
+/** @internal Structure to keep track of registered callbacks */
+TAILQ_HEAD(rte_eal_uev_cb_list, rte_eal_uev_callback);
+
+
__attribute__((format(printf, 2, 0)))
static inline void
rte_pmd_debug_trace(const char *func_name, const char *fmt, ...)
@@ -163,6 +170,8 @@ struct rte_device {
const struct rte_driver *driver;/**< Associated driver */
int numa_node; /**< NUMA node connection */
struct rte_devargs *devargs; /**< Device user arguments */
+ /** User application callbacks for device uevent monitoring */
+ struct rte_eal_uev_cb_list uev_cbs;
};
/**
@@ -246,6 +255,133 @@ int rte_eal_hotplug_add(const char *busname, const char *devname,
*/
int rte_eal_hotplug_remove(const char *busname, const char *devname);
+#define RTE_EAL_UEVENT_MSG_LEN 4096
+#define RTE_EAL_UEVENT_SUBSYSTEM_UIO 1
+#define RTE_EAL_UEVENT_SUBSYSTEM_VFIO 2
+
+/**
+ * The eth device event type for interrupt, and maybe others in the future.
+ */
+enum rte_eal_uevent_type {
+ RTE_EAL_UEVENT_UNKNOWN, /**< unknown event type */
+ RTE_EAL_UEVENT_ADD, /**< lsc interrupt event */
+ RTE_EAL_UEVENT_REMOVE,
+ /**< queue state event (enabled/disabled) */
+ RTE_EAL_UEVENT_CHANGE,
+ /**< reset interrupt event, sent to VF on PF reset */
+ RTE_EAL_UEVENT_MOVE, /**< message from the VF received by PF */
+ RTE_EAL_UEVENT_ONLINE, /**< MACsec offload related event */
+ RTE_EAL_UEVENT_OFFLINE, /**< device removal event */
+ RTE_EAL_UEVENT_MAX /**< max value of this enum */
+};
+
+struct rte_eal_uevent {
+ enum rte_eal_uevent_type type; /**< uevent action type */
+ int subsystem; /**< subsystem id */
+};
+
+/**
+ * create the device uevent file descriptor.
+ * @return
+ * - On success, the device uevent fd.
+ * - On failure, a negative value.
+ */
+int
+rte_eal_uev_fd_new(void);
+
+/**
+ * Bind the netlink to enable uevent receiving.
+ *
+ * @param fd
+ * The fd which the uevent associated to
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+rte_eal_uev_enable(int fd);
+
+/**
+ * It read out the uevent from the specific file descriptor.
+ *
+ * @param fd
+ * The fd which the uevent associated to
+ * @param uevent
+ * Pointer to the uevent which read from the monitoring fd.
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int
+rte_eal_uev_receive(int fd, struct rte_eal_uevent *uevent);
+
+typedef int (*rte_eal_uev_cb_fn)(struct rte_device *dev,
+ enum rte_eal_uevent_type event, void *cb_arg, void *ret_param);
+/**< user application callback to be registered for interrupts */
+
+/**
+ * Register a callback function for specific device..
+ *
+ * @param dev
+ * Pointer to struct rte_device.
+ * @param event
+ * Uevent interested.
+ * @param cb_fn
+ * User supplied callback function to be called.
+ * @param cb_arg
+ * Pointer to the parameters for the registered callback.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int rte_eal_uev_callback_register(struct rte_device *dev,
+ enum rte_eal_uevent_type event,
+ rte_eal_uev_cb_fn cb_fn, void *cb_arg);
+
+/**
+ * Unregister a callback function for specific device.
+ *
+ * @param device
+ * Pointer to struct rte_device.
+ * @param event
+ * Uevent interested.
+ * @param cb_fn
+ * User supplied callback function to be called.
+ * @param cb_arg
+ * Pointer to the parameters for the registered callback. -1 means to
+ * remove all for the same callback address and same event.
+ *
+ * @return
+ * - On success, zero.
+ * - On failure, a negative value.
+ */
+int rte_eal_uev_callback_unregister(struct rte_device *dev,
+ enum rte_eal_uevent_type event,
+ rte_eal_uev_cb_fn cb_fn, void *cb_arg);
+
+/**
+ * @internal Executes all the user application registered callbacks for
+ * the specific device. It is for DPDK internal user only. User
+ * application should not call it directly.
+ *
+ * @param dev
+ * Pointer to struct rte_device.
+ * @param event
+ * rte device uevent type.
+ * @param cb_arg
+ * callback parameter.
+ * @param ret_param
+ * To pass data back to user application.
+ * This allows the user application to decide if a particular function
+ * is permitted or not.
+ *
+ * @return
+ * int
+ */
+int _rte_eal_uev_callback_process(struct rte_device *dev,
+ enum rte_eal_uevent_type event, void *cb_arg, void *ret_param);
+
/**
* Device comparison function.
*
@@ -394,6 +394,23 @@ void rte_pci_unmap_device(struct rte_pci_device *dev);
/**
* @internal
+ * Map to a particular private resource.
+ *
+ * @param requested_addr
+ * The starting address for the new mapping range.
+ * @param offset
+ * The offset for the mapping range.
+ * @param size
+ * The size for the mapping range.
+ * @return
+ * - On success, the function returns a pointer to the mapped area.
+ * - On error, the value MAP_FAILED is returned.
+ */
+void *pci_map_private_resource(void *requested_addr, off_t offset,
+ size_t size);
+
+/**
+ * @internal
* Map a particular resource from a file.
*
* @param requested_addr
@@ -670,11 +670,16 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
RTE_SET_USED(r);
return -1;
}
+
rte_spinlock_lock(&intr_lock);
- TAILQ_FOREACH(src, &intr_sources, next)
+ TAILQ_FOREACH(src, &intr_sources, next) {
if (src->intr_handle.fd ==
events[n].data.fd)
break;
+ else if (src->intr_handle.uevent_fd ==
+ events[n].data.fd)
+ break;
+ }
if (src == NULL){
rte_spinlock_unlock(&intr_lock);
continue;
@@ -736,17 +741,13 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
rte_spinlock_lock(&intr_lock);
if (call) {
-
/* Finally, call all callbacks. */
TAILQ_FOREACH(cb, &src->callbacks, next) {
-
/* make a copy and unlock. */
active_cb = *cb;
rte_spinlock_unlock(&intr_lock);
-
/* call the actual callback */
active_cb.cb_fn(active_cb.cb_arg);
-
/*get the lock back. */
rte_spinlock_lock(&intr_lock);
}
@@ -859,7 +860,24 @@ eal_intr_thread_main(__rte_unused void *arg)
}
else
numfds++;
+
+ /**
+ * add device uevent file descriptor
+ * into wait list for uevent monitoring.
+ */
+ ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
+ ev.data.fd = src->intr_handle.uevent_fd;
+ if (epoll_ctl(pfd, EPOLL_CTL_ADD,
+ src->intr_handle.uevent_fd, &ev) < 0){
+ rte_panic("Error adding uevent_fd %d epoll_ctl"
+ ", %s\n",
+ src->intr_handle.uevent_fd,
+ strerror(errno));
+ } else
+ numfds++;
}
+
+
rte_spinlock_unlock(&intr_lock);
/* serve the interrupt */
eal_intr_handle_interrupts(pfd, numfds);
@@ -52,10 +52,14 @@ void *pci_find_max_end_va(void);
int pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr,
uint64_t *end_addr, uint64_t *flags);
+void pci_uio_uev_handler(void *param);
int pci_uio_alloc_resource(struct rte_pci_device *dev,
struct mapped_pci_resource **uio_res);
void pci_uio_free_resource(struct rte_pci_device *dev,
struct mapped_pci_resource *uio_res);
+
+int pci_uio_remap_resource(struct rte_pci_device *dev);
+
int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
struct mapped_pci_resource *uio_res, int map_idx);
@@ -231,6 +231,10 @@ pci_uio_free_resource(struct rte_pci_device *dev,
close(dev->intr_handle.uio_cfg_fd);
dev->intr_handle.uio_cfg_fd = -1;
}
+ if (dev->intr_handle.uevent_fd >= 0) {
+ close(dev->intr_handle.uevent_fd);
+ dev->intr_handle.uevent_fd = -1;
+ }
if (dev->intr_handle.fd >= 0) {
close(dev->intr_handle.fd);
dev->intr_handle.fd = -1;
@@ -239,6 +243,53 @@ pci_uio_free_resource(struct rte_pci_device *dev,
}
int
+pci_uio_remap_resource(struct rte_pci_device *dev)
+{
+ int i;
+ uint64_t phaddr;
+ void *map_address;
+
+ /* Map all BARs */
+ for (i = 0; i != PCI_MAX_RESOURCE; i++) {
+ /* skip empty BAR */
+ phaddr = dev->mem_resource[i].phys_addr;
+ if (phaddr == 0)
+ continue;
+ map_address = pci_map_private_resource(dev->mem_resource[i].addr, 0,
+ (size_t)dev->mem_resource[i].len);
+ if (map_address == MAP_FAILED)
+ goto error;
+ memset(map_address, 0xFF, (size_t)dev->mem_resource[i].len);
+ dev->mem_resource[i].addr = map_address;
+ }
+
+ return 0;
+error:
+ return -1;
+}
+
+void
+pci_uio_uev_handler(void *param)
+{
+
+ struct rte_pci_device *dev = (struct rte_pci_device *)param;
+ struct rte_eal_uevent event;
+ int ret;
+
+ /* check device uevent */
+ if (rte_eal_uev_receive(dev->intr_handle.uevent_fd, &event) == 0) {
+ if (event.subsystem == RTE_EAL_UEVENT_SUBSYSTEM_UIO) {
+ if (event.type == RTE_EAL_UEVENT_REMOVE) {
+ /*remap the resource to be fake before removal processing */
+ ret = pci_uio_remap_resource(dev);
+ if (!ret)
+ _rte_eal_uev_callback_process(&dev->device, RTE_EAL_UEVENT_REMOVE, NULL, NULL);
+ }
+ }
+ }
+}
+
+int
pci_uio_alloc_resource(struct rte_pci_device *dev,
struct mapped_pci_resource **uio_res)
{
@@ -246,6 +297,7 @@ pci_uio_alloc_resource(struct rte_pci_device *dev,
char cfgname[PATH_MAX];
char devname[PATH_MAX]; /* contains the /dev/uioX */
int uio_num;
+ struct rte_intr_handle *intr_handle;
struct rte_pci_addr *loc;
loc = &dev->addr;
@@ -276,6 +328,16 @@ pci_uio_alloc_resource(struct rte_pci_device *dev,
goto error;
}
+ dev->intr_handle.uevent_fd = rte_eal_uev_fd_new();
+ intr_handle = &dev->intr_handle;
+
+ rte_eal_uev_enable(intr_handle->uevent_fd);
+ TAILQ_INIT(&(dev->device.uev_cbs));
+
+ /* register callback func to eal lib */
+ rte_intr_callback_register(intr_handle,
+ pci_uio_uev_handler, dev);
+
if (dev->kdrv == RTE_KDRV_IGB_UIO)
dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
else {
@@ -90,6 +90,7 @@ struct rte_intr_handle {
for uio_pci_generic */
};
int fd; /**< interrupt event file descriptor */
+ int uevent_fd; /**< uevent file descriptor */
enum rte_intr_handle_type type; /**< handle type */
uint32_t max_intr; /**< max interrupt requested */
uint32_t nb_efd; /**< number of available efd(event fd) */
@@ -235,5 +236,4 @@ rte_intr_allow_others(struct rte_intr_handle *intr_handle);
*/
int
rte_intr_cap_multiple(struct rte_intr_handle *intr_handle);
-
#endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */