[dpdk-dev,V9,4/5] pci_uio: add uevent hotplug failure handler in pci

Message ID 1515575544-2141-5-git-send-email-jia.guo@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues

Commit Message

Guo, Jia Jan. 10, 2018, 9:12 a.m. UTC
when detect hot removal uevent of device, the device resource become invalid,
in order to avoid unexpected usage of this resource, remap the device resource
to be a fake memory, that would lead the application keep running well but not
encounter system core dump.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
v9->v8:
split the patch set into small and explicit patch
---
 drivers/bus/pci/bsd/pci.c                 | 23 ++++++++++++++++++++
 drivers/bus/pci/linux/pci.c               | 34 ++++++++++++++++++++++++++++++
 drivers/bus/pci/pci_common.c              | 22 +++++++++++++++++++
 drivers/bus/pci/pci_common_uio.c          | 28 +++++++++++++++++++++++++
 drivers/bus/pci/private.h                 | 12 +++++++++++
 drivers/bus/pci/rte_bus_pci.h             | 11 ++++++++++
 drivers/bus/vdev/vdev.c                   |  9 +++++++-
 lib/librte_eal/common/eal_common_bus.c    |  1 +
 lib/librte_eal/common/include/rte_bus.h   | 17 +++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_dev.c     | 35 ++++++++++++++++++++++++++++---
 lib/librte_eal/linuxapp/igb_uio/igb_uio.c |  6 ++++++
 lib/librte_pci/rte_pci.c                  | 20 ++++++++++++++++++
 lib/librte_pci/rte_pci.h                  | 17 +++++++++++++++
 13 files changed, 231 insertions(+), 4 deletions(-)
  

Patch

diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c
index 655b34b..d7165b9 100644
--- a/drivers/bus/pci/bsd/pci.c
+++ b/drivers/bus/pci/bsd/pci.c
@@ -97,6 +97,29 @@  rte_pci_unmap_device(struct rte_pci_device *dev)
 	}
 }
 
+/* re-map pci device */
+int
+rte_pci_remap_device(struct rte_pci_device *dev)
+{
+	int ret;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	switch (dev->kdrv) {
+	case RTE_KDRV_NIC_UIO:
+		ret = pci_uio_remap_resource(dev);
+		break;
+	default:
+		RTE_LOG(DEBUG, EAL,
+			"  Not managed by a supported kernel driver, skipped\n");
+		ret = 1;
+		break;
+	}
+
+	return ret;
+}
+
 void
 pci_uio_free_resource(struct rte_pci_device *dev,
 		struct mapped_pci_resource *uio_res)
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 25f907e..7aa3079 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -116,6 +116,38 @@  rte_pci_unmap_device(struct rte_pci_device *dev)
 	}
 }
 
+/* Map pci device */
+int
+rte_pci_remap_device(struct rte_pci_device *dev)
+{
+	int ret = -1;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	switch (dev->kdrv) {
+	case RTE_KDRV_VFIO:
+#ifdef VFIO_PRESENT
+		/* no thing to do */
+#endif
+		break;
+	case RTE_KDRV_IGB_UIO:
+	case RTE_KDRV_UIO_GENERIC:
+		if (rte_eal_using_phys_addrs()) {
+			/* map resources for devices that use uio */
+			ret = pci_uio_remap_resource(dev);
+		}
+		break;
+	default:
+		RTE_LOG(DEBUG, EAL,
+			"  Not managed by a supported kernel driver, skipped\n");
+		ret = 1;
+		break;
+	}
+
+	return ret;
+}
+
 void *
 pci_find_max_end_va(void)
 {
@@ -357,6 +389,8 @@  pci_scan_one(const char *dirname, const struct rte_pci_addr *addr)
 		rte_pci_add_device(dev);
 	}
 
+	dev->device.state = RTE_DEV_PARSED;
+	TAILQ_INIT(&(dev->device.uev_cbs));
 	return 0;
 }
 
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index c4415a0..3fbe9d7 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -282,6 +282,7 @@  pci_probe_all_drivers(struct rte_pci_device *dev)
 		if (rc > 0)
 			/* positive value means driver doesn't support it */
 			continue;
+		dev->device.state = RTE_DEV_PROBED;
 		return 0;
 	}
 	return 1;
@@ -481,6 +482,7 @@  rte_pci_insert_device(struct rte_pci_device *exist_pci_dev,
 void
 rte_pci_remove_device(struct rte_pci_device *pci_dev)
 {
+	RTE_LOG(DEBUG, EAL, " rte_pci_remove_device for device list\n");
 	TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next);
 }
 
@@ -522,6 +524,25 @@  pci_find_device_by_name(const struct rte_device *start,
 }
 
 static int
+pci_remap_device(struct rte_device *dev)
+{
+	struct rte_pci_device *pdev;
+	int ret;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	pdev = RTE_DEV_TO_PCI(dev);
+
+	/* remap resources for devices that use igb_uio */
+	ret = rte_pci_remap_device(pdev);
+	if (ret != 0)
+		RTE_LOG(ERR, EAL, "failed to remap device %s",
+			dev->name);
+	return ret;
+}
+
+static int
 pci_plug(struct rte_device *dev)
 {
 	return pci_probe_all_drivers(RTE_DEV_TO_PCI(dev));
@@ -552,6 +573,7 @@  struct rte_pci_bus rte_pci_bus = {
 		.unplug = pci_unplug,
 		.parse = pci_parse,
 		.get_iommu_class = rte_pci_get_iommu_class,
+		.remap_device = pci_remap_device,
 	},
 	.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
 	.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
diff --git a/drivers/bus/pci/pci_common_uio.c b/drivers/bus/pci/pci_common_uio.c
index dd84ec8..a4bc473 100644
--- a/drivers/bus/pci/pci_common_uio.c
+++ b/drivers/bus/pci/pci_common_uio.c
@@ -147,6 +147,34 @@  pci_uio_unmap(struct mapped_pci_resource *uio_res)
 	}
 }
 
+/* remap the PCI resource of a PCI device in private virtual memory */
+int
+pci_uio_remap_resource(struct rte_pci_device *dev)
+{
+	int i;
+	uint64_t phaddr;
+	void *map_address;
+
+	/* Map all BARs */
+	for (i = 0; i != PCI_MAX_RESOURCE; i++) {
+		/* skip empty BAR */
+		phaddr = dev->mem_resource[i].phys_addr;
+		if (phaddr == 0)
+			continue;
+		map_address = pci_map_private_resource(
+				dev->mem_resource[i].addr, 0,
+				(size_t)dev->mem_resource[i].len);
+		if (map_address == MAP_FAILED)
+			goto error;
+		memset(map_address, 0xFF, (size_t)dev->mem_resource[i].len);
+		dev->mem_resource[i].addr = map_address;
+	}
+
+	return 0;
+error:
+	return -1;
+}
+
 static struct mapped_pci_resource *
 pci_uio_find_resource(struct rte_pci_device *dev)
 {
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 2283f09..10baa1a 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -202,6 +202,18 @@  void pci_uio_free_resource(struct rte_pci_device *dev,
 		struct mapped_pci_resource *uio_res);
 
 /**
+ * remap the pci uio resource..
+ *
+ * @param dev
+ *   Point to the struct rte pci device.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+pci_uio_remap_resource(struct rte_pci_device *dev);
+
+/**
  * Map device memory to uio resource
  *
  * This function is private to EAL.
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index d4a2996..65337eb 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -52,6 +52,8 @@  extern "C" {
 #include <sys/queue.h>
 #include <stdint.h>
 #include <inttypes.h>
+#include <unistd.h>
+#include <fcntl.h>
 
 #include <rte_debug.h>
 #include <rte_interrupts.h>
@@ -197,6 +199,15 @@  int rte_pci_map_device(struct rte_pci_device *dev);
 void rte_pci_unmap_device(struct rte_pci_device *dev);
 
 /**
+ * Remap this device
+ *
+ * @param dev
+ *   A pointer to a rte_pci_device structure describing the device
+ *   to use
+ */
+int rte_pci_remap_device(struct rte_pci_device *dev);
+
+/**
  * Dump the content of the PCI bus.
  *
  * @param f
diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
index cac2aa0..c9cd369 100644
--- a/drivers/bus/vdev/vdev.c
+++ b/drivers/bus/vdev/vdev.c
@@ -323,7 +323,6 @@  vdev_find_device(const struct rte_device *start, rte_dev_cmp_t cmp,
 	return NULL;
 }
 
-
 static struct rte_device *
 vdev_find_device_by_name(const struct rte_device *start,
 		rte_dev_cmp_name_t cmp_name,
@@ -343,6 +342,13 @@  vdev_find_device_by_name(const struct rte_device *start,
 }
 
 static int
+vdev_remap_device(struct rte_device *dev)
+{
+	RTE_SET_USED(dev);
+	return 0;
+}
+
+static int
 vdev_plug(struct rte_device *dev)
 {
 	return vdev_probe_all_drivers(RTE_DEV_TO_VDEV(dev));
@@ -362,6 +368,7 @@  static struct rte_bus rte_vdev_bus = {
 	.plug = vdev_plug,
 	.unplug = vdev_unplug,
 	.parse = vdev_parse,
+	.remap_device = vdev_remap_device,
 };
 
 RTE_REGISTER_BUS(vdev, rte_vdev_bus);
diff --git a/lib/librte_eal/common/eal_common_bus.c b/lib/librte_eal/common/eal_common_bus.c
index efd5539..bdb0e54 100644
--- a/lib/librte_eal/common/eal_common_bus.c
+++ b/lib/librte_eal/common/eal_common_bus.c
@@ -54,6 +54,7 @@  rte_bus_register(struct rte_bus *bus)
 	RTE_VERIFY(bus->find_device_by_name);
 	/* Buses supporting driver plug also require unplug. */
 	RTE_VERIFY(!bus->plug || bus->unplug);
+	RTE_VERIFY(bus->remap_device);
 
 	TAILQ_INSERT_TAIL(&rte_bus_list, bus, next);
 	RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name);
diff --git a/lib/librte_eal/common/include/rte_bus.h b/lib/librte_eal/common/include/rte_bus.h
index 6dcfdb3..78990bc 100644
--- a/lib/librte_eal/common/include/rte_bus.h
+++ b/lib/librte_eal/common/include/rte_bus.h
@@ -196,6 +196,22 @@  typedef int (*rte_bus_unplug_t)(struct rte_device *dev);
 typedef int (*rte_bus_parse_t)(const char *name, void *addr);
 
 /**
+ * Implementation specific remap function which is responsible for
+ * remmaping devices on that bus from original share memory resource
+ * to a private memory resource for the sake of device has been removal,
+ * when detect the device removal event invoke from the kernel side,
+ * prior to call this function before any operation for device hw.
+ *
+ * @param dev
+ *	Device pointer that was returned by a previous call to find_device.
+ *
+ * @return
+ *	0 on success.
+ *	!0 on error.
+ */
+typedef int (*rte_bus_remap_device_t)(struct rte_device *dev);
+
+/**
  * Bus scan policies
  */
 enum rte_bus_scan_mode {
@@ -239,6 +255,7 @@  struct rte_bus {
 	rte_bus_plug_t plug;         /**< Probe single device for drivers */
 	rte_bus_unplug_t unplug;     /**< Remove single device from driver */
 	rte_bus_parse_t parse;       /**< Parse a device name */
+	rte_bus_remap_device_t remap_device;       /**< remap a device */
 	struct rte_bus_conf conf;    /**< Bus configuration */
 	rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */
 };
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
index 9d347f2..d68ef9b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_dev.c
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -32,6 +32,12 @@  bool service_no_init = true;
 
 #define DEV_EV_MNT_SERVICE_NAME "device_event_monitor_service"
 
+static void sig_handler(int signum)
+{
+	if (signum == SIGINT || signum == SIGTERM)
+		rte_dev_monitor_stop();
+}
+
 static int
 dev_monitor_fd_new(void)
 {
@@ -168,6 +174,7 @@  dev_uev_process(struct epoll_event *events, int nfds)
 	struct rte_bus *bus;
 	struct rte_device *dev;
 	struct rte_eal_uevent uevent;
+	int ret;
 	int i;
 
 	for (i = 0; i < nfds; i++) {
@@ -187,9 +194,17 @@  dev_uev_process(struct epoll_event *events, int nfds)
 
 				if ((!dev) || dev->state == RTE_DEV_UNDEFINED)
 					return 0;
-				return(_rte_dev_callback_process(dev,
-				  RTE_DEV_EVENT_REMOVE,
-				  NULL));
+				dev->state = RTE_DEV_FAULT;
+
+				/**
+				 * remap the resource to be fake
+				 * before user's removal processing
+				 */
+				ret = bus->remap_device(dev);
+				if (!ret)
+					return(_rte_dev_callback_process(dev,
+					  RTE_DEV_EVENT_REMOVE,
+					  NULL));
 			} else if (uevent.type == RTE_DEV_EVENT_ADD) {
 				if (dev == NULL) {
 					return(_rte_dev_callback_process(NULL,
@@ -215,12 +230,26 @@  dev_uev_process(struct epoll_event *events, int nfds)
  */
 static int32_t dev_uev_monitoring(__rte_unused void *arg)
 {
+	struct sigaction act;
+	sigset_t mask;
 	int netlink_fd = -1;
 	struct epoll_event ep_kernel;
 	int fd_ep = -1;
 
 	service_exit = false;
 
+	/* set signal handlers */
+	memset(&act, 0x00, sizeof(struct sigaction));
+	act.sa_handler = sig_handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_RESTART;
+	sigaction(SIGINT, &act, NULL);
+	sigaction(SIGTERM, &act, NULL);
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGINT);
+	sigaddset(&mask, SIGTERM);
+	sigprocmask(SIG_UNBLOCK, &mask, NULL);
+
 	fd_ep = epoll_create1(EPOLL_CLOEXEC);
 	if (fd_ep < 0) {
 		RTE_LOG(ERR, EAL, "error creating epoll fd: %m\n");
diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
index a3a98c1..d0e07b4 100644
--- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
+++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
@@ -354,6 +354,12 @@  igbuio_pci_release(struct uio_info *info, struct inode *inode)
 	struct rte_uio_pci_dev *udev = info->priv;
 	struct pci_dev *dev = udev->pdev;
 
+	/* check if device have been remove before release */
+	if ((&dev->dev.kobj)->state_remove_uevent_sent == 1) {
+		pr_info("The device have been removed\n");
+		return -1;
+	}
+
 	/* disable interrupts */
 	igbuio_pci_disable_interrupts(udev);
 
diff --git a/lib/librte_pci/rte_pci.c b/lib/librte_pci/rte_pci.c
index 0160fc1..feb5fd7 100644
--- a/lib/librte_pci/rte_pci.c
+++ b/lib/librte_pci/rte_pci.c
@@ -172,6 +172,26 @@  rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr)
 	return -1;
 }
 
+/* map a private resource from an address*/
+void *
+pci_map_private_resource(void *requested_addr, off_t offset, size_t size)
+{
+	void *mapaddr;
+
+	mapaddr = mmap(requested_addr, size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+	if (mapaddr == MAP_FAILED) {
+		RTE_LOG(ERR, EAL, "%s(): cannot mmap(%p, 0x%lx, 0x%lx): "
+			"%s (%p)\n",
+			__func__, requested_addr,
+			(unsigned long)size, (unsigned long)offset,
+			strerror(errno), mapaddr);
+	} else
+		RTE_LOG(DEBUG, EAL, "  PCI memory mapped at %p\n", mapaddr);
+
+	return mapaddr;
+}
 
 /* map a particular resource from a file */
 void *
diff --git a/lib/librte_pci/rte_pci.h b/lib/librte_pci/rte_pci.h
index 4f2cd18..f6091a6 100644
--- a/lib/librte_pci/rte_pci.h
+++ b/lib/librte_pci/rte_pci.h
@@ -227,6 +227,23 @@  int rte_pci_addr_cmp(const struct rte_pci_addr *addr,
 int rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr);
 
 /**
+ * @internal
+ * Map to a particular private resource.
+ *
+ * @param requested_addr
+ *      The starting address for the new mapping range.
+ * @param offset
+ *      The offset for the mapping range.
+ * @param size
+ *      The size for the mapping range.
+ * @return
+ *   - On success, the function returns a pointer to the mapped area.
+ *   - On error, the value MAP_FAILED is returned.
+ */
+void *pci_map_private_resource(void *requested_addr, off_t offset,
+		size_t size);
+
+/**
  * Map a particular resource from a file.
  *
  * @param requested_addr