diff mbox series

[v7,6/7] eal: add failure handle mechanism for hotplug

Message ID 1531137377-10157-7-git-send-email-jia.guo@intel.com (mailing list archive)
State Superseded, archived
Headers show
Series hotplug failure handle mechanism | expand

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Guo, Jia July 9, 2018, 11:56 a.m. UTC
This patch introduces a failure handle mechanism to handle device
hotplug removal event.

First it can register sigbus handler when enable device event monitor. Once
sigbus error be captured, it will check the failure address and accordingly
remap the invalid memory for the corresponding device. Besed on this
mechanism, it could guaranty the application not crash when the device be
hotplug out.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
---
v7->v6:
delete some unused part.
---
 lib/librte_eal/linuxapp/eal/eal_dev.c | 112 +++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
index 1cf6aeb..0de3fb7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_dev.c
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -4,6 +4,8 @@ 
 
 #include <string.h>
 #include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
 
@@ -14,6 +16,10 @@ 
 #include <rte_malloc.h>
 #include <rte_interrupts.h>
 #include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
 
 #include "eal_private.h"
 
@@ -23,6 +29,16 @@  static bool monitor_started;
 #define EAL_UEV_MSG_LEN 4096
 #define EAL_UEV_MSG_ELEM_LEN 128
 
+/*
+ * spinlock for device failure process, protect the bus and the device
+ * to avoid race condition.
+ */
+static rte_spinlock_t dev_failure_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
 static void dev_uev_handler(__rte_unused void *param);
 
 /* identify the system layer which reports this event. */
@@ -33,6 +49,49 @@  enum eal_dev_event_subsystem {
 	EAL_DEV_EVENT_SUBSYSTEM_MAX
 };
 
+static void
+sigbus_action_recover(void)
+{
+	if (sigbus_need_recover) {
+		sigaction(SIGBUS, &sigbus_action_old, NULL);
+		sigbus_need_recover = 0;
+	}
+}
+
+static void sigbus_handler(int signum, siginfo_t *info,
+				void *ctx __rte_unused)
+{
+	int ret;
+
+	RTE_LOG(INFO, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
+		(int)pthread_self(), info->si_addr);
+
+	rte_spinlock_lock(&dev_failure_lock);
+	ret = rte_bus_sigbus_handler(info->si_addr);
+	rte_spinlock_unlock(&dev_failure_lock);
+	if (ret == -1) {
+		rte_exit(EXIT_FAILURE,
+			 "Failed to handle SIGBUS for hotplug, "
+			 "(rte_errno: %s)!", strerror(rte_errno));
+	} else if (ret == 1) {
+		if (sigbus_action_old.sa_handler)
+			(*(sigbus_action_old.sa_handler))(signum);
+		else
+			rte_exit(EXIT_FAILURE,
+				 "Failed to handle generic SIGBUS!");
+	}
+
+	RTE_LOG(INFO, EAL, "Success to handle SIGBUS for hotplug!\n");
+}
+
+static int cmp_dev_name(const struct rte_device *dev,
+	const void *_name)
+{
+	const char *name = _name;
+
+	return strcmp(dev->name, name);
+}
+
 static int
 dev_uev_socket_fd_create(void)
 {
@@ -147,6 +206,9 @@  dev_uev_handler(__rte_unused void *param)
 	struct rte_dev_event uevent;
 	int ret;
 	char buf[EAL_UEV_MSG_LEN];
+	struct rte_bus *bus;
+	struct rte_device *dev;
+	const char *busname = "";
 
 	memset(&uevent, 0, sizeof(struct rte_dev_event));
 	memset(buf, 0, EAL_UEV_MSG_LEN);
@@ -171,13 +233,50 @@  dev_uev_handler(__rte_unused void *param)
 	RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
 		uevent.devname, uevent.type, uevent.subsystem);
 
-	if (uevent.devname)
+	switch (uevent.subsystem) {
+	case EAL_DEV_EVENT_SUBSYSTEM_PCI:
+	case EAL_DEV_EVENT_SUBSYSTEM_UIO:
+		busname = "pci";
+		break;
+	default:
+		break;
+	}
+
+	if (uevent.devname) {
+		if (uevent.type == RTE_DEV_EVENT_REMOVE) {
+			rte_spinlock_lock(&dev_failure_lock);
+			bus = rte_bus_find_by_name(busname);
+			if (bus == NULL) {
+				RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
+					busname);
+				return;
+			}
+
+			dev = bus->find_device(NULL, cmp_dev_name,
+					       uevent.devname);
+			if (dev == NULL) {
+				RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
+					"bus (%s)\n", uevent.devname, busname);
+				return;
+			}
+
+			ret = bus->hotplug_failure_handler(dev);
+			rte_spinlock_unlock(&dev_failure_lock);
+			if (ret) {
+				RTE_LOG(ERR, EAL, "Can not handle hotplug for "
+					"device (%s)\n", dev->name);
+				return;
+			}
+		}
 		dev_callback_process(uevent.devname, uevent.type);
+	}
 }
 
 int __rte_experimental
 rte_dev_event_monitor_start(void)
 {
+	sigset_t mask;
+	struct sigaction action;
 	int ret;
 
 	if (monitor_started)
@@ -197,6 +296,14 @@  rte_dev_event_monitor_start(void)
 		return -1;
 	}
 
+	/* register sigbus handler */
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = SA_SIGINFO;
+	action.sa_mask = mask;
+	action.sa_sigaction = sigbus_handler;
+	sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
+
 	monitor_started = true;
 
 	return 0;
@@ -217,8 +324,11 @@  rte_dev_event_monitor_stop(void)
 		return ret;
 	}
 
+	sigbus_action_recover();
+
 	close(intr_handle.fd);
 	intr_handle.fd = -1;
 	monitor_started = false;
+
 	return 0;
 }