[2/2] net/netvsc: support VF device hot add/remove
diff mbox series

Message ID 1606809383-26660-2-git-send-email-longli@linuxonhyperv.com
State Superseded
Delegated to: Thomas Monjalon
Headers show
Series
  • [1/2] eal/hotplug: allow monitor to be setup by multiple places
Related show

Checks

Context Check Description
ci/travis-robot success Travis build: passed
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/checkpatch success coding style OK

Commit Message

Long Li Dec. 1, 2020, 7:56 a.m. UTC
From: Long Li <longli@microsoft.com>

When a VF device is present, netvsc can send or receive packets over the
VF device. The VF device driver communicates directly with the PCI device
via the PF from the host hypervisor. This is faster than exchanging data
with netvsp via vmbus, i.e. syntheic path.

In Azure and Hyper-v environments, VF device can be hot added or hot
removed at anytime while guest VM is running. This patch improves netvsc
to support VF device hot add/remove.

1. netvsc monitors all system hot add activities over the PCI bus. When it
detects a VF device is added to the system and is managed under this
netvsc device, it asks EAL to probe and start this VF device, then it
attaches and switches data path to the VF device.

2. After a VF device is attached to netvsc, netvsc monitors this device on
hot remove. When this VF device is hot removed, netvsc switches data path
to synthetic, stops this VF device and removes it from EAL.

3. If any failure happens during a VF device hot remove or add, the netvsc
falls back to synthetic path for all data traffic.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/net/netvsc/hn_ethdev.c | 162 ++++++++++++++++-
 drivers/net/netvsc/hn_nvs.c    |   4 +-
 drivers/net/netvsc/hn_nvs.h    |   2 +-
 drivers/net/netvsc/hn_rxtx.c   |  36 ++--
 drivers/net/netvsc/hn_var.h    |  52 ++++--
 drivers/net/netvsc/hn_vf.c     | 318 +++++++++++++++++++++++++++------
 6 files changed, 485 insertions(+), 89 deletions(-)

Patch
diff mbox series

diff --git a/drivers/net/netvsc/hn_ethdev.c b/drivers/net/netvsc/hn_ethdev.c
index 49f954305d..1c0a4b2985 100644
--- a/drivers/net/netvsc/hn_ethdev.c
+++ b/drivers/net/netvsc/hn_ethdev.c
@@ -9,6 +9,10 @@ 
 #include <stdio.h>
 #include <errno.h>
 #include <unistd.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <sys/ioctl.h>
 
 #include <rte_ethdev.h>
 #include <rte_memcpy.h>
@@ -27,6 +31,7 @@ 
 #include <rte_eal.h>
 #include <rte_dev.h>
 #include <rte_bus_vmbus.h>
+#include <rte_alarm.h>
 
 #include "hn_logs.h"
 #include "hn_var.h"
@@ -50,6 +55,9 @@ 
 #define NETVSC_ARG_TXBREAK "tx_copybreak"
 #define NETVSC_ARG_RX_EXTMBUF_ENABLE "rx_extmbuf_enable"
 
+/* The max number of retry when hot adding a VF device */
+#define NETVSC_MAX_HOTADD_RETRY 5
+
 struct hn_xstats_name_off {
 	char name[RTE_ETH_XSTATS_NAME_SIZE];
 	unsigned int offset;
@@ -542,6 +550,124 @@  static int hn_subchan_configure(struct hn_data *hv,
 	return err;
 }
 
+static void netvsc_hotplug_retry(void *args)
+{
+	int ret;
+	struct hn_data *hv = args;
+	struct rte_eth_dev *dev = &rte_eth_devices[hv->port_id];
+	struct rte_devargs *d = &hv->devargs;
+	char buf[256];
+
+	DIR *di;
+	struct dirent *dir;
+	struct ifreq req;
+	struct rte_ether_addr eth_addr;
+	int s;
+
+	PMD_DRV_LOG(DEBUG, "%s: retry count %d\n",
+		    __func__, hv->eal_hot_plug_retry);
+
+	if (hv->eal_hot_plug_retry++ > NETVSC_MAX_HOTADD_RETRY)
+		return;
+
+	snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s/net", d->name);
+	di = opendir(buf);
+	if (!di) {
+		PMD_DRV_LOG(DEBUG, "%s: can't open directory %s, "
+			    "retrying in 1 second\n", __func__, buf);
+		goto retry;
+	}
+
+	while ((dir = readdir(di))) {
+		/* Skip . and .. directories */
+		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
+			continue;
+
+		/* trying to get mac address if this is a network device*/
+		s = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+		if (s == -1) {
+			PMD_DRV_LOG(ERR, "Failed to create socket errno %d\n",
+				    errno);
+			closedir(di);
+			return;
+		}
+		strlcpy(req.ifr_name, dir->d_name, sizeof(req.ifr_name));
+		ret = ioctl(s, SIOCGIFHWADDR, &req);
+		close(s);
+		if (ret == -1) {
+			PMD_DRV_LOG(ERR, "Failed to send SIOCGIFHWADDR for "
+				    "device %s\n", dir->d_name);
+			continue;
+		}
+		if (req.ifr_hwaddr.sa_family != ARPHRD_ETHER)
+			continue;
+		memcpy(eth_addr.addr_bytes, req.ifr_hwaddr.sa_data,
+		       RTE_DIM(eth_addr.addr_bytes));
+
+		if (rte_is_same_ether_addr(&eth_addr, dev->data->mac_addrs)) {
+			PMD_DRV_LOG(NOTICE, "Found matching MAC address, "
+				    "adding device %s network name %s\n",
+				    d->name, dir->d_name);
+			ret = rte_eal_hotplug_add(d->bus->name, d->name,
+						  d->args);
+			if (ret)
+				PMD_DRV_LOG(ERR,
+					    "Failed to add PCI device %s\n",
+					    d->name);
+		}
+		/* When the code reaches here, we either have already added
+		 * the device, or its MAC address did not match.
+		 */
+		closedir(di);
+		return;
+	}
+retry:
+	/* The device is still being initialized, retry after 1 second */
+	rte_eal_alarm_set(1000000, netvsc_hotplug_retry, hv);
+}
+
+static void
+netvsc_hotadd_callback(const char *device_name, enum rte_dev_event_type type,
+		       void *arg)
+{
+	struct hn_data *hv = arg;
+	struct rte_devargs *d = &hv->devargs;
+	int ret;
+
+	PMD_DRV_LOG(INFO, "Device notification type=%d device_name=%s\n",
+		    type, device_name);
+
+	switch (type) {
+	case RTE_DEV_EVENT_ADD:
+		/* if we already has a VF, don't check on hot add */
+		if (hv->vf_ctx.vf_state > vf_removed)
+			break;
+
+		ret = rte_devargs_parse(d, device_name);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "devargs parsing failed ret=%d\n", ret);
+			return;
+		}
+
+		if (!strcmp(d->bus->name, "pci")) {
+			/* Start the process of figuring out if this
+			 * PCI device is a VF device
+			 */
+			hv->eal_hot_plug_retry = 0;
+			rte_eal_alarm_set(1000000, netvsc_hotplug_retry, hv);
+		}
+
+		/* We will switch to VF on RDNIS configure message
+		 * sent from VSP
+		 */
+
+		break;
+	default:
+		break;
+	}
+}
+
 static int hn_dev_configure(struct rte_eth_dev *dev)
 {
 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
@@ -617,7 +743,7 @@  static int hn_dev_configure(struct rte_eth_dev *dev)
 		}
 	}
 
-	return hn_vf_configure(dev, dev_conf);
+	return hn_vf_configure_locked(dev, dev_conf);
 }
 
 static int hn_dev_stats_get(struct rte_eth_dev *dev,
@@ -827,6 +953,14 @@  hn_dev_start(struct rte_eth_dev *dev)
 
 	PMD_INIT_FUNC_TRACE();
 
+	/* Register to monitor hot plug events */
+	error = rte_dev_event_callback_register(NULL, netvsc_hotadd_callback,
+						hv);
+	if (error) {
+		PMD_DRV_LOG(ERR, "failed to register device event callback\n");
+		return error;
+	}
+
 	error = hn_rndis_set_rxfilter(hv,
 				      NDIS_PACKET_TYPE_BROADCAST |
 				      NDIS_PACKET_TYPE_ALL_MULTICAST |
@@ -853,6 +987,7 @@  hn_dev_stop(struct rte_eth_dev *dev)
 	PMD_INIT_FUNC_TRACE();
 	dev->data->dev_started = 0;
 
+	rte_dev_event_callback_unregister(NULL, netvsc_hotadd_callback, hv);
 	hn_rndis_set_rxfilter(hv, 0);
 	return hn_vf_stop(dev);
 }
@@ -861,11 +996,14 @@  static int
 hn_dev_close(struct rte_eth_dev *dev)
 {
 	int ret;
+	struct hn_data *hv = dev->data->dev_private;
 
 	PMD_INIT_FUNC_TRACE();
 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
 		return 0;
 
+	rte_eal_alarm_cancel(netvsc_hotplug_retry, &hv->devargs);
+
 	ret = hn_vf_close(dev);
 	hn_dev_free_queues(dev);
 
@@ -990,7 +1128,10 @@  eth_hn_dev_init(struct rte_eth_dev *eth_dev)
 	hv->max_queues = 1;
 
 	rte_rwlock_init(&hv->vf_lock);
-	hv->vf_port = HN_INVALID_PORT;
+	hv->vf_ctx.vf_vsc_switched = false;
+	hv->vf_ctx.vf_vsp_reported = false;
+	hv->vf_ctx.vf_attached = false;
+	hv->vf_ctx.vf_state = vf_unknown;
 
 	err = hn_parse_args(eth_dev);
 	if (err)
@@ -1044,12 +1185,10 @@  eth_hn_dev_init(struct rte_eth_dev *eth_dev)
 	hv->max_queues = RTE_MIN(rxr_cnt, (unsigned int)max_chan);
 
 	/* If VF was reported but not added, do it now */
-	if (hv->vf_present && !hn_vf_attached(hv)) {
+	if (hv->vf_ctx.vf_vsp_reported && !hv->vf_ctx.vf_vsc_switched) {
 		PMD_INIT_LOG(DEBUG, "Adding VF device");
 
 		err = hn_vf_add(eth_dev, hv);
-		if (err)
-			hv->vf_present = 0;
 	}
 
 	return 0;
@@ -1095,15 +1234,23 @@  static int eth_hn_probe(struct rte_vmbus_driver *drv __rte_unused,
 
 	PMD_INIT_FUNC_TRACE();
 
+	ret = rte_dev_event_monitor_start();
+	if (ret) {
+		PMD_DRV_LOG(ERR, "Failed to start device event monitoring\n");
+		return ret;
+	}
+
 	eth_dev = eth_dev_vmbus_allocate(dev, sizeof(struct hn_data));
 	if (!eth_dev)
 		return -ENOMEM;
 
 	ret = eth_hn_dev_init(eth_dev);
-	if (ret)
+	if (ret) {
 		eth_dev_vmbus_release(eth_dev);
-	else
+		rte_dev_event_monitor_stop();
+	} else {
 		rte_eth_dev_probing_finish(eth_dev);
+	}
 
 	return ret;
 }
@@ -1124,6 +1271,7 @@  static int eth_hn_remove(struct rte_vmbus_device *dev)
 		return ret;
 
 	eth_dev_vmbus_release(eth_dev);
+	rte_dev_event_monitor_stop();
 	return 0;
 }
 
diff --git a/drivers/net/netvsc/hn_nvs.c b/drivers/net/netvsc/hn_nvs.c
index eeb82ab9ee..a0ee7d8bfa 100644
--- a/drivers/net/netvsc/hn_nvs.c
+++ b/drivers/net/netvsc/hn_nvs.c
@@ -569,7 +569,7 @@  hn_nvs_alloc_subchans(struct hn_data *hv, uint32_t *nsubch)
 	return 0;
 }
 
-void
+int
 hn_nvs_set_datapath(struct hn_data *hv, uint32_t path)
 {
 	struct hn_nvs_datapath dp;
@@ -588,4 +588,6 @@  hn_nvs_set_datapath(struct hn_data *hv, uint32_t path)
 			    "send set datapath failed: %d",
 			    error);
 	}
+
+	return error;
 }
diff --git a/drivers/net/netvsc/hn_nvs.h b/drivers/net/netvsc/hn_nvs.h
index 015839e364..3766d2ee34 100644
--- a/drivers/net/netvsc/hn_nvs.h
+++ b/drivers/net/netvsc/hn_nvs.h
@@ -212,7 +212,7 @@  int	hn_nvs_attach(struct hn_data *hv, unsigned int mtu);
 void	hn_nvs_detach(struct hn_data *hv);
 void	hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
 int	hn_nvs_alloc_subchans(struct hn_data *hv, uint32_t *nsubch);
-void	hn_nvs_set_datapath(struct hn_data *hv, uint32_t path);
+int	hn_nvs_set_datapath(struct hn_data *hv, uint32_t path);
 void	hn_nvs_handle_vfassoc(struct rte_eth_dev *dev,
 			      const struct vmbus_chanpkt_hdr *hdr,
 			      const void *data);
diff --git a/drivers/net/netvsc/hn_rxtx.c b/drivers/net/netvsc/hn_rxtx.c
index 015662fdb4..0f4ef0100b 100644
--- a/drivers/net/netvsc/hn_rxtx.c
+++ b/drivers/net/netvsc/hn_rxtx.c
@@ -1492,16 +1492,20 @@  hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		hn_process_events(hv, txq->queue_id, 0);
 
 	/* Transmit over VF if present and up */
-	rte_rwlock_read_lock(&hv->vf_lock);
-	vf_dev = hn_get_vf_dev(hv);
-	if (vf_dev && vf_dev->data->dev_started) {
-		void *sub_q = vf_dev->data->tx_queues[queue_id];
-
-		nb_tx = (*vf_dev->tx_pkt_burst)(sub_q, tx_pkts, nb_pkts);
+	if (hv->vf_ctx.vf_vsc_switched) {
+		rte_rwlock_read_lock(&hv->vf_lock);
+		vf_dev = hn_get_vf_dev(hv);
+		if (hv->vf_ctx.vf_vsc_switched && vf_dev &&
+		    vf_dev->data->dev_started) {
+			void *sub_q = vf_dev->data->tx_queues[queue_id];
+
+			nb_tx = (*vf_dev->tx_pkt_burst)
+					(sub_q, tx_pkts, nb_pkts);
+			rte_rwlock_read_unlock(&hv->vf_lock);
+			return nb_tx;
+		}
 		rte_rwlock_read_unlock(&hv->vf_lock);
-		return nb_tx;
 	}
-	rte_rwlock_read_unlock(&hv->vf_lock);
 
 	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
 		struct rte_mbuf *m = tx_pkts[nb_tx];
@@ -1614,13 +1618,17 @@  hn_recv_pkts(void *prxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 					   (void **)rx_pkts, nb_pkts, NULL);
 
 	/* If VF is available, check that as well */
-	rte_rwlock_read_lock(&hv->vf_lock);
-	vf_dev = hn_get_vf_dev(hv);
-	if (vf_dev && vf_dev->data->dev_started)
-		nb_rcv += hn_recv_vf(vf_dev->data->port_id, rxq,
-				     rx_pkts + nb_rcv, nb_pkts - nb_rcv);
+	if (hv->vf_ctx.vf_vsc_switched) {
+		rte_rwlock_read_lock(&hv->vf_lock);
+		vf_dev = hn_get_vf_dev(hv);
+		if (hv->vf_ctx.vf_vsc_switched && vf_dev &&
+		    vf_dev->data->dev_started)
+			nb_rcv += hn_recv_vf(vf_dev->data->port_id, rxq,
+					     rx_pkts + nb_rcv,
+					     nb_pkts - nb_rcv);
 
-	rte_rwlock_read_unlock(&hv->vf_lock);
+		rte_rwlock_read_unlock(&hv->vf_lock);
+	}
 	return nb_rcv;
 }
 
diff --git a/drivers/net/netvsc/hn_var.h b/drivers/net/netvsc/hn_var.h
index bd874c6b4d..b7405ca726 100644
--- a/drivers/net/netvsc/hn_var.h
+++ b/drivers/net/netvsc/hn_var.h
@@ -105,14 +105,37 @@  struct hn_rx_bufinfo {
 
 #define HN_INVALID_PORT	UINT16_MAX
 
+enum vf_device_state {
+	vf_unknown = 0,
+	vf_removed,
+	vf_configured,
+	vf_started,
+	vf_stopped,
+};
+
+struct hn_vf_ctx {
+	uint16_t	vf_port;
+
+	/* We have taken ownership of this VF port from DPDK */
+	bool		vf_attached;
+
+	/* VSC has requested to switch data path to VF */
+	bool		vf_vsc_switched;
+
+	/* VSP has reported the VF is present for this NIC */
+	bool		vf_vsp_reported;
+
+	enum vf_device_state	vf_state;
+};
+
 struct hn_data {
 	struct rte_vmbus_device *vmbus;
 	struct hn_rx_queue *primary;
 	rte_rwlock_t    vf_lock;
 	uint16_t	port_id;
-	uint16_t	vf_port;
 
-	uint8_t		vf_present;
+	struct hn_vf_ctx	vf_ctx;
+
 	uint8_t		closed;
 	uint8_t		vlan_strip;
 
@@ -153,6 +176,9 @@  struct hn_data {
 	struct rte_eth_dev_owner owner;
 
 	struct vmbus_channel *channels[HN_MAX_CHANNELS];
+
+	struct rte_devargs devargs;
+	int		eal_hot_plug_retry;
 };
 
 static inline struct vmbus_channel *
@@ -196,13 +222,6 @@  uint32_t hn_dev_rx_queue_count(struct rte_eth_dev *dev, uint16_t queue_id);
 int	hn_dev_rx_queue_status(void *rxq, uint16_t offset);
 void	hn_dev_free_queues(struct rte_eth_dev *dev);
 
-/* Check if VF is attached */
-static inline bool
-hn_vf_attached(const struct hn_data *hv)
-{
-	return hv->vf_port != HN_INVALID_PORT;
-}
-
 /*
  * Get VF device for existing netvsc device
  * Assumes vf_lock is held.
@@ -210,19 +229,17 @@  hn_vf_attached(const struct hn_data *hv)
 static inline struct rte_eth_dev *
 hn_get_vf_dev(const struct hn_data *hv)
 {
-	uint16_t vf_port = hv->vf_port;
-
-	if (vf_port == HN_INVALID_PORT)
-		return NULL;
+	if (hv->vf_ctx.vf_attached)
+		return &rte_eth_devices[hv->vf_ctx.vf_port];
 	else
-		return &rte_eth_devices[vf_port];
+		return NULL;
 }
 
 int	hn_vf_info_get(struct hn_data *hv,
 		       struct rte_eth_dev_info *info);
 int	hn_vf_add(struct rte_eth_dev *dev, struct hn_data *hv);
-int	hn_vf_configure(struct rte_eth_dev *dev,
-			const struct rte_eth_conf *dev_conf);
+int	hn_vf_configure_locked(struct rte_eth_dev *dev,
+			       const struct rte_eth_conf *dev_conf);
 const uint32_t *hn_vf_supported_ptypes(struct rte_eth_dev *dev);
 int	hn_vf_start(struct rte_eth_dev *dev);
 void	hn_vf_reset(struct rte_eth_dev *dev);
@@ -265,3 +282,6 @@  int	hn_vf_rss_hash_update(struct rte_eth_dev *dev,
 int	hn_vf_reta_hash_update(struct rte_eth_dev *dev,
 			       struct rte_eth_rss_reta_entry64 *reta_conf,
 			       uint16_t reta_size);
+int	hn_eth_rmv_event_callback(uint16_t port_id,
+				  enum rte_eth_event_type event __rte_unused,
+				  void *cb_arg, void *out __rte_unused);
diff --git a/drivers/net/netvsc/hn_vf.c b/drivers/net/netvsc/hn_vf.c
index d43ebaa69f..86392917c5 100644
--- a/drivers/net/netvsc/hn_vf.c
+++ b/drivers/net/netvsc/hn_vf.c
@@ -24,6 +24,7 @@ 
 #include <rte_bus_pci.h>
 #include <rte_log.h>
 #include <rte_string_fns.h>
+#include <rte_alarm.h>
 
 #include "hn_logs.h"
 #include "hn_var.h"
@@ -52,73 +53,252 @@  static int hn_vf_match(const struct rte_eth_dev *dev)
 /*
  * Attach new PCI VF device and return the port_id
  */
-static int hn_vf_attach(struct hn_data *hv, uint16_t port_id)
+static int hn_vf_attach(struct rte_eth_dev *dev, struct hn_data *hv)
 {
 	struct rte_eth_dev_owner owner = { .id = RTE_ETH_DEV_NO_OWNER };
-	int ret;
+	int port, ret;
 
-	if (hn_vf_attached(hv)) {
+	if (hv->vf_ctx.vf_attached) {
 		PMD_DRV_LOG(ERR, "VF already attached");
-		return -EEXIST;
+		return 0;
 	}
 
-	ret = rte_eth_dev_owner_get(port_id, &owner);
+	port = hn_vf_match(dev);
+	if (port < 0) {
+		PMD_DRV_LOG(NOTICE, "Couldn't find port for VF");
+		return port;
+	}
+
+	PMD_DRV_LOG(NOTICE, "found matching VF port %d\n", port);
+	ret = rte_eth_dev_owner_get(port, &owner);
 	if (ret < 0) {
-		PMD_DRV_LOG(ERR, "Can not find owner for port %d", port_id);
+		PMD_DRV_LOG(ERR, "Can not find owner for port %d", port);
 		return ret;
 	}
 
 	if (owner.id != RTE_ETH_DEV_NO_OWNER) {
 		PMD_DRV_LOG(ERR, "Port %u already owned by other device %s",
-			    port_id, owner.name);
+			    port, owner.name);
 		return -EBUSY;
 	}
 
-	ret = rte_eth_dev_owner_set(port_id, &hv->owner);
+	ret = rte_eth_dev_owner_set(port, &hv->owner);
 	if (ret < 0) {
-		PMD_DRV_LOG(ERR, "Can set owner for port %d", port_id);
+		PMD_DRV_LOG(ERR, "Can set owner for port %d", port);
 		return ret;
 	}
 
-	PMD_DRV_LOG(DEBUG, "Attach VF device %u", port_id);
-	hv->vf_port = port_id;
+	PMD_DRV_LOG(DEBUG, "Attach VF device %u", port);
+	hv->vf_ctx.vf_attached = true;
+	hv->vf_ctx.vf_port = port;
+	return 0;
+}
+
+static void hn_vf_remove(struct hn_data *hv);
+
+static void hn_remove_delayed(void *args)
+{
+	struct hn_data *hv = args;
+	uint16_t port_id = hv->vf_ctx.vf_port;
+	struct rte_device *dev = rte_eth_devices[port_id].device;
+	int ret;
+
+	/* Tell VSP to switch data path to synthentic */
+	hn_vf_remove(hv);
+
+	PMD_DRV_LOG(NOTICE, "Start to remove port %d\n", port_id);
+	rte_rwlock_write_lock(&hv->vf_lock);
+
+	/* Give back ownership */
+	ret = rte_eth_dev_owner_unset(port_id, hv->owner.id);
+	if (ret)
+		PMD_DRV_LOG(ERR, "rte_eth_dev_owner_unset failed ret=%d\n",
+			    ret);
+	hv->vf_ctx.vf_attached = false;
+
+	ret = rte_eth_dev_callback_unregister(port_id, RTE_ETH_EVENT_INTR_RMV,
+					      hn_eth_rmv_event_callback, hv);
+	if (ret)
+		PMD_DRV_LOG(ERR,
+			    "rte_eth_dev_callback_unregister failed ret=%d\n",
+			    ret);
+
+	/* Detach and release port_id from system */
+	ret = rte_eth_dev_stop(port_id);
+	if (ret)
+		PMD_DRV_LOG(ERR, "rte_eth_dev_stop failed port_id=%u ret=%d\n",
+			    port_id, ret);
+
+	ret = rte_eth_dev_close(port_id);
+	if (ret)
+		PMD_DRV_LOG(ERR, "rte_eth_dev_close failed port_id=%u ret=%d\n",
+			    port_id, ret);
+
+	ret = rte_dev_remove(dev);
+	hv->vf_ctx.vf_state = vf_removed;
+
+	rte_rwlock_write_unlock(&hv->vf_lock);
+}
+
+int hn_eth_rmv_event_callback(uint16_t port_id,
+			      enum rte_eth_event_type event __rte_unused,
+			      void *cb_arg, void *out __rte_unused)
+{
+	struct hn_data *hv = cb_arg;
+
+	PMD_DRV_LOG(NOTICE, "Removing VF portid %d\n", port_id);
+	rte_eal_alarm_set(1, hn_remove_delayed, hv);
+
 	return 0;
 }
 
+static int hn_setup_vf_queues(int port, struct rte_eth_dev *dev)
+{
+	struct hn_rx_queue *rx_queue;
+	struct rte_eth_txq_info txinfo;
+	struct rte_eth_rxq_info rxinfo;
+	int i, ret = 0;
+
+	for (i = 0; i < dev->data->nb_tx_queues; i++) {
+		ret = rte_eth_tx_queue_info_get(dev->data->port_id, i, &txinfo);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "rte_eth_tx_queue_info_get failed ret=%d\n",
+				    ret);
+			return ret;
+		}
+
+		ret = rte_eth_tx_queue_setup(port, i, txinfo.nb_desc, 0,
+					     &txinfo.conf);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "rte_eth_tx_queue_setup failed ret=%d\n",
+				    ret);
+			return ret;
+		}
+	}
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		ret = rte_eth_rx_queue_info_get(dev->data->port_id, i, &rxinfo);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "rte_eth_rx_queue_info_get failed ret=%d\n",
+				    ret);
+			return ret;
+		}
+
+		rx_queue = dev->data->rx_queues[i];
+
+		ret = rte_eth_rx_queue_setup(port, i, rxinfo.nb_desc, 0,
+					     &rxinfo.conf, rx_queue->mb_pool);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "rte_eth_rx_queue_setup failed ret=%d\n",
+				    ret);
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+int hn_vf_add(struct rte_eth_dev *dev, struct hn_data *hv);
+
+static void hn_vf_add_retry(void *args)
+{
+	struct rte_eth_dev *dev = args;
+	struct hn_data *hv = dev->data->dev_private;
+
+	hn_vf_add(dev, hv);
+}
+
+int hn_vf_configure(struct rte_eth_dev *dev,
+		    const struct rte_eth_conf *dev_conf);
+
 /* Add new VF device to synthetic device */
 int hn_vf_add(struct rte_eth_dev *dev, struct hn_data *hv)
 {
-	int port, err;
+	int ret, port;
 
-	port = hn_vf_match(dev);
-	if (port < 0) {
-		PMD_DRV_LOG(NOTICE, "No matching MAC found");
-		return port;
+	if (!hv->vf_ctx.vf_vsp_reported || hv->vf_ctx.vf_vsc_switched)
+		return 0;
+
+	rte_rwlock_write_lock(&hv->vf_lock);
+
+	ret = hn_vf_attach(dev, hv);
+	if (ret) {
+		PMD_DRV_LOG(NOTICE,
+			    "RNDIS reports VF but device not found, retrying");
+		rte_eal_alarm_set(1000000, hn_vf_add_retry, dev);
+		goto exit;
 	}
 
-	err = hn_vf_attach(hv, port);
-	if (err == 0)
-		hn_nvs_set_datapath(hv, NVS_DATAPATH_VF);
+	port = hv->vf_ctx.vf_port;
 
-	return err;
+	/* If the primary device has started, this is a VF host add.
+	 * Configure and start VF device.
+	 */
+	if (dev->data->dev_started) {
+		if (rte_eth_devices[port].data->dev_started) {
+			PMD_DRV_LOG(ERR, "VF already started on hot add");
+			goto exit;
+		}
+
+		PMD_DRV_LOG(NOTICE, "configuring VF port %d\n", port);
+		ret = hn_vf_configure(dev, &dev->data->dev_conf);
+		if (ret) {
+			PMD_DRV_LOG(ERR, "Failed to configure VF port %d\n",
+				    port);
+			goto exit;
+		}
+
+		ret = hn_setup_vf_queues(port, dev);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "Failed to configure VF queues port %d\n",
+				    port);
+			goto exit;
+		}
+
+		PMD_DRV_LOG(NOTICE, "Starting VF port %d\n", port);
+		ret = rte_eth_dev_start(port);
+		if (ret) {
+			PMD_DRV_LOG(ERR, "rte_eth_dev_start failed ret=%d\n",
+				    ret);
+			goto exit;
+		}
+		hv->vf_ctx.vf_state = vf_started;
+	}
+
+	ret = hn_nvs_set_datapath(hv, NVS_DATAPATH_VF);
+	if (ret == 0)
+		hv->vf_ctx.vf_vsc_switched = true;
+
+exit:
+	rte_rwlock_write_unlock(&hv->vf_lock);
+	return ret;
 }
 
-/* Remove new VF device */
+/* Switch data path to VF device */
 static void hn_vf_remove(struct hn_data *hv)
 {
+	int ret;
 
-	if (!hn_vf_attached(hv)) {
+	if (!hv->vf_ctx.vf_vsc_switched) {
+		PMD_DRV_LOG(ERR, "VF path not active");
+		return;
+	}
+
+	rte_rwlock_write_lock(&hv->vf_lock);
+	if (!hv->vf_ctx.vf_vsc_switched) {
 		PMD_DRV_LOG(ERR, "VF path not active");
 	} else {
 		/* Stop incoming packets from arriving on VF */
-		hn_nvs_set_datapath(hv, NVS_DATAPATH_SYNTHETIC);
-
-		/* Give back ownership */
-		rte_eth_dev_owner_unset(hv->vf_port, hv->owner.id);
-
-		/* Stop transmission over VF */
-		hv->vf_port = HN_INVALID_PORT;
+		ret = hn_nvs_set_datapath(hv, NVS_DATAPATH_SYNTHETIC);
+		if (ret == 0)
+			hv->vf_ctx.vf_vsc_switched = false;
 	}
+	rte_rwlock_write_unlock(&hv->vf_lock);
 }
 
 /* Handle VF association message from host */
@@ -140,8 +320,7 @@  hn_nvs_handle_vfassoc(struct rte_eth_dev *dev,
 		    vf_assoc->allocated ? "add to" : "remove from",
 		    dev->data->port_id);
 
-	rte_rwlock_write_lock(&hv->vf_lock);
-	hv->vf_present = vf_assoc->allocated;
+	hv->vf_ctx.vf_vsp_reported = vf_assoc->allocated;
 
 	if (dev->state == RTE_ETH_DEV_ATTACHED) {
 		if (vf_assoc->allocated)
@@ -149,7 +328,6 @@  hn_nvs_handle_vfassoc(struct rte_eth_dev *dev,
 		else
 			hn_vf_remove(hv);
 	}
-	rte_rwlock_write_unlock(&hv->vf_lock);
 }
 
 static void
@@ -216,10 +394,6 @@  int hn_vf_info_get(struct hn_data *hv, struct rte_eth_dev_info *info)
 	return ret;
 }
 
-/*
- * Configure VF if present.
- * Force VF to have same number of queues as synthetic device
- */
 int hn_vf_configure(struct rte_eth_dev *dev,
 		    const struct rte_eth_conf *dev_conf)
 {
@@ -230,17 +404,56 @@  int hn_vf_configure(struct rte_eth_dev *dev,
 	/* link state interrupt does not matter here. */
 	vf_conf.intr_conf.lsc = 0;
 
-	rte_rwlock_read_lock(&hv->vf_lock);
-	if (hv->vf_port != HN_INVALID_PORT) {
-		ret = rte_eth_dev_configure(hv->vf_port,
+	/* need to monitor removal event */
+	vf_conf.intr_conf.rmv = 1;
+
+	if (hv->vf_ctx.vf_attached) {
+		ret = rte_eth_dev_callback_register(hv->vf_ctx.vf_port,
+						    RTE_ETH_EVENT_INTR_RMV,
+						    hn_eth_rmv_event_callback,
+						    hv);
+		if (ret) {
+			PMD_DRV_LOG(ERR,
+				    "Registering callback failed for "
+				    "vf port %d ret %d\n",
+				    hv->vf_ctx.vf_port, ret);
+			return ret;
+		}
+
+		ret = rte_eth_dev_configure(hv->vf_ctx.vf_port,
 					    dev->data->nb_rx_queues,
 					    dev->data->nb_tx_queues,
 					    &vf_conf);
-		if (ret != 0)
-			PMD_DRV_LOG(ERR,
-				    "VF configuration failed: %d", ret);
+		if (ret) {
+			PMD_DRV_LOG(ERR, "VF configuration failed: %d", ret);
+
+			rte_eth_dev_callback_unregister(hv->vf_ctx.vf_port,
+							RTE_ETH_EVENT_INTR_RMV,
+							hn_eth_rmv_event_callback,
+							hv);
+
+			return ret;
+		}
+
+		hv->vf_ctx.vf_state = vf_configured;
 	}
-	rte_rwlock_read_unlock(&hv->vf_lock);
+
+	return ret;
+}
+
+/* Configure VF if present.
+ * VF device will have the same number of queues as the synthetic device
+ */
+int hn_vf_configure_locked(struct rte_eth_dev *dev,
+			   const struct rte_eth_conf *dev_conf)
+{
+	struct hn_data *hv = dev->data->dev_private;
+	int ret = 0;
+
+	rte_rwlock_write_lock(&hv->vf_lock);
+	ret = hn_vf_configure(dev, dev_conf);
+	rte_rwlock_write_unlock(&hv->vf_lock);
+
 	return ret;
 }
 
@@ -325,16 +538,21 @@  void hn_vf_reset(struct rte_eth_dev *dev)
 
 int hn_vf_close(struct rte_eth_dev *dev)
 {
-	struct hn_data *hv = dev->data->dev_private;
-	uint16_t vf_port;
 	int ret = 0;
+	struct hn_data *hv = dev->data->dev_private;
 
-	rte_rwlock_read_lock(&hv->vf_lock);
-	vf_port = hv->vf_port;
-	if (vf_port != HN_INVALID_PORT)
-		ret = rte_eth_dev_close(vf_port);
+	rte_eal_alarm_cancel(hn_vf_add_retry, dev);
 
-	hv->vf_port = HN_INVALID_PORT;
+	rte_rwlock_read_lock(&hv->vf_lock);
+	if (hv->vf_ctx.vf_attached) {
+		rte_eth_dev_callback_unregister(hv->vf_ctx.vf_port,
+						RTE_ETH_EVENT_INTR_RMV,
+						hn_eth_rmv_event_callback,
+						hv);
+		rte_eal_alarm_cancel(hn_remove_delayed, hv);
+		ret = rte_eth_dev_close(hv->vf_ctx.vf_port);
+		hv->vf_ctx.vf_attached = false;
+	}
 	rte_rwlock_read_unlock(&hv->vf_lock);
 
 	return ret;