[v4,09/10] net/ifc: support SW assisted VDPA live migration

Message ID 20181214211612.167681-10-xiao.w.wang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series support SW assisted VDPA live migration |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Xiao Wang Dec. 14, 2018, 9:16 p.m. UTC
  In SW assisted live migration mode, driver will stop the device and
setup a mediate virtio ring to relay the communication between the
virtio driver and the VDPA device.

This data path intervention will allow SW to help on guest dirty page
logging for live migration.

This SW fallback is event driven relay thread, so when the network
throughput is low, this SW fallback will take little CPU resource, but
when the throughput goes up, the relay thread's CPU usage will goes up
accordinly.

User needs to take all the factors including CPU usage, guest perf
degradation, etc. into consideration when selecting the live migration
support mode.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
---
 drivers/net/ifc/base/ifcvf.h |   1 +
 drivers/net/ifc/ifcvf_vdpa.c | 346 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 344 insertions(+), 3 deletions(-)
  

Comments

Maxime Coquelin Dec. 16, 2018, 9:35 a.m. UTC | #1
On 12/14/18 10:16 PM, Xiao Wang wrote:
> In SW assisted live migration mode, driver will stop the device and
> setup a mediate virtio ring to relay the communication between the
> virtio driver and the VDPA device.
> 
> This data path intervention will allow SW to help on guest dirty page
> logging for live migration.
> 
> This SW fallback is event driven relay thread, so when the network
> throughput is low, this SW fallback will take little CPU resource, but
> when the throughput goes up, the relay thread's CPU usage will goes up
> accordinly.

s/accordinly/accordingly/

> 
> User needs to take all the factors including CPU usage, guest perf
> degradation, etc. into consideration when selecting the live migration
> support mode.
> 
> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> ---
>   drivers/net/ifc/base/ifcvf.h |   1 +
>   drivers/net/ifc/ifcvf_vdpa.c | 346 ++++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 344 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ifc/base/ifcvf.h b/drivers/net/ifc/base/ifcvf.h
> index c15c69107..e8a30d2c6 100644
> --- a/drivers/net/ifc/base/ifcvf.h
> +++ b/drivers/net/ifc/base/ifcvf.h
> @@ -50,6 +50,7 @@
>   #define IFCVF_LM_ENABLE_VF		0x1
>   #define IFCVF_LM_ENABLE_PF		0x3
>   #define IFCVF_LOG_BASE			0x100000000000
> +#define IFCVF_MEDIATE_VRING		0x200000000000

MEDIATED?

>   
>   #define IFCVF_32_BIT_MASK		0xffffffff
>   
> diff --git a/drivers/net/ifc/ifcvf_vdpa.c b/drivers/net/ifc/ifcvf_vdpa.c
> index f181c5a6e..61757d0b4 100644
> --- a/drivers/net/ifc/ifcvf_vdpa.c
> +++ b/drivers/net/ifc/ifcvf_vdpa.c
> @@ -63,6 +63,9 @@ struct ifcvf_internal {
>   	rte_atomic32_t running;
>   	rte_spinlock_t lock;
>   	bool sw_lm;
> +	bool sw_fallback_running;
> +	/* mediated vring for sw fallback */
> +	struct vring m_vring[IFCVF_MAX_QUEUES * 2];
>   };
>   
>   struct internal_list {
> @@ -308,6 +311,9 @@ vdpa_ifcvf_stop(struct ifcvf_internal *internal)
>   		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
>   				hw->vring[i].last_used_idx);
>   
> +	if (internal->sw_lm)
> +		return;
> +
>   	rte_vhost_get_negotiated_features(vid, &features);
>   	if (RTE_VHOST_NEED_LOG(features)) {
>   		ifcvf_disable_logging(hw);
> @@ -539,6 +545,318 @@ update_datapath(struct ifcvf_internal *internal)
>   	return ret;
>   }
>   
> +static int
> +m_ifcvf_start(struct ifcvf_internal *internal)
> +{
> +	struct ifcvf_hw *hw = &internal->hw;
> +	uint32_t i, nr_vring;
> +	int vid, ret;
> +	struct rte_vhost_vring vq;
> +	void *vring_buf;
> +	uint64_t m_vring_iova = IFCVF_MEDIATE_VRING;
> +	uint64_t size;
> +	uint64_t gpa;
> +
> +	vid = internal->vid;
> +	nr_vring = rte_vhost_get_vring_num(vid);
> +	rte_vhost_get_negotiated_features(vid, &hw->req_features);
> +
> +	for (i = 0; i < nr_vring; i++) {
> +		rte_vhost_get_vhost_vring(vid, i, &vq);
> +
> +		size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
> +				PAGE_SIZE);
> +		vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
> +		vring_init(&internal->m_vring[i], vq.size, vring_buf,
> +				PAGE_SIZE);
> +
> +		ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
> +			(uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
> +		if (ret < 0) {
> +			DRV_LOG(ERR, "mediate vring DMA map failed.");
> +			goto error;
> +		}
> +
> +		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
> +		if (gpa == 0) {
> +			DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
> +			return -1;
> +		}
> +		hw->vring[i].desc = gpa;
> +
> +		hw->vring[i].avail = m_vring_iova +
> +			(char *)internal->m_vring[i].avail -
> +			(char *)internal->m_vring[i].desc;
> +
> +		hw->vring[i].used = m_vring_iova +
> +			(char *)internal->m_vring[i].used -
> +			(char *)internal->m_vring[i].desc;
> +
> +		hw->vring[i].size = vq.size;
> +
> +		rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
> +				&hw->vring[i].last_used_idx);
> +
> +		m_vring_iova += size;
> +	}
> +	hw->nr_vring = nr_vring;
> +
> +	return ifcvf_start_hw(&internal->hw);
> +
> +error:
> +	for (i = 0; i < nr_vring; i++)
> +		if (internal->m_vring[i].desc)
> +			rte_free(internal->m_vring[i].desc);
> +
> +	return -1;
> +}
> +
> +static int
> +m_ifcvf_stop(struct ifcvf_internal *internal)
> +{
> +	int vid;
> +	uint32_t i;
> +	struct rte_vhost_vring vq;
> +	struct ifcvf_hw *hw = &internal->hw;
> +	uint64_t m_vring_iova = IFCVF_MEDIATE_VRING;
> +	uint64_t size, len;
> +
> +	vid = internal->vid;
> +	ifcvf_stop_hw(hw);
> +
> +	for (i = 0; i < hw->nr_vring; i++) {
> +		rte_vhost_get_vhost_vring(vid, i, &vq);
> +		len = IFCVF_USED_RING_LEN(vq.size);
> +		rte_vhost_log_used_vring(vid, i, 0, len);
> +
> +		size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
> +				PAGE_SIZE);
> +		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
> +			(uint64_t)(uintptr_t)internal->m_vring[i].desc,
> +			m_vring_iova, size);
> +
> +		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
> +				hw->vring[i].last_used_idx);
> +		rte_free(internal->m_vring[i].desc);
> +		m_vring_iova += size;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +m_enable_vfio_intr(struct ifcvf_internal *internal)
> +{
> +	uint32_t nr_vring;
> +	struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
> +	int ret;
> +
> +	nr_vring = rte_vhost_get_vring_num(internal->vid);
> +
> +	ret = rte_intr_efd_enable(intr_handle, nr_vring);
> +	if (ret)
> +		return -1;
> +
> +	ret = rte_intr_enable(intr_handle);
> +	if (ret)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static void
> +m_disable_vfio_intr(struct ifcvf_internal *internal)
> +{
> +	struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
> +
> +	rte_intr_efd_disable(intr_handle);
> +	rte_intr_disable(intr_handle);
> +}
> +
> +static void
> +update_avail_ring(struct ifcvf_internal *internal, uint16_t qid)
> +{
> +	rte_vdpa_relay_vring_avail(internal->vid, qid, &internal->m_vring[qid]);
> +	ifcvf_notify_queue(&internal->hw, qid);
> +}
> +
> +static void
> +update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
> +{
> +	rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
> +	rte_vhost_vring_call(internal->vid, qid);
> +}
> +
> +static void *
> +vring_relay(void *arg)
> +{
> +	int i, vid, epfd, fd, nfds;
> +	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
> +	struct rte_vhost_vring vring;
> +	struct rte_intr_handle *intr_handle;
> +	uint16_t qid, q_num;
> +	struct epoll_event events[IFCVF_MAX_QUEUES * 4];
> +	struct epoll_event ev;
> +	int nbytes;
> +	uint64_t buf;
> +
> +	vid = internal->vid;
> +	q_num = rte_vhost_get_vring_num(vid);
> +	/* prepare the mediate vring */
> +	for (qid = 0; qid < q_num; qid++) {
> +		rte_vhost_get_vring_base(vid, qid,
> +				&internal->m_vring[qid].avail->idx,
> +				&internal->m_vring[qid].used->idx);
> +		rte_vdpa_relay_vring_avail(vid, qid, &internal->m_vring[qid]);
> +	}
> +
> +	/* add notify fd and interrupt fd to epoll */
> +	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
> +	if (epfd < 0) {
> +		DRV_LOG(ERR, "failed to create epoll instance.");
> +		return NULL;
> +	}
> +	internal->epfd = epfd;
> +
> +	for (qid = 0; qid < q_num; qid++) {
> +		ev.events = EPOLLIN | EPOLLPRI;
> +		rte_vhost_get_vhost_vring(vid, qid, &vring);
> +		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
> +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> +			return NULL;
> +		}
> +	}
> +
> +	intr_handle = &internal->pdev->intr_handle;
> +	for (qid = 0; qid < q_num; qid++) {
> +		ev.events = EPOLLIN | EPOLLPRI;
> +		ev.data.u64 = 1 | qid << 1 |
> +			(uint64_t)intr_handle->efds[qid] << 32;
> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid], &ev)
> +				< 0) {
> +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> +			return NULL;
> +		}
> +	}
> +
> +	/* start relay with a first kick */
> +	for (qid = 0; qid < q_num; qid++)
> +		ifcvf_notify_queue(&internal->hw, qid);
> +
> +	/* listen to the events and react accordingly */
> +	for (;;) {
> +		nfds = epoll_wait(epfd, events, q_num * 2, -1);
> +		if (nfds < 0) {
> +			if (errno == EINTR)
> +				continue;
> +			DRV_LOG(ERR, "epoll_wait return fail\n");
> +			return NULL;
> +		}
> +
> +		for (i = 0; i < nfds; i++) {
> +			fd = (uint32_t)(events[i].data.u64 >> 32);
> +			do {
> +				nbytes = read(fd, &buf, 8);
> +				if (nbytes < 0) {
> +					if (errno == EINTR ||
> +					    errno == EWOULDBLOCK ||
> +					    errno == EAGAIN)
> +						continue;
> +					DRV_LOG(INFO, "Error reading "
> +						"kickfd: %s",
> +						strerror(errno));
> +				}
> +				break;
> +			} while (1);
> +
> +			qid = events[i].data.u32 >> 1;
> +
> +			if (events[i].data.u32 & 1)
> +				update_used_ring(internal, qid);
> +			else
> +				update_avail_ring(internal, qid);
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +setup_vring_relay(struct ifcvf_internal *internal)
> +{
> +	int ret;
> +
> +	ret = pthread_create(&internal->tid, NULL, vring_relay,
> +			(void *)internal);

So it will be scheduled without any affinity?
Shouldn't it use a pmd thread instead?

> +	if (ret) {
> +		DRV_LOG(ERR, "failed to create ring relay pthread.");
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int
> +unset_vring_relay(struct ifcvf_internal *internal)
> +{
> +	void *status;
> +
> +	if (internal->tid) {
> +		pthread_cancel(internal->tid);
> +		pthread_join(internal->tid, &status);
> +	}
> +	internal->tid = 0;
> +
> +	if (internal->epfd >= 0)
> +		close(internal->epfd);
> +	internal->epfd = -1;
> +
> +	return 0;
> +}
> +
> +static int
> +ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
> +{
> +	int ret;
> +
> +	/* stop the direct IO data path */
> +	unset_notify_relay(internal);
> +	vdpa_ifcvf_stop(internal);
> +	vdpa_disable_vfio_intr(internal);
> +
> +	ret = rte_vhost_host_notifier_ctrl(internal->vid, false);
> +	if (ret && ret != -ENOTSUP)
> +		goto error;
> +
> +	/* set up interrupt for interrupt relay */
> +	ret = m_enable_vfio_intr(internal);
> +	if (ret)
> +		goto unmap;
> +
> +	/* config the VF */
> +	ret = m_ifcvf_start(internal);
> +	if (ret)
> +		goto unset_intr;
> +
> +	/* set up vring relay thread */
> +	ret = setup_vring_relay(internal);
> +	if (ret)
> +		goto stop_vf;
> +
> +	internal->sw_fallback_running = true;
> +
> +	return 0;
> +
> +stop_vf:
> +	m_ifcvf_stop(internal);
> +unset_intr:
> +	m_disable_vfio_intr(internal);
> +unmap:
> +	ifcvf_dma_map(internal, 0);
> +error:
> +	return -1;
> +}
> +
>   static int
>   ifcvf_dev_config(int vid)
>   {
> @@ -579,8 +897,25 @@ ifcvf_dev_close(int vid)
>   	}
>   
>   	internal = list->internal;
> -	rte_atomic32_set(&internal->dev_attached, 0);
> -	update_datapath(internal);
> +
> +	if (internal->sw_fallback_running) {
> +		/* unset ring relay */
> +		unset_vring_relay(internal);
> +
> +		/* reset VF */
> +		m_ifcvf_stop(internal);
> +
> +		/* remove interrupt setting */
> +		m_disable_vfio_intr(internal);
> +
> +		/* unset DMA map for guest memory */
> +		ifcvf_dma_map(internal, 0);
> +
> +		internal->sw_fallback_running = false;
> +	} else {
> +		rte_atomic32_set(&internal->dev_attached, 0);
> +		update_datapath(internal);
> +	}
>   
>   	return 0;
>   }
> @@ -604,7 +939,12 @@ ifcvf_set_features(int vid)
>   	internal = list->internal;
>   	rte_vhost_get_negotiated_features(vid, &features);
>   
> -	if (RTE_VHOST_NEED_LOG(features)) {
> +	if (!RTE_VHOST_NEED_LOG(features))
> +		return 0;
> +
> +	if (internal->sw_lm) {
> +		ifcvf_sw_fallback_switchover(internal);
> +	} else {
>   		rte_vhost_get_log_base(vid, &log_base, &log_size);
>   		rte_vfio_container_dma_map(internal->vfio_container_fd,
>   				log_base, IFCVF_LOG_BASE, log_size);
>
  
Xiao Wang Dec. 17, 2018, 9:12 a.m. UTC | #2
Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Sunday, December 16, 2018 1:35 AM
> To: Wang, Xiao W <xiao.w.wang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>
> Cc: alejandro.lucero@netronome.com; dev@dpdk.org; Wang, Zhihong
> <zhihong.wang@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>
> Subject: Re: [PATCH v4 09/10] net/ifc: support SW assisted VDPA live migration
> 
> 
> 
> On 12/14/18 10:16 PM, Xiao Wang wrote:
> > In SW assisted live migration mode, driver will stop the device and
> > setup a mediate virtio ring to relay the communication between the
> > virtio driver and the VDPA device.
> >
> > This data path intervention will allow SW to help on guest dirty page
> > logging for live migration.
> >
> > This SW fallback is event driven relay thread, so when the network
> > throughput is low, this SW fallback will take little CPU resource, but
> > when the throughput goes up, the relay thread's CPU usage will goes up
> > accordinly.
> 
> s/accordinly/accordingly/
> 

Will fix it in next version.

> >
> > User needs to take all the factors including CPU usage, guest perf
> > degradation, etc. into consideration when selecting the live migration
> > support mode.
> >
> > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> > ---
> >   drivers/net/ifc/base/ifcvf.h |   1 +
> >   drivers/net/ifc/ifcvf_vdpa.c | 346
> ++++++++++++++++++++++++++++++++++++++++++-
> >   2 files changed, 344 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/ifc/base/ifcvf.h b/drivers/net/ifc/base/ifcvf.h
> > index c15c69107..e8a30d2c6 100644
> > --- a/drivers/net/ifc/base/ifcvf.h
> > +++ b/drivers/net/ifc/base/ifcvf.h
> > @@ -50,6 +50,7 @@
> >   #define IFCVF_LM_ENABLE_VF		0x1
> >   #define IFCVF_LM_ENABLE_PF		0x3
> >   #define IFCVF_LOG_BASE			0x100000000000
> > +#define IFCVF_MEDIATE_VRING		0x200000000000
> 
> MEDIATED?

"mediate" is used as adjective here.

> 
> >
> >   #define IFCVF_32_BIT_MASK		0xffffffff
> >
> > diff --git a/drivers/net/ifc/ifcvf_vdpa.c b/drivers/net/ifc/ifcvf_vdpa.c
> > index f181c5a6e..61757d0b4 100644
> > --- a/drivers/net/ifc/ifcvf_vdpa.c
> > +++ b/drivers/net/ifc/ifcvf_vdpa.c
> > @@ -63,6 +63,9 @@ struct ifcvf_internal {
> >   	rte_atomic32_t running;
> >   	rte_spinlock_t lock;
> >   	bool sw_lm;

[...]

> > +static void *
> > +vring_relay(void *arg)
> > +{
> > +	int i, vid, epfd, fd, nfds;
> > +	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
> > +	struct rte_vhost_vring vring;
> > +	struct rte_intr_handle *intr_handle;
> > +	uint16_t qid, q_num;
> > +	struct epoll_event events[IFCVF_MAX_QUEUES * 4];
> > +	struct epoll_event ev;
> > +	int nbytes;
> > +	uint64_t buf;
> > +
> > +	vid = internal->vid;
> > +	q_num = rte_vhost_get_vring_num(vid);
> > +	/* prepare the mediate vring */
> > +	for (qid = 0; qid < q_num; qid++) {
> > +		rte_vhost_get_vring_base(vid, qid,
> > +				&internal->m_vring[qid].avail->idx,
> > +				&internal->m_vring[qid].used->idx);
> > +		rte_vdpa_relay_vring_avail(vid, qid, &internal->m_vring[qid]);
> > +	}
> > +
> > +	/* add notify fd and interrupt fd to epoll */
> > +	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
> > +	if (epfd < 0) {
> > +		DRV_LOG(ERR, "failed to create epoll instance.");
> > +		return NULL;
> > +	}
> > +	internal->epfd = epfd;
> > +
> > +	for (qid = 0; qid < q_num; qid++) {
> > +		ev.events = EPOLLIN | EPOLLPRI;
> > +		rte_vhost_get_vhost_vring(vid, qid, &vring);
> > +		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
> > +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
> > +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> > +			return NULL;
> > +		}
> > +	}
> > +
> > +	intr_handle = &internal->pdev->intr_handle;
> > +	for (qid = 0; qid < q_num; qid++) {
> > +		ev.events = EPOLLIN | EPOLLPRI;
> > +		ev.data.u64 = 1 | qid << 1 |
> > +			(uint64_t)intr_handle->efds[qid] << 32;
> > +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid],
> &ev)
> > +				< 0) {
> > +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
> > +			return NULL;
> > +		}
> > +	}
> > +
> > +	/* start relay with a first kick */
> > +	for (qid = 0; qid < q_num; qid++)
> > +		ifcvf_notify_queue(&internal->hw, qid);
> > +
> > +	/* listen to the events and react accordingly */
> > +	for (;;) {
> > +		nfds = epoll_wait(epfd, events, q_num * 2, -1);
> > +		if (nfds < 0) {
> > +			if (errno == EINTR)
> > +				continue;
> > +			DRV_LOG(ERR, "epoll_wait return fail\n");
> > +			return NULL;
> > +		}
> > +
> > +		for (i = 0; i < nfds; i++) {
> > +			fd = (uint32_t)(events[i].data.u64 >> 32);
> > +			do {
> > +				nbytes = read(fd, &buf, 8);
> > +				if (nbytes < 0) {
> > +					if (errno == EINTR ||
> > +					    errno == EWOULDBLOCK ||
> > +					    errno == EAGAIN)
> > +						continue;
> > +					DRV_LOG(INFO, "Error reading "
> > +						"kickfd: %s",
> > +						strerror(errno));
> > +				}
> > +				break;
> > +			} while (1);
> > +
> > +			qid = events[i].data.u32 >> 1;
> > +
> > +			if (events[i].data.u32 & 1)
> > +				update_used_ring(internal, qid);
> > +			else
> > +				update_avail_ring(internal, qid);
> > +		}
> > +	}
> > +
> > +	return NULL;
> > +}
> > +
> > +static int
> > +setup_vring_relay(struct ifcvf_internal *internal)
> > +{
> > +	int ret;
> > +
> > +	ret = pthread_create(&internal->tid, NULL, vring_relay,
> > +			(void *)internal);
> 
> So it will be scheduled without any affinity?
> Shouldn't it use a pmd thread instead?

The new thread will inherit the thread affinity from its parent thread. As you know, vdpa is trying to
 minimize CPU usage for virtio HW acceleration, and we assign just one core to vdpa daemon
 (doc/guides/sample_app_ug/vdpa.rst), so there's no dedicated pmd worker core.

Thanks,
Xiao
  
Maxime Coquelin Dec. 17, 2018, 11:08 a.m. UTC | #3
On 12/17/18 10:12 AM, Wang, Xiao W wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Sunday, December 16, 2018 1:35 AM
>> To: Wang, Xiao W <xiao.w.wang@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>
>> Cc: alejandro.lucero@netronome.com; dev@dpdk.org; Wang, Zhihong
>> <zhihong.wang@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>
>> Subject: Re: [PATCH v4 09/10] net/ifc: support SW assisted VDPA live migration
>>
>>
>>
>> On 12/14/18 10:16 PM, Xiao Wang wrote:
>>> In SW assisted live migration mode, driver will stop the device and
>>> setup a mediate virtio ring to relay the communication between the
>>> virtio driver and the VDPA device.
>>>
>>> This data path intervention will allow SW to help on guest dirty page
>>> logging for live migration.
>>>
>>> This SW fallback is event driven relay thread, so when the network
>>> throughput is low, this SW fallback will take little CPU resource, but
>>> when the throughput goes up, the relay thread's CPU usage will goes up
>>> accordinly.
>>
>> s/accordinly/accordingly/
>>
> 
> Will fix it in next version.
> 
>>>
>>> User needs to take all the factors including CPU usage, guest perf
>>> degradation, etc. into consideration when selecting the live migration
>>> support mode.
>>>
>>> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
>>> ---
>>>    drivers/net/ifc/base/ifcvf.h |   1 +
>>>    drivers/net/ifc/ifcvf_vdpa.c | 346
>> ++++++++++++++++++++++++++++++++++++++++++-
>>>    2 files changed, 344 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/net/ifc/base/ifcvf.h b/drivers/net/ifc/base/ifcvf.h
>>> index c15c69107..e8a30d2c6 100644
>>> --- a/drivers/net/ifc/base/ifcvf.h
>>> +++ b/drivers/net/ifc/base/ifcvf.h
>>> @@ -50,6 +50,7 @@
>>>    #define IFCVF_LM_ENABLE_VF		0x1
>>>    #define IFCVF_LM_ENABLE_PF		0x3
>>>    #define IFCVF_LOG_BASE			0x100000000000
>>> +#define IFCVF_MEDIATE_VRING		0x200000000000
>>
>> MEDIATED?
> 
> "mediate" is used as adjective here.
> 
>>
>>>
>>>    #define IFCVF_32_BIT_MASK		0xffffffff
>>>
>>> diff --git a/drivers/net/ifc/ifcvf_vdpa.c b/drivers/net/ifc/ifcvf_vdpa.c
>>> index f181c5a6e..61757d0b4 100644
>>> --- a/drivers/net/ifc/ifcvf_vdpa.c
>>> +++ b/drivers/net/ifc/ifcvf_vdpa.c
>>> @@ -63,6 +63,9 @@ struct ifcvf_internal {
>>>    	rte_atomic32_t running;
>>>    	rte_spinlock_t lock;
>>>    	bool sw_lm;
> 
> [...]
> 
>>> +static void *
>>> +vring_relay(void *arg)
>>> +{
>>> +	int i, vid, epfd, fd, nfds;
>>> +	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
>>> +	struct rte_vhost_vring vring;
>>> +	struct rte_intr_handle *intr_handle;
>>> +	uint16_t qid, q_num;
>>> +	struct epoll_event events[IFCVF_MAX_QUEUES * 4];
>>> +	struct epoll_event ev;
>>> +	int nbytes;
>>> +	uint64_t buf;
>>> +
>>> +	vid = internal->vid;
>>> +	q_num = rte_vhost_get_vring_num(vid);
>>> +	/* prepare the mediate vring */
>>> +	for (qid = 0; qid < q_num; qid++) {
>>> +		rte_vhost_get_vring_base(vid, qid,
>>> +				&internal->m_vring[qid].avail->idx,
>>> +				&internal->m_vring[qid].used->idx);
>>> +		rte_vdpa_relay_vring_avail(vid, qid, &internal->m_vring[qid]);
>>> +	}
>>> +
>>> +	/* add notify fd and interrupt fd to epoll */
>>> +	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
>>> +	if (epfd < 0) {
>>> +		DRV_LOG(ERR, "failed to create epoll instance.");
>>> +		return NULL;
>>> +	}
>>> +	internal->epfd = epfd;
>>> +
>>> +	for (qid = 0; qid < q_num; qid++) {
>>> +		ev.events = EPOLLIN | EPOLLPRI;
>>> +		rte_vhost_get_vhost_vring(vid, qid, &vring);
>>> +		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
>>> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
>>> +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
>>> +			return NULL;
>>> +		}
>>> +	}
>>> +
>>> +	intr_handle = &internal->pdev->intr_handle;
>>> +	for (qid = 0; qid < q_num; qid++) {
>>> +		ev.events = EPOLLIN | EPOLLPRI;
>>> +		ev.data.u64 = 1 | qid << 1 |
>>> +			(uint64_t)intr_handle->efds[qid] << 32;
>>> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid],
>> &ev)
>>> +				< 0) {
>>> +			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
>>> +			return NULL;
>>> +		}
>>> +	}
>>> +
>>> +	/* start relay with a first kick */
>>> +	for (qid = 0; qid < q_num; qid++)
>>> +		ifcvf_notify_queue(&internal->hw, qid);
>>> +
>>> +	/* listen to the events and react accordingly */
>>> +	for (;;) {
>>> +		nfds = epoll_wait(epfd, events, q_num * 2, -1);
>>> +		if (nfds < 0) {
>>> +			if (errno == EINTR)
>>> +				continue;
>>> +			DRV_LOG(ERR, "epoll_wait return fail\n");
>>> +			return NULL;
>>> +		}
>>> +
>>> +		for (i = 0; i < nfds; i++) {
>>> +			fd = (uint32_t)(events[i].data.u64 >> 32);
>>> +			do {
>>> +				nbytes = read(fd, &buf, 8);
>>> +				if (nbytes < 0) {
>>> +					if (errno == EINTR ||
>>> +					    errno == EWOULDBLOCK ||
>>> +					    errno == EAGAIN)
>>> +						continue;
>>> +					DRV_LOG(INFO, "Error reading "
>>> +						"kickfd: %s",
>>> +						strerror(errno));
>>> +				}
>>> +				break;
>>> +			} while (1);
>>> +
>>> +			qid = events[i].data.u32 >> 1;
>>> +
>>> +			if (events[i].data.u32 & 1)
>>> +				update_used_ring(internal, qid);
>>> +			else
>>> +				update_avail_ring(internal, qid);
>>> +		}
>>> +	}
>>> +
>>> +	return NULL;
>>> +}
>>> +
>>> +static int
>>> +setup_vring_relay(struct ifcvf_internal *internal)
>>> +{
>>> +	int ret;
>>> +
>>> +	ret = pthread_create(&internal->tid, NULL, vring_relay,
>>> +			(void *)internal);
>>
>> So it will be scheduled without any affinity?
>> Shouldn't it use a pmd thread instead?
> 
> The new thread will inherit the thread affinity from its parent thread. As you know, vdpa is trying to
>   minimize CPU usage for virtio HW acceleration, and we assign just one core to vdpa daemon
>   (doc/guides/sample_app_ug/vdpa.rst), so there's no dedicated pmd worker core.

OK, thanks for the clarification!

> Thanks,
> Xiao
>
  

Patch

diff --git a/drivers/net/ifc/base/ifcvf.h b/drivers/net/ifc/base/ifcvf.h
index c15c69107..e8a30d2c6 100644
--- a/drivers/net/ifc/base/ifcvf.h
+++ b/drivers/net/ifc/base/ifcvf.h
@@ -50,6 +50,7 @@ 
 #define IFCVF_LM_ENABLE_VF		0x1
 #define IFCVF_LM_ENABLE_PF		0x3
 #define IFCVF_LOG_BASE			0x100000000000
+#define IFCVF_MEDIATE_VRING		0x200000000000
 
 #define IFCVF_32_BIT_MASK		0xffffffff
 
diff --git a/drivers/net/ifc/ifcvf_vdpa.c b/drivers/net/ifc/ifcvf_vdpa.c
index f181c5a6e..61757d0b4 100644
--- a/drivers/net/ifc/ifcvf_vdpa.c
+++ b/drivers/net/ifc/ifcvf_vdpa.c
@@ -63,6 +63,9 @@  struct ifcvf_internal {
 	rte_atomic32_t running;
 	rte_spinlock_t lock;
 	bool sw_lm;
+	bool sw_fallback_running;
+	/* mediated vring for sw fallback */
+	struct vring m_vring[IFCVF_MAX_QUEUES * 2];
 };
 
 struct internal_list {
@@ -308,6 +311,9 @@  vdpa_ifcvf_stop(struct ifcvf_internal *internal)
 		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
 				hw->vring[i].last_used_idx);
 
+	if (internal->sw_lm)
+		return;
+
 	rte_vhost_get_negotiated_features(vid, &features);
 	if (RTE_VHOST_NEED_LOG(features)) {
 		ifcvf_disable_logging(hw);
@@ -539,6 +545,318 @@  update_datapath(struct ifcvf_internal *internal)
 	return ret;
 }
 
+static int
+m_ifcvf_start(struct ifcvf_internal *internal)
+{
+	struct ifcvf_hw *hw = &internal->hw;
+	uint32_t i, nr_vring;
+	int vid, ret;
+	struct rte_vhost_vring vq;
+	void *vring_buf;
+	uint64_t m_vring_iova = IFCVF_MEDIATE_VRING;
+	uint64_t size;
+	uint64_t gpa;
+
+	vid = internal->vid;
+	nr_vring = rte_vhost_get_vring_num(vid);
+	rte_vhost_get_negotiated_features(vid, &hw->req_features);
+
+	for (i = 0; i < nr_vring; i++) {
+		rte_vhost_get_vhost_vring(vid, i, &vq);
+
+		size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
+				PAGE_SIZE);
+		vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
+		vring_init(&internal->m_vring[i], vq.size, vring_buf,
+				PAGE_SIZE);
+
+		ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
+			(uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
+		if (ret < 0) {
+			DRV_LOG(ERR, "mediate vring DMA map failed.");
+			goto error;
+		}
+
+		gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
+		if (gpa == 0) {
+			DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
+			return -1;
+		}
+		hw->vring[i].desc = gpa;
+
+		hw->vring[i].avail = m_vring_iova +
+			(char *)internal->m_vring[i].avail -
+			(char *)internal->m_vring[i].desc;
+
+		hw->vring[i].used = m_vring_iova +
+			(char *)internal->m_vring[i].used -
+			(char *)internal->m_vring[i].desc;
+
+		hw->vring[i].size = vq.size;
+
+		rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
+				&hw->vring[i].last_used_idx);
+
+		m_vring_iova += size;
+	}
+	hw->nr_vring = nr_vring;
+
+	return ifcvf_start_hw(&internal->hw);
+
+error:
+	for (i = 0; i < nr_vring; i++)
+		if (internal->m_vring[i].desc)
+			rte_free(internal->m_vring[i].desc);
+
+	return -1;
+}
+
+static int
+m_ifcvf_stop(struct ifcvf_internal *internal)
+{
+	int vid;
+	uint32_t i;
+	struct rte_vhost_vring vq;
+	struct ifcvf_hw *hw = &internal->hw;
+	uint64_t m_vring_iova = IFCVF_MEDIATE_VRING;
+	uint64_t size, len;
+
+	vid = internal->vid;
+	ifcvf_stop_hw(hw);
+
+	for (i = 0; i < hw->nr_vring; i++) {
+		rte_vhost_get_vhost_vring(vid, i, &vq);
+		len = IFCVF_USED_RING_LEN(vq.size);
+		rte_vhost_log_used_vring(vid, i, 0, len);
+
+		size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
+				PAGE_SIZE);
+		rte_vfio_container_dma_unmap(internal->vfio_container_fd,
+			(uint64_t)(uintptr_t)internal->m_vring[i].desc,
+			m_vring_iova, size);
+
+		rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
+				hw->vring[i].last_used_idx);
+		rte_free(internal->m_vring[i].desc);
+		m_vring_iova += size;
+	}
+
+	return 0;
+}
+
+static int
+m_enable_vfio_intr(struct ifcvf_internal *internal)
+{
+	uint32_t nr_vring;
+	struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
+	int ret;
+
+	nr_vring = rte_vhost_get_vring_num(internal->vid);
+
+	ret = rte_intr_efd_enable(intr_handle, nr_vring);
+	if (ret)
+		return -1;
+
+	ret = rte_intr_enable(intr_handle);
+	if (ret)
+		return -1;
+
+	return 0;
+}
+
+static void
+m_disable_vfio_intr(struct ifcvf_internal *internal)
+{
+	struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
+
+	rte_intr_efd_disable(intr_handle);
+	rte_intr_disable(intr_handle);
+}
+
+static void
+update_avail_ring(struct ifcvf_internal *internal, uint16_t qid)
+{
+	rte_vdpa_relay_vring_avail(internal->vid, qid, &internal->m_vring[qid]);
+	ifcvf_notify_queue(&internal->hw, qid);
+}
+
+static void
+update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
+{
+	rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
+	rte_vhost_vring_call(internal->vid, qid);
+}
+
+static void *
+vring_relay(void *arg)
+{
+	int i, vid, epfd, fd, nfds;
+	struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
+	struct rte_vhost_vring vring;
+	struct rte_intr_handle *intr_handle;
+	uint16_t qid, q_num;
+	struct epoll_event events[IFCVF_MAX_QUEUES * 4];
+	struct epoll_event ev;
+	int nbytes;
+	uint64_t buf;
+
+	vid = internal->vid;
+	q_num = rte_vhost_get_vring_num(vid);
+	/* prepare the mediate vring */
+	for (qid = 0; qid < q_num; qid++) {
+		rte_vhost_get_vring_base(vid, qid,
+				&internal->m_vring[qid].avail->idx,
+				&internal->m_vring[qid].used->idx);
+		rte_vdpa_relay_vring_avail(vid, qid, &internal->m_vring[qid]);
+	}
+
+	/* add notify fd and interrupt fd to epoll */
+	epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
+	if (epfd < 0) {
+		DRV_LOG(ERR, "failed to create epoll instance.");
+		return NULL;
+	}
+	internal->epfd = epfd;
+
+	for (qid = 0; qid < q_num; qid++) {
+		ev.events = EPOLLIN | EPOLLPRI;
+		rte_vhost_get_vhost_vring(vid, qid, &vring);
+		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
+		if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
+			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
+			return NULL;
+		}
+	}
+
+	intr_handle = &internal->pdev->intr_handle;
+	for (qid = 0; qid < q_num; qid++) {
+		ev.events = EPOLLIN | EPOLLPRI;
+		ev.data.u64 = 1 | qid << 1 |
+			(uint64_t)intr_handle->efds[qid] << 32;
+		if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid], &ev)
+				< 0) {
+			DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
+			return NULL;
+		}
+	}
+
+	/* start relay with a first kick */
+	for (qid = 0; qid < q_num; qid++)
+		ifcvf_notify_queue(&internal->hw, qid);
+
+	/* listen to the events and react accordingly */
+	for (;;) {
+		nfds = epoll_wait(epfd, events, q_num * 2, -1);
+		if (nfds < 0) {
+			if (errno == EINTR)
+				continue;
+			DRV_LOG(ERR, "epoll_wait return fail\n");
+			return NULL;
+		}
+
+		for (i = 0; i < nfds; i++) {
+			fd = (uint32_t)(events[i].data.u64 >> 32);
+			do {
+				nbytes = read(fd, &buf, 8);
+				if (nbytes < 0) {
+					if (errno == EINTR ||
+					    errno == EWOULDBLOCK ||
+					    errno == EAGAIN)
+						continue;
+					DRV_LOG(INFO, "Error reading "
+						"kickfd: %s",
+						strerror(errno));
+				}
+				break;
+			} while (1);
+
+			qid = events[i].data.u32 >> 1;
+
+			if (events[i].data.u32 & 1)
+				update_used_ring(internal, qid);
+			else
+				update_avail_ring(internal, qid);
+		}
+	}
+
+	return NULL;
+}
+
+static int
+setup_vring_relay(struct ifcvf_internal *internal)
+{
+	int ret;
+
+	ret = pthread_create(&internal->tid, NULL, vring_relay,
+			(void *)internal);
+	if (ret) {
+		DRV_LOG(ERR, "failed to create ring relay pthread.");
+		return -1;
+	}
+	return 0;
+}
+
+static int
+unset_vring_relay(struct ifcvf_internal *internal)
+{
+	void *status;
+
+	if (internal->tid) {
+		pthread_cancel(internal->tid);
+		pthread_join(internal->tid, &status);
+	}
+	internal->tid = 0;
+
+	if (internal->epfd >= 0)
+		close(internal->epfd);
+	internal->epfd = -1;
+
+	return 0;
+}
+
+static int
+ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
+{
+	int ret;
+
+	/* stop the direct IO data path */
+	unset_notify_relay(internal);
+	vdpa_ifcvf_stop(internal);
+	vdpa_disable_vfio_intr(internal);
+
+	ret = rte_vhost_host_notifier_ctrl(internal->vid, false);
+	if (ret && ret != -ENOTSUP)
+		goto error;
+
+	/* set up interrupt for interrupt relay */
+	ret = m_enable_vfio_intr(internal);
+	if (ret)
+		goto unmap;
+
+	/* config the VF */
+	ret = m_ifcvf_start(internal);
+	if (ret)
+		goto unset_intr;
+
+	/* set up vring relay thread */
+	ret = setup_vring_relay(internal);
+	if (ret)
+		goto stop_vf;
+
+	internal->sw_fallback_running = true;
+
+	return 0;
+
+stop_vf:
+	m_ifcvf_stop(internal);
+unset_intr:
+	m_disable_vfio_intr(internal);
+unmap:
+	ifcvf_dma_map(internal, 0);
+error:
+	return -1;
+}
+
 static int
 ifcvf_dev_config(int vid)
 {
@@ -579,8 +897,25 @@  ifcvf_dev_close(int vid)
 	}
 
 	internal = list->internal;
-	rte_atomic32_set(&internal->dev_attached, 0);
-	update_datapath(internal);
+
+	if (internal->sw_fallback_running) {
+		/* unset ring relay */
+		unset_vring_relay(internal);
+
+		/* reset VF */
+		m_ifcvf_stop(internal);
+
+		/* remove interrupt setting */
+		m_disable_vfio_intr(internal);
+
+		/* unset DMA map for guest memory */
+		ifcvf_dma_map(internal, 0);
+
+		internal->sw_fallback_running = false;
+	} else {
+		rte_atomic32_set(&internal->dev_attached, 0);
+		update_datapath(internal);
+	}
 
 	return 0;
 }
@@ -604,7 +939,12 @@  ifcvf_set_features(int vid)
 	internal = list->internal;
 	rte_vhost_get_negotiated_features(vid, &features);
 
-	if (RTE_VHOST_NEED_LOG(features)) {
+	if (!RTE_VHOST_NEED_LOG(features))
+		return 0;
+
+	if (internal->sw_lm) {
+		ifcvf_sw_fallback_switchover(internal);
+	} else {
 		rte_vhost_get_log_base(vid, &log_base, &log_size);
 		rte_vfio_container_dma_map(internal->vfio_container_fd,
 				log_base, IFCVF_LOG_BASE, log_size);