diff mbox

[dpdk-dev,2/2] net/virtio: support GUEST ANNOUNCE

Message ID	1511521440-57724-3-git-send-email-xiao.w.wang@intel.com (mailing list archive)
State	Superseded, archived
Delegated to:	Yuanhan Liu
Headers	From: Xiao Wang <xiao.w.wang@intel.com> To: dev@dpdk.org Cc: yliu@fridaylinux.org, Xiao Wang <xiao.w.wang@intel.com> Date: Fri, 24 Nov 2017 03:04:00 -0800 Message-Id: <1511521440-57724-3-git-send-email-xiao.w.wang@intel.com> In-Reply-To: <1511521440-57724-1-git-send-email-xiao.w.wang@intel.com> References: <1511521440-57724-1-git-send-email-xiao.w.wang@intel.com> Subject: [dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>

Checks

Context	Check	Description
ci/Intel-compilation	success	Compilation OK
ci/checkpatch	warning	coding style issues

Commit Message

Xiao Wang Nov. 24, 2017, 11:04 a.m. UTC

  When live migration is done, for the backup VM, either the virtio
frontend or the vhost backend needs to send out gratuitous RARP packet
to announce its new network location.

This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support live
migration scenario where the vhost backend doesn't have the ability to
generate RARP packet.

Brief introduction of the work flow:
1. QEMU finishes live migration, pokes the backup VM with an interrupt.
2. Virtio interrupt handler reads out the interrupt status value, and
   realizes it needs to send out RARP packet to announce its location.
3. Pause device to stop worker thread touching the queues.
4. Inject a RARP packet into a Tx Queue.
5. Ack the interrupt via control queue.
6. Resume device to continue packet processing.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
---
 drivers/net/virtio/virtio_ethdev.c | 131 ++++++++++++++++++++++++++++++++++++-
 drivers/net/virtio/virtio_ethdev.h |   4 ++
 drivers/net/virtio/virtio_pci.h    |   1 +
 drivers/net/virtio/virtio_rxtx.c   |  81 +++++++++++++++++++++++
 drivers/net/virtio/virtqueue.h     |  11 ++++
 5 files changed, 226 insertions(+), 2 deletions(-)

Comments

Tiwei Bie Nov. 24, 2017, 6:04 a.m. UTC | #1

Hi,

Some quick comments. Will go through the whole patch later.

On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> When live migration is done, for the backup VM, either the virtio
> frontend or the vhost backend needs to send out gratuitous RARP packet
> to announce its new network location.
> 
> This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support live
> migration scenario where the vhost backend doesn't have the ability to
> generate RARP packet.
> 
> Brief introduction of the work flow:
> 1. QEMU finishes live migration, pokes the backup VM with an interrupt.
> 2. Virtio interrupt handler reads out the interrupt status value, and
>    realizes it needs to send out RARP packet to announce its location.
> 3. Pause device to stop worker thread touching the queues.
> 4. Inject a RARP packet into a Tx Queue.
> 5. Ack the interrupt via control queue.
> 6. Resume device to continue packet processing.
> 
> Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
> ---
>  drivers/net/virtio/virtio_ethdev.c | 131 ++++++++++++++++++++++++++++++++++++-
>  drivers/net/virtio/virtio_ethdev.h |   4 ++
>  drivers/net/virtio/virtio_pci.h    |   1 +
>  drivers/net/virtio/virtio_rxtx.c   |  81 +++++++++++++++++++++++
>  drivers/net/virtio/virtqueue.h     |  11 ++++
>  5 files changed, 226 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
> index 1959b11..6eaea0e 100644
> --- a/drivers/net/virtio/virtio_ethdev.c
> +++ b/drivers/net/virtio/virtio_ethdev.c
> @@ -48,6 +48,8 @@
>  #include <rte_pci.h>
>  #include <rte_bus_pci.h>
>  #include <rte_ether.h>
> +#include <rte_ip.h>
> +#include <rte_arp.h>
>  #include <rte_common.h>
>  #include <rte_errno.h>
>  #include <rte_cpuflags.h>
> @@ -55,6 +57,7 @@
>  #include <rte_memory.h>
>  #include <rte_eal.h>
>  #include <rte_dev.h>
> +#include <rte_cycles.h>
>  
>  #include "virtio_ethdev.h"
>  #include "virtio_pci.h"
> @@ -106,6 +109,13 @@ static int virtio_dev_queue_stats_mapping_set(
>  	uint8_t stat_idx,
>  	uint8_t is_rx);
>  
> +static int make_rarp_packet(struct rte_mbuf *rarp_mbuf,
> +		const struct ether_addr *mac);
> +static int virtio_dev_pause(struct rte_eth_dev *dev);
> +static void virtio_dev_resume(struct rte_eth_dev *dev);
> +static void generate_rarp(struct rte_eth_dev *dev);
> +static void virtnet_ack_link_announce(struct rte_eth_dev *dev);
> +
>  /*
>   * The set of PCI devices this driver supports
>   */
> @@ -1249,9 +1259,116 @@ static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
>  	return 0;
>  }
>  
> +#define RARP_PKT_SIZE	64
> +
> +static int
> +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct arp_hdr  *rarp;
> +
> +	if (rarp_mbuf->buf_len < RARP_PKT_SIZE) {
> +		PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)",
> +				rarp_mbuf->buf_len, RARP_PKT_SIZE);
> +		return -1;
> +	}
> +
> +	/* Ethernet header. */
> +	eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);

You can use rte_pktmbuf_mtod() directly.

> +	memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
> +	ether_addr_copy(mac, &eth_hdr->s_addr);
> +	eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
> +
> +	/* RARP header. */
> +	rarp = (struct arp_hdr *)(eth_hdr + 1);
> +	rarp->arp_hrd = htons(ARP_HRD_ETHER);
> +	rarp->arp_pro = htons(ETHER_TYPE_IPv4);
> +	rarp->arp_hln = ETHER_ADDR_LEN;
> +	rarp->arp_pln = 4;
> +	rarp->arp_op  = htons(ARP_OP_REVREQUEST);
> +
> +	ether_addr_copy(mac, &rarp->arp_data.arp_sha);
> +	ether_addr_copy(mac, &rarp->arp_data.arp_tha);
> +	memset(&rarp->arp_data.arp_sip, 0x00, 4);
> +	memset(&rarp->arp_data.arp_tip, 0x00, 4);
> +
> +	rarp_mbuf->data_len = RARP_PKT_SIZE;
> +	rarp_mbuf->pkt_len = RARP_PKT_SIZE;
> +
> +	return 0;
> +}
> +
> +static int
> +virtio_dev_pause(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +	if (hw->started == 0)
> +		return -1;
> +	hw->started = 0;
> +	/*
> +	 * Prevent the worker thread from touching queues to avoid condition,
> +	 * 1 ms should be enough for the ongoing Tx function to finish.
> +	 */
> +	rte_delay_ms(1);
> +	return 0;
> +}
> +
> +static void
> +virtio_dev_resume(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +	hw->started = 1;
> +}
> +
> +static void
> +generate_rarp(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +	struct rte_mbuf *rarp_mbuf = NULL;
> +	struct virtnet_tx *txvq = dev->data->tx_queues[0];
> +	struct virtnet_rx *rxvq = dev->data->rx_queues[0];
> +
> +	rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
> +	if (rarp_mbuf == NULL) {
> +		PMD_DRV_LOG(ERR, "mbuf allocate failed");
> +		return;
> +	}
> +
> +	if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr)) {
> +		rte_pktmbuf_free(rarp_mbuf);
> +		rarp_mbuf = NULL;
> +		return;
> +	}
> +
> +	/* If virtio port just stopped, no need to send RARP */
> +	if (virtio_dev_pause(dev) < -1)

You mean < 0?

> +		return;
> +
> +	virtio_inject_pkts(txvq, &rarp_mbuf, 1);
> +	/* Recover the stored hw status to let worker thread continue */
> +	virtio_dev_resume(dev);
> +}
> +
> +static void
> +virtnet_ack_link_announce(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +	struct virtio_pmd_ctrl ctrl;
> +	int len;
> +
> +	ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE;
> +	ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK;
> +	len = 0;
> +
> +	virtio_send_command(hw->cvq, &ctrl, &len, 0);
> +}
> +
>  /*
> - * Process Virtio Config changed interrupt and call the callback
> - * if link state changed.
> + * Process virtio config changed interrupt. Call the callback
> + * if link state changed; generate gratuitous RARP packet if
> + * the status indicates an ANNOUNCE.
>   */
>  void
>  virtio_interrupt_handler(void *param)
> @@ -1274,6 +1391,12 @@ static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
>  						      NULL, NULL);
>  	}
>  
> +	if (isr & VIRTIO_NET_S_ANNOUNCE) {
> +		rte_spinlock_lock(&hw->sl);
> +		generate_rarp(dev);
> +		virtnet_ack_link_announce(dev);
> +		rte_spinlock_unlock(&hw->sl);
> +	}
>  }
>  
>  /* set rx and tx handlers according to what is supported */
> @@ -1786,6 +1909,8 @@ static int eth_virtio_pci_remove(struct rte_pci_device *pci_dev)
>  			return -EBUSY;
>  		}
>  
> +	rte_spinlock_init(&hw->sl);
> +
>  	hw->use_simple_rx = 1;
>  	hw->use_simple_tx = 1;
>  
> @@ -1952,12 +2077,14 @@ static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
>  
>  	PMD_INIT_LOG(DEBUG, "stop");
>  
> +	rte_spinlock_lock(&hw->sl);
>  	if (intr_conf->lsc || intr_conf->rxq)
>  		virtio_intr_disable(dev);
>  
>  	hw->started = 0;
>  	memset(&link, 0, sizeof(link));
>  	virtio_dev_atomic_write_link_status(dev, &link);
> +	rte_spinlock_unlock(&hw->sl);
>  }
>  
>  static int
> diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
> index 2039bc5..24271cb 100644
> --- a/drivers/net/virtio/virtio_ethdev.h
> +++ b/drivers/net/virtio/virtio_ethdev.h
> @@ -67,6 +67,7 @@
>  	 1u << VIRTIO_NET_F_HOST_TSO6	  |	\
>  	 1u << VIRTIO_NET_F_MRG_RXBUF	  |	\
>  	 1u << VIRTIO_NET_F_MTU	| \
> +	 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE  | \
>  	 1u << VIRTIO_RING_F_INDIRECT_DESC |    \
>  	 1ULL << VIRTIO_F_VERSION_1       |	\
>  	 1ULL << VIRTIO_F_IOMMU_PLATFORM)
> @@ -111,6 +112,9 @@ uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		uint16_t nb_pkts);
>  
> +uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> +		uint16_t nb_pkts);
> +
>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		uint16_t nb_pkts);
>  
> diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
> index 3c5ce66..3cd367e 100644
> --- a/drivers/net/virtio/virtio_pci.h
> +++ b/drivers/net/virtio/virtio_pci.h
> @@ -270,6 +270,7 @@ struct virtio_hw {
>  	struct virtio_pci_common_cfg *common_cfg;
>  	struct virtio_net_config *dev_cfg;
>  	void	    *virtio_user_dev;
> +	rte_spinlock_t sl;

Need to add some detailed comments to describe what's
protected by this lock.

>  
>  	struct virtqueue **vqs;
>  };
> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index 6a24fde..7313bdd 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -1100,3 +1100,84 @@
>  
>  	return nb_tx;
>  }
> +
> +uint16_t
> +virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> +{
> +	struct virtnet_tx *txvq = tx_queue;
> +	struct virtqueue *vq = txvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint16_t hdr_size = hw->vtnet_hdr_size;
> +	uint16_t nb_used, nb_tx = 0;
> +
> +	if (unlikely(nb_pkts < 1))
> +		return nb_pkts;
> +
> +	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
> +	nb_used = VIRTQUEUE_NUSED(vq);
> +
> +	virtio_rmb();
> +	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
> +		virtio_xmit_cleanup(vq, nb_used);
> +
> +	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
> +		struct rte_mbuf *txm = tx_pkts[nb_tx];
> +		int can_push = 0, use_indirect = 0, slots, need;
> +
> +		/* optimize ring usage */
> +		if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
> +					vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
> +			rte_mbuf_refcnt_read(txm) == 1 &&
> +			RTE_MBUF_DIRECT(txm) &&
> +			txm->nb_segs == 1 &&
> +			rte_pktmbuf_headroom(txm) >= hdr_size &&
> +			rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> +				__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> +			can_push = 1;
> +		else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> +			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> +			use_indirect = 1;
> +
> +		/* How many main ring entries are needed to this Tx?
> +		 * any_layout => number of segments
> +		 * indirect   => 1
> +		 * default    => number of segments + 1
> +		 */
> +		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
> +		need = slots - vq->vq_free_cnt;
> +
> +		/* Positive value indicates it need free vring descriptors */
> +		if (unlikely(need > 0)) {
> +			nb_used = VIRTQUEUE_NUSED(vq);
> +			virtio_rmb();
> +			need = RTE_MIN(need, (int)nb_used);
> +
> +			virtio_xmit_cleanup(vq, need);
> +			need = slots - vq->vq_free_cnt;
> +			if (unlikely(need > 0)) {
> +				PMD_TX_LOG(ERR,
> +						"No free tx descriptors to transmit");
> +				break;
> +			}
> +		}
> +
> +		/* Enqueue Packet buffers */
> +		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
> +
> +		txvq->stats.bytes += txm->pkt_len;
> +		virtio_update_packet_stats(&txvq->stats, txm);
> +	}
> +
> +	txvq->stats.packets += nb_tx;
> +
> +	if (likely(nb_tx)) {
> +		vq_update_avail_idx(vq);
> +
> +		if (unlikely(virtqueue_kick_prepare(vq))) {
> +			virtqueue_notify(vq);
> +			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
> +		}
> +	}
> +
> +	return nb_tx;
> +}

What's the difference between virtio_inject_pkts() and
virtio_xmit_pkts() except the latter will check hw->started?

Best regards,
Tiwei Bie

Yuanhan Liu Nov. 27, 2017, 12:48 p.m. UTC | #2

On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> When live migration is done, for the backup VM, either the virtio
> frontend or the vhost backend needs to send out gratuitous RARP packet
> to announce its new network location.
> 
> This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support live
> migration scenario where the vhost backend doesn't have the ability to
> generate RARP packet.

Yes, it's a feature good to have.

> +static int
> +virtio_dev_pause(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +	if (hw->started == 0)
> +		return -1;
> +	hw->started = 0;
> +	/*
> +	 * Prevent the worker thread from touching queues to avoid condition,
> +	 * 1 ms should be enough for the ongoing Tx function to finish.
> +	 */
> +	rte_delay_ms(1);
> +	return 0;
> +}
> +
> +static void
> +virtio_dev_resume(struct rte_eth_dev *dev)
> +{
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +	hw->started = 1;
> +}

However, the implementation (stop first, pause for 1ms, duplicate another
Tx function, resume) doesn't seem elegant.

You probably could try something like DPDK vhost does:

- set a flag when S_ANNOUCE is received
- inject a pkt when such flag is set in the xmit function

You then should be able to get rid of all of above stuffs.

	--yliu

Xiao Wang Nov. 30, 2017, 2:37 a.m. UTC | #3

> -----Original Message-----

> From: Bie, Tiwei

> Sent: Friday, November 24, 2017 2:05 PM

> To: Wang, Xiao W <xiao.w.wang@intel.com>

> Cc: dev@dpdk.org; yliu@fridaylinux.org

> Subject: Re: [dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE

> 

> Hi,

> 

> Some quick comments. Will go through the whole patch later.

> 

> On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:

> > When live migration is done, for the backup VM, either the virtio

> > frontend or the vhost backend needs to send out gratuitous RARP packet

> > to announce its new network location.

> >

> > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support

> live

> > migration scenario where the vhost backend doesn't have the ability to

> > generate RARP packet.

> >

> > Brief introduction of the work flow:

> > 1. QEMU finishes live migration, pokes the backup VM with an interrupt.

> > 2. Virtio interrupt handler reads out the interrupt status value, and

> >    realizes it needs to send out RARP packet to announce its location.

> > 3. Pause device to stop worker thread touching the queues.

> > 4. Inject a RARP packet into a Tx Queue.

> > 5. Ack the interrupt via control queue.

> > 6. Resume device to continue packet processing.

> >

> > Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>

> > ---

> >  drivers/net/virtio/virtio_ethdev.c | 131

> ++++++++++++++++++++++++++++++++++++-

> >  drivers/net/virtio/virtio_ethdev.h |   4 ++

> >  drivers/net/virtio/virtio_pci.h    |   1 +

> >  drivers/net/virtio/virtio_rxtx.c   |  81 +++++++++++++++++++++++

> >  drivers/net/virtio/virtqueue.h     |  11 ++++

> >  5 files changed, 226 insertions(+), 2 deletions(-)

> >

> > diff --git a/drivers/net/virtio/virtio_ethdev.c

> b/drivers/net/virtio/virtio_ethdev.c

> > index 1959b11..6eaea0e 100644

> > --- a/drivers/net/virtio/virtio_ethdev.c

> > +++ b/drivers/net/virtio/virtio_ethdev.c

> > @@ -48,6 +48,8 @@

> >  #include <rte_pci.h>

> >  #include <rte_bus_pci.h>

> >  #include <rte_ether.h>

> > +#include <rte_ip.h>

> > +#include <rte_arp.h>

> >  #include <rte_common.h>

> >  #include <rte_errno.h>

> >  #include <rte_cpuflags.h>

> > @@ -55,6 +57,7 @@

> >  #include <rte_memory.h>

> >  #include <rte_eal.h>

> >  #include <rte_dev.h>

> > +#include <rte_cycles.h>

> >

> >  #include "virtio_ethdev.h"

> >  #include "virtio_pci.h"

> > @@ -106,6 +109,13 @@ static int virtio_dev_queue_stats_mapping_set(

> >  	uint8_t stat_idx,

> >  	uint8_t is_rx);

> >

> > +static int make_rarp_packet(struct rte_mbuf *rarp_mbuf,

> > +		const struct ether_addr *mac);

> > +static int virtio_dev_pause(struct rte_eth_dev *dev);

> > +static void virtio_dev_resume(struct rte_eth_dev *dev);

> > +static void generate_rarp(struct rte_eth_dev *dev);

> > +static void virtnet_ack_link_announce(struct rte_eth_dev *dev);

> > +

> >  /*

> >   * The set of PCI devices this driver supports

> >   */

> > @@ -1249,9 +1259,116 @@ static int virtio_dev_xstats_get_names(struct

> rte_eth_dev *dev,

> >  	return 0;

> >  }

> >

> > +#define RARP_PKT_SIZE	64

> > +

> > +static int

> > +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr

> *mac)

> > +{

> > +	struct ether_hdr *eth_hdr;

> > +	struct arp_hdr  *rarp;

> > +

> > +	if (rarp_mbuf->buf_len < RARP_PKT_SIZE) {

> > +		PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)",

> > +				rarp_mbuf->buf_len, RARP_PKT_SIZE);

> > +		return -1;

> > +	}

> > +

> > +	/* Ethernet header. */

> > +	eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);

> 

> You can use rte_pktmbuf_mtod() directly.


Looks better. Will change it in v2.

> 

> > +	memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);

> > +	ether_addr_copy(mac, &eth_hdr->s_addr);

> > +	eth_hdr->ether_type = htons(ETHER_TYPE_RARP);

> > +

> > +	/* RARP header. */

> > +	rarp = (struct arp_hdr *)(eth_hdr + 1);

> > +	rarp->arp_hrd = htons(ARP_HRD_ETHER);

> > +	rarp->arp_pro = htons(ETHER_TYPE_IPv4);

> > +	rarp->arp_hln = ETHER_ADDR_LEN;

> > +	rarp->arp_pln = 4;

> > +	rarp->arp_op  = htons(ARP_OP_REVREQUEST);

> > +

> > +	ether_addr_copy(mac, &rarp->arp_data.arp_sha);

> > +	ether_addr_copy(mac, &rarp->arp_data.arp_tha);

> > +	memset(&rarp->arp_data.arp_sip, 0x00, 4);

> > +	memset(&rarp->arp_data.arp_tip, 0x00, 4);

> > +

> > +	rarp_mbuf->data_len = RARP_PKT_SIZE;

> > +	rarp_mbuf->pkt_len = RARP_PKT_SIZE;

> > +

> > +	return 0;

> > +}

> > +

> > +static int

> > +virtio_dev_pause(struct rte_eth_dev *dev)

> > +{

> > +	struct virtio_hw *hw = dev->data->dev_private;

> > +

> > +	if (hw->started == 0)

> > +		return -1;

> > +	hw->started = 0;

> > +	/*

> > +	 * Prevent the worker thread from touching queues to avoid condition,

> > +	 * 1 ms should be enough for the ongoing Tx function to finish.

> > +	 */

> > +	rte_delay_ms(1);

> > +	return 0;

> > +}

> > +

> > +static void

> > +virtio_dev_resume(struct rte_eth_dev *dev)

> > +{

> > +	struct virtio_hw *hw = dev->data->dev_private;

> > +

> > +	hw->started = 1;

> > +}

> > +

> > +static void

> > +generate_rarp(struct rte_eth_dev *dev)

> > +{

> > +	struct virtio_hw *hw = dev->data->dev_private;

> > +	struct rte_mbuf *rarp_mbuf = NULL;

> > +	struct virtnet_tx *txvq = dev->data->tx_queues[0];

> > +	struct virtnet_rx *rxvq = dev->data->rx_queues[0];

> > +

> > +	rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);

> > +	if (rarp_mbuf == NULL) {

> > +		PMD_DRV_LOG(ERR, "mbuf allocate failed");

> > +		return;

> > +	}

> > +

> > +	if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr))

> {

> > +		rte_pktmbuf_free(rarp_mbuf);

> > +		rarp_mbuf = NULL;

> > +		return;

> > +	}

> > +

> > +	/* If virtio port just stopped, no need to send RARP */

> > +	if (virtio_dev_pause(dev) < -1)

> 

> You mean < 0?


Yes, will fix it in v2.

> 

> > +		return;

> > +

> > +	virtio_inject_pkts(txvq, &rarp_mbuf, 1);

> > +	/* Recover the stored hw status to let worker thread continue */

> > +	virtio_dev_resume(dev);

> > +}

> > +

> > +static void

> > +virtnet_ack_link_announce(struct rte_eth_dev *dev)

> > +{

> > +	struct virtio_hw *hw = dev->data->dev_private;

> > +	struct virtio_pmd_ctrl ctrl;

> > +	int len;

> > +

> > +	ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE;

> > +	ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK;

> > +	len = 0;

> > +

> > +	virtio_send_command(hw->cvq, &ctrl, &len, 0);

> > +}

> > +

> >  /*

> > - * Process Virtio Config changed interrupt and call the callback

> > - * if link state changed.

> > + * Process virtio config changed interrupt. Call the callback

> > + * if link state changed; generate gratuitous RARP packet if

> > + * the status indicates an ANNOUNCE.

> >   */

> >  void

> >  virtio_interrupt_handler(void *param)

> > @@ -1274,6 +1391,12 @@ static int virtio_dev_xstats_get_names(struct

> rte_eth_dev *dev,

> >  						      NULL, NULL);

> >  	}

> >

> > +	if (isr & VIRTIO_NET_S_ANNOUNCE) {

> > +		rte_spinlock_lock(&hw->sl);

> > +		generate_rarp(dev);

> > +		virtnet_ack_link_announce(dev);

> > +		rte_spinlock_unlock(&hw->sl);

> > +	}

> >  }

> >

> >  /* set rx and tx handlers according to what is supported */

> > @@ -1786,6 +1909,8 @@ static int eth_virtio_pci_remove(struct

> rte_pci_device *pci_dev)

> >  			return -EBUSY;

> >  		}

> >

> > +	rte_spinlock_init(&hw->sl);

> > +

> >  	hw->use_simple_rx = 1;

> >  	hw->use_simple_tx = 1;

> >

> > @@ -1952,12 +2077,14 @@ static void virtio_dev_free_mbufs(struct

> rte_eth_dev *dev)

> >

> >  	PMD_INIT_LOG(DEBUG, "stop");

> >

> > +	rte_spinlock_lock(&hw->sl);

> >  	if (intr_conf->lsc || intr_conf->rxq)

> >  		virtio_intr_disable(dev);

> >

> >  	hw->started = 0;

> >  	memset(&link, 0, sizeof(link));

> >  	virtio_dev_atomic_write_link_status(dev, &link);

> > +	rte_spinlock_unlock(&hw->sl);

> >  }

> >

> >  static int

> > diff --git a/drivers/net/virtio/virtio_ethdev.h

> b/drivers/net/virtio/virtio_ethdev.h

> > index 2039bc5..24271cb 100644

> > --- a/drivers/net/virtio/virtio_ethdev.h

> > +++ b/drivers/net/virtio/virtio_ethdev.h

> > @@ -67,6 +67,7 @@

> >  	 1u << VIRTIO_NET_F_HOST_TSO6	  |	\

> >  	 1u << VIRTIO_NET_F_MRG_RXBUF	  |	\

> >  	 1u << VIRTIO_NET_F_MTU	| \

> > +	 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE  | \

> >  	 1u << VIRTIO_RING_F_INDIRECT_DESC |    \

> >  	 1ULL << VIRTIO_F_VERSION_1       |	\

> >  	 1ULL << VIRTIO_F_IOMMU_PLATFORM)

> > @@ -111,6 +112,9 @@ uint16_t virtio_recv_mergeable_pkts(void *rx_queue,

> struct rte_mbuf **rx_pkts,

> >  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,

> >  		uint16_t nb_pkts);

> >

> > +uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,

> > +		uint16_t nb_pkts);

> > +

> >  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,

> >  		uint16_t nb_pkts);

> >

> > diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h

> > index 3c5ce66..3cd367e 100644

> > --- a/drivers/net/virtio/virtio_pci.h

> > +++ b/drivers/net/virtio/virtio_pci.h

> > @@ -270,6 +270,7 @@ struct virtio_hw {

> >  	struct virtio_pci_common_cfg *common_cfg;

> >  	struct virtio_net_config *dev_cfg;

> >  	void	    *virtio_user_dev;

> > +	rte_spinlock_t sl;

> 

> Need to add some detailed comments to describe what's

> protected by this lock.


With this feature, the hw->started flag can be changed by two threads:
App management thread and the interrupt handler thread.
This lock can prevent such a case:

1. When LM is done, config change interrupt triggered, the handler pause the device. hw->started  = 0;
2. app stops virtio port. Hw->started = 0;
3. RARP injected in Tx queue, ack sent in Control queue, resume device. hw->started = 1; (but the port is stopped already)

> 

> >

> >  	struct virtqueue **vqs;

> >  };

> > diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c

> > index 6a24fde..7313bdd 100644

> > --- a/drivers/net/virtio/virtio_rxtx.c

> > +++ b/drivers/net/virtio/virtio_rxtx.c

> > @@ -1100,3 +1100,84 @@

> >

> >  	return nb_tx;

> >  }

> > +

> > +uint16_t

> > +virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t

> nb_pkts)

> > +{

> > +	struct virtnet_tx *txvq = tx_queue;

> > +	struct virtqueue *vq = txvq->vq;

> > +	struct virtio_hw *hw = vq->hw;

> > +	uint16_t hdr_size = hw->vtnet_hdr_size;

> > +	uint16_t nb_used, nb_tx = 0;

> > +

> > +	if (unlikely(nb_pkts < 1))

> > +		return nb_pkts;

> > +

> > +	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);

> > +	nb_used = VIRTQUEUE_NUSED(vq);

> > +

> > +	virtio_rmb();

> > +	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))

> > +		virtio_xmit_cleanup(vq, nb_used);

> > +

> > +	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {

> > +		struct rte_mbuf *txm = tx_pkts[nb_tx];

> > +		int can_push = 0, use_indirect = 0, slots, need;

> > +

> > +		/* optimize ring usage */

> > +		if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||

> > +					vtpci_with_feature(hw,

> VIRTIO_F_VERSION_1)) &&

> > +			rte_mbuf_refcnt_read(txm) == 1 &&

> > +			RTE_MBUF_DIRECT(txm) &&

> > +			txm->nb_segs == 1 &&

> > +			rte_pktmbuf_headroom(txm) >= hdr_size &&

> > +			rte_is_aligned(rte_pktmbuf_mtod(txm, char *),

> > +				__alignof__(struct virtio_net_hdr_mrg_rxbuf)))

> > +			can_push = 1;

> > +		else if (vtpci_with_feature(hw,

> VIRTIO_RING_F_INDIRECT_DESC) &&

> > +			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)

> > +			use_indirect = 1;

> > +

> > +		/* How many main ring entries are needed to this Tx?

> > +		 * any_layout => number of segments

> > +		 * indirect   => 1

> > +		 * default    => number of segments + 1

> > +		 */

> > +		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);

> > +		need = slots - vq->vq_free_cnt;

> > +

> > +		/* Positive value indicates it need free vring descriptors */

> > +		if (unlikely(need > 0)) {

> > +			nb_used = VIRTQUEUE_NUSED(vq);

> > +			virtio_rmb();

> > +			need = RTE_MIN(need, (int)nb_used);

> > +

> > +			virtio_xmit_cleanup(vq, need);

> > +			need = slots - vq->vq_free_cnt;

> > +			if (unlikely(need > 0)) {

> > +				PMD_TX_LOG(ERR,

> > +						"No free tx descriptors to

> transmit");

> > +				break;

> > +			}

> > +		}

> > +

> > +		/* Enqueue Packet buffers */

> > +		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect,

> can_push);

> > +

> > +		txvq->stats.bytes += txm->pkt_len;

> > +		virtio_update_packet_stats(&txvq->stats, txm);

> > +	}

> > +

> > +	txvq->stats.packets += nb_tx;

> > +

> > +	if (likely(nb_tx)) {

> > +		vq_update_avail_idx(vq);

> > +

> > +		if (unlikely(virtqueue_kick_prepare(vq))) {

> > +			virtqueue_notify(vq);

> > +			PMD_TX_LOG(DEBUG, "Notified backend after xmit");

> > +		}

> > +	}

> > +

> > +	return nb_tx;

> > +}

> 

> What's the difference between virtio_inject_pkts() and

> virtio_xmit_pkts() except the latter will check hw->started?


No vlan tag insertion.
Actually they are both using virtqueue_enqueue_xmit() to enqueue packet.

Thanks for your comments.
Xiao

Xiao Wang Nov. 30, 2017, 2:41 a.m. UTC | #4

> -----Original Message-----
> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> Sent: Monday, November 27, 2017 8:49 PM
> To: Wang, Xiao W <xiao.w.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH 2/2] net/virtio: support GUEST ANNOUNCE
> 
> On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> > When live migration is done, for the backup VM, either the virtio
> > frontend or the vhost backend needs to send out gratuitous RARP packet
> > to announce its new network location.
> >
> > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support
> live
> > migration scenario where the vhost backend doesn't have the ability to
> > generate RARP packet.
> 
> Yes, it's a feature good to have.
> 
> > +static int
> > +virtio_dev_pause(struct rte_eth_dev *dev)
> > +{
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +
> > +	if (hw->started == 0)
> > +		return -1;
> > +	hw->started = 0;
> > +	/*
> > +	 * Prevent the worker thread from touching queues to avoid condition,
> > +	 * 1 ms should be enough for the ongoing Tx function to finish.
> > +	 */
> > +	rte_delay_ms(1);
> > +	return 0;
> > +}
> > +
> > +static void
> > +virtio_dev_resume(struct rte_eth_dev *dev)
> > +{
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +
> > +	hw->started = 1;
> > +}
> 
> However, the implementation (stop first, pause for 1ms, duplicate another
> Tx function, resume) doesn't seem elegant.
> 
> You probably could try something like DPDK vhost does:
> 
> - set a flag when S_ANNOUCE is received
> - inject a pkt when such flag is set in the xmit function
> 
> You then should be able to get rid of all of above stuffs.
> 
> 	--yliu

The difference is that the virtio port may just receive packet, without xmit.

Thanks for the comments,
Xiao

Yuanhan Liu Dec. 5, 2017, 2:26 p.m. UTC | #5

On Thu, Nov 30, 2017 at 02:41:12AM +0000, Wang, Xiao W wrote:
> 
> 
> > -----Original Message-----
> > From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> > Sent: Monday, November 27, 2017 8:49 PM
> > To: Wang, Xiao W <xiao.w.wang@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [PATCH 2/2] net/virtio: support GUEST ANNOUNCE
> > 
> > On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> > > When live migration is done, for the backup VM, either the virtio
> > > frontend or the vhost backend needs to send out gratuitous RARP packet
> > > to announce its new network location.
> > >
> > > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support
> > live
> > > migration scenario where the vhost backend doesn't have the ability to
> > > generate RARP packet.
> > 
> > Yes, it's a feature good to have.
> > 
> > > +static int
> > > +virtio_dev_pause(struct rte_eth_dev *dev)
> > > +{
> > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > +
> > > +	if (hw->started == 0)
> > > +		return -1;
> > > +	hw->started = 0;
> > > +	/*
> > > +	 * Prevent the worker thread from touching queues to avoid condition,
> > > +	 * 1 ms should be enough for the ongoing Tx function to finish.
> > > +	 */
> > > +	rte_delay_ms(1);
> > > +	return 0;
> > > +}
> > > +
> > > +static void
> > > +virtio_dev_resume(struct rte_eth_dev *dev)
> > > +{
> > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > +
> > > +	hw->started = 1;
> > > +}
> > 
> > However, the implementation (stop first, pause for 1ms, duplicate another
> > Tx function, resume) doesn't seem elegant.
> > 
> > You probably could try something like DPDK vhost does:
> > 
> > - set a flag when S_ANNOUCE is received
> > - inject a pkt when such flag is set in the xmit function
> > 
> > You then should be able to get rid of all of above stuffs.
> > 
> > 	--yliu
> 
> The difference is that the virtio port may just receive packet, without xmit.

Thanks, I missed that.

However, you really should not add a duplicate function. It adds more
maintain effort. I think you probably could just invoke the tx_pkt_burst
callback directly. You have stopped the device after all. What's the
necessary to duplicate it?

	--yliu

Xiao Wang Jan. 3, 2018, 1:43 a.m. UTC | #6

> -----Original Message-----
> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> Sent: Tuesday, December 5, 2017 10:26 PM
> To: Wang, Xiao W <xiao.w.wang@intel.com>
> Cc: dev@dpdk.org; Bie, Tiwei <tiwei.bie@intel.com>
> Subject: Re: [PATCH 2/2] net/virtio: support GUEST ANNOUNCE
> 
> On Thu, Nov 30, 2017 at 02:41:12AM +0000, Wang, Xiao W wrote:
> >
> >
> > > -----Original Message-----
> > > From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> > > Sent: Monday, November 27, 2017 8:49 PM
> > > To: Wang, Xiao W <xiao.w.wang@intel.com>
> > > Cc: dev@dpdk.org
> > > Subject: Re: [PATCH 2/2] net/virtio: support GUEST ANNOUNCE
> > >
> > > On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote:
> > > > When live migration is done, for the backup VM, either the virtio
> > > > frontend or the vhost backend needs to send out gratuitous RARP packet
> > > > to announce its new network location.
> > > >
> > > > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support
> > > live
> > > > migration scenario where the vhost backend doesn't have the ability to
> > > > generate RARP packet.
> > >
> > > Yes, it's a feature good to have.
> > >
> > > > +static int
> > > > +virtio_dev_pause(struct rte_eth_dev *dev)
> > > > +{
> > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > > +
> > > > +	if (hw->started == 0)
> > > > +		return -1;
> > > > +	hw->started = 0;
> > > > +	/*
> > > > +	 * Prevent the worker thread from touching queues to avoid condition,
> > > > +	 * 1 ms should be enough for the ongoing Tx function to finish.
> > > > +	 */
> > > > +	rte_delay_ms(1);
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static void
> > > > +virtio_dev_resume(struct rte_eth_dev *dev)
> > > > +{
> > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > > +
> > > > +	hw->started = 1;
> > > > +}
> > >
> > > However, the implementation (stop first, pause for 1ms, duplicate another
> > > Tx function, resume) doesn't seem elegant.
> > >
> > > You probably could try something like DPDK vhost does:
> > >
> > > - set a flag when S_ANNOUCE is received
> > > - inject a pkt when such flag is set in the xmit function
> > >
> > > You then should be able to get rid of all of above stuffs.
> > >
> > > 	--yliu
> >
> > The difference is that the virtio port may just receive packet, without xmit.
> 
> Thanks, I missed that.
> 
> However, you really should not add a duplicate function. It adds more
> maintain effort. I think you probably could just invoke the tx_pkt_burst
> callback directly. You have stopped the device after all. What's the
> necessary to duplicate it?
> 
> 	--yliu

To just invoke the tx_pkt_burst callback will be ideal.
I will have a static mbuf ** pointer for rarp packets in next version, then the tx_pkt_burst function will let rarp packet pass even if the queue is stopped.
This can avoid code duplication.

BRs,
Xiao

diff mbox

Patch

diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 1959b11..6eaea0e 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -48,6 +48,8 @@ 
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
 #include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_arp.h>
 #include <rte_common.h>
 #include <rte_errno.h>
 #include <rte_cpuflags.h>
@@ -55,6 +57,7 @@ 
 #include <rte_memory.h>
 #include <rte_eal.h>
 #include <rte_dev.h>
+#include <rte_cycles.h>
 
 #include "virtio_ethdev.h"
 #include "virtio_pci.h"
@@ -106,6 +109,13 @@  static int virtio_dev_queue_stats_mapping_set(
 	uint8_t stat_idx,
 	uint8_t is_rx);
 
+static int make_rarp_packet(struct rte_mbuf *rarp_mbuf,
+		const struct ether_addr *mac);
+static int virtio_dev_pause(struct rte_eth_dev *dev);
+static void virtio_dev_resume(struct rte_eth_dev *dev);
+static void generate_rarp(struct rte_eth_dev *dev);
+static void virtnet_ack_link_announce(struct rte_eth_dev *dev);
+
 /*
  * The set of PCI devices this driver supports
  */
@@ -1249,9 +1259,116 @@  static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
 	return 0;
 }
 
+#define RARP_PKT_SIZE	64
+
+static int
+make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
+{
+	struct ether_hdr *eth_hdr;
+	struct arp_hdr  *rarp;
+
+	if (rarp_mbuf->buf_len < RARP_PKT_SIZE) {
+		PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)",
+				rarp_mbuf->buf_len, RARP_PKT_SIZE);
+		return -1;
+	}
+
+	/* Ethernet header. */
+	eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
+	memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
+	ether_addr_copy(mac, &eth_hdr->s_addr);
+	eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
+
+	/* RARP header. */
+	rarp = (struct arp_hdr *)(eth_hdr + 1);
+	rarp->arp_hrd = htons(ARP_HRD_ETHER);
+	rarp->arp_pro = htons(ETHER_TYPE_IPv4);
+	rarp->arp_hln = ETHER_ADDR_LEN;
+	rarp->arp_pln = 4;
+	rarp->arp_op  = htons(ARP_OP_REVREQUEST);
+
+	ether_addr_copy(mac, &rarp->arp_data.arp_sha);
+	ether_addr_copy(mac, &rarp->arp_data.arp_tha);
+	memset(&rarp->arp_data.arp_sip, 0x00, 4);
+	memset(&rarp->arp_data.arp_tip, 0x00, 4);
+
+	rarp_mbuf->data_len = RARP_PKT_SIZE;
+	rarp_mbuf->pkt_len = RARP_PKT_SIZE;
+
+	return 0;
+}
+
+static int
+virtio_dev_pause(struct rte_eth_dev *dev)
+{
+	struct virtio_hw *hw = dev->data->dev_private;
+
+	if (hw->started == 0)
+		return -1;
+	hw->started = 0;
+	/*
+	 * Prevent the worker thread from touching queues to avoid condition,
+	 * 1 ms should be enough for the ongoing Tx function to finish.
+	 */
+	rte_delay_ms(1);
+	return 0;
+}
+
+static void
+virtio_dev_resume(struct rte_eth_dev *dev)
+{
+	struct virtio_hw *hw = dev->data->dev_private;
+
+	hw->started = 1;
+}
+
+static void
+generate_rarp(struct rte_eth_dev *dev)
+{
+	struct virtio_hw *hw = dev->data->dev_private;
+	struct rte_mbuf *rarp_mbuf = NULL;
+	struct virtnet_tx *txvq = dev->data->tx_queues[0];
+	struct virtnet_rx *rxvq = dev->data->rx_queues[0];
+
+	rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
+	if (rarp_mbuf == NULL) {
+		PMD_DRV_LOG(ERR, "mbuf allocate failed");
+		return;
+	}
+
+	if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr)) {
+		rte_pktmbuf_free(rarp_mbuf);
+		rarp_mbuf = NULL;
+		return;
+	}
+
+	/* If virtio port just stopped, no need to send RARP */
+	if (virtio_dev_pause(dev) < -1)
+		return;
+
+	virtio_inject_pkts(txvq, &rarp_mbuf, 1);
+	/* Recover the stored hw status to let worker thread continue */
+	virtio_dev_resume(dev);
+}
+
+static void
+virtnet_ack_link_announce(struct rte_eth_dev *dev)
+{
+	struct virtio_hw *hw = dev->data->dev_private;
+	struct virtio_pmd_ctrl ctrl;
+	int len;
+
+	ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE;
+	ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK;
+	len = 0;
+
+	virtio_send_command(hw->cvq, &ctrl, &len, 0);
+}
+
 /*
- * Process Virtio Config changed interrupt and call the callback
- * if link state changed.
+ * Process virtio config changed interrupt. Call the callback
+ * if link state changed; generate gratuitous RARP packet if
+ * the status indicates an ANNOUNCE.
  */
 void
 virtio_interrupt_handler(void *param)
@@ -1274,6 +1391,12 @@  static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
 						      NULL, NULL);
 	}
 
+	if (isr & VIRTIO_NET_S_ANNOUNCE) {
+		rte_spinlock_lock(&hw->sl);
+		generate_rarp(dev);
+		virtnet_ack_link_announce(dev);
+		rte_spinlock_unlock(&hw->sl);
+	}
 }
 
 /* set rx and tx handlers according to what is supported */
@@ -1786,6 +1909,8 @@  static int eth_virtio_pci_remove(struct rte_pci_device *pci_dev)
 			return -EBUSY;
 		}
 
+	rte_spinlock_init(&hw->sl);
+
 	hw->use_simple_rx = 1;
 	hw->use_simple_tx = 1;
 
@@ -1952,12 +2077,14 @@  static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
 
 	PMD_INIT_LOG(DEBUG, "stop");
 
+	rte_spinlock_lock(&hw->sl);
 	if (intr_conf->lsc || intr_conf->rxq)
 		virtio_intr_disable(dev);
 
 	hw->started = 0;
 	memset(&link, 0, sizeof(link));
 	virtio_dev_atomic_write_link_status(dev, &link);
+	rte_spinlock_unlock(&hw->sl);
 }
 
 static int
diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
index 2039bc5..24271cb 100644
--- a/drivers/net/virtio/virtio_ethdev.h
+++ b/drivers/net/virtio/virtio_ethdev.h
@@ -67,6 +67,7 @@ 
 	 1u << VIRTIO_NET_F_HOST_TSO6	  |	\
 	 1u << VIRTIO_NET_F_MRG_RXBUF	  |	\
 	 1u << VIRTIO_NET_F_MTU	| \
+	 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE  | \
 	 1u << VIRTIO_RING_F_INDIRECT_DESC |    \
 	 1ULL << VIRTIO_F_VERSION_1       |	\
 	 1ULL << VIRTIO_F_IOMMU_PLATFORM)
@@ -111,6 +112,9 @@  uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
 
+uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
 uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts);
 
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index 3c5ce66..3cd367e 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -270,6 +270,7 @@  struct virtio_hw {
 	struct virtio_pci_common_cfg *common_cfg;
 	struct virtio_net_config *dev_cfg;
 	void	    *virtio_user_dev;
+	rte_spinlock_t sl;
 
 	struct virtqueue **vqs;
 };
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 6a24fde..7313bdd 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -1100,3 +1100,84 @@ 
 
 	return nb_tx;
 }
+
+uint16_t
+virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct virtnet_tx *txvq = tx_queue;
+	struct virtqueue *vq = txvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint16_t nb_used, nb_tx = 0;
+
+	if (unlikely(nb_pkts < 1))
+		return nb_pkts;
+
+	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	virtio_rmb();
+	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
+		virtio_xmit_cleanup(vq, nb_used);
+
+	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
+		struct rte_mbuf *txm = tx_pkts[nb_tx];
+		int can_push = 0, use_indirect = 0, slots, need;
+
+		/* optimize ring usage */
+		if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
+					vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
+			rte_mbuf_refcnt_read(txm) == 1 &&
+			RTE_MBUF_DIRECT(txm) &&
+			txm->nb_segs == 1 &&
+			rte_pktmbuf_headroom(txm) >= hdr_size &&
+			rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
+				__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
+			can_push = 1;
+		else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
+			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
+			use_indirect = 1;
+
+		/* How many main ring entries are needed to this Tx?
+		 * any_layout => number of segments
+		 * indirect   => 1
+		 * default    => number of segments + 1
+		 */
+		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
+		need = slots - vq->vq_free_cnt;
+
+		/* Positive value indicates it need free vring descriptors */
+		if (unlikely(need > 0)) {
+			nb_used = VIRTQUEUE_NUSED(vq);
+			virtio_rmb();
+			need = RTE_MIN(need, (int)nb_used);
+
+			virtio_xmit_cleanup(vq, need);
+			need = slots - vq->vq_free_cnt;
+			if (unlikely(need > 0)) {
+				PMD_TX_LOG(ERR,
+						"No free tx descriptors to transmit");
+				break;
+			}
+		}
+
+		/* Enqueue Packet buffers */
+		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
+
+		txvq->stats.bytes += txm->pkt_len;
+		virtio_update_packet_stats(&txvq->stats, txm);
+	}
+
+	txvq->stats.packets += nb_tx;
+
+	if (likely(nb_tx)) {
+		vq_update_avail_idx(vq);
+
+		if (unlikely(virtqueue_kick_prepare(vq))) {
+			virtqueue_notify(vq);
+			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
+		}
+	}
+
+	return nb_tx;
+}
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 2305d91..ed420e9 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -158,6 +158,17 @@  struct virtio_net_ctrl_mac {
 #define VIRTIO_NET_CTRL_VLAN_ADD 0
 #define VIRTIO_NET_CTRL_VLAN_DEL 1
 
+/*
+ * Control link announce acknowledgement
+ *
+ * The command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that
+ * driver has recevied the notification; device would clear the
+ * VIRTIO_NET_S_ANNOUNCE bit in the status field after it receives
+ * this command.
+ */
+#define VIRTIO_NET_CTRL_ANNOUNCE       3
+#define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
+
 struct virtio_net_ctrl_hdr {
 	uint8_t class;
 	uint8_t cmd;