[dpdk-dev,v3] virtio: Support mergeable buffer in virtio pmd

Message ID 1408006475-17606-1-git-send-email-changchun.ouyang@intel.com (mailing list archive)
State Accepted, archived
Headers

Commit Message

Ouyang Changchun Aug. 14, 2014, 8:54 a.m. UTC
v3 change:
- Investigate the comments from Huawei and fix one potential issue of wrong offset to
  the number of descriptor in buffer; also fix other tiny comments.

v2 change:
- Resolve conflicts with the tip code;
- And resolve 2 issues:
   -- fix mbuf leak when discard an uncompleted packet.
   -- refine pkt.data to point to actual payload data start point.
 
v1 change:
- This patch supports mergeable buffer feature in DPDK based virtio PMD, which can
  receive jumbo frame with larger size, like 3K, 4K or even 9K.

Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com>
Acked-by: Huawei Xie <huawei.xie@intel.com>
---
 lib/librte_pmd_virtio/virtio_ethdev.c |  20 +--
 lib/librte_pmd_virtio/virtio_ethdev.h |   3 +
 lib/librte_pmd_virtio/virtio_rxtx.c   | 221 +++++++++++++++++++++++++++++-----
 3 files changed, 207 insertions(+), 37 deletions(-)
  

Comments

Ouyang Changchun Aug. 21, 2014, 1:23 a.m. UTC | #1
Hi all,

Any comments for this patch?
And what's the status for merging it into mainline?

Thanks in advance
Changchun

> -----Original Message-----
> From: Ouyang, Changchun
> Sent: Thursday, August 14, 2014 4:55 PM
> To: dev@dpdk.org
> Cc: Cao, Waterman; Ouyang, Changchun
> Subject: [PATCH v3] virtio: Support mergeable buffer in virtio pmd
> 
> v3 change:
> - Investigate the comments from Huawei and fix one potential issue of
> wrong offset to
>   the number of descriptor in buffer; also fix other tiny comments.
> 
> v2 change:
> - Resolve conflicts with the tip code;
> - And resolve 2 issues:
>    -- fix mbuf leak when discard an uncompleted packet.
>    -- refine pkt.data to point to actual payload data start point.
> 
> v1 change:
> - This patch supports mergeable buffer feature in DPDK based virtio PMD,
> which can
>   receive jumbo frame with larger size, like 3K, 4K or even 9K.
> 
> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com>
> Acked-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  lib/librte_pmd_virtio/virtio_ethdev.c |  20 +--
>  lib/librte_pmd_virtio/virtio_ethdev.h |   3 +
>  lib/librte_pmd_virtio/virtio_rxtx.c   | 221
> +++++++++++++++++++++++++++++-----
>  3 files changed, 207 insertions(+), 37 deletions(-)
> 
> diff --git a/lib/librte_pmd_virtio/virtio_ethdev.c
> b/lib/librte_pmd_virtio/virtio_ethdev.c
> index b9f5529..535d798 100644
> --- a/lib/librte_pmd_virtio/virtio_ethdev.c
> +++ b/lib/librte_pmd_virtio/virtio_ethdev.c
> @@ -337,7 +337,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
>  		snprintf(vq_name, sizeof(vq_name),
> "port%d_tvq%d_hdrzone",
>  			dev->data->port_id, queue_idx);
>  		vq->virtio_net_hdr_mz =
> rte_memzone_reserve_aligned(vq_name,
> -			vq_size * sizeof(struct virtio_net_hdr),
> +			vq_size * hw->vtnet_hdr_size,
>  			socket_id, 0, CACHE_LINE_SIZE);
>  		if (vq->virtio_net_hdr_mz == NULL) {
>  			rte_free(vq);
> @@ -346,7 +346,7 @@ int virtio_dev_queue_setup(struct rte_eth_dev *dev,
>  		vq->virtio_net_hdr_mem =
>  			vq->virtio_net_hdr_mz->phys_addr;
>  		memset(vq->virtio_net_hdr_mz->addr, 0,
> -			vq_size * sizeof(struct virtio_net_hdr));
> +			vq_size * hw->vtnet_hdr_size);
>  	} else if (queue_type == VTNET_CQ) {
>  		/* Allocate a page for control vq command, data and status
> */
>  		snprintf(vq_name, sizeof(vq_name), "port%d_cvq_hdrzone",
> @@ -571,9 +571,6 @@ virtio_negotiate_features(struct virtio_hw *hw)
>  	mask |= VIRTIO_NET_F_GUEST_TSO4 | VIRTIO_NET_F_GUEST_TSO6
> | VIRTIO_NET_F_GUEST_ECN;
>  	mask |= VTNET_LRO_FEATURES;
> 
> -	/* rx_mbuf should not be in multiple merged segments */
> -	mask |= VIRTIO_NET_F_MRG_RXBUF;
> -
>  	/* not negotiating INDIRECT descriptor table support */
>  	mask |= VIRTIO_RING_F_INDIRECT_DESC;
> 
> @@ -746,7 +743,6 @@ eth_virtio_dev_init(__rte_unused struct eth_driver
> *eth_drv,
>  	}
> 
>  	eth_dev->dev_ops = &virtio_eth_dev_ops;
> -	eth_dev->rx_pkt_burst = &virtio_recv_pkts;
>  	eth_dev->tx_pkt_burst = &virtio_xmit_pkts;
> 
>  	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
> @@ -801,10 +797,13 @@ eth_virtio_dev_init(__rte_unused struct
> eth_driver *eth_drv,
>  	virtio_negotiate_features(hw);
> 
>  	/* Setting up rx_header size for the device */
> -	if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF))
> +	if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> +		eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;
>  		hw->vtnet_hdr_size = sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> -	else
> +	} else {
> +		eth_dev->rx_pkt_burst = &virtio_recv_pkts;
>  		hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
> +	}
> 
>  	/* Allocate memory for storing MAC addresses */
>  	eth_dev->data->mac_addrs = rte_zmalloc("virtio",
> ETHER_ADDR_LEN, 0);
> @@ -1009,7 +1008,7 @@ static void virtio_dev_free_mbufs(struct
> rte_eth_dev *dev)
> 
>  		while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
>  					dev->data->rx_queues[i])) != NULL) {
> -			rte_pktmbuf_free_seg(buf);
> +			rte_pktmbuf_free(buf);
>  			mbuf_num++;
>  		}
> 
> @@ -1028,7 +1027,8 @@ static void virtio_dev_free_mbufs(struct
> rte_eth_dev *dev)
>  		mbuf_num = 0;
>  		while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
>  					dev->data->tx_queues[i])) != NULL) {
> -			rte_pktmbuf_free_seg(buf);
> +			rte_pktmbuf_free(buf);
> +
>  			mbuf_num++;
>  		}
> 
> diff --git a/lib/librte_pmd_virtio/virtio_ethdev.h
> b/lib/librte_pmd_virtio/virtio_ethdev.h
> index 858e644..d2e1eed 100644
> --- a/lib/librte_pmd_virtio/virtio_ethdev.h
> +++ b/lib/librte_pmd_virtio/virtio_ethdev.h
> @@ -104,6 +104,9 @@ int  virtio_dev_tx_queue_setup(struct rte_eth_dev
> *dev, uint16_t tx_queue_id,
>  uint16_t virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		uint16_t nb_pkts);
> 
> +uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts,
> +		uint16_t nb_pkts);
> +
>  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		uint16_t nb_pkts);
> 
> diff --git a/lib/librte_pmd_virtio/virtio_rxtx.c
> b/lib/librte_pmd_virtio/virtio_rxtx.c
> index fcd8bd1..0b10108 100644
> --- a/lib/librte_pmd_virtio/virtio_rxtx.c
> +++ b/lib/librte_pmd_virtio/virtio_rxtx.c
> @@ -146,6 +146,7 @@ static inline int
>  virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf
> *cookie)
>  {
>  	struct vq_desc_extra *dxp;
> +	struct virtio_hw *hw = vq->hw;
>  	struct vring_desc *start_dp;
>  	uint16_t needed = 1;
>  	uint16_t head_idx, idx;
> @@ -165,9 +166,11 @@ virtqueue_enqueue_recv_refill(struct virtqueue *vq,
> struct rte_mbuf *cookie)
>  	dxp->ndescs = needed;
> 
>  	start_dp = vq->vq_ring.desc;
> -	start_dp[idx].addr  =
> -		(uint64_t) (cookie->buf_physaddr +
> RTE_PKTMBUF_HEADROOM - sizeof(struct virtio_net_hdr));
> -	start_dp[idx].len   = cookie->buf_len - RTE_PKTMBUF_HEADROOM +
> sizeof(struct virtio_net_hdr);
> +	start_dp[idx].addr =
> +		(uint64_t)(cookie->buf_physaddr +
> RTE_PKTMBUF_HEADROOM
> +		- hw->vtnet_hdr_size);
> +	start_dp[idx].len =
> +		cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw-
> >vtnet_hdr_size;
>  	start_dp[idx].flags =  VRING_DESC_F_WRITE;
>  	idx = start_dp[idx].next;
>  	vq->vq_desc_head_idx = idx;
> @@ -184,8 +187,10 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq,
> struct rte_mbuf *cookie)
>  {
>  	struct vq_desc_extra *dxp;
>  	struct vring_desc *start_dp;
> -	uint16_t needed = 2;
> +	uint16_t seg_num = cookie->pkt.nb_segs;
> +	uint16_t needed = 1 + seg_num;
>  	uint16_t head_idx, idx;
> +	uint16_t head_size = txvq->hw->vtnet_hdr_size;
> 
>  	if (unlikely(txvq->vq_free_cnt == 0))
>  		return -ENOSPC;
> @@ -198,19 +203,25 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq,
> struct rte_mbuf *cookie)
>  	idx = head_idx;
>  	dxp = &txvq->vq_descx[idx];
>  	if (dxp->cookie != NULL)
> -		rte_pktmbuf_free_seg(dxp->cookie);
> +		rte_pktmbuf_free(dxp->cookie);
>  	dxp->cookie = (void *)cookie;
>  	dxp->ndescs = needed;
> 
>  	start_dp = txvq->vq_ring.desc;
> -	start_dp[idx].addr  =
> -		txvq->virtio_net_hdr_mem + idx * sizeof(struct
> virtio_net_hdr);
> -	start_dp[idx].len   = sizeof(struct virtio_net_hdr);
> +	start_dp[idx].addr =
> +		txvq->virtio_net_hdr_mem + idx * head_size;
> +	start_dp[idx].len = (uint32_t)head_size;
>  	start_dp[idx].flags = VRING_DESC_F_NEXT;
> -	idx = start_dp[idx].next;
> -	start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
> -	start_dp[idx].len   = cookie->pkt.data_len;
> -	start_dp[idx].flags = 0;
> +
> +	for (; ((seg_num > 0) && (cookie != NULL)); seg_num--) {
> +		idx = start_dp[idx].next;
> +		start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
> +		start_dp[idx].len   = cookie->pkt.data_len;
> +		start_dp[idx].flags = VRING_DESC_F_NEXT;
> +		cookie = cookie->pkt.next;
> +	}
> +
> +	start_dp[idx].flags &= ~VRING_DESC_F_NEXT;
>  	idx = start_dp[idx].next;
>  	txvq->vq_desc_head_idx = idx;
>  	if (txvq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
> @@ -284,7 +295,7 @@ virtio_dev_vring_start(struct virtqueue *vq, int
> queue_type)
>  			error = virtqueue_enqueue_recv_refill(vq, m);
> 
>  			if (error) {
> -				rte_pktmbuf_free_seg(m);
> +				rte_pktmbuf_free(m);
>  				break;
>  			}
>  			nbufs++;
> @@ -423,7 +434,7 @@ virtio_discard_rxbuf(struct virtqueue *vq, struct
> rte_mbuf *m)
>  	error = virtqueue_enqueue_recv_refill(vq, m);
>  	if (unlikely(error)) {
>  		RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
> -		rte_pktmbuf_free_seg(m);
> +		rte_pktmbuf_free(m);
>  	}
>  }
> 
> @@ -433,13 +444,13 @@ uint16_t
>  virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t
> nb_pkts)
>  {
>  	struct virtqueue *rxvq = rx_queue;
> -	struct virtio_hw *hw = rxvq->hw;
>  	struct rte_mbuf *rxm, *new_mbuf;
>  	uint16_t nb_used, num, nb_rx = 0;
>  	uint32_t len[VIRTIO_MBUF_BURST_SZ];
>  	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
>  	int error;
>  	uint32_t i, nb_enqueued = 0;
> +	const uint32_t hdr_size = sizeof(struct virtio_net_hdr);
> 
>  	nb_used = VIRTQUEUE_NUSED(rxvq);
> 
> @@ -460,8 +471,7 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> 
>  		PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
> 
> -		if (unlikely(len[i]
> -			     < (uint32_t)hw->vtnet_hdr_size +
> ETHER_HDR_LEN)) {
> +		if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
>  			PMD_RX_LOG(ERR, "Packet drop");
>  			nb_enqueued++;
>  			virtio_discard_rxbuf(rxvq, rxm);
> @@ -471,17 +481,16 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> 
>  		rxm->pkt.in_port = rxvq->port_id;
>  		rxm->pkt.data = (char *)rxm->buf_addr +
> RTE_PKTMBUF_HEADROOM;
> +
>  		rxm->pkt.nb_segs = 1;
>  		rxm->pkt.next = NULL;
> -		rxm->pkt.pkt_len  = (uint32_t)(len[i]
> -					       - sizeof(struct virtio_net_hdr));
> -		rxm->pkt.data_len = (uint16_t)(len[i]
> -					       - sizeof(struct virtio_net_hdr));
> +		rxm->pkt.pkt_len = (uint32_t)(len[i] - hdr_size);
> +		rxm->pkt.data_len = (uint16_t)(len[i] - hdr_size);
> 
>  		VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
> 
>  		rx_pkts[nb_rx++] = rxm;
> -		rxvq->bytes += len[i] - sizeof(struct virtio_net_hdr);
> +		rxvq->bytes += rx_pkts[nb_rx - 1]->pkt.pkt_len;
>  	}
> 
>  	rxvq->packets += nb_rx;
> @@ -498,11 +507,165 @@ virtio_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  		}
>  		error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
>  		if (unlikely(error)) {
> -			rte_pktmbuf_free_seg(new_mbuf);
> +			rte_pktmbuf_free(new_mbuf);
>  			break;
>  		}
>  		nb_enqueued++;
>  	}
> +
> +	if (likely(nb_enqueued)) {
> +		if (unlikely(virtqueue_kick_prepare(rxvq))) {
> +			virtqueue_notify(rxvq);
> +			PMD_RX_LOG(DEBUG, "Notified\n");
> +		}
> +	}
> +
> +	vq_update_avail_idx(rxvq);
> +
> +	return nb_rx;
> +}
> +
> +uint16_t
> +virtio_recv_mergeable_pkts(void *rx_queue,
> +			struct rte_mbuf **rx_pkts,
> +			uint16_t nb_pkts)
> +{
> +	struct virtqueue *rxvq = rx_queue;
> +	struct rte_mbuf *rxm, *new_mbuf;
> +	uint16_t nb_used, num, nb_rx = 0;
> +	uint32_t len[VIRTIO_MBUF_BURST_SZ];
> +	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
> +	struct rte_mbuf *prev;
> +	int error;
> +	uint32_t i = 0, nb_enqueued = 0;
> +	uint32_t seg_num = 0;
> +	uint16_t extra_idx = 0;
> +	uint32_t seg_res = 0;
> +	const uint32_t hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +
> +	nb_used = VIRTQUEUE_NUSED(rxvq);
> +
> +	rmb();
> +
> +	if (nb_used == 0)
> +		return 0;
> +
> +	PMD_RX_LOG(DEBUG, "used:%d\n", nb_used);
> +
> +	while (i < nb_used) {
> +		struct virtio_net_hdr_mrg_rxbuf *header;
> +
> +		if (nb_rx == nb_pkts)
> +			break;
> +
> +		num = virtqueue_dequeue_burst_rx(rxvq, rcv_pkts, len, 1);
> +		if (num != 1)
> +			continue;
> +
> +		i++;
> +
> +		PMD_RX_LOG(DEBUG, "dequeue:%d\n", num);
> +		PMD_RX_LOG(DEBUG, "packet len:%d\n", len[0]);
> +
> +		rxm = rcv_pkts[0];
> +
> +		if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
> +			PMD_RX_LOG(ERR, "Packet drop\n");
> +			nb_enqueued++;
> +			virtio_discard_rxbuf(rxvq, rxm);
> +			rxvq->errors++;
> +			continue;
> +		}
> +
> +		header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm-
> >buf_addr +
> +			RTE_PKTMBUF_HEADROOM - hdr_size);
> +		seg_num = header->num_buffers;
> +
> +		if (seg_num == 0)
> +			seg_num = 1;
> +
> +		rxm->pkt.data = (char *)rxm->buf_addr +
> RTE_PKTMBUF_HEADROOM;
> +		rxm->pkt.nb_segs = seg_num;
> +		rxm->pkt.next = NULL;
> +		rxm->pkt.pkt_len = (uint32_t)(len[0] - hdr_size);
> +		rxm->pkt.data_len = (uint16_t)(len[0] - hdr_size);
> +
> +		rxm->pkt.in_port = rxvq->port_id;
> +		rx_pkts[nb_rx] = rxm;
> +		prev = rxm;
> +
> +		seg_res = seg_num - 1;
> +
> +		while (seg_res != 0) {
> +			/*
> +			 * Get extra segments for current uncompleted
> packet.
> +			 */
> +			uint32_t  rcv_cnt =
> +				RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
> +			if (likely(VIRTQUEUE_NUSED(rxvq) >= rcv_cnt)) {
> +				uint32_t rx_num =
> +					virtqueue_dequeue_burst_rx(rxvq,
> +					rcv_pkts, len, rcv_cnt);
> +				i += rx_num;
> +				rcv_cnt = rx_num;
> +			} else {
> +				PMD_RX_LOG(ERR,
> +					"No enough segments for
> packet.\n");
> +				nb_enqueued++;
> +				virtio_discard_rxbuf(rxvq, rxm);
> +				rxvq->errors++;
> +				break;
> +			}
> +
> +			extra_idx = 0;
> +
> +			while (extra_idx < rcv_cnt) {
> +				rxm = rcv_pkts[extra_idx];
> +
> +				rxm->pkt.data =
> +					(char *)rxm->buf_addr +
> +					RTE_PKTMBUF_HEADROOM -
> hdr_size;
> +				rxm->pkt.next = NULL;
> +				rxm->pkt.pkt_len =
> (uint32_t)(len[extra_idx]);
> +				rxm->pkt.data_len =
> (uint16_t)(len[extra_idx]);
> +
> +				if (prev)
> +					prev->pkt.next = rxm;
> +
> +				prev = rxm;
> +				rx_pkts[nb_rx]->pkt.pkt_len += rxm-
> >pkt.pkt_len;
> +				extra_idx++;
> +			};
> +			seg_res -= rcv_cnt;
> +		}
> +
> +		VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
> +			rx_pkts[nb_rx]->pkt.data_len);
> +
> +		rxvq->bytes += rx_pkts[nb_rx]->pkt.pkt_len;
> +		nb_rx++;
> +	}
> +
> +	rxvq->packets += nb_rx;
> +
> +	/* Allocate new mbuf for the used descriptor */
> +	error = ENOSPC;
> +	while (likely(!virtqueue_full(rxvq))) {
> +		new_mbuf = rte_rxmbuf_alloc(rxvq->mpool);
> +		if (unlikely(new_mbuf == NULL)) {
> +			struct rte_eth_dev *dev
> +				= &rte_eth_devices[rxvq->port_id];
> +			dev->data->rx_mbuf_alloc_failed++;
> +			break;
> +		}
> +		error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
> +		if (unlikely(error)) {
> +			rte_pktmbuf_free(new_mbuf);
> +			break;
> +		}
> +		nb_enqueued++;
> +	}
> +
>  	if (likely(nb_enqueued)) {
>  		if (unlikely(virtqueue_kick_prepare(rxvq))) {
>  			virtqueue_notify(rxvq);
> @@ -536,12 +699,16 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>  	num = (uint16_t)(likely(nb_used < VIRTIO_MBUF_BURST_SZ) ?
> nb_used : VIRTIO_MBUF_BURST_SZ);
> 
>  	while (nb_tx < nb_pkts) {
> -		if (virtqueue_full(txvq) && num) {
> +		int need = tx_pkts[nb_tx]->pkt.nb_segs - txvq->vq_free_cnt;
> +		int deq_cnt = RTE_MIN(need, (int)num);
> +
> +		num -= (deq_cnt > 0) ? deq_cnt : 0;
> +		while (deq_cnt > 0) {
>  			virtqueue_dequeue_pkt_tx(txvq);
> -			num--;
> +			deq_cnt--;
>  		}
> 
> -		if (!virtqueue_full(txvq)) {
> +		if (tx_pkts[nb_tx]->pkt.nb_segs <= txvq->vq_free_cnt) {
>  			txm = tx_pkts[nb_tx];
>  			/* Enqueue Packet buffers */
>  			error = virtqueue_enqueue_xmit(txvq, txm);
> @@ -555,7 +722,7 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>  				break;
>  			}
>  			nb_tx++;
> -			txvq->bytes += txm->pkt.data_len;
> +			txvq->bytes += txm->pkt.pkt_len;
>  		} else {
>  			PMD_TX_LOG(ERR, "No free tx descriptors to
> transmit");
>  			break;
> --
> 1.8.4.2
  
Thomas Monjalon Aug. 25, 2014, 3:16 p.m. UTC | #2
2014-08-14 16:54, Ouyang Changchun:
> v3 change:
> - Investigate the comments from Huawei and fix one potential issue of wrong offset to
>   the number of descriptor in buffer; also fix other tiny comments.
> 
> v2 change:
> - Resolve conflicts with the tip code;
> - And resolve 2 issues:
>    -- fix mbuf leak when discard an uncompleted packet.
>    -- refine pkt.data to point to actual payload data start point.
>  
> v1 change:
> - This patch supports mergeable buffer feature in DPDK based virtio PMD, which can
>   receive jumbo frame with larger size, like 3K, 4K or even 9K.
> 
> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com>
> Acked-by: Huawei Xie <huawei.xie@intel.com>

Applied for version 1.7.1.

Thanks
  

Patch

diff --git a/lib/librte_pmd_virtio/virtio_ethdev.c b/lib/librte_pmd_virtio/virtio_ethdev.c
index b9f5529..535d798 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.c
+++ b/lib/librte_pmd_virtio/virtio_ethdev.c
@@ -337,7 +337,7 @@  int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 		snprintf(vq_name, sizeof(vq_name), "port%d_tvq%d_hdrzone",
 			dev->data->port_id, queue_idx);
 		vq->virtio_net_hdr_mz = rte_memzone_reserve_aligned(vq_name,
-			vq_size * sizeof(struct virtio_net_hdr),
+			vq_size * hw->vtnet_hdr_size,
 			socket_id, 0, CACHE_LINE_SIZE);
 		if (vq->virtio_net_hdr_mz == NULL) {
 			rte_free(vq);
@@ -346,7 +346,7 @@  int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 		vq->virtio_net_hdr_mem =
 			vq->virtio_net_hdr_mz->phys_addr;
 		memset(vq->virtio_net_hdr_mz->addr, 0,
-			vq_size * sizeof(struct virtio_net_hdr));
+			vq_size * hw->vtnet_hdr_size);
 	} else if (queue_type == VTNET_CQ) {
 		/* Allocate a page for control vq command, data and status */
 		snprintf(vq_name, sizeof(vq_name), "port%d_cvq_hdrzone",
@@ -571,9 +571,6 @@  virtio_negotiate_features(struct virtio_hw *hw)
 	mask |= VIRTIO_NET_F_GUEST_TSO4 | VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN;
 	mask |= VTNET_LRO_FEATURES;
 
-	/* rx_mbuf should not be in multiple merged segments */
-	mask |= VIRTIO_NET_F_MRG_RXBUF;
-
 	/* not negotiating INDIRECT descriptor table support */
 	mask |= VIRTIO_RING_F_INDIRECT_DESC;
 
@@ -746,7 +743,6 @@  eth_virtio_dev_init(__rte_unused struct eth_driver *eth_drv,
 	}
 
 	eth_dev->dev_ops = &virtio_eth_dev_ops;
-	eth_dev->rx_pkt_burst = &virtio_recv_pkts;
 	eth_dev->tx_pkt_burst = &virtio_xmit_pkts;
 
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
@@ -801,10 +797,13 @@  eth_virtio_dev_init(__rte_unused struct eth_driver *eth_drv,
 	virtio_negotiate_features(hw);
 
 	/* Setting up rx_header size for the device */
-	if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF))
+	if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+		eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;
 		hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-	else
+	} else {
+		eth_dev->rx_pkt_burst = &virtio_recv_pkts;
 		hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
+	}
 
 	/* Allocate memory for storing MAC addresses */
 	eth_dev->data->mac_addrs = rte_zmalloc("virtio", ETHER_ADDR_LEN, 0);
@@ -1009,7 +1008,7 @@  static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
 
 		while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
 					dev->data->rx_queues[i])) != NULL) {
-			rte_pktmbuf_free_seg(buf);
+			rte_pktmbuf_free(buf);
 			mbuf_num++;
 		}
 
@@ -1028,7 +1027,8 @@  static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
 		mbuf_num = 0;
 		while ((buf = (struct rte_mbuf *)virtqueue_detatch_unused(
 					dev->data->tx_queues[i])) != NULL) {
-			rte_pktmbuf_free_seg(buf);
+			rte_pktmbuf_free(buf);
+
 			mbuf_num++;
 		}
 
diff --git a/lib/librte_pmd_virtio/virtio_ethdev.h b/lib/librte_pmd_virtio/virtio_ethdev.h
index 858e644..d2e1eed 100644
--- a/lib/librte_pmd_virtio/virtio_ethdev.h
+++ b/lib/librte_pmd_virtio/virtio_ethdev.h
@@ -104,6 +104,9 @@  int  virtio_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
 uint16_t virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts);
 
+uint16_t virtio_recv_mergeable_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
 
diff --git a/lib/librte_pmd_virtio/virtio_rxtx.c b/lib/librte_pmd_virtio/virtio_rxtx.c
index fcd8bd1..0b10108 100644
--- a/lib/librte_pmd_virtio/virtio_rxtx.c
+++ b/lib/librte_pmd_virtio/virtio_rxtx.c
@@ -146,6 +146,7 @@  static inline int
 virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
 {
 	struct vq_desc_extra *dxp;
+	struct virtio_hw *hw = vq->hw;
 	struct vring_desc *start_dp;
 	uint16_t needed = 1;
 	uint16_t head_idx, idx;
@@ -165,9 +166,11 @@  virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
 	dxp->ndescs = needed;
 
 	start_dp = vq->vq_ring.desc;
-	start_dp[idx].addr  =
-		(uint64_t) (cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM - sizeof(struct virtio_net_hdr));
-	start_dp[idx].len   = cookie->buf_len - RTE_PKTMBUF_HEADROOM + sizeof(struct virtio_net_hdr);
+	start_dp[idx].addr =
+		(uint64_t)(cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM
+		- hw->vtnet_hdr_size);
+	start_dp[idx].len =
+		cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
 	start_dp[idx].flags =  VRING_DESC_F_WRITE;
 	idx = start_dp[idx].next;
 	vq->vq_desc_head_idx = idx;
@@ -184,8 +187,10 @@  virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
 {
 	struct vq_desc_extra *dxp;
 	struct vring_desc *start_dp;
-	uint16_t needed = 2;
+	uint16_t seg_num = cookie->pkt.nb_segs;
+	uint16_t needed = 1 + seg_num;
 	uint16_t head_idx, idx;
+	uint16_t head_size = txvq->hw->vtnet_hdr_size;
 
 	if (unlikely(txvq->vq_free_cnt == 0))
 		return -ENOSPC;
@@ -198,19 +203,25 @@  virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
 	idx = head_idx;
 	dxp = &txvq->vq_descx[idx];
 	if (dxp->cookie != NULL)
-		rte_pktmbuf_free_seg(dxp->cookie);
+		rte_pktmbuf_free(dxp->cookie);
 	dxp->cookie = (void *)cookie;
 	dxp->ndescs = needed;
 
 	start_dp = txvq->vq_ring.desc;
-	start_dp[idx].addr  =
-		txvq->virtio_net_hdr_mem + idx * sizeof(struct virtio_net_hdr);
-	start_dp[idx].len   = sizeof(struct virtio_net_hdr);
+	start_dp[idx].addr =
+		txvq->virtio_net_hdr_mem + idx * head_size;
+	start_dp[idx].len = (uint32_t)head_size;
 	start_dp[idx].flags = VRING_DESC_F_NEXT;
-	idx = start_dp[idx].next;
-	start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
-	start_dp[idx].len   = cookie->pkt.data_len;
-	start_dp[idx].flags = 0;
+
+	for (; ((seg_num > 0) && (cookie != NULL)); seg_num--) {
+		idx = start_dp[idx].next;
+		start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
+		start_dp[idx].len   = cookie->pkt.data_len;
+		start_dp[idx].flags = VRING_DESC_F_NEXT;
+		cookie = cookie->pkt.next;
+	}
+
+	start_dp[idx].flags &= ~VRING_DESC_F_NEXT;
 	idx = start_dp[idx].next;
 	txvq->vq_desc_head_idx = idx;
 	if (txvq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
@@ -284,7 +295,7 @@  virtio_dev_vring_start(struct virtqueue *vq, int queue_type)
 			error = virtqueue_enqueue_recv_refill(vq, m);
 
 			if (error) {
-				rte_pktmbuf_free_seg(m);
+				rte_pktmbuf_free(m);
 				break;
 			}
 			nbufs++;
@@ -423,7 +434,7 @@  virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
 	error = virtqueue_enqueue_recv_refill(vq, m);
 	if (unlikely(error)) {
 		RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
-		rte_pktmbuf_free_seg(m);
+		rte_pktmbuf_free(m);
 	}
 }
 
@@ -433,13 +444,13 @@  uint16_t
 virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	struct virtqueue *rxvq = rx_queue;
-	struct virtio_hw *hw = rxvq->hw;
 	struct rte_mbuf *rxm, *new_mbuf;
 	uint16_t nb_used, num, nb_rx = 0;
 	uint32_t len[VIRTIO_MBUF_BURST_SZ];
 	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
 	int error;
 	uint32_t i, nb_enqueued = 0;
+	const uint32_t hdr_size = sizeof(struct virtio_net_hdr);
 
 	nb_used = VIRTQUEUE_NUSED(rxvq);
 
@@ -460,8 +471,7 @@  virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
 
-		if (unlikely(len[i]
-			     < (uint32_t)hw->vtnet_hdr_size + ETHER_HDR_LEN)) {
+		if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
 			PMD_RX_LOG(ERR, "Packet drop");
 			nb_enqueued++;
 			virtio_discard_rxbuf(rxvq, rxm);
@@ -471,17 +481,16 @@  virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		rxm->pkt.in_port = rxvq->port_id;
 		rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+
 		rxm->pkt.nb_segs = 1;
 		rxm->pkt.next = NULL;
-		rxm->pkt.pkt_len  = (uint32_t)(len[i]
-					       - sizeof(struct virtio_net_hdr));
-		rxm->pkt.data_len = (uint16_t)(len[i]
-					       - sizeof(struct virtio_net_hdr));
+		rxm->pkt.pkt_len = (uint32_t)(len[i] - hdr_size);
+		rxm->pkt.data_len = (uint16_t)(len[i] - hdr_size);
 
 		VIRTIO_DUMP_PACKET(rxm, rxm->pkt.data_len);
 
 		rx_pkts[nb_rx++] = rxm;
-		rxvq->bytes += len[i] - sizeof(struct virtio_net_hdr);
+		rxvq->bytes += rx_pkts[nb_rx - 1]->pkt.pkt_len;
 	}
 
 	rxvq->packets += nb_rx;
@@ -498,11 +507,165 @@  virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		}
 		error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
 		if (unlikely(error)) {
-			rte_pktmbuf_free_seg(new_mbuf);
+			rte_pktmbuf_free(new_mbuf);
 			break;
 		}
 		nb_enqueued++;
 	}
+
+	if (likely(nb_enqueued)) {
+		if (unlikely(virtqueue_kick_prepare(rxvq))) {
+			virtqueue_notify(rxvq);
+			PMD_RX_LOG(DEBUG, "Notified\n");
+		}
+	}
+
+	vq_update_avail_idx(rxvq);
+
+	return nb_rx;
+}
+
+uint16_t
+virtio_recv_mergeable_pkts(void *rx_queue,
+			struct rte_mbuf **rx_pkts,
+			uint16_t nb_pkts)
+{
+	struct virtqueue *rxvq = rx_queue;
+	struct rte_mbuf *rxm, *new_mbuf;
+	uint16_t nb_used, num, nb_rx = 0;
+	uint32_t len[VIRTIO_MBUF_BURST_SZ];
+	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
+	struct rte_mbuf *prev;
+	int error;
+	uint32_t i = 0, nb_enqueued = 0;
+	uint32_t seg_num = 0;
+	uint16_t extra_idx = 0;
+	uint32_t seg_res = 0;
+	const uint32_t hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+	nb_used = VIRTQUEUE_NUSED(rxvq);
+
+	rmb();
+
+	if (nb_used == 0)
+		return 0;
+
+	PMD_RX_LOG(DEBUG, "used:%d\n", nb_used);
+
+	while (i < nb_used) {
+		struct virtio_net_hdr_mrg_rxbuf *header;
+
+		if (nb_rx == nb_pkts)
+			break;
+
+		num = virtqueue_dequeue_burst_rx(rxvq, rcv_pkts, len, 1);
+		if (num != 1)
+			continue;
+
+		i++;
+
+		PMD_RX_LOG(DEBUG, "dequeue:%d\n", num);
+		PMD_RX_LOG(DEBUG, "packet len:%d\n", len[0]);
+
+		rxm = rcv_pkts[0];
+
+		if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
+			PMD_RX_LOG(ERR, "Packet drop\n");
+			nb_enqueued++;
+			virtio_discard_rxbuf(rxvq, rxm);
+			rxvq->errors++;
+			continue;
+		}
+
+		header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
+			RTE_PKTMBUF_HEADROOM - hdr_size);
+		seg_num = header->num_buffers;
+
+		if (seg_num == 0)
+			seg_num = 1;
+
+		rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+		rxm->pkt.nb_segs = seg_num;
+		rxm->pkt.next = NULL;
+		rxm->pkt.pkt_len = (uint32_t)(len[0] - hdr_size);
+		rxm->pkt.data_len = (uint16_t)(len[0] - hdr_size);
+
+		rxm->pkt.in_port = rxvq->port_id;
+		rx_pkts[nb_rx] = rxm;
+		prev = rxm;
+
+		seg_res = seg_num - 1;
+
+		while (seg_res != 0) {
+			/*
+			 * Get extra segments for current uncompleted packet.
+			 */
+			uint32_t  rcv_cnt =
+				RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
+			if (likely(VIRTQUEUE_NUSED(rxvq) >= rcv_cnt)) {
+				uint32_t rx_num =
+					virtqueue_dequeue_burst_rx(rxvq,
+					rcv_pkts, len, rcv_cnt);
+				i += rx_num;
+				rcv_cnt = rx_num;
+			} else {
+				PMD_RX_LOG(ERR,
+					"No enough segments for packet.\n");
+				nb_enqueued++;
+				virtio_discard_rxbuf(rxvq, rxm);
+				rxvq->errors++;
+				break;
+			}
+
+			extra_idx = 0;
+
+			while (extra_idx < rcv_cnt) {
+				rxm = rcv_pkts[extra_idx];
+
+				rxm->pkt.data =
+					(char *)rxm->buf_addr +
+					RTE_PKTMBUF_HEADROOM - hdr_size;
+				rxm->pkt.next = NULL;
+				rxm->pkt.pkt_len = (uint32_t)(len[extra_idx]);
+				rxm->pkt.data_len = (uint16_t)(len[extra_idx]);
+
+				if (prev)
+					prev->pkt.next = rxm;
+
+				prev = rxm;
+				rx_pkts[nb_rx]->pkt.pkt_len += rxm->pkt.pkt_len;
+				extra_idx++;
+			};
+			seg_res -= rcv_cnt;
+		}
+
+		VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
+			rx_pkts[nb_rx]->pkt.data_len);
+
+		rxvq->bytes += rx_pkts[nb_rx]->pkt.pkt_len;
+		nb_rx++;
+	}
+
+	rxvq->packets += nb_rx;
+
+	/* Allocate new mbuf for the used descriptor */
+	error = ENOSPC;
+	while (likely(!virtqueue_full(rxvq))) {
+		new_mbuf = rte_rxmbuf_alloc(rxvq->mpool);
+		if (unlikely(new_mbuf == NULL)) {
+			struct rte_eth_dev *dev
+				= &rte_eth_devices[rxvq->port_id];
+			dev->data->rx_mbuf_alloc_failed++;
+			break;
+		}
+		error = virtqueue_enqueue_recv_refill(rxvq, new_mbuf);
+		if (unlikely(error)) {
+			rte_pktmbuf_free(new_mbuf);
+			break;
+		}
+		nb_enqueued++;
+	}
+
 	if (likely(nb_enqueued)) {
 		if (unlikely(virtqueue_kick_prepare(rxvq))) {
 			virtqueue_notify(rxvq);
@@ -536,12 +699,16 @@  virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	num = (uint16_t)(likely(nb_used < VIRTIO_MBUF_BURST_SZ) ? nb_used : VIRTIO_MBUF_BURST_SZ);
 
 	while (nb_tx < nb_pkts) {
-		if (virtqueue_full(txvq) && num) {
+		int need = tx_pkts[nb_tx]->pkt.nb_segs - txvq->vq_free_cnt;
+		int deq_cnt = RTE_MIN(need, (int)num);
+
+		num -= (deq_cnt > 0) ? deq_cnt : 0;
+		while (deq_cnt > 0) {
 			virtqueue_dequeue_pkt_tx(txvq);
-			num--;
+			deq_cnt--;
 		}
 
-		if (!virtqueue_full(txvq)) {
+		if (tx_pkts[nb_tx]->pkt.nb_segs <= txvq->vq_free_cnt) {
 			txm = tx_pkts[nb_tx];
 			/* Enqueue Packet buffers */
 			error = virtqueue_enqueue_xmit(txvq, txm);
@@ -555,7 +722,7 @@  virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 				break;
 			}
 			nb_tx++;
-			txvq->bytes += txm->pkt.data_len;
+			txvq->bytes += txm->pkt.pkt_len;
 		} else {
 			PMD_TX_LOG(ERR, "No free tx descriptors to transmit");
 			break;