[dpdk-dev,5/5] net/virtio: fix Tso when mbuf is shared

Message ID 1479977798-13417-6-git-send-email-olivier.matz@6wind.com (mailing list archive)
State Rejected, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
checkpatch/checkpatch success coding style OK

Commit Message

Olivier Matz Nov. 24, 2016, 8:56 a.m. UTC
  With virtio, doing tso requires to modify the network
packet data:
- the dpdk API requires to set the l4 checksum to an
  Intel-Nic-like pseudo header checksum that does
  not include the ip length
- the virtio peer expects that the l4 checksum is
  a standard pseudo header checksum.

This is a problem with shared packets, because they
should not be modified.

This patch fixes this issue by copying the headers into
a linear buffer in that case. This buffer is located in
the virtio_tx_region, at the same place where the
virtio header is stored.

The size of this buffer is set to 256, which should
be enough in all cases:
  sizeof(ethernet) + sizeof(vlan) * 2 + sizeof(ip6)
    sizeof(ip6-ext) + sizeof(tcp) + sizeof(tcp-opts)
  = 14 + 8 + 40 + sizeof(ip6-ext) + 40 + sizeof(tcp-opts)
  = 102 + sizeof(ip6-ext) + sizeof(tcp-opts)

Fixes: 696573046e9e ("net/virtio: support TSO")

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 drivers/net/virtio/virtio_rxtx.c | 119 +++++++++++++++++++++++++++------------
 drivers/net/virtio/virtqueue.h   |   2 +
 2 files changed, 85 insertions(+), 36 deletions(-)
  

Comments

Yuanhan Liu Dec. 14, 2016, 7:27 a.m. UTC | #1
Hi Olivier,

Firstly sorry for late response!

On Thu, Nov 24, 2016 at 09:56:38AM +0100, Olivier Matz wrote:
> With virtio, doing tso requires to modify the network
> packet data:

I thought more about it this time, and I'm wondering why it's needed.

> - the dpdk API requires to set the l4 checksum to an
>   Intel-Nic-like pseudo header checksum that does
>   not include the ip length

If the packet is for a NIC pmd driver in the end, then the NIC driver
(or application) would handle the checksum correctly.  You could check
the tx_prep patchset for example.

> - the virtio peer expects that the l4 checksum is
>   a standard pseudo header checksum.

For this case, the checksum is then not needed: we could assume the data
between virtio to virtio transmission on the same host is always valid,
that checksum validation is unnecessary.

So, in either case, it doesn't seem to me we have to generate the checksum
here. Or am I miss something?

OTOH, even if it does, I still see some issues (see below).

>  		/* TCP Segmentation Offload */
>  		if (cookie->ol_flags & PKT_TX_TCP_SEG) {
> -			virtio_tso_fix_cksum(cookie);
> +			offset = virtio_tso_fix_cksum(cookie,
> +				RTE_PTR_ADD(hdr, start_dp[hdr_idx].len),
> +				VIRTIO_MAX_HDR_SZ);
> +			if (offset > 0) {
> +				RTE_ASSERT(can_push != 0);

I think it's (can_push == 0) ?

> +				start_dp[hdr_idx].len += offset;

Actually, there is an assumption if you do this, that the backend driver
must have to support ANY_LAYOUT. Otherwise, it won't work: the driver
would expect the header and packet data is totally separated into two
desc buffers.

Though the assumption is most likely true in nowadays, I don't think
it's a guarantee.

	--yliu
  
Stephen Hemminger Jan. 9, 2017, 5:59 p.m. UTC | #2
On Thu, 24 Nov 2016 09:56:38 +0100
Olivier Matz <olivier.matz@6wind.com> wrote:

> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index 22d97a4..577c775 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -211,43 +211,73 @@ virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
>  
>  /* When doing TSO, the IP length is not included in the pseudo header
>   * checksum of the packet given to the PMD, but for virtio it is
> - * expected.
> + * expected. Fix the mbuf or a copy if the mbuf is shared.
>   */
> -static void
> -virtio_tso_fix_cksum(struct rte_mbuf *m)
> +static unsigned int
> +virtio_tso_fix_cksum(struct rte_mbuf *m, char *hdr, size_t hdr_sz)
>  {
> -	/* common case: header is not fragmented */
> -	if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
> -			m->l4_len)) {
> -		struct ipv4_hdr *iph;
> -		struct ipv6_hdr *ip6h;
> -		struct tcp_hdr *th;
> -		uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
> -		uint32_t tmp;
> -
> -		iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
> -		th = RTE_PTR_ADD(iph, m->l3_len);
> -		if ((iph->version_ihl >> 4) == 4) {
> -			iph->hdr_checksum = 0;
> -			iph->hdr_checksum = rte_ipv4_cksum(iph);
> -			ip_len = iph->total_length;
> -			ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
> -				m->l3_len);
> -		} else {
> -			ip6h = (struct ipv6_hdr *)iph;
> -			ip_paylen = ip6h->payload_len;
> +	struct ipv4_hdr *iph, iph_copy;
> +	struct ipv6_hdr *ip6h = NULL, ip6h_copy;
> +	struct tcp_hdr *th, th_copy;
> +	size_t hdrlen = m->l2_len + m->l3_len + m->l4_len;
> +	uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
> +	uint32_t tmp;
> +	int shared = 0;

Minor nit, uses bool instead of int for shared?
  

Patch

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 22d97a4..577c775 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -211,43 +211,73 @@  virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
 
 /* When doing TSO, the IP length is not included in the pseudo header
  * checksum of the packet given to the PMD, but for virtio it is
- * expected.
+ * expected. Fix the mbuf or a copy if the mbuf is shared.
  */
-static void
-virtio_tso_fix_cksum(struct rte_mbuf *m)
+static unsigned int
+virtio_tso_fix_cksum(struct rte_mbuf *m, char *hdr, size_t hdr_sz)
 {
-	/* common case: header is not fragmented */
-	if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
-			m->l4_len)) {
-		struct ipv4_hdr *iph;
-		struct ipv6_hdr *ip6h;
-		struct tcp_hdr *th;
-		uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
-		uint32_t tmp;
-
-		iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
-		th = RTE_PTR_ADD(iph, m->l3_len);
-		if ((iph->version_ihl >> 4) == 4) {
-			iph->hdr_checksum = 0;
-			iph->hdr_checksum = rte_ipv4_cksum(iph);
-			ip_len = iph->total_length;
-			ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
-				m->l3_len);
-		} else {
-			ip6h = (struct ipv6_hdr *)iph;
-			ip_paylen = ip6h->payload_len;
+	struct ipv4_hdr *iph, iph_copy;
+	struct ipv6_hdr *ip6h = NULL, ip6h_copy;
+	struct tcp_hdr *th, th_copy;
+	size_t hdrlen = m->l2_len + m->l3_len + m->l4_len;
+	uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
+	uint32_t tmp;
+	int shared = 0;
+
+	/* mbuf is write-only, we need to copy the headers in a linear buffer */
+	if (unlikely(rte_pktmbuf_data_is_shared(m, 0, hdrlen))) {
+		shared = 1;
+
+		/* network headers are too big, there's nothing we can do */
+		if (hdrlen > hdr_sz)
+			return 0;
+
+		rte_pktmbuf_read_copy(m, 0, hdrlen, hdr);
+		iph = (struct ipv4_hdr *)(hdr + m->l2_len);
+		ip6h = (struct ipv6_hdr *)(hdr + m->l2_len);
+		th = (struct tcp_hdr *)(hdr + m->l2_len + m->l3_len);
+	} else {
+		iph = rte_pktmbuf_read(m, m->l2_len, sizeof(*iph), &iph_copy);
+		th = rte_pktmbuf_read(m, m->l2_len + m->l3_len, sizeof(*th),
+			&th_copy);
+	}
+
+	if ((iph->version_ihl >> 4) == 4) {
+		iph->hdr_checksum = 0;
+		iph->hdr_checksum = rte_ipv4_cksum(iph);
+		ip_len = iph->total_length;
+		ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
+			m->l3_len);
+	} else {
+		if (!shared) {
+			ip6h = rte_pktmbuf_read(m, m->l2_len, sizeof(*ip6h),
+				&ip6h_copy);
 		}
+		ip_paylen = ip6h->payload_len;
+	}
 
-		/* calculate the new phdr checksum not including ip_paylen */
-		prev_cksum = th->cksum;
-		tmp = prev_cksum;
-		tmp += ip_paylen;
-		tmp = (tmp & 0xffff) + (tmp >> 16);
-		new_cksum = tmp;
+	/* calculate the new phdr checksum not including ip_paylen */
+	prev_cksum = th->cksum;
+	tmp = prev_cksum;
+	tmp += ip_paylen;
+	tmp = (tmp & 0xffff) + (tmp >> 16);
+	new_cksum = tmp;
 
-		/* replace it in the packet */
-		th->cksum = new_cksum;
-	}
+	/* replace it in the header */
+	th->cksum = new_cksum;
+
+	/* the update was done in the linear buffer, return */
+	if (shared)
+		return hdrlen;
+
+	/* copy from local buffer into mbuf if required */
+	if ((iph->version_ihl >> 4) == 4)
+		rte_pktmbuf_write(m, m->l2_len, sizeof(*iph), iph);
+	else
+		rte_pktmbuf_write(m, m->l2_len, sizeof(*ip6h), ip6h);
+	rte_pktmbuf_write(m, m->l2_len + m->l3_len, sizeof(*th), th);
+
+	return 0;
 }
 
 static inline int
@@ -268,7 +298,9 @@  virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 	struct vring_desc *start_dp;
 	uint16_t seg_num = cookie->nb_segs;
 	uint16_t head_idx, idx;
+	uint16_t hdr_idx = 0;
 	uint16_t head_size = vq->hw->vtnet_hdr_size;
+	unsigned int offset = 0;
 	struct virtio_net_hdr *hdr;
 	int offload;
 
@@ -303,6 +335,8 @@  virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 
 		/* loop below will fill in rest of the indirect elements */
 		start_dp = txr[idx].tx_indir;
+		hdr_idx = 0;
+		start_dp[hdr_idx].len = vq->hw->vtnet_hdr_size;
 		idx = 1;
 	} else {
 		/* setup first tx ring slot to point to header
@@ -313,7 +347,7 @@  virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 		start_dp[idx].len   = vq->hw->vtnet_hdr_size;
 		start_dp[idx].flags = VRING_DESC_F_NEXT;
 		hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
-
+		hdr_idx = idx;
 		idx = start_dp[idx].next;
 	}
 
@@ -345,7 +379,14 @@  virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 
 		/* TCP Segmentation Offload */
 		if (cookie->ol_flags & PKT_TX_TCP_SEG) {
-			virtio_tso_fix_cksum(cookie);
+			offset = virtio_tso_fix_cksum(cookie,
+				RTE_PTR_ADD(hdr, start_dp[hdr_idx].len),
+				VIRTIO_MAX_HDR_SZ);
+			if (offset > 0) {
+				RTE_ASSERT(can_push != 0);
+				start_dp[hdr_idx].len += offset;
+			}
+
 			hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ?
 				VIRTIO_NET_HDR_GSO_TCPV6 :
 				VIRTIO_NET_HDR_GSO_TCPV4;
@@ -362,10 +403,16 @@  virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
 	}
 
 	do {
-		start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);
-		start_dp[idx].len   = cookie->data_len;
+		if (offset > cookie->data_len) {
+			offset -= cookie->data_len;
+			continue;
+		}
+		start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq) +
+			offset;
+		start_dp[idx].len   = cookie->data_len - offset;
 		start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
 		idx = start_dp[idx].next;
+		offset = 0;
 	} while ((cookie = cookie->next) != NULL);
 
 	if (use_indirect)
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index f0bb089..edfe0dd 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -254,8 +254,10 @@  struct virtio_net_hdr_mrg_rxbuf {
 
 /* Region reserved to allow for transmit header and indirect ring */
 #define VIRTIO_MAX_TX_INDIRECT 8
+#define VIRTIO_MAX_HDR_SZ 256
 struct virtio_tx_region {
 	struct virtio_net_hdr_mrg_rxbuf tx_hdr;
+	char net_headers[VIRTIO_MAX_HDR_SZ]; /* for offload if mbuf is RO */
 	struct vring_desc tx_indir[VIRTIO_MAX_TX_INDIRECT]
 			   __attribute__((__aligned__(16)));
 };