[v2] net/af_xdp: avoid to unnecessary allocation and free mbuf in rx path

Message ID 1601016336-12233-1-git-send-email-lirongqing@baidu.com (mailing list archive)
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers
Series [v2] net/af_xdp: avoid to unnecessary allocation and free mbuf in rx path |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-mellanox-Performance success Performance Testing PASS

Commit Message

Li RongQing Sept. 25, 2020, 6:45 a.m. UTC
  when receive packets, the max bunch number of mbuf are allocated
if hardware does not receive the max bunch number packets, it
will free redundancy mbuf, that is low-performance

so optimize rx performance, by allocating number of mbuf based on
result of xsk_ring_cons__peek, to avoid to redundancy allocation,
and free mbuf when receive packets

V2: rollback rx cached_cons if mbuf failed to be allocated

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Dongsheng Rong <rongdongsheng@baidu.com>
---
 drivers/net/af_xdp/rte_eth_af_xdp.c | 67 ++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 38 deletions(-)
  

Comments

Loftus, Ciara Oct. 1, 2020, 4:24 p.m. UTC | #1
> 
> when receive packets, the max bunch number of mbuf are allocated
> if hardware does not receive the max bunch number packets, it
> will free redundancy mbuf, that is low-performance
> 
> so optimize rx performance, by allocating number of mbuf based on
> result of xsk_ring_cons__peek, to avoid to redundancy allocation,
> and free mbuf when receive packets

Hi,

Thanks for the patch and fixing the issue I raised.
With my testing so far I haven't measured an improvement in performance with the patch.
Do you have data to share which shows the benefit of your patch?

I agree the potential excess allocation of mbufs for the fill ring is not the most optimal, but if doing it does not significantly impact the performance I would be in favour of keeping that approach versus touching the cached_cons outside of libbpf which is unconventional.

If a benefit can be shown and we proceed with the approach, I would suggest creating a new function for the cached consumer rollback eg. xsk_ring_cons_cancel() or similar, and add a comment describing what it does.

Thanks,
Ciara

> 
> V2: rollback rx cached_cons if mbuf failed to be allocated
> 
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
> Signed-off-by: Dongsheng Rong <rongdongsheng@baidu.com>
> ---
>  drivers/net/af_xdp/rte_eth_af_xdp.c | 67 ++++++++++++++++---------------
> ------
>  1 file changed, 29 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> b/drivers/net/af_xdp/rte_eth_af_xdp.c
> index 01f462b46..e04fa43f6 100644
> --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> @@ -251,28 +251,29 @@ af_xdp_rx_zc(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
>  	struct xsk_umem_info *umem = rxq->umem;
>  	uint32_t idx_rx = 0;
>  	unsigned long rx_bytes = 0;
> -	int rcvd, i;
> +	int i;
>  	struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
> 
> -	/* allocate bufs for fill queue replenishment after rx */
> -	if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
> -		AF_XDP_LOG(DEBUG,
> -			"Failed to get enough buffers for fq.\n");
> -		return 0;
> -	}
> +	nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
> 
> -	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
> -
> -	if (rcvd == 0) {
> +	if (nb_pkts == 0) {
>  #if defined(XDP_USE_NEED_WAKEUP)
>  		if (xsk_ring_prod__needs_wakeup(fq))
>  			(void)poll(rxq->fds, 1, 1000);
>  #endif
> 
> -		goto out;
> +		return 0;
> +	}
> +
> +	/* allocate bufs for fill queue replenishment after rx */
> +	if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
> +		AF_XDP_LOG(DEBUG,
> +			"Failed to get enough buffers for fq.\n");
> +		rx->cached_cons -= nb_pkts;
> +		return 0;
>  	}
> 
> -	for (i = 0; i < rcvd; i++) {
> +	for (i = 0; i < nb_pkts; i++) {
>  		const struct xdp_desc *desc;
>  		uint64_t addr;
>  		uint32_t len;
> @@ -297,20 +298,14 @@ af_xdp_rx_zc(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
>  		rx_bytes += len;
>  	}
> 
> -	xsk_ring_cons__release(rx, rcvd);
> -
> -	(void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
> +	xsk_ring_cons__release(rx, nb_pkts);
> +	(void)reserve_fill_queue(umem, nb_pkts, fq_bufs, fq);
> 
>  	/* statistics */
> -	rxq->stats.rx_pkts += rcvd;
> +	rxq->stats.rx_pkts += nb_pkts;
>  	rxq->stats.rx_bytes += rx_bytes;
> 
> -out:
> -	if (rcvd != nb_pkts)
> -		rte_mempool_put_bulk(umem->mb_pool, (void
> **)&fq_bufs[rcvd],
> -				     nb_pkts - rcvd);
> -
> -	return rcvd;
> +	return nb_pkts;
>  }
>  #else
>  static uint16_t
> @@ -322,7 +317,7 @@ af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
>  	struct xsk_ring_prod *fq = &rxq->fq;
>  	uint32_t idx_rx = 0;
>  	unsigned long rx_bytes = 0;
> -	int rcvd, i;
> +	int i;
>  	uint32_t free_thresh = fq->size >> 1;
>  	struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
> 
> @@ -330,20 +325,21 @@ af_xdp_rx_cp(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
>  		(void)reserve_fill_queue(umem,
> ETH_AF_XDP_RX_BATCH_SIZE,
>  					 NULL, fq);
> 
> -	if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts)
> != 0))
> -		return 0;
> -
> -	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
> -	if (rcvd == 0) {
> +	nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
> +	if (nb_pkts == 0) {
>  #if defined(XDP_USE_NEED_WAKEUP)
>  		if (xsk_ring_prod__needs_wakeup(fq))
>  			(void)poll(rxq->fds, 1, 1000);
>  #endif
> +		return 0;
> +	}
> 
> -		goto out;
> +	if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs,
> nb_pkts))) {
> +		rx->cached_cons -= nb_pkts;
> +		return 0;
>  	}
> 
> -	for (i = 0; i < rcvd; i++) {
> +	for (i = 0; i < nb_pkts; i++) {
>  		const struct xdp_desc *desc;
>  		uint64_t addr;
>  		uint32_t len;
> @@ -362,18 +358,13 @@ af_xdp_rx_cp(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
>  		bufs[i] = mbufs[i];
>  	}
> 
> -	xsk_ring_cons__release(rx, rcvd);
> +	xsk_ring_cons__release(rx, nb_pkts);
> 
>  	/* statistics */
> -	rxq->stats.rx_pkts += rcvd;
> +	rxq->stats.rx_pkts += nb_pkts;
>  	rxq->stats.rx_bytes += rx_bytes;
> 
> -out:
> -	if (rcvd != nb_pkts)
> -		rte_mempool_put_bulk(rxq->mb_pool, (void
> **)&mbufs[rcvd],
> -				     nb_pkts - rcvd);
> -
> -	return rcvd;
> +	return nb_pkts;
>  }
>  #endif
> 
> --
> 2.16.2
  
Li RongQing Oct. 14, 2020, 12:15 p.m. UTC | #2
> -----Original Message-----
> From: Loftus, Ciara [mailto:ciara.loftus@intel.com]
> Sent: Friday, October 02, 2020 12:24 AM
> To: Li,Rongqing <lirongqing@baidu.com>
> Cc: dev@dpdk.org
> Subject: RE: [PATCH][v2] net/af_xdp: avoid to unnecessary allocation and free
> mbuf in rx path
> 
> >
> > when receive packets, the max bunch number of mbuf are allocated if
> > hardware does not receive the max bunch number packets, it will free
> > redundancy mbuf, that is low-performance
> >
> > so optimize rx performance, by allocating number of mbuf based on
> > result of xsk_ring_cons__peek, to avoid to redundancy allocation, and
> > free mbuf when receive packets
> 
> Hi,
> 
> Thanks for the patch and fixing the issue I raised.

Thanks for your finding 

> With my testing so far I haven't measured an improvement in performance
> with the patch.
> Do you have data to share which shows the benefit of your patch?
> 
> I agree the potential excess allocation of mbufs for the fill ring is not the most
> optimal, but if doing it does not significantly impact the performance I would be
> in favour of keeping that approach versus touching the cached_cons outside of
> libbpf which is unconventional.
> 
> If a benefit can be shown and we proceed with the approach, I would suggest
> creating a new function for the cached consumer rollback eg.
> xsk_ring_cons_cancel() or similar, and add a comment describing what it does.
> 

Thanks for your test.

Yes, it has benefit

We first see this issue when do some send performance, topo is like below

Qemu with vhost-user ----->ovs------->xdp interface

Qemu sends udp packets, xdp has not packets to receive, but it must be polled by ovs, and xdp must allocated/free mbuf unnecessary, with this packet, we has about 5% benefit for sending, this depends on flow table complexity


When do rx benchmark, if packets per batch is reaching about 32, the benefit is very little.
If packets per batch is far less than 32, we can see the cycle per packet is reduced obviously


-Li
  
Ferruh Yigit Nov. 13, 2020, 5:40 p.m. UTC | #3
On 10/14/2020 1:15 PM, Li,Rongqing wrote:
> 
> 
>> -----Original Message-----
>> From: Loftus, Ciara [mailto:ciara.loftus@intel.com]
>> Sent: Friday, October 02, 2020 12:24 AM
>> To: Li,Rongqing <lirongqing@baidu.com>
>> Cc: dev@dpdk.org
>> Subject: RE: [PATCH][v2] net/af_xdp: avoid to unnecessary allocation and free
>> mbuf in rx path
>>
>>>
>>> when receive packets, the max bunch number of mbuf are allocated if
>>> hardware does not receive the max bunch number packets, it will free
>>> redundancy mbuf, that is low-performance
>>>
>>> so optimize rx performance, by allocating number of mbuf based on
>>> result of xsk_ring_cons__peek, to avoid to redundancy allocation, and
>>> free mbuf when receive packets
>>
>> Hi,
>>
>> Thanks for the patch and fixing the issue I raised.
> 
> Thanks for your finding
> 
>> With my testing so far I haven't measured an improvement in performance
>> with the patch.
>> Do you have data to share which shows the benefit of your patch?
>>
>> I agree the potential excess allocation of mbufs for the fill ring is not the most
>> optimal, but if doing it does not significantly impact the performance I would be
>> in favour of keeping that approach versus touching the cached_cons outside of
>> libbpf which is unconventional.
>>
>> If a benefit can be shown and we proceed with the approach, I would suggest
>> creating a new function for the cached consumer rollback eg.
>> xsk_ring_cons_cancel() or similar, and add a comment describing what it does.
>>
> 
> Thanks for your test.
> 
> Yes, it has benefit
> 
> We first see this issue when do some send performance, topo is like below
> 
> Qemu with vhost-user ----->ovs------->xdp interface
> 
> Qemu sends udp packets, xdp has not packets to receive, but it must be polled by ovs, and xdp must allocated/free mbuf unnecessary, with this packet, we has about 5% benefit for sending, this depends on flow table complexity
> 
> 
> When do rx benchmark, if packets per batch is reaching about 32, the benefit is very little.
> If packets per batch is far less than 32, we can see the cycle per packet is reduced obviously
> 

Hi Li, Ciara,

What is the status of this patch, is the patch justified and is a new versions 
requested/expected?
  
Loftus, Ciara Nov. 16, 2020, 7:04 a.m. UTC | #4
> 
> On 10/14/2020 1:15 PM, Li,Rongqing wrote:
> >
> >
> >> -----Original Message-----
> >> From: Loftus, Ciara [mailto:ciara.loftus@intel.com]
> >> Sent: Friday, October 02, 2020 12:24 AM
> >> To: Li,Rongqing <lirongqing@baidu.com>
> >> Cc: dev@dpdk.org
> >> Subject: RE: [PATCH][v2] net/af_xdp: avoid to unnecessary allocation and
> free
> >> mbuf in rx path
> >>
> >>>
> >>> when receive packets, the max bunch number of mbuf are allocated if
> >>> hardware does not receive the max bunch number packets, it will free
> >>> redundancy mbuf, that is low-performance
> >>>
> >>> so optimize rx performance, by allocating number of mbuf based on
> >>> result of xsk_ring_cons__peek, to avoid to redundancy allocation, and
> >>> free mbuf when receive packets
> >>
> >> Hi,
> >>
> >> Thanks for the patch and fixing the issue I raised.
> >
> > Thanks for your finding
> >
> >> With my testing so far I haven't measured an improvement in
> performance
> >> with the patch.
> >> Do you have data to share which shows the benefit of your patch?
> >>
> >> I agree the potential excess allocation of mbufs for the fill ring is not the
> most
> >> optimal, but if doing it does not significantly impact the performance I
> would be
> >> in favour of keeping that approach versus touching the cached_cons
> outside of
> >> libbpf which is unconventional.
> >>
> >> If a benefit can be shown and we proceed with the approach, I would
> suggest
> >> creating a new function for the cached consumer rollback eg.
> >> xsk_ring_cons_cancel() or similar, and add a comment describing what it
> does.
> >>
> >
> > Thanks for your test.
> >
> > Yes, it has benefit
> >
> > We first see this issue when do some send performance, topo is like below
> >
> > Qemu with vhost-user ----->ovs------->xdp interface
> >
> > Qemu sends udp packets, xdp has not packets to receive, but it must be
> polled by ovs, and xdp must allocated/free mbuf unnecessary, with this
> packet, we has about 5% benefit for sending, this depends on flow table
> complexity
> >
> >
> > When do rx benchmark, if packets per batch is reaching about 32, the
> benefit is very little.
> > If packets per batch is far less than 32, we can see the cycle per packet is
> reduced obviously
> >
> 
> Hi Li, Ciara,
> 
> What is the status of this patch, is the patch justified and is a new versions
> requested/expected?


Apologies for the delay, I missed your reply Li.
With the data you've provided I think the patch is justified.
I think the rollback requires some explanation in the code as it may not be immediately clear what is happening.
I suggest a v3 with either a comment above the rollback, or a new function as described in my previous mail, also with a comment.

Thanks for the patch.

Ciara
  
Li RongQing Nov. 17, 2020, 12:05 a.m. UTC | #5
> 
> Apologies for the delay, I missed your reply Li.
> With the data you've provided I think the patch is justified.
> I think the rollback requires some explanation in the code as it may not be
> immediately clear what is happening.
> I suggest a v3 with either a comment above the rollback, or a new function as
> described in my previous mail, also with a comment.
> 

Ok, we will send V3

Thanks

-Li


> Thanks for the patch.
> 
> Ciara
  

Patch

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index 01f462b46..e04fa43f6 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -251,28 +251,29 @@  af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	struct xsk_umem_info *umem = rxq->umem;
 	uint32_t idx_rx = 0;
 	unsigned long rx_bytes = 0;
-	int rcvd, i;
+	int i;
 	struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
 
-	/* allocate bufs for fill queue replenishment after rx */
-	if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
-		AF_XDP_LOG(DEBUG,
-			"Failed to get enough buffers for fq.\n");
-		return 0;
-	}
+	nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 
-	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
-
-	if (rcvd == 0) {
+	if (nb_pkts == 0) {
 #if defined(XDP_USE_NEED_WAKEUP)
 		if (xsk_ring_prod__needs_wakeup(fq))
 			(void)poll(rxq->fds, 1, 1000);
 #endif
 
-		goto out;
+		return 0;
+	}
+
+	/* allocate bufs for fill queue replenishment after rx */
+	if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
+		AF_XDP_LOG(DEBUG,
+			"Failed to get enough buffers for fq.\n");
+		rx->cached_cons -= nb_pkts;
+		return 0;
 	}
 
-	for (i = 0; i < rcvd; i++) {
+	for (i = 0; i < nb_pkts; i++) {
 		const struct xdp_desc *desc;
 		uint64_t addr;
 		uint32_t len;
@@ -297,20 +298,14 @@  af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		rx_bytes += len;
 	}
 
-	xsk_ring_cons__release(rx, rcvd);
-
-	(void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
+	xsk_ring_cons__release(rx, nb_pkts);
+	(void)reserve_fill_queue(umem, nb_pkts, fq_bufs, fq);
 
 	/* statistics */
-	rxq->stats.rx_pkts += rcvd;
+	rxq->stats.rx_pkts += nb_pkts;
 	rxq->stats.rx_bytes += rx_bytes;
 
-out:
-	if (rcvd != nb_pkts)
-		rte_mempool_put_bulk(umem->mb_pool, (void **)&fq_bufs[rcvd],
-				     nb_pkts - rcvd);
-
-	return rcvd;
+	return nb_pkts;
 }
 #else
 static uint16_t
@@ -322,7 +317,7 @@  af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	struct xsk_ring_prod *fq = &rxq->fq;
 	uint32_t idx_rx = 0;
 	unsigned long rx_bytes = 0;
-	int rcvd, i;
+	int i;
 	uint32_t free_thresh = fq->size >> 1;
 	struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
 
@@ -330,20 +325,21 @@  af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		(void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
 					 NULL, fq);
 
-	if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
-		return 0;
-
-	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
-	if (rcvd == 0) {
+	nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
+	if (nb_pkts == 0) {
 #if defined(XDP_USE_NEED_WAKEUP)
 		if (xsk_ring_prod__needs_wakeup(fq))
 			(void)poll(rxq->fds, 1, 1000);
 #endif
+		return 0;
+	}
 
-		goto out;
+	if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts))) {
+		rx->cached_cons -= nb_pkts;
+		return 0;
 	}
 
-	for (i = 0; i < rcvd; i++) {
+	for (i = 0; i < nb_pkts; i++) {
 		const struct xdp_desc *desc;
 		uint64_t addr;
 		uint32_t len;
@@ -362,18 +358,13 @@  af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		bufs[i] = mbufs[i];
 	}
 
-	xsk_ring_cons__release(rx, rcvd);
+	xsk_ring_cons__release(rx, nb_pkts);
 
 	/* statistics */
-	rxq->stats.rx_pkts += rcvd;
+	rxq->stats.rx_pkts += nb_pkts;
 	rxq->stats.rx_bytes += rx_bytes;
 
-out:
-	if (rcvd != nb_pkts)
-		rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
-				     nb_pkts - rcvd);
-
-	return rcvd;
+	return nb_pkts;
 }
 #endif