[v4,1/2] app/testpmd: optimize testpmd txonly mode

Message ID 20190326130247.14360-1-pbhagavatula@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series [v4,1/2] app/testpmd: optimize testpmd txonly mode |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing fail Performance Testing issues

Commit Message

Pavan Nikhilesh Bhagavatula March 26, 2019, 1:03 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Optimize testpmd txonly mode by
1. Moving per packet ethernet header copy above the loop.
2. Use bulk ops for allocating segments instead of having a inner loop
for every segment.

Also, move the packet prepare logic into a separate function so that it
can be reused later.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v4 Changes:
 - Fix packet len calculation.

 v3 Changes:
 - Split the patches for easier review. (Thomas)
 - Remove unnecessary assignments to 0. (Bernard)

 v2 Changes:
 - Use bulk ops for fetching segments. (Andrew Rybchenko)
 - Fallback to rte_mbuf_raw_alloc if bulk get fails. (Andrew Rybchenko)
 - Fix mbufs not being freed when there is no more mbufs available for
 segments. (Andrew Rybchenko)

 app/test-pmd/txonly.c | 141 +++++++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 64 deletions(-)

--
2.20.1
  

Comments

Iremonger, Bernard March 26, 2019, 4:13 p.m. UTC | #1
Hi Pavan,

> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula [mailto:pbhagavatula@marvell.com]
> Sent: Tuesday, March 26, 2019 1:03 PM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; thomas@monjalon.net;
> arybchenko@solarflare.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> Iremonger, Bernard <bernard.iremonger@intel.com>
> Cc: dev@dpdk.org; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>
> Subject: [dpdk-dev] [PATCH v4 1/2] app/testpmd: optimize testpmd txonly
> mode
> 
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Optimize testpmd txonly mode by
> 1. Moving per packet ethernet header copy above the loop.
> 2. Use bulk ops for allocating segments instead of having a inner loop for
> every segment.
> 
> Also, move the packet prepare logic into a separate function so that it can be
> reused later.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  v4 Changes:
>  - Fix packet len calculation.
> 
>  v3 Changes:
>  - Split the patches for easier review. (Thomas)
>  - Remove unnecessary assignments to 0. (Bernard)
> 
>  v2 Changes:
>  - Use bulk ops for fetching segments. (Andrew Rybchenko)
>  - Fallback to rte_mbuf_raw_alloc if bulk get fails. (Andrew Rybchenko)
>  - Fix mbufs not being freed when there is no more mbufs available for
> segments. (Andrew Rybchenko)
> 
>  app/test-pmd/txonly.c | 141 +++++++++++++++++++++++-------------------
>  1 file changed, 77 insertions(+), 64 deletions(-)
> 
> diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c index
> 1f08b6ed3..8d49e41b1 100644
> --- a/app/test-pmd/txonly.c
> +++ b/app/test-pmd/txonly.c
> @@ -147,6 +147,63 @@ setup_pkt_udp_ip_headers(struct ipv4_hdr
> *ip_hdr,
>  	ip_hdr->hdr_checksum = (uint16_t) ip_cksum;  }
> 
> +static inline bool
> +pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
> +		struct ether_hdr *eth_hdr, const uint16_t vlan_tci,
> +		const uint16_t vlan_tci_outer, const uint64_t ol_flags) {
> +	struct rte_mbuf *pkt_segs[RTE_MAX_SEGS_PER_PKT];
> +	struct rte_mbuf *pkt_seg;
> +	uint32_t nb_segs, pkt_len;
> +	uint8_t i;
> +
> +	if (unlikely(tx_pkt_split == TX_PKT_SPLIT_RND))
> +		nb_segs = random() % tx_pkt_nb_segs + 1;
> +	else
> +		nb_segs = tx_pkt_nb_segs;
> +
> +	if (nb_segs > 1) {
> +		if (rte_mempool_get_bulk(mbp, (void **)pkt_segs,
> nb_segs))
> +			return false;
> +	}
> +
> +	rte_pktmbuf_reset_headroom(pkt);
> +	pkt->data_len = tx_pkt_seg_lengths[0];
> +	pkt->ol_flags = ol_flags;
> +	pkt->vlan_tci = vlan_tci;
> +	pkt->vlan_tci_outer = vlan_tci_outer;
> +	pkt->l2_len = sizeof(struct ether_hdr);
> +	pkt->l3_len = sizeof(struct ipv4_hdr);
> +
> +	pkt_len = pkt->data_len;
> +	pkt_seg = pkt;
> +	for (i = 1; i < nb_segs; i++) {
> +		pkt_seg->next = pkt_segs[i - 1];
> +		pkt_seg = pkt_seg->next;
> +		pkt_seg->data_len = tx_pkt_seg_lengths[i];
> +		pkt_len += pkt_seg->data_len;
> +	}
> +	pkt_seg->next = NULL; /* Last segment of packet. */
> +	/*
> +	 * Copy headers in first packet segment(s).
> +	 */
> +	copy_buf_to_pkt(eth_hdr, sizeof(eth_hdr), pkt, 0);
> +	copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
> +			sizeof(struct ether_hdr));
> +	copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
> +			sizeof(struct ether_hdr) +
> +			sizeof(struct ipv4_hdr));
> +
> +	/*
> +	 * Complete first mbuf of packet and append it to the
> +	 * burst of packets to be transmitted.
> +	 */
> +	pkt->nb_segs = nb_segs;
> +	pkt->pkt_len = pkt_len;
> +
> +	return true;
> +}
> +
>  /*
>   * Transmit a burst of multi-segments packets.
>   */
> @@ -154,9 +211,8 @@ static void
>  pkt_burst_transmit(struct fwd_stream *fs)  {
>  	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> -	struct rte_port *txp;
>  	struct rte_mbuf *pkt;
> -	struct rte_mbuf *pkt_seg;
> +	struct rte_port *txp;

Unnecessary  change to struct rte_port *txp still there.

>  	struct rte_mempool *mbp;
>  	struct ether_hdr eth_hdr;
>  	uint16_t nb_tx;
> @@ -164,14 +220,12 @@ pkt_burst_transmit(struct fwd_stream *fs)
>  	uint16_t vlan_tci, vlan_tci_outer;
>  	uint32_t retry;
>  	uint64_t ol_flags = 0;
> -	uint8_t  i;
>  	uint64_t tx_offloads;
>  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>  	uint64_t start_tsc;
>  	uint64_t end_tsc;
>  	uint64_t core_cycles;
>  #endif
> -	uint32_t nb_segs, pkt_len;
> 
>  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>  	start_tsc = rte_rdtsc();
> @@ -188,72 +242,31 @@ pkt_burst_transmit(struct fwd_stream *fs)
>  		ol_flags |= PKT_TX_QINQ_PKT;
>  	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
>  		ol_flags |= PKT_TX_MACSEC;
> +
> +	/*
> +	 * Initialize Ethernet header.
> +	 */
> +	ether_addr_copy(&peer_eth_addrs[fs->peer_addr],
> &eth_hdr.d_addr);
> +	ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
> +	eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> +
>  	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
>  		pkt = rte_mbuf_raw_alloc(mbp);
> -		if (pkt == NULL) {
> -		nomore_mbuf:
> -			if (nb_pkt == 0)
> -				return;
> +		if (pkt == NULL)
> +			break;
> +		if (unlikely(!pkt_burst_prepare(pkt, mbp,
> +						&eth_hdr, vlan_tci,
> +						vlan_tci_outer,
> +						ol_flags))) {
> +			rte_mempool_put(mbp, pkt);
>  			break;
>  		}
> -
> -		/*
> -		 * Using raw alloc is good to improve performance,
> -		 * but some consumers may use the headroom and so
> -		 * decrement data_off. We need to make sure it is
> -		 * reset to default value.
> -		 */
> -		rte_pktmbuf_reset_headroom(pkt);
> -		pkt->data_len = tx_pkt_seg_lengths[0];
> -		pkt_seg = pkt;
> -		if (tx_pkt_split == TX_PKT_SPLIT_RND)
> -			nb_segs = random() % tx_pkt_nb_segs + 1;
> -		else
> -			nb_segs = tx_pkt_nb_segs;
> -		pkt_len = pkt->data_len;
> -		for (i = 1; i < nb_segs; i++) {
> -			pkt_seg->next = rte_mbuf_raw_alloc(mbp);
> -			if (pkt_seg->next == NULL) {
> -				pkt->nb_segs = i;
> -				rte_pktmbuf_free(pkt);
> -				goto nomore_mbuf;
> -			}
> -			pkt_seg = pkt_seg->next;
> -			pkt_seg->data_len = tx_pkt_seg_lengths[i];
> -			pkt_len += pkt_seg->data_len;
> -		}
> -		pkt_seg->next = NULL; /* Last segment of packet. */
> -
> -		/*
> -		 * Initialize Ethernet header.
> -		 */
> -		ether_addr_copy(&peer_eth_addrs[fs-
> >peer_addr],&eth_hdr.d_addr);
> -		ether_addr_copy(&ports[fs->tx_port].eth_addr,
> &eth_hdr.s_addr);
> -		eth_hdr.ether_type =
> rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> -
> -		/*
> -		 * Copy headers in first packet segment(s).
> -		 */
> -		copy_buf_to_pkt(&eth_hdr, sizeof(eth_hdr), pkt, 0);
> -		copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
> -				sizeof(struct ether_hdr));
> -		copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
> -				sizeof(struct ether_hdr) +
> -				sizeof(struct ipv4_hdr));
> -
> -		/*
> -		 * Complete first mbuf of packet and append it to the
> -		 * burst of packets to be transmitted.
> -		 */
> -		pkt->nb_segs = nb_segs;
> -		pkt->pkt_len = pkt_len;
> -		pkt->ol_flags = ol_flags;
> -		pkt->vlan_tci = vlan_tci;
> -		pkt->vlan_tci_outer = vlan_tci_outer;
> -		pkt->l2_len = sizeof(struct ether_hdr);
> -		pkt->l3_len = sizeof(struct ipv4_hdr);
>  		pkts_burst[nb_pkt] = pkt;
>  	}
> +
> +	if (nb_pkt == 0)
> +		return;
> +
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> nb_pkt);
>  	/*
>  	 * Retry if necessary
> --
> 2.20.1

Regards,

Bernard.
  

Patch

diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 1f08b6ed3..8d49e41b1 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -147,6 +147,63 @@  setup_pkt_udp_ip_headers(struct ipv4_hdr *ip_hdr,
 	ip_hdr->hdr_checksum = (uint16_t) ip_cksum;
 }

+static inline bool
+pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
+		struct ether_hdr *eth_hdr, const uint16_t vlan_tci,
+		const uint16_t vlan_tci_outer, const uint64_t ol_flags)
+{
+	struct rte_mbuf *pkt_segs[RTE_MAX_SEGS_PER_PKT];
+	struct rte_mbuf *pkt_seg;
+	uint32_t nb_segs, pkt_len;
+	uint8_t i;
+
+	if (unlikely(tx_pkt_split == TX_PKT_SPLIT_RND))
+		nb_segs = random() % tx_pkt_nb_segs + 1;
+	else
+		nb_segs = tx_pkt_nb_segs;
+
+	if (nb_segs > 1) {
+		if (rte_mempool_get_bulk(mbp, (void **)pkt_segs, nb_segs))
+			return false;
+	}
+
+	rte_pktmbuf_reset_headroom(pkt);
+	pkt->data_len = tx_pkt_seg_lengths[0];
+	pkt->ol_flags = ol_flags;
+	pkt->vlan_tci = vlan_tci;
+	pkt->vlan_tci_outer = vlan_tci_outer;
+	pkt->l2_len = sizeof(struct ether_hdr);
+	pkt->l3_len = sizeof(struct ipv4_hdr);
+
+	pkt_len = pkt->data_len;
+	pkt_seg = pkt;
+	for (i = 1; i < nb_segs; i++) {
+		pkt_seg->next = pkt_segs[i - 1];
+		pkt_seg = pkt_seg->next;
+		pkt_seg->data_len = tx_pkt_seg_lengths[i];
+		pkt_len += pkt_seg->data_len;
+	}
+	pkt_seg->next = NULL; /* Last segment of packet. */
+	/*
+	 * Copy headers in first packet segment(s).
+	 */
+	copy_buf_to_pkt(eth_hdr, sizeof(eth_hdr), pkt, 0);
+	copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
+			sizeof(struct ether_hdr));
+	copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
+			sizeof(struct ether_hdr) +
+			sizeof(struct ipv4_hdr));
+
+	/*
+	 * Complete first mbuf of packet and append it to the
+	 * burst of packets to be transmitted.
+	 */
+	pkt->nb_segs = nb_segs;
+	pkt->pkt_len = pkt_len;
+
+	return true;
+}
+
 /*
  * Transmit a burst of multi-segments packets.
  */
@@ -154,9 +211,8 @@  static void
 pkt_burst_transmit(struct fwd_stream *fs)
 {
 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
-	struct rte_port *txp;
 	struct rte_mbuf *pkt;
-	struct rte_mbuf *pkt_seg;
+	struct rte_port *txp;
 	struct rte_mempool *mbp;
 	struct ether_hdr eth_hdr;
 	uint16_t nb_tx;
@@ -164,14 +220,12 @@  pkt_burst_transmit(struct fwd_stream *fs)
 	uint16_t vlan_tci, vlan_tci_outer;
 	uint32_t retry;
 	uint64_t ol_flags = 0;
-	uint8_t  i;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t start_tsc;
 	uint64_t end_tsc;
 	uint64_t core_cycles;
 #endif
-	uint32_t nb_segs, pkt_len;

 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	start_tsc = rte_rdtsc();
@@ -188,72 +242,31 @@  pkt_burst_transmit(struct fwd_stream *fs)
 		ol_flags |= PKT_TX_QINQ_PKT;
 	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
 		ol_flags |= PKT_TX_MACSEC;
+
+	/*
+	 * Initialize Ethernet header.
+	 */
+	ether_addr_copy(&peer_eth_addrs[fs->peer_addr], &eth_hdr.d_addr);
+	ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
+	eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
 	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
 		pkt = rte_mbuf_raw_alloc(mbp);
-		if (pkt == NULL) {
-		nomore_mbuf:
-			if (nb_pkt == 0)
-				return;
+		if (pkt == NULL)
+			break;
+		if (unlikely(!pkt_burst_prepare(pkt, mbp,
+						&eth_hdr, vlan_tci,
+						vlan_tci_outer,
+						ol_flags))) {
+			rte_mempool_put(mbp, pkt);
 			break;
 		}
-
-		/*
-		 * Using raw alloc is good to improve performance,
-		 * but some consumers may use the headroom and so
-		 * decrement data_off. We need to make sure it is
-		 * reset to default value.
-		 */
-		rte_pktmbuf_reset_headroom(pkt);
-		pkt->data_len = tx_pkt_seg_lengths[0];
-		pkt_seg = pkt;
-		if (tx_pkt_split == TX_PKT_SPLIT_RND)
-			nb_segs = random() % tx_pkt_nb_segs + 1;
-		else
-			nb_segs = tx_pkt_nb_segs;
-		pkt_len = pkt->data_len;
-		for (i = 1; i < nb_segs; i++) {
-			pkt_seg->next = rte_mbuf_raw_alloc(mbp);
-			if (pkt_seg->next == NULL) {
-				pkt->nb_segs = i;
-				rte_pktmbuf_free(pkt);
-				goto nomore_mbuf;
-			}
-			pkt_seg = pkt_seg->next;
-			pkt_seg->data_len = tx_pkt_seg_lengths[i];
-			pkt_len += pkt_seg->data_len;
-		}
-		pkt_seg->next = NULL; /* Last segment of packet. */
-
-		/*
-		 * Initialize Ethernet header.
-		 */
-		ether_addr_copy(&peer_eth_addrs[fs->peer_addr],&eth_hdr.d_addr);
-		ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
-		eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
-
-		/*
-		 * Copy headers in first packet segment(s).
-		 */
-		copy_buf_to_pkt(&eth_hdr, sizeof(eth_hdr), pkt, 0);
-		copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
-				sizeof(struct ether_hdr));
-		copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
-				sizeof(struct ether_hdr) +
-				sizeof(struct ipv4_hdr));
-
-		/*
-		 * Complete first mbuf of packet and append it to the
-		 * burst of packets to be transmitted.
-		 */
-		pkt->nb_segs = nb_segs;
-		pkt->pkt_len = pkt_len;
-		pkt->ol_flags = ol_flags;
-		pkt->vlan_tci = vlan_tci;
-		pkt->vlan_tci_outer = vlan_tci_outer;
-		pkt->l2_len = sizeof(struct ether_hdr);
-		pkt->l3_len = sizeof(struct ipv4_hdr);
 		pkts_burst[nb_pkt] = pkt;
 	}
+
+	if (nb_pkt == 0)
+		return;
+
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
 	/*
 	 * Retry if necessary