[v2,2/3] app/testpmd: gather Rx and Tx routines profiling

Message ID 1584625851-10291-3-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers
Series app/testpmd: qualify Rx/Tx profiling data on burst size |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko March 19, 2020, 1:50 p.m. UTC
  This patch counts the tick spent in rx-tx_burst routines in
dedicated counters and displays the gathered profiling statistics.

The feature is engaged only if CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES
configured as 'Y'. The "set fwdprof (flags)" command can be used
to select what counters should be involved.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 app/test-pmd/csumonly.c   | 21 +++++++++------------
 app/test-pmd/flowgen.c    | 21 +++++++++------------
 app/test-pmd/icmpecho.c   | 21 +++++++++------------
 app/test-pmd/iofwd.c      | 21 +++++++++------------
 app/test-pmd/macfwd.c     | 21 +++++++++------------
 app/test-pmd/macswap.c    | 21 +++++++++------------
 app/test-pmd/rxonly.c     | 14 ++++----------
 app/test-pmd/softnicfwd.c | 21 +++++++++------------
 app/test-pmd/testpmd.c    | 18 +++++++++++++++++-
 app/test-pmd/testpmd.h    | 34 ++++++++++++++++++++++++++++++++--
 app/test-pmd/txonly.c     | 20 ++++++++------------
 11 files changed, 124 insertions(+), 109 deletions(-)
  

Comments

Thomas Monjalon April 2, 2020, 11:20 a.m. UTC | #1
19/03/2020 14:50, Viacheslav Ovsiienko:
> +	if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
> +		printf("\n  rx CPU cycles/packet=%u (total cycles="
> +		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
> +		       (unsigned int)(rx_cycles / total_recv),
> +		       rx_cycles, total_recv);
> +	if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
> +		printf("\n  tx CPU cycles/packet=%u (total cycles="
> +		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
> +		       (unsigned int)(tx_cycles / total_xmit),
> +		       tx_cycles, total_xmit);

This is the "UI", so I think it deserves a cautious review.

Instead of "=" without space, I think ": " is easier to read.

Please use "Rx" and "Tx" instead of lowercase ones.

"CPU cycles/packet" is hard to read. I would prefer either
	"cycles/packet" without CPU
or
	"cycles per packet"

I will continue some UI comments in next patch.
  
Slava Ovsiienko April 2, 2020, 11:23 a.m. UTC | #2
Hi,

Thanks to all for reviewing and comments, will address ones in v3.

With best regards,
Slava

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Thursday, April 2, 2020 14:20
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; ferruh.yigit@intel.com; bernard.iremonger@intel.com
> Subject: Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: gather Rx and Tx
> routines profiling
> 
> 19/03/2020 14:50, Viacheslav Ovsiienko:
> > +	if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
> > +		printf("\n  rx CPU cycles/packet=%u (total cycles="
> > +		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
> > +		       (unsigned int)(rx_cycles / total_recv),
> > +		       rx_cycles, total_recv);
> > +	if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
> > +		printf("\n  tx CPU cycles/packet=%u (total cycles="
> > +		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
> > +		       (unsigned int)(tx_cycles / total_xmit),
> > +		       tx_cycles, total_xmit);
> 
> This is the "UI", so I think it deserves a cautious review.
> 
> Instead of "=" without space, I think ": " is easier to read.
> 
> Please use "Rx" and "Tx" instead of lowercase ones.
> 
> "CPU cycles/packet" is hard to read. I would prefer either
> 	"cycles/packet" without CPU
> or
> 	"cycles per packet"
> 
> I will continue some UI comments in next patch.
>
  

Patch

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 25091de..4104737 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -789,18 +789,15 @@  struct simple_gre_hdr {
 	int ret;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/* receive a burst of packet */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -1067,8 +1064,10 @@  struct simple_gre_hdr {
 		printf("Preparing packet burst to transmit failed: %s\n",
 				rte_strerror(rte_errno));
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
 			nb_prep);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/*
 	 * Retry if necessary
@@ -1077,8 +1076,10 @@  struct simple_gre_hdr {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&tx_pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -1096,11 +1097,7 @@  struct simple_gre_hdr {
 		} while (++nb_tx < nb_rx);
 	}
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine csum_fwd_engine = {
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 4bd351e..51e87b0 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -98,19 +98,16 @@ 
 	uint32_t retry;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 	static int next_flow = 0;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
-#endif
-
 	/* Receive a burst of packets and discard them. */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 	for (i = 0; i < nb_rx; i++)
@@ -180,7 +177,9 @@ 
 		next_flow = (next_flow + 1) % cfg_n_flows;
 	}
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -188,8 +187,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -207,11 +208,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_pkt);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine flow_gen_engine = {
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 65aece16..8843183 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -294,20 +294,17 @@ 
 	uint8_t  i;
 	int l2_len;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * First, receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -492,8 +489,10 @@ 
 
 	/* Send back ICMP echo replies, if any. */
 	if (nb_replies > 0) {
+		TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 		nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
 					 nb_replies);
+		TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		/*
 		 * Retry if necessary
 		 */
@@ -502,10 +501,12 @@ 
 			while (nb_tx < nb_replies &&
 					retry++ < burst_tx_retry_num) {
 				rte_delay_us(burst_tx_delay_time);
+				TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 				nb_tx += rte_eth_tx_burst(fs->tx_port,
 						fs->tx_queue,
 						&pkts_burst[nb_tx],
 						nb_replies - nb_tx);
+				TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 			}
 		}
 		fs->tx_packets += nb_tx;
@@ -520,11 +521,7 @@ 
 		}
 	}
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine icmp_echo_engine = {
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9dce76e..9ff6531 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -52,20 +52,17 @@ 
 	uint32_t retry;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 	fs->rx_packets += nb_rx;
@@ -73,8 +70,10 @@ 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -82,8 +81,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -96,11 +97,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine io_fwd_engine = {
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index d2ebb11..f4a213e 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -57,20 +57,17 @@ 
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -103,7 +100,9 @@ 
 		mb->vlan_tci = txp->tx_vlan_id;
 		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
 	}
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -111,8 +110,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 
@@ -126,11 +127,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_fwd_engine = {
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 8428c26..5cb3133 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -58,20 +58,17 @@ 
 	uint16_t nb_tx;
 	uint32_t retry;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -83,7 +80,9 @@ 
 
 	do_macswap(pkts_burst, nb_rx, txp);
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -91,8 +90,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -105,11 +106,7 @@ 
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_swap_engine = {
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 5c65fc4..2820d7f 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -51,18 +51,16 @@ 
 	uint16_t i;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
 #endif
 
 	/*
 	 * Receive a burst of packets.
 	 */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 				 nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	if (unlikely(nb_rx == 0))
 		return;
 
@@ -73,11 +71,7 @@ 
 	for (i = 0; i < nb_rx; i++)
 		rte_pktmbuf_free(pkts_burst[i]);
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine rx_only_engine = {
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index e9d4373..b78f2ce 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -88,34 +88,35 @@  struct tm_hierarchy {
 	uint32_t retry;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
 	/*  Packets Receive */
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
 			pkts_burst, nb_pkt_per_burst);
+	TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
 	fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 			pkts_burst, nb_rx);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
 	/* Retry if necessary */
 	if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
 		retry = 0;
 		while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_rx - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -130,11 +131,7 @@  struct tm_hierarchy {
 			rte_pktmbuf_free(pkts_burst[nb_tx]);
 		} while (++nb_tx < nb_rx);
 	}
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index c93fa35..b195880 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1625,6 +1625,8 @@  struct extmem_param {
 	struct rte_eth_stats stats;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t fwd_cycles = 0;
+	uint64_t rx_cycles = 0;
+	uint64_t tx_cycles = 0;
 #endif
 	uint64_t total_recv = 0;
 	uint64_t total_xmit = 0;
@@ -1655,6 +1657,8 @@  struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fwd_cycles += fs->core_cycles;
+		rx_cycles += fs->core_rx_cycles;
+		tx_cycles += fs->core_tx_cycles;
 #endif
 	}
 	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
@@ -1785,11 +1789,21 @@  struct extmem_param {
 	       "%s\n",
 	       acc_stats_border, acc_stats_border);
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	if (total_recv > 0)
+	if (fwdprof_flags & RECORD_CORE_CYCLES_FWD && total_recv > 0)
 		printf("\n  CPU cycles/packet=%u (total cycles="
 		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
 		       (unsigned int)(fwd_cycles / total_recv),
 		       fwd_cycles, total_recv);
+	if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
+		printf("\n  rx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total RX packets=%"PRIu64")\n",
+		       (unsigned int)(rx_cycles / total_recv),
+		       rx_cycles, total_recv);
+	if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
+		printf("\n  tx CPU cycles/packet=%u (total cycles="
+		       "%"PRIu64" / total TX packets=%"PRIu64")\n",
+		       (unsigned int)(tx_cycles / total_xmit),
+		       tx_cycles, total_xmit);
 #endif
 }
 
@@ -1820,6 +1834,8 @@  struct extmem_param {
 #endif
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 		fs->core_cycles = 0;
+		fs->core_rx_cycles = 0;
+		fs->core_tx_cycles = 0;
 #endif
 	}
 }
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 466e611..6177a50 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -136,7 +136,9 @@  struct fwd_stream {
 	/**< received packets has bad outer l4 checksum */
 	unsigned int gro_times;	/**< GRO operation times */
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t     core_cycles; /**< used for RX and TX processing */
+	uint64_t core_cycles; /**< used for RX and TX processing */
+	uint64_t core_tx_cycles; /**< used for tx_burst processing */
+	uint64_t core_rx_cycles; /**< used for rx_burst processing */
 #endif
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 	struct pkt_burst_stats rx_burst_stats;
@@ -325,7 +327,35 @@  struct queue_stats_mappings {
 #define RECORD_CORE_CYCLES_FWD (1<<0)
 #define RECORD_CORE_CYCLES_RX (1<<1)
 #define RECORD_CORE_CYCLES_TX (1<<2)
-#endif
+
+/* Macros to gather profiling statistics. */
+#define TEST_PMD_CORE_CYC_TX_START(a) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) a = rte_rdtsc(); }
+
+#define TEST_PMD_CORE_CYC_RX_START(a) \
+{if (fwdprof_flags & (RECORD_CORE_CYCLES_FWD | \
+		       RECORD_CORE_CYCLES_RX)) a = rte_rdtsc(); }
+
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }
+
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
+
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
+
+#else
+/* No profiling statistics is configured. */
+#define TEST_PMD_CORE_CYC_TX_START(a)
+#define TEST_PMD_CORE_CYC_RX_START(a)
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
 
 /* globals used for configuration */
 extern uint16_t verbose_level; /**< Drives messages being displayed, if any. */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 8a1989f..8ff7410 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -241,15 +241,11 @@ 
 	uint64_t ol_flags = 0;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	uint64_t start_tsc;
-	uint64_t end_tsc;
-	uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	start_tsc = rte_rdtsc();
+	uint64_t start_rx_tsc = 0;
+	uint64_t start_tx_tsc = 0;
 #endif
 
+	TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
 	mbp = current_fwd_lcore()->mbp;
 	txp = &ports[fs->tx_port];
 	tx_offloads = txp->dev_conf.txmode.offloads;
@@ -301,7 +297,9 @@ 
 	if (nb_pkt == 0)
 		return;
 
+	TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+	TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 	/*
 	 * Retry if necessary
 	 */
@@ -309,8 +307,10 @@ 
 		retry = 0;
 		while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
 			rte_delay_us(burst_tx_delay_time);
+			TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
 			nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
 					&pkts_burst[nb_tx], nb_pkt - nb_tx);
+			TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 		}
 	}
 	fs->tx_packets += nb_tx;
@@ -334,11 +334,7 @@ 
 		} while (++nb_tx < nb_pkt);
 	}
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-	end_tsc = rte_rdtsc();
-	core_cycles = (end_tsc - start_tsc);
-	fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+	TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void