[dpdk-dev,v2,1/4] app/test: unit test for rx and tx cycles/packet

Message ID 1412944201-30703-2-git-send-email-cunming.liang@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Cunming Liang Oct. 10, 2014, 12:29 p.m. UTC
  It provides unit test to measure cycles/packet in NIC loopback mode.
It simply gives the average cycles of IO used per packet without test equipment.
When doing the test, make sure the link is UP.

Usage Example:
1. Run unit test app in interactive mode
    app/test -c f -n 4 -- -i
2. Run and wait for the result
    pmd_perf_autotest

There's option to choose rx/tx pair, default is vector.
    set_rxtx_mode [vector|scalar|full|hybrid]
Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without INC_VEC=y in config

Signed-off-by: Cunming Liang <cunming.liang@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 app/test/Makefile                   |    1 +
 app/test/commands.c                 |   38 +++
 app/test/packet_burst_generator.c   |    4 +-
 app/test/test.h                     |    4 +
 app/test/test_pmd_perf.c            |  626 +++++++++++++++++++++++++++++++++++
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
 6 files changed, 677 insertions(+), 2 deletions(-)
 create mode 100644 app/test/test_pmd_perf.c
  

Comments

Neil Horman Oct. 10, 2014, 5:52 p.m. UTC | #1
On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> It provides unit test to measure cycles/packet in NIC loopback mode.
> It simply gives the average cycles of IO used per packet without test equipment.
> When doing the test, make sure the link is UP.
> 
> Usage Example:
> 1. Run unit test app in interactive mode
>     app/test -c f -n 4 -- -i
> 2. Run and wait for the result
>     pmd_perf_autotest
> 
> There's option to choose rx/tx pair, default is vector.
>     set_rxtx_mode [vector|scalar|full|hybrid]
> Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without INC_VEC=y in config
> 
> Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Notes inline

> ---
>  app/test/Makefile                   |    1 +
>  app/test/commands.c                 |   38 +++
>  app/test/packet_burst_generator.c   |    4 +-
>  app/test/test.h                     |    4 +
>  app/test/test_pmd_perf.c            |  626 +++++++++++++++++++++++++++++++++++
>  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
>  6 files changed, 677 insertions(+), 2 deletions(-)
>  create mode 100644 app/test/test_pmd_perf.c
> 
> diff --git a/app/test/Makefile b/app/test/Makefile
> index 6af6d76..ebfa0ba 100644
> --- a/app/test/Makefile
> +++ b/app/test/Makefile
> @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
>  
>  SRCS-y += test_ring.c
>  SRCS-y += test_ring_perf.c
> +SRCS-y += test_pmd_perf.c
>  
>  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
>  SRCS-y += test_table.c
> diff --git a/app/test/commands.c b/app/test/commands.c
> index a9e36b1..f1e746e 100644
> --- a/app/test/commands.c
> +++ b/app/test/commands.c
> @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
>  
> +#define NB_ETHPORTS_USED                (1)
> +#define NB_SOCKETS                      (2)
> +#define MEMPOOL_CACHE_SIZE 250
> +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
Don't you want to size this in accordance with the amount of data your sending
(64 Bytes as noted above)?

> +static void
> +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> +{
> +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> +		eth_addr->addr_bytes[0],
> +		eth_addr->addr_bytes[1],
> +		eth_addr->addr_bytes[2],
> +		eth_addr->addr_bytes[3],
> +		eth_addr->addr_bytes[4],
> +		eth_addr->addr_bytes[5]);
> +}
> +
This was copieed from print_ethaddr.  Seems like a good candidate for a common
function in rte_ether.h


> +}
> +
> +static void
> +signal_handler(int signum)
> +{
> +	/* When we receive a USR1 signal, print stats */
I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits the
program

> +	if (signum == SIGUSR1) {
SIGINT instead.  Thats the common practice.

> +		printf("Force Stop!\n");
> +		stop = 1;
> +	}
> +	if (signum == SIGUSR2)
> +		stats_display(0);
> +}
> +/* main processing loop */
> +static int
> +main_loop(__rte_unused void *args)
> +{
> +#define PACKET_SIZE 64
> +#define FRAME_GAP 12
> +#define MAC_PREAMBLE 8
> +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> +	unsigned lcore_id;
> +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> +	struct lcore_conf *conf;
> +	uint64_t prev_tsc, cur_tsc;
> +	int pkt_per_port;
> +	uint64_t packets_per_second, total_packets;
> +
> +	lcore_id = rte_lcore_id();
> +	conf = &lcore_conf[lcore_id];
> +	if (conf->status != LCORE_USED)
> +		return 0;
> +
> +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> +
> +	int idx = 0;
> +	for (i = 0; i < conf->nb_ports; i++) {
> +		int num = pkt_per_port;
> +		portid = conf->portlist[i];
> +		printf("inject %d packet to port %d\n", num, portid);
> +		while (num) {
> +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> +			nb_tx = rte_eth_tx_burst(portid, 0,
> +						&tx_burst[idx], nb_tx);
> +			num -= nb_tx;
> +			idx += nb_tx;
> +		}
> +	}
> +	printf("Total packets inject to prime ports = %u\n", idx);
> +
> +	packets_per_second = (link_mbps * 1000 * 1000) /
> +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
> +	printf("Each port will do %"PRIu64" packets per second\n",
> +		+packets_per_second);
> +
> +	total_packets = RTE_TEST_DURATION * conf->nb_ports * packets_per_second;
> +	printf("Test will stop after at least %"PRIu64" packets received\n",
> +		+ total_packets);
> +
> +	prev_tsc = rte_rdtsc();
> +
> +	while (likely(!stop)) {
> +		for (i = 0; i < conf->nb_ports; i++) {
> +			portid = conf->portlist[i];
> +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> +						 pkts_burst, MAX_PKT_BURST);
> +			if (unlikely(nb_rx == 0)) {
> +				idle++;
> +				continue;
> +			}
> +
> +			count += nb_rx;
Doesn't take into consideration error conditions.  rte_eth_rx_burst can return
-ENOTSUP

> +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
Ditto with -ENOTSUP

> +			if (unlikely(nb_tx < nb_rx)) {
What makes this unlikely?  Seems like a perfectly reasonable condition to happen
to me.  If the network is busy, its completely likely that you will receive more
frames than you send, if you elect to receive all frames.

> +				drop += (nb_rx - nb_tx);
> +				do {
> +					rte_pktmbuf_free(pkts_burst[nb_tx]);
Defer this, it skews your timing

> +				} while (++nb_tx < nb_rx);
> +			}
> +		}
> +		if (unlikely(count >= total_packets))
> +			break;
Whats the reasoning here?  Do you only ever expect to receive frames that you
send?  If so, seems like this should call for a big warning to get printed.

> +	}
> +
> +	cur_tsc = rte_rdtsc();
> +
> +	for (i = 0; i < conf->nb_ports; i++) {
> +		portid = conf->portlist[i];
> +		int nb_free = pkt_per_port;
> +		do { /* dry out */
> +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> +						 pkts_burst, MAX_PKT_BURST);
> +			nb_tx = 0;
> +			while (nb_tx < nb_rx)
> +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> +			nb_free -= nb_rx;
> +		} while (nb_free != 0);
> +		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
> +	}
> +
Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it be
enough just to stop the interface?

> +	if (count == 0)
> +		return -1;
> +
> +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> +
Bad math here.  Theres no guarantee that the tsc hasn't wrapped (potentially
more than once) depending on your test length.  you need to check the tsc before
and after each burst and record an average of deltas instead, accounting in each
instance for the possibility of wrap.

> +	return 0;
> +}
> +
> +static int
> +test_pmd_perf(void)
> +{
> +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> +	uint16_t portid;
> +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> +	int socketid = -1;
> +	int ret;
> +
> +	printf("Start PMD RXTX cycles cost test.\n");
> +
> +	signal(SIGUSR1, signal_handler);
Again SIGINT here.

> +	signal(SIGUSR2, signal_handler);
> +
> +	nb_ports = rte_eth_dev_count();
> +	if (nb_ports < NB_ETHPORTS_USED) {
> +		printf("At least %u port(s) used for perf. test\n",
> +		       NB_ETHPORTS_USED);
> +		return -1;
> +	}
> +
> +	if (nb_ports > RTE_MAX_ETHPORTS)
> +		nb_ports = RTE_MAX_ETHPORTS;
> +
> +	nb_lcores = rte_lcore_count();
> +
> +	memset(lcore_conf, 0, sizeof(lcore_conf));
> +	init_lcores();
> +
> +	init_mbufpool(NB_MBUF);
> +
> +	reset_count();
> +	num = 0;
> +	for (portid = 0; portid < nb_ports; portid++) {
> +		if (socketid == -1) {
> +			socketid = rte_eth_dev_socket_id(portid);
> +			slave_id = alloc_lcore(socketid);
> +			if (slave_id == (uint16_t)-1) {
> +				printf("No avail lcore to run test\n");
> +				return -1;
> +			}
> +			printf("Performance test runs on lcore %u socket %u\n",
> +			       slave_id, socketid);
> +		}
> +
> +		if (socketid != rte_eth_dev_socket_id(portid)) {
> +			printf("Skip port %d\n", portid);
> +			continue;
> +		}
> +
> +		/* port configure */
> +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> +					    nb_tx_queue, &port_conf);
> +		if (ret < 0)
> +			rte_exit(EXIT_FAILURE,
> +				"Cannot configure device: err=%d, port=%d\n",
> +				 ret, portid);
> +
> +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> +		printf("Port %u ", portid);
> +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> +		printf("\n");
> +
> +		/* tx queue setup */
> +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> +					     socketid, &tx_conf);
> +		if (ret < 0)
> +			rte_exit(EXIT_FAILURE,
> +				"rte_eth_tx_queue_setup: err=%d, "
> +				"port=%d\n", ret, portid);
> +
> +		/* rx queue steup */
> +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> +						socketid, &rx_conf,
> +						mbufpool[socketid]);
> +		if (ret < 0)
> +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
> +				 "port=%d\n", ret, portid);
> +
> +		/* Start device */
> +		stop = 0;
> +		ret = rte_eth_dev_start(portid);
> +		if (ret < 0)
> +			rte_exit(EXIT_FAILURE,
> +				"rte_eth_dev_start: err=%d, port=%d\n",
> +				ret, portid);
> +
> +		/* always eanble promiscuous */
> +		rte_eth_promiscuous_enable(portid);
> +
> +		lcore_conf[slave_id].portlist[num++] = portid;
> +		lcore_conf[slave_id].nb_ports++;
> +	}
> +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> +
> +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> +
> +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> +	if (rte_eal_wait_lcore(slave_id) < 0)
> +		return -1;
> +
> +	/* port tear down */
> +	for (portid = 0; portid < nb_ports; portid++) {
> +		if (socketid != rte_eth_dev_socket_id(portid))
> +			continue;
> +
> +		rte_eth_dev_stop(portid);
> +	}
> +
Clean up your allocated memory/lcores/etc?

Neil
  
Cunming Liang Oct. 12, 2014, 11:10 a.m. UTC | #2
Hi Neil,

Very appreciate your comments.
I add inline reply, will send v3 asap when we get alignment.

BRs,
Liang Cunming

> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Saturday, October 11, 2014 1:52 AM
> To: Liang, Cunming
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx cycles/packet
> 
> On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > It provides unit test to measure cycles/packet in NIC loopback mode.
> > It simply gives the average cycles of IO used per packet without test equipment.
> > When doing the test, make sure the link is UP.
> >
> > Usage Example:
> > 1. Run unit test app in interactive mode
> >     app/test -c f -n 4 -- -i
> > 2. Run and wait for the result
> >     pmd_perf_autotest
> >
> > There's option to choose rx/tx pair, default is vector.
> >     set_rxtx_mode [vector|scalar|full|hybrid]
> > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> INC_VEC=y in config
> >
> > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> 
> Notes inline
> 
> > ---
> >  app/test/Makefile                   |    1 +
> >  app/test/commands.c                 |   38 +++
> >  app/test/packet_burst_generator.c   |    4 +-
> >  app/test/test.h                     |    4 +
> >  app/test/test_pmd_perf.c            |  626
> +++++++++++++++++++++++++++++++++++
> >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> >  6 files changed, 677 insertions(+), 2 deletions(-)
> >  create mode 100644 app/test/test_pmd_perf.c
> >
> > diff --git a/app/test/Makefile b/app/test/Makefile
> > index 6af6d76..ebfa0ba 100644
> > --- a/app/test/Makefile
> > +++ b/app/test/Makefile
> > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> >
> >  SRCS-y += test_ring.c
> >  SRCS-y += test_ring_perf.c
> > +SRCS-y += test_pmd_perf.c
> >
> >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> >  SRCS-y += test_table.c
> > diff --git a/app/test/commands.c b/app/test/commands.c
> > index a9e36b1..f1e746e 100644
> > --- a/app/test/commands.c
> > +++ b/app/test/commands.c
> > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> >
> > +#define NB_ETHPORTS_USED                (1)
> > +#define NB_SOCKETS                      (2)
> > +#define MEMPOOL_CACHE_SIZE 250
> > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> RTE_PKTMBUF_HEADROOM)
> Don't you want to size this in accordance with the amount of data your sending
> (64 Bytes as noted above)?
[Liang, Cunming] The case is designed to measure small packet IO cost with normal mbuf size.
Even if decreasing the size, it won't gain significant cycles.
> 
> > +static void
> > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > +{
> > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > +		eth_addr->addr_bytes[0],
> > +		eth_addr->addr_bytes[1],
> > +		eth_addr->addr_bytes[2],
> > +		eth_addr->addr_bytes[3],
> > +		eth_addr->addr_bytes[4],
> > +		eth_addr->addr_bytes[5]);
> > +}
> > +
> This was copieed from print_ethaddr.  Seems like a good candidate for a common
> function in rte_ether.h
[Liang, Cunming] Agree with you, some of samples now use it with the same copy.
I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the 48bits address output.
And leaving other prints for application customization.
> 
> 
> > +}
> > +
> > +static void
> > +signal_handler(int signum)
> > +{
> > +	/* When we receive a USR1 signal, print stats */
> I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits the
> program
[Liang, Cunming] Thanks, it's a typo.
> 
> > +	if (signum == SIGUSR1) {
> SIGINT instead.  Thats the common practice.
[Liang, Cunming] I understood your opinion. 
The considerations I'm not using SIGINT instead are:
1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command interactive.
  It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
2. By SIGINT semantic, expect to terminate the process.
  Here I expect to force stop this case, but still alive in command line.
  After it stopped, it can run again or start to run other test cases.
  So I keep SIGINT, SIGUSR1 in different behavior.
3. It should be rarely used. 
  Only when exception timeout, I leave this backdoor for automation test control.
  For manual test, we can easily force kill the process.

> 
> > +		printf("Force Stop!\n");
> > +		stop = 1;
> > +	}
> > +	if (signum == SIGUSR2)
> > +		stats_display(0);
> > +}
> > +/* main processing loop */
> > +static int
> > +main_loop(__rte_unused void *args)
> > +{
> > +#define PACKET_SIZE 64
> > +#define FRAME_GAP 12
> > +#define MAC_PREAMBLE 8
> > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > +	unsigned lcore_id;
> > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > +	struct lcore_conf *conf;
> > +	uint64_t prev_tsc, cur_tsc;
> > +	int pkt_per_port;
> > +	uint64_t packets_per_second, total_packets;
> > +
> > +	lcore_id = rte_lcore_id();
> > +	conf = &lcore_conf[lcore_id];
> > +	if (conf->status != LCORE_USED)
> > +		return 0;
> > +
> > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > +
> > +	int idx = 0;
> > +	for (i = 0; i < conf->nb_ports; i++) {
> > +		int num = pkt_per_port;
> > +		portid = conf->portlist[i];
> > +		printf("inject %d packet to port %d\n", num, portid);
> > +		while (num) {
> > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > +						&tx_burst[idx], nb_tx);
> > +			num -= nb_tx;
> > +			idx += nb_tx;
> > +		}
> > +	}
> > +	printf("Total packets inject to prime ports = %u\n", idx);
> > +
> > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
> > +	printf("Each port will do %"PRIu64" packets per second\n",
> > +		+packets_per_second);
> > +
> > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> packets_per_second;
> > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > +		+ total_packets);
> > +
> > +	prev_tsc = rte_rdtsc();
> > +
> > +	while (likely(!stop)) {
> > +		for (i = 0; i < conf->nb_ports; i++) {
> > +			portid = conf->portlist[i];
> > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > +						 pkts_burst, MAX_PKT_BURST);
> > +			if (unlikely(nb_rx == 0)) {
> > +				idle++;
> > +				continue;
> > +			}
> > +
> > +			count += nb_rx;
> Doesn't take into consideration error conditions.  rte_eth_rx_burst can return
> -ENOTSUP
[Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG CONFIG.
The error is used to avoid no function call register. 
When ETHDEV_DEBUG turn off, the NULL function call cause segfault directly.
So I think it's a library internal error.
In such library exceptional case, I prefer not expecting sample/application to condition check library functional error.
> 
> > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
> Ditto with -ENOTSUP
> 
> > +			if (unlikely(nb_tx < nb_rx)) {
> What makes this unlikely?  Seems like a perfectly reasonable condition to happen
> to me.  If the network is busy, its completely likely that you will receive more
> frames than you send, if you elect to receive all frames.
[Liang, Cunming] For this case, NIC works in MAC loopback mode.
It firstly injects numbers of packets to NIC. Then NIC will loopback all packets to ingress side.
The code here will receive the packets, and send them back to NIC
Packets looping inside all come from initial injection.
As the total number of injected packets is much less than in-chip queue size, the tx egress queue shouldn't block desc. ring update.
So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot archive line rate. 
When it happens, the cycles/packets result make no sense, as the bottle neck is NIC.
The drop counter can record it.
> 
> > +				drop += (nb_rx - nb_tx);
> > +				do {
> > +					rte_pktmbuf_free(pkts_burst[nb_tx]);
> Defer this, it skews your timing
[Liang, Cunming] Agree with you, I ever thought about it.
This test cases is designed to measure pure IO RX/TX routine.
When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or drop it.
Each way introduces noise(adding additional control code), resending much times even cost more than free it.
The cycles/packets is useful when there's no packet drop, otherwise it gives the hint where the problem comes from (by idle or drop).
> 
> > +				} while (++nb_tx < nb_rx);
> > +			}
> > +		}
> > +		if (unlikely(count >= total_packets))
> > +			break;
> Whats the reasoning here?  Do you only ever expect to receive frames that you
> send?  If so, seems like this should call for a big warning to get printed.
[Liang, Cunming] The loop exits when the pre-calculated total_packets are received.
As the nb_rx is unpredictable, the packet counter may large equal than total_packets the last time.
The reason unlikely used here is because the condition becomes true only the last time. 
> 
> > +	}
> > +
> > +	cur_tsc = rte_rdtsc();
> > +
> > +	for (i = 0; i < conf->nb_ports; i++) {
> > +		portid = conf->portlist[i];
> > +		int nb_free = pkt_per_port;
> > +		do { /* dry out */
> > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > +						 pkts_burst, MAX_PKT_BURST);
> > +			nb_tx = 0;
> > +			while (nb_tx < nb_rx)
> > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > +			nb_free -= nb_rx;
> > +		} while (nb_free != 0);
> > +		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
> > +	}
> > +
> Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it be
> enough just to stop the interface?
[Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
But it's designed to run multi-times without exit, for the purpose of warming up or for getting average number.
So stopping device is not enough, we have to release the flying packets.

> 
> > +	if (count == 0)
> > +		return -1;
> > +
> > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > +
> Bad math here.  Theres no guarantee that the tsc hasn't wrapped (potentially
> more than once) depending on your test length.  you need to check the tsc before
> and after each burst and record an average of deltas instead, accounting in each
> instance for the possibility of wrap.
[Liang, Cunming] I'm not sure catch your point correctly.
I think both cur_tsc and prev_tsc are 64 bits width. 
For 3GHz, I think it won't wrapped so quick.
As it's uint64_t, so even get wrapped, the delta should still be correct.
> 
> > +	return 0;
> > +}
> > +
> > +static int
> > +test_pmd_perf(void)
> > +{
> > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > +	uint16_t portid;
> > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > +	int socketid = -1;
> > +	int ret;
> > +
> > +	printf("Start PMD RXTX cycles cost test.\n");
> > +
> > +	signal(SIGUSR1, signal_handler);
> Again SIGINT here.
> 
> > +	signal(SIGUSR2, signal_handler);
> > +
> > +	nb_ports = rte_eth_dev_count();
> > +	if (nb_ports < NB_ETHPORTS_USED) {
> > +		printf("At least %u port(s) used for perf. test\n",
> > +		       NB_ETHPORTS_USED);
> > +		return -1;
> > +	}
> > +
> > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > +		nb_ports = RTE_MAX_ETHPORTS;
> > +
> > +	nb_lcores = rte_lcore_count();
> > +
> > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > +	init_lcores();
> > +
> > +	init_mbufpool(NB_MBUF);
> > +
> > +	reset_count();
> > +	num = 0;
> > +	for (portid = 0; portid < nb_ports; portid++) {
> > +		if (socketid == -1) {
> > +			socketid = rte_eth_dev_socket_id(portid);
> > +			slave_id = alloc_lcore(socketid);
> > +			if (slave_id == (uint16_t)-1) {
> > +				printf("No avail lcore to run test\n");
> > +				return -1;
> > +			}
> > +			printf("Performance test runs on lcore %u socket %u\n",
> > +			       slave_id, socketid);
> > +		}
> > +
> > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > +			printf("Skip port %d\n", portid);
> > +			continue;
> > +		}
> > +
> > +		/* port configure */
> > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > +					    nb_tx_queue, &port_conf);
> > +		if (ret < 0)
> > +			rte_exit(EXIT_FAILURE,
> > +				"Cannot configure device: err=%d, port=%d\n",
> > +				 ret, portid);
> > +
> > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > +		printf("Port %u ", portid);
> > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > +		printf("\n");
> > +
> > +		/* tx queue setup */
> > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > +					     socketid, &tx_conf);
> > +		if (ret < 0)
> > +			rte_exit(EXIT_FAILURE,
> > +				"rte_eth_tx_queue_setup: err=%d, "
> > +				"port=%d\n", ret, portid);
> > +
> > +		/* rx queue steup */
> > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > +						socketid, &rx_conf,
> > +						mbufpool[socketid]);
> > +		if (ret < 0)
> > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
> > +				 "port=%d\n", ret, portid);
> > +
> > +		/* Start device */
> > +		stop = 0;
> > +		ret = rte_eth_dev_start(portid);
> > +		if (ret < 0)
> > +			rte_exit(EXIT_FAILURE,
> > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > +				ret, portid);
> > +
> > +		/* always eanble promiscuous */
> > +		rte_eth_promiscuous_enable(portid);
> > +
> > +		lcore_conf[slave_id].portlist[num++] = portid;
> > +		lcore_conf[slave_id].nb_ports++;
> > +	}
> > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > +
> > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > +
> > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > +		return -1;
> > +
> > +	/* port tear down */
> > +	for (portid = 0; portid < nb_ports; portid++) {
> > +		if (socketid != rte_eth_dev_socket_id(portid))
> > +			continue;
> > +
> > +		rte_eth_dev_stop(portid);
> > +	}
> > +
> Clean up your allocated memory/lcores/etc?
[Liang, Cunming] It do cleanup on the beginning of case.
"Init_lcores","init_mbufpool","reset_count" guarantee the data clean before each testing.
And mbufpool only allocated once even if we run multiple times.
> 
> Neil
  
De Lara Guarch, Pablo Oct. 13, 2014, 12:56 p.m. UTC | #3
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Liang, Cunming
> Sent: Sunday, October 12, 2014 12:11 PM
> To: Neil Horman
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> Hi Neil,
> 
> Very appreciate your comments.
> I add inline reply, will send v3 asap when we get alignment.
> 
> BRs,
> Liang Cunming
> 
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Saturday, October 11, 2014 1:52 AM
> > To: Liang, Cunming
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> >
> > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > It simply gives the average cycles of IO used per packet without test
> equipment.
> > > When doing the test, make sure the link is UP.
> > >
> > > Usage Example:
> > > 1. Run unit test app in interactive mode
> > >     app/test -c f -n 4 -- -i
> > > 2. Run and wait for the result
> > >     pmd_perf_autotest
> > >
> > > There's option to choose rx/tx pair, default is vector.
> > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> > INC_VEC=y in config
> > >
> > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> >
> > Notes inline
> >
> > > ---
> > >  app/test/Makefile                   |    1 +
> > >  app/test/commands.c                 |   38 +++
> > >  app/test/packet_burst_generator.c   |    4 +-
> > >  app/test/test.h                     |    4 +
> > >  app/test/test_pmd_perf.c            |  626
> > +++++++++++++++++++++++++++++++++++
> > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > >  create mode 100644 app/test/test_pmd_perf.c
> > >
> > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > index 6af6d76..ebfa0ba 100644
> > > --- a/app/test/Makefile
> > > +++ b/app/test/Makefile
> > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > >
> > >  SRCS-y += test_ring.c
> > >  SRCS-y += test_ring_perf.c
> > > +SRCS-y += test_pmd_perf.c
> > >
> > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > >  SRCS-y += test_table.c
> > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > index a9e36b1..f1e746e 100644
> > > --- a/app/test/commands.c
> > > +++ b/app/test/commands.c
> > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > >
> > > +#define NB_ETHPORTS_USED                (1)
> > > +#define NB_SOCKETS                      (2)
> > > +#define MEMPOOL_CACHE_SIZE 250
> > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > RTE_PKTMBUF_HEADROOM)
> > Don't you want to size this in accordance with the amount of data your
> sending
> > (64 Bytes as noted above)?
> [Liang, Cunming] The case is designed to measure small packet IO cost with
> normal mbuf size.
> Even if decreasing the size, it won't gain significant cycles.
> >
> > > +static void
> > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > +{
> > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > +		eth_addr->addr_bytes[0],
> > > +		eth_addr->addr_bytes[1],
> > > +		eth_addr->addr_bytes[2],
> > > +		eth_addr->addr_bytes[3],
> > > +		eth_addr->addr_bytes[4],
> > > +		eth_addr->addr_bytes[5]);
> > > +}
> > > +
> > This was copieed from print_ethaddr.  Seems like a good candidate for a
> common
> > function in rte_ether.h
> [Liang, Cunming] Agree with you, some of samples now use it with the same
> copy.
> I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the
> 48bits address output.
> And leaving other prints for application customization.
> >
> >
> > > +}
> > > +
> > > +static void
> > > +signal_handler(int signum)
> > > +{
> > > +	/* When we receive a USR1 signal, print stats */
> > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits
> the
> > program
> [Liang, Cunming] Thanks, it's a typo.
> >
> > > +	if (signum == SIGUSR1) {
> > SIGINT instead.  Thats the common practice.
> [Liang, Cunming] I understood your opinion.
> The considerations I'm not using SIGINT instead are:
> 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command
> interactive.
>   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> 2. By SIGINT semantic, expect to terminate the process.
>   Here I expect to force stop this case, but still alive in command line.
>   After it stopped, it can run again or start to run other test cases.
>   So I keep SIGINT, SIGUSR1 in different behavior.
> 3. It should be rarely used.
>   Only when exception timeout, I leave this backdoor for automation test
> control.
>   For manual test, we can easily force kill the process.
> 
> >
> > > +		printf("Force Stop!\n");
> > > +		stop = 1;
> > > +	}
> > > +	if (signum == SIGUSR2)
> > > +		stats_display(0);
> > > +}
> > > +/* main processing loop */
> > > +static int
> > > +main_loop(__rte_unused void *args)
> > > +{
> > > +#define PACKET_SIZE 64
> > > +#define FRAME_GAP 12
> > > +#define MAC_PREAMBLE 8
> > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > +	unsigned lcore_id;
> > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > +	struct lcore_conf *conf;
> > > +	uint64_t prev_tsc, cur_tsc;
> > > +	int pkt_per_port;
> > > +	uint64_t packets_per_second, total_packets;
> > > +
> > > +	lcore_id = rte_lcore_id();
> > > +	conf = &lcore_conf[lcore_id];
> > > +	if (conf->status != LCORE_USED)
> > > +		return 0;
> > > +
> > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > +
> > > +	int idx = 0;
> > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > +		int num = pkt_per_port;
> > > +		portid = conf->portlist[i];
> > > +		printf("inject %d packet to port %d\n", num, portid);
> > > +		while (num) {
> > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > +						&tx_burst[idx], nb_tx);
> > > +			num -= nb_tx;
> > > +			idx += nb_tx;
> > > +		}
> > > +	}
> > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > +
> > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) *
> CHAR_BIT);
> > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > +		+packets_per_second);
> > > +
> > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > packets_per_second;
> > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > +		+ total_packets);
> > > +
> > > +	prev_tsc = rte_rdtsc();
> > > +
> > > +	while (likely(!stop)) {
> > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > +			portid = conf->portlist[i];
> > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > +						 pkts_burst,
> MAX_PKT_BURST);
> > > +			if (unlikely(nb_rx == 0)) {
> > > +				idle++;
> > > +				continue;
> > > +			}
> > > +
> > > +			count += nb_rx;
> > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> return
> > -ENOTSUP
> [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> CONFIG.
> The error is used to avoid no function call register.
> When ETHDEV_DEBUG turn off, the NULL function call cause segfault
> directly.
> So I think it's a library internal error.
> In such library exceptional case, I prefer not expecting sample/application to
> condition check library functional error.
> >
> > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst,
> nb_rx);
> > Ditto with -ENOTSUP
> >
> > > +			if (unlikely(nb_tx < nb_rx)) {
> > What makes this unlikely?  Seems like a perfectly reasonable condition to
> happen
> > to me.  If the network is busy, its completely likely that you will receive
> more
> > frames than you send, if you elect to receive all frames.
> [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> It firstly injects numbers of packets to NIC. Then NIC will loopback all packets
> to ingress side.
> The code here will receive the packets, and send them back to NIC
> Packets looping inside all come from initial injection.
> As the total number of injected packets is much less than in-chip queue size,
> the tx egress queue shouldn't block desc. ring update.
> So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot
> archive line rate.
> When it happens, the cycles/packets result make no sense, as the bottle
> neck is NIC.
> The drop counter can record it.
> >
> > > +				drop += (nb_rx - nb_tx);
> > > +				do {
> > > +
> 	rte_pktmbuf_free(pkts_burst[nb_tx]);
> > Defer this, it skews your timing
> [Liang, Cunming] Agree with you, I ever thought about it.
> This test cases is designed to measure pure IO RX/TX routine.
> When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or
> drop it.
> Each way introduces noise(adding additional control code), resending much
> times even cost more than free it.
> The cycles/packets is useful when there's no packet drop, otherwise it gives
> the hint where the problem comes from (by idle or drop).
> >
> > > +				} while (++nb_tx < nb_rx);
> > > +			}
> > > +		}
> > > +		if (unlikely(count >= total_packets))
> > > +			break;
> > Whats the reasoning here?  Do you only ever expect to receive frames that
> you
> > send?  If so, seems like this should call for a big warning to get printed.
> [Liang, Cunming] The loop exits when the pre-calculated total_packets are
> received.
> As the nb_rx is unpredictable, the packet counter may large equal than
> total_packets the last time.
> The reason unlikely used here is because the condition becomes true only
> the last time.
> >
> > > +	}
> > > +
> > > +	cur_tsc = rte_rdtsc();
> > > +
> > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > +		portid = conf->portlist[i];
> > > +		int nb_free = pkt_per_port;
> > > +		do { /* dry out */
> > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > +						 pkts_burst,
> MAX_PKT_BURST);
> > > +			nb_tx = 0;
> > > +			while (nb_tx < nb_rx)
> > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > +			nb_free -= nb_rx;
> > > +		} while (nb_free != 0);
> > > +		printf("free %d mbuf left in port %u\n", pkt_per_port,
> portid);
> > > +	}
> > > +
> > Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it
> be
> > enough just to stop the interface?
> [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> But it's designed to run multi-times without exit, for the purpose of warming
> up or for getting average number.
> So stopping device is not enough, we have to release the flying packets.
> 
> >
> > > +	if (count == 0)
> > > +		return -1;
> > > +
> > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > +
> > Bad math here.  Theres no guarantee that the tsc hasn't wrapped
> (potentially
> > more than once) depending on your test length.  you need to check the tsc
> before
> > and after each burst and record an average of deltas instead, accounting in
> each
> > instance for the possibility of wrap.
> [Liang, Cunming] I'm not sure catch your point correctly.
> I think both cur_tsc and prev_tsc are 64 bits width.
> For 3GHz, I think it won't wrapped so quick.
> As it's uint64_t, so even get wrapped, the delta should still be correct.

You need to change those %lu to %PRIu64, or you will get a compilation error when 
using 32-bit targets, since those variables are uint64_t. There are other parts of the 
code with same problem, as well.


> >
> > > +	return 0;
> > > +}
> > > +
> > > +static int
> > > +test_pmd_perf(void)
> > > +{
> > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > +	uint16_t portid;
> > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > +	int socketid = -1;
> > > +	int ret;
> > > +
> > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > +
> > > +	signal(SIGUSR1, signal_handler);
> > Again SIGINT here.
> >
> > > +	signal(SIGUSR2, signal_handler);
> > > +
> > > +	nb_ports = rte_eth_dev_count();
> > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > +		printf("At least %u port(s) used for perf. test\n",
> > > +		       NB_ETHPORTS_USED);
> > > +		return -1;
> > > +	}
> > > +
> > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > +
> > > +	nb_lcores = rte_lcore_count();
> > > +
> > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > +	init_lcores();
> > > +
> > > +	init_mbufpool(NB_MBUF);
> > > +
> > > +	reset_count();
> > > +	num = 0;
> > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > +		if (socketid == -1) {
> > > +			socketid = rte_eth_dev_socket_id(portid);
> > > +			slave_id = alloc_lcore(socketid);
> > > +			if (slave_id == (uint16_t)-1) {
> > > +				printf("No avail lcore to run test\n");
> > > +				return -1;
> > > +			}
> > > +			printf("Performance test runs on lcore %u socket
> %u\n",
> > > +			       slave_id, socketid);
> > > +		}
> > > +
> > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > +			printf("Skip port %d\n", portid);
> > > +			continue;
> > > +		}
> > > +
> > > +		/* port configure */
> > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > +					    nb_tx_queue, &port_conf);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE,
> > > +				"Cannot configure device: err=%d,
> port=%d\n",
> > > +				 ret, portid);
> > > +
> > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > +		printf("Port %u ", portid);
> > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > +		printf("\n");
> > > +
> > > +		/* tx queue setup */
> > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > +					     socketid, &tx_conf);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE,
> > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > +				"port=%d\n", ret, portid);
> > > +
> > > +		/* rx queue steup */
> > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > +						socketid, &rx_conf,
> > > +						mbufpool[socketid]);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:
> err=%d,"
> > > +				 "port=%d\n", ret, portid);
> > > +
> > > +		/* Start device */
> > > +		stop = 0;
> > > +		ret = rte_eth_dev_start(portid);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE,
> > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > +				ret, portid);
> > > +
> > > +		/* always eanble promiscuous */
> > > +		rte_eth_promiscuous_enable(portid);
> > > +
> > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > +		lcore_conf[slave_id].nb_ports++;
> > > +	}
> > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > +
> > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > +
> > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > +		return -1;
> > > +
> > > +	/* port tear down */
> > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > +			continue;
> > > +
> > > +		rte_eth_dev_stop(portid);
> > > +	}
> > > +
> > Clean up your allocated memory/lcores/etc?
> [Liang, Cunming] It do cleanup on the beginning of case.
> "Init_lcores","init_mbufpool","reset_count" guarantee the data clean before
> each testing.
> And mbufpool only allocated once even if we run multiple times.
> >
> > Neil
  
Cunming Liang Oct. 14, 2014, 12:54 a.m. UTC | #4
> -----Original Message-----
> From: De Lara Guarch, Pablo
> Sent: Monday, October 13, 2014 8:56 PM
> To: Liang, Cunming; Neil Horman
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Liang, Cunming
> > Sent: Sunday, October 12, 2014 12:11 PM
> > To: Neil Horman
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> >
> > Hi Neil,
> >
> > Very appreciate your comments.
> > I add inline reply, will send v3 asap when we get alignment.
> >
> > BRs,
> > Liang Cunming
> >
> > > -----Original Message-----
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Saturday, October 11, 2014 1:52 AM
> > > To: Liang, Cunming
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> > >
> > > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > > It simply gives the average cycles of IO used per packet without test
> > equipment.
> > > > When doing the test, make sure the link is UP.
> > > >
> > > > Usage Example:
> > > > 1. Run unit test app in interactive mode
> > > >     app/test -c f -n 4 -- -i
> > > > 2. Run and wait for the result
> > > >     pmd_perf_autotest
> > > >
> > > > There's option to choose rx/tx pair, default is vector.
> > > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> > > INC_VEC=y in config
> > > >
> > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > >
> > > Notes inline
> > >
> > > > ---
> > > >  app/test/Makefile                   |    1 +
> > > >  app/test/commands.c                 |   38 +++
> > > >  app/test/packet_burst_generator.c   |    4 +-
> > > >  app/test/test.h                     |    4 +
> > > >  app/test/test_pmd_perf.c            |  626
> > > +++++++++++++++++++++++++++++++++++
> > > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > > >  create mode 100644 app/test/test_pmd_perf.c
> > > >
> > > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > > index 6af6d76..ebfa0ba 100644
> > > > --- a/app/test/Makefile
> > > > +++ b/app/test/Makefile
> > > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > > >
> > > >  SRCS-y += test_ring.c
> > > >  SRCS-y += test_ring_perf.c
> > > > +SRCS-y += test_pmd_perf.c
> > > >
> > > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > > >  SRCS-y += test_table.c
> > > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > > index a9e36b1..f1e746e 100644
> > > > --- a/app/test/commands.c
> > > > +++ b/app/test/commands.c
> > > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > > >
> > > > +#define NB_ETHPORTS_USED                (1)
> > > > +#define NB_SOCKETS                      (2)
> > > > +#define MEMPOOL_CACHE_SIZE 250
> > > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > > RTE_PKTMBUF_HEADROOM)
> > > Don't you want to size this in accordance with the amount of data your
> > sending
> > > (64 Bytes as noted above)?
> > [Liang, Cunming] The case is designed to measure small packet IO cost with
> > normal mbuf size.
> > Even if decreasing the size, it won't gain significant cycles.
> > >
> > > > +static void
> > > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > > +{
> > > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > > +		eth_addr->addr_bytes[0],
> > > > +		eth_addr->addr_bytes[1],
> > > > +		eth_addr->addr_bytes[2],
> > > > +		eth_addr->addr_bytes[3],
> > > > +		eth_addr->addr_bytes[4],
> > > > +		eth_addr->addr_bytes[5]);
> > > > +}
> > > > +
> > > This was copieed from print_ethaddr.  Seems like a good candidate for a
> > common
> > > function in rte_ether.h
> > [Liang, Cunming] Agree with you, some of samples now use it with the same
> > copy.
> > I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the
> > 48bits address output.
> > And leaving other prints for application customization.
> > >
> > >
> > > > +}
> > > > +
> > > > +static void
> > > > +signal_handler(int signum)
> > > > +{
> > > > +	/* When we receive a USR1 signal, print stats */
> > > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits
> > the
> > > program
> > [Liang, Cunming] Thanks, it's a typo.
> > >
> > > > +	if (signum == SIGUSR1) {
> > > SIGINT instead.  Thats the common practice.
> > [Liang, Cunming] I understood your opinion.
> > The considerations I'm not using SIGINT instead are:
> > 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command
> > interactive.
> >   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> > 2. By SIGINT semantic, expect to terminate the process.
> >   Here I expect to force stop this case, but still alive in command line.
> >   After it stopped, it can run again or start to run other test cases.
> >   So I keep SIGINT, SIGUSR1 in different behavior.
> > 3. It should be rarely used.
> >   Only when exception timeout, I leave this backdoor for automation test
> > control.
> >   For manual test, we can easily force kill the process.
> >
> > >
> > > > +		printf("Force Stop!\n");
> > > > +		stop = 1;
> > > > +	}
> > > > +	if (signum == SIGUSR2)
> > > > +		stats_display(0);
> > > > +}
> > > > +/* main processing loop */
> > > > +static int
> > > > +main_loop(__rte_unused void *args)
> > > > +{
> > > > +#define PACKET_SIZE 64
> > > > +#define FRAME_GAP 12
> > > > +#define MAC_PREAMBLE 8
> > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > +	unsigned lcore_id;
> > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > +	struct lcore_conf *conf;
> > > > +	uint64_t prev_tsc, cur_tsc;
> > > > +	int pkt_per_port;
> > > > +	uint64_t packets_per_second, total_packets;
> > > > +
> > > > +	lcore_id = rte_lcore_id();
> > > > +	conf = &lcore_conf[lcore_id];
> > > > +	if (conf->status != LCORE_USED)
> > > > +		return 0;
> > > > +
> > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > +
> > > > +	int idx = 0;
> > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > +		int num = pkt_per_port;
> > > > +		portid = conf->portlist[i];
> > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > +		while (num) {
> > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > +						&tx_burst[idx], nb_tx);
> > > > +			num -= nb_tx;
> > > > +			idx += nb_tx;
> > > > +		}
> > > > +	}
> > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > +
> > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) *
> > CHAR_BIT);
> > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > +		+packets_per_second);
> > > > +
> > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > packets_per_second;
> > > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > > +		+ total_packets);
> > > > +
> > > > +	prev_tsc = rte_rdtsc();
> > > > +
> > > > +	while (likely(!stop)) {
> > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > +			portid = conf->portlist[i];
> > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > +						 pkts_burst,
> > MAX_PKT_BURST);
> > > > +			if (unlikely(nb_rx == 0)) {
> > > > +				idle++;
> > > > +				continue;
> > > > +			}
> > > > +
> > > > +			count += nb_rx;
> > > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> > return
> > > -ENOTSUP
> > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> > CONFIG.
> > The error is used to avoid no function call register.
> > When ETHDEV_DEBUG turn off, the NULL function call cause segfault
> > directly.
> > So I think it's a library internal error.
> > In such library exceptional case, I prefer not expecting sample/application to
> > condition check library functional error.
> > >
> > > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst,
> > nb_rx);
> > > Ditto with -ENOTSUP
> > >
> > > > +			if (unlikely(nb_tx < nb_rx)) {
> > > What makes this unlikely?  Seems like a perfectly reasonable condition to
> > happen
> > > to me.  If the network is busy, its completely likely that you will receive
> > more
> > > frames than you send, if you elect to receive all frames.
> > [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> > It firstly injects numbers of packets to NIC. Then NIC will loopback all packets
> > to ingress side.
> > The code here will receive the packets, and send them back to NIC
> > Packets looping inside all come from initial injection.
> > As the total number of injected packets is much less than in-chip queue size,
> > the tx egress queue shouldn't block desc. ring update.
> > So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot
> > archive line rate.
> > When it happens, the cycles/packets result make no sense, as the bottle
> > neck is NIC.
> > The drop counter can record it.
> > >
> > > > +				drop += (nb_rx - nb_tx);
> > > > +				do {
> > > > +
> > 	rte_pktmbuf_free(pkts_burst[nb_tx]);
> > > Defer this, it skews your timing
> > [Liang, Cunming] Agree with you, I ever thought about it.
> > This test cases is designed to measure pure IO RX/TX routine.
> > When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or
> > drop it.
> > Each way introduces noise(adding additional control code), resending much
> > times even cost more than free it.
> > The cycles/packets is useful when there's no packet drop, otherwise it gives
> > the hint where the problem comes from (by idle or drop).
> > >
> > > > +				} while (++nb_tx < nb_rx);
> > > > +			}
> > > > +		}
> > > > +		if (unlikely(count >= total_packets))
> > > > +			break;
> > > Whats the reasoning here?  Do you only ever expect to receive frames that
> > you
> > > send?  If so, seems like this should call for a big warning to get printed.
> > [Liang, Cunming] The loop exits when the pre-calculated total_packets are
> > received.
> > As the nb_rx is unpredictable, the packet counter may large equal than
> > total_packets the last time.
> > The reason unlikely used here is because the condition becomes true only
> > the last time.
> > >
> > > > +	}
> > > > +
> > > > +	cur_tsc = rte_rdtsc();
> > > > +
> > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > +		portid = conf->portlist[i];
> > > > +		int nb_free = pkt_per_port;
> > > > +		do { /* dry out */
> > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > +						 pkts_burst,
> > MAX_PKT_BURST);
> > > > +			nb_tx = 0;
> > > > +			while (nb_tx < nb_rx)
> > > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > > +			nb_free -= nb_rx;
> > > > +		} while (nb_free != 0);
> > > > +		printf("free %d mbuf left in port %u\n", pkt_per_port,
> > portid);
> > > > +	}
> > > > +
> > > Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it
> > be
> > > enough just to stop the interface?
> > [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> > But it's designed to run multi-times without exit, for the purpose of warming
> > up or for getting average number.
> > So stopping device is not enough, we have to release the flying packets.
> >
> > >
> > > > +	if (count == 0)
> > > > +		return -1;
> > > > +
> > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > +
> > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped
> > (potentially
> > > more than once) depending on your test length.  you need to check the tsc
> > before
> > > and after each burst and record an average of deltas instead, accounting in
> > each
> > > instance for the possibility of wrap.
> > [Liang, Cunming] I'm not sure catch your point correctly.
> > I think both cur_tsc and prev_tsc are 64 bits width.
> > For 3GHz, I think it won't wrapped so quick.
> > As it's uint64_t, so even get wrapped, the delta should still be correct.
> 
> You need to change those %lu to %PRIu64, or you will get a compilation error
> when
> using 32-bit targets, since those variables are uint64_t. There are other parts of
> the
> code with same problem, as well.
[Liang, Cunming] Make sense to me. Thanks.
> 
> 
> > >
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int
> > > > +test_pmd_perf(void)
> > > > +{
> > > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > > +	uint16_t portid;
> > > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > > +	int socketid = -1;
> > > > +	int ret;
> > > > +
> > > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > > +
> > > > +	signal(SIGUSR1, signal_handler);
> > > Again SIGINT here.
> > >
> > > > +	signal(SIGUSR2, signal_handler);
> > > > +
> > > > +	nb_ports = rte_eth_dev_count();
> > > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > > +		printf("At least %u port(s) used for perf. test\n",
> > > > +		       NB_ETHPORTS_USED);
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > > +
> > > > +	nb_lcores = rte_lcore_count();
> > > > +
> > > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > > +	init_lcores();
> > > > +
> > > > +	init_mbufpool(NB_MBUF);
> > > > +
> > > > +	reset_count();
> > > > +	num = 0;
> > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > +		if (socketid == -1) {
> > > > +			socketid = rte_eth_dev_socket_id(portid);
> > > > +			slave_id = alloc_lcore(socketid);
> > > > +			if (slave_id == (uint16_t)-1) {
> > > > +				printf("No avail lcore to run test\n");
> > > > +				return -1;
> > > > +			}
> > > > +			printf("Performance test runs on lcore %u socket
> > %u\n",
> > > > +			       slave_id, socketid);
> > > > +		}
> > > > +
> > > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > > +			printf("Skip port %d\n", portid);
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		/* port configure */
> > > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > > +					    nb_tx_queue, &port_conf);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE,
> > > > +				"Cannot configure device: err=%d,
> > port=%d\n",
> > > > +				 ret, portid);
> > > > +
> > > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > > +		printf("Port %u ", portid);
> > > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > > +		printf("\n");
> > > > +
> > > > +		/* tx queue setup */
> > > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > > +					     socketid, &tx_conf);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE,
> > > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > > +				"port=%d\n", ret, portid);
> > > > +
> > > > +		/* rx queue steup */
> > > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > > +						socketid, &rx_conf,
> > > > +						mbufpool[socketid]);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:
> > err=%d,"
> > > > +				 "port=%d\n", ret, portid);
> > > > +
> > > > +		/* Start device */
> > > > +		stop = 0;
> > > > +		ret = rte_eth_dev_start(portid);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE,
> > > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > > +				ret, portid);
> > > > +
> > > > +		/* always eanble promiscuous */
> > > > +		rte_eth_promiscuous_enable(portid);
> > > > +
> > > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > > +		lcore_conf[slave_id].nb_ports++;
> > > > +	}
> > > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > > +
> > > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > > +
> > > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > > +		return -1;
> > > > +
> > > > +	/* port tear down */
> > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > > +			continue;
> > > > +
> > > > +		rte_eth_dev_stop(portid);
> > > > +	}
> > > > +
> > > Clean up your allocated memory/lcores/etc?
> > [Liang, Cunming] It do cleanup on the beginning of case.
> > "Init_lcores","init_mbufpool","reset_count" guarantee the data clean before
> > each testing.
> > And mbufpool only allocated once even if we run multiple times.
> > >
> > > Neil
  
Neil Horman Oct. 21, 2014, 10:33 a.m. UTC | #5
On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> Hi Neil,
> 
> Very appreciate your comments.
> I add inline reply, will send v3 asap when we get alignment.
> 
> BRs,
> Liang Cunming
> 
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Saturday, October 11, 2014 1:52 AM
> > To: Liang, Cunming
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx cycles/packet
> > 
> > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > It simply gives the average cycles of IO used per packet without test equipment.
> > > When doing the test, make sure the link is UP.
> > >
> > > Usage Example:
> > > 1. Run unit test app in interactive mode
> > >     app/test -c f -n 4 -- -i
> > > 2. Run and wait for the result
> > >     pmd_perf_autotest
> > >
> > > There's option to choose rx/tx pair, default is vector.
> > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> > INC_VEC=y in config
> > >
> > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > 
> > Notes inline
> > 
> > > ---
> > >  app/test/Makefile                   |    1 +
> > >  app/test/commands.c                 |   38 +++
> > >  app/test/packet_burst_generator.c   |    4 +-
> > >  app/test/test.h                     |    4 +
> > >  app/test/test_pmd_perf.c            |  626
> > +++++++++++++++++++++++++++++++++++
> > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > >  create mode 100644 app/test/test_pmd_perf.c
> > >
> > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > index 6af6d76..ebfa0ba 100644
> > > --- a/app/test/Makefile
> > > +++ b/app/test/Makefile
> > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > >
> > >  SRCS-y += test_ring.c
> > >  SRCS-y += test_ring_perf.c
> > > +SRCS-y += test_pmd_perf.c
> > >
> > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > >  SRCS-y += test_table.c
> > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > index a9e36b1..f1e746e 100644
> > > --- a/app/test/commands.c
> > > +++ b/app/test/commands.c
> > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > >
> > > +#define NB_ETHPORTS_USED                (1)
> > > +#define NB_SOCKETS                      (2)
> > > +#define MEMPOOL_CACHE_SIZE 250
> > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > RTE_PKTMBUF_HEADROOM)
> > Don't you want to size this in accordance with the amount of data your sending
> > (64 Bytes as noted above)?
> [Liang, Cunming] The case is designed to measure small packet IO cost with normal mbuf size.
> Even if decreasing the size, it won't gain significant cycles.
> > 
That presumes a non-memory constrained system, doesn't it?  I suppose in the end
as long as you have consistency, its not overly relevant, but it seems like
you'll want to add data sizing as a feature to this eventually (i.e. the ability
to test performance for larger frames sizes), at which point you'll need to make
this non-static anyway.

> > > +static void
> > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > +{
> > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > +		eth_addr->addr_bytes[0],
> > > +		eth_addr->addr_bytes[1],
> > > +		eth_addr->addr_bytes[2],
> > > +		eth_addr->addr_bytes[3],
> > > +		eth_addr->addr_bytes[4],
> > > +		eth_addr->addr_bytes[5]);
> > > +}
> > > +
> > This was copieed from print_ethaddr.  Seems like a good candidate for a common
> > function in rte_ether.h
> [Liang, Cunming] Agree with you, some of samples now use it with the same copy.
> I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the 48bits address output.
> And leaving other prints for application customization.
> > 
Sounds good.

> > 
> > > +}
> > > +
> > > +static void
> > > +signal_handler(int signum)
> > > +{
> > > +	/* When we receive a USR1 signal, print stats */
> > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits the
> > program
> [Liang, Cunming] Thanks, it's a typo.
> > 
> > > +	if (signum == SIGUSR1) {
> > SIGINT instead.  Thats the common practice.
> [Liang, Cunming] I understood your opinion. 
> The considerations I'm not using SIGINT instead are:
> 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command interactive.
>   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> 2. By SIGINT semantic, expect to terminate the process.
>   Here I expect to force stop this case, but still alive in command line.
>   After it stopped, it can run again or start to run other test cases.
>   So I keep SIGINT, SIGUSR1 in different behavior.
> 3. It should be rarely used. 
>   Only when exception timeout, I leave this backdoor for automation test control.
>   For manual test, we can easily force kill the process.
> 
Hmm, ok, that sounds reasonable.
 
> > 
> > > +		printf("Force Stop!\n");
> > > +		stop = 1;
> > > +	}
> > > +	if (signum == SIGUSR2)
> > > +		stats_display(0);
> > > +}
> > > +/* main processing loop */
> > > +static int
> > > +main_loop(__rte_unused void *args)
> > > +{
> > > +#define PACKET_SIZE 64
> > > +#define FRAME_GAP 12
> > > +#define MAC_PREAMBLE 8
> > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > +	unsigned lcore_id;
> > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > +	struct lcore_conf *conf;
> > > +	uint64_t prev_tsc, cur_tsc;
> > > +	int pkt_per_port;
> > > +	uint64_t packets_per_second, total_packets;
> > > +
> > > +	lcore_id = rte_lcore_id();
> > > +	conf = &lcore_conf[lcore_id];
> > > +	if (conf->status != LCORE_USED)
> > > +		return 0;
> > > +
> > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > +
> > > +	int idx = 0;
> > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > +		int num = pkt_per_port;
> > > +		portid = conf->portlist[i];
> > > +		printf("inject %d packet to port %d\n", num, portid);
> > > +		while (num) {
> > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > +						&tx_burst[idx], nb_tx);
> > > +			num -= nb_tx;
> > > +			idx += nb_tx;
> > > +		}
> > > +	}
> > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > +
> > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
> > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > +		+packets_per_second);
> > > +
> > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > packets_per_second;
> > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > +		+ total_packets);
> > > +
> > > +	prev_tsc = rte_rdtsc();
> > > +
> > > +	while (likely(!stop)) {
> > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > +			portid = conf->portlist[i];
> > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > +						 pkts_burst, MAX_PKT_BURST);
> > > +			if (unlikely(nb_rx == 0)) {
> > > +				idle++;
> > > +				continue;
> > > +			}
> > > +
> > > +			count += nb_rx;
> > Doesn't take into consideration error conditions.  rte_eth_rx_burst can return
> > -ENOTSUP
> [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG CONFIG.
> The error is used to avoid no function call register. 
> When ETHDEV_DEBUG turn off, the NULL function call cause segfault directly.
> So I think it's a library internal error.
No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is null,
-ENOTSUPP will be returned to the application, you need to handle the error
condition.

> In such library exceptional case, I prefer not expecting sample/application to condition check library functional error.
But you're assertion about the library handling the exception is wrong.

> > 
> > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
> > Ditto with -ENOTSUP
> > 
> > > +			if (unlikely(nb_tx < nb_rx)) {
> > What makes this unlikely?  Seems like a perfectly reasonable condition to happen
> > to me.  If the network is busy, its completely likely that you will receive more
> > frames than you send, if you elect to receive all frames.
> [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> It firstly injects numbers of packets to NIC. Then NIC will loopback all packets to ingress side.
> The code here will receive the packets, and send them back to NIC
> Packets looping inside all come from initial injection.
> As the total number of injected packets is much less than in-chip queue size, the tx egress queue shouldn't block desc. ring update.
> So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot archive line rate. 
> When it happens, the cycles/packets result make no sense, as the bottle neck is NIC.
> The drop counter can record it.
Ok.

> > 
> > > +				drop += (nb_rx - nb_tx);
> > > +				do {
> > > +					rte_pktmbuf_free(pkts_burst[nb_tx]);
> > Defer this, it skews your timing
> [Liang, Cunming] Agree with you, I ever thought about it.
> This test cases is designed to measure pure IO RX/TX routine.
> When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or drop it.
> Each way introduces noise(adding additional control code), resending much times even cost more than free it.
> The cycles/packets is useful when there's no packet drop, otherwise it gives the hint where the problem comes from (by idle or drop).
I'm not sure what you're asserting here.  Are you suggesting that you want to
include the time it takes to free memory buffers in your testing?  That seems
dubious at best to me.  If you want to measure I/O, thats great, but memory
management of packet buffers is a separate operation from that I/O

> > 
> > > +				} while (++nb_tx < nb_rx);
> > > +			}
> > > +		}
> > > +		if (unlikely(count >= total_packets))
> > > +			break;
> > Whats the reasoning here?  Do you only ever expect to receive frames that you
> > send?  If so, seems like this should call for a big warning to get printed.
> [Liang, Cunming] The loop exits when the pre-calculated total_packets are received.
> As the nb_rx is unpredictable, the packet counter may large equal than total_packets the last time.
> The reason unlikely used here is because the condition becomes true only the last time. 
ok

> > 
> > > +	}
> > > +
> > > +	cur_tsc = rte_rdtsc();
> > > +
> > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > +		portid = conf->portlist[i];
> > > +		int nb_free = pkt_per_port;
> > > +		do { /* dry out */
> > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > +						 pkts_burst, MAX_PKT_BURST);
> > > +			nb_tx = 0;
> > > +			while (nb_tx < nb_rx)
> > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > +			nb_free -= nb_rx;
> > > +		} while (nb_free != 0);
> > > +		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
> > > +	}
> > > +
> > Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it be
> > enough just to stop the interface?
> [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> But it's designed to run multi-times without exit, for the purpose of warming up or for getting average number.
> So stopping device is not enough, we have to release the flying packets.
> 
Ok

> > 
> > > +	if (count == 0)
> > > +		return -1;
> > > +
> > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > +
> > Bad math here.  Theres no guarantee that the tsc hasn't wrapped (potentially
> > more than once) depending on your test length.  you need to check the tsc before
> > and after each burst and record an average of deltas instead, accounting in each
> > instance for the possibility of wrap.
> [Liang, Cunming] I'm not sure catch your point correctly.
> I think both cur_tsc and prev_tsc are 64 bits width. 
> For 3GHz, I think it won't wrapped so quick.
> As it's uint64_t, so even get wrapped, the delta should still be correct.
But theres no guarantee that the tsc starts at zero when you begin your test.
The system may have been up for a long time and near wrapping already.
Regardless, you need to account for the possibility that cur_tsc is smaller
than prev_tsc, or this breaks.

> > 
> > > +	return 0;
> > > +}
> > > +
> > > +static int
> > > +test_pmd_perf(void)
> > > +{
> > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > +	uint16_t portid;
> > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > +	int socketid = -1;
> > > +	int ret;
> > > +
> > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > +
> > > +	signal(SIGUSR1, signal_handler);
> > Again SIGINT here.
> > 
> > > +	signal(SIGUSR2, signal_handler);
> > > +
> > > +	nb_ports = rte_eth_dev_count();
> > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > +		printf("At least %u port(s) used for perf. test\n",
> > > +		       NB_ETHPORTS_USED);
> > > +		return -1;
> > > +	}
> > > +
> > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > +
> > > +	nb_lcores = rte_lcore_count();
> > > +
> > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > +	init_lcores();
> > > +
> > > +	init_mbufpool(NB_MBUF);
> > > +
> > > +	reset_count();
> > > +	num = 0;
> > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > +		if (socketid == -1) {
> > > +			socketid = rte_eth_dev_socket_id(portid);
> > > +			slave_id = alloc_lcore(socketid);
> > > +			if (slave_id == (uint16_t)-1) {
> > > +				printf("No avail lcore to run test\n");
> > > +				return -1;
> > > +			}
> > > +			printf("Performance test runs on lcore %u socket %u\n",
> > > +			       slave_id, socketid);
> > > +		}
> > > +
> > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > +			printf("Skip port %d\n", portid);
> > > +			continue;
> > > +		}
> > > +
> > > +		/* port configure */
> > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > +					    nb_tx_queue, &port_conf);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE,
> > > +				"Cannot configure device: err=%d, port=%d\n",
> > > +				 ret, portid);
> > > +
> > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > +		printf("Port %u ", portid);
> > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > +		printf("\n");
> > > +
> > > +		/* tx queue setup */
> > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > +					     socketid, &tx_conf);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE,
> > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > +				"port=%d\n", ret, portid);
> > > +
> > > +		/* rx queue steup */
> > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > +						socketid, &rx_conf,
> > > +						mbufpool[socketid]);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
> > > +				 "port=%d\n", ret, portid);
> > > +
> > > +		/* Start device */
> > > +		stop = 0;
> > > +		ret = rte_eth_dev_start(portid);
> > > +		if (ret < 0)
> > > +			rte_exit(EXIT_FAILURE,
> > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > +				ret, portid);
> > > +
> > > +		/* always eanble promiscuous */
> > > +		rte_eth_promiscuous_enable(portid);
> > > +
> > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > +		lcore_conf[slave_id].nb_ports++;
> > > +	}
> > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > +
> > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > +
> > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > +		return -1;
> > > +
> > > +	/* port tear down */
> > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > +			continue;
> > > +
> > > +		rte_eth_dev_stop(portid);
> > > +	}
> > > +
> > Clean up your allocated memory/lcores/etc?
> [Liang, Cunming] It do cleanup on the beginning of case.
> "Init_lcores","init_mbufpool","reset_count" guarantee the data clean before each testing.
> And mbufpool only allocated once even if we run multiple times.
Its a janitorial issue.  Before the program exits, you need to free any
resources that you've allocated.

Neil

> > 
> > Neil
> 
>
  
Bruce Richardson Oct. 21, 2014, 10:43 a.m. UTC | #6
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> Sent: Tuesday, October 21, 2014 11:33 AM
> To: Liang, Cunming
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> > >
> > > > +	if (count == 0)
> > > > +		return -1;
> > > > +
> > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > +
> > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped (potentially
> > > more than once) depending on your test length.  you need to check the tsc
> before
> > > and after each burst and record an average of deltas instead, accounting in
> each
> > > instance for the possibility of wrap.
> > [Liang, Cunming] I'm not sure catch your point correctly.
> > I think both cur_tsc and prev_tsc are 64 bits width.
> > For 3GHz, I think it won't wrapped so quick.
> > As it's uint64_t, so even get wrapped, the delta should still be correct.
> But theres no guarantee that the tsc starts at zero when you begin your test.
> The system may have been up for a long time and near wrapping already.
> Regardless, you need to account for the possibility that cur_tsc is smaller
> than prev_tsc, or this breaks.
> 

The tsc. is 64-bit and so only wraps around every couple of hundred years or so on a 2GHz machine, so I don't think it's necessary to handle that case. 

/Bruce
  
Cunming Liang Oct. 21, 2014, 1:17 p.m. UTC | #7
> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Tuesday, October 21, 2014 6:33 PM
> To: Liang, Cunming
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> > Hi Neil,
> >
> > Very appreciate your comments.
> > I add inline reply, will send v3 asap when we get alignment.
> >
> > BRs,
> > Liang Cunming
> >
> > > -----Original Message-----
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Saturday, October 11, 2014 1:52 AM
> > > To: Liang, Cunming
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> > >
> > > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > > It simply gives the average cycles of IO used per packet without test
> equipment.
> > > > When doing the test, make sure the link is UP.
> > > >
> > > > Usage Example:
> > > > 1. Run unit test app in interactive mode
> > > >     app/test -c f -n 4 -- -i
> > > > 2. Run and wait for the result
> > > >     pmd_perf_autotest
> > > >
> > > > There's option to choose rx/tx pair, default is vector.
> > > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> > > INC_VEC=y in config
> > > >
> > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > >
> > > Notes inline
> > >
> > > > ---
> > > >  app/test/Makefile                   |    1 +
> > > >  app/test/commands.c                 |   38 +++
> > > >  app/test/packet_burst_generator.c   |    4 +-
> > > >  app/test/test.h                     |    4 +
> > > >  app/test/test_pmd_perf.c            |  626
> > > +++++++++++++++++++++++++++++++++++
> > > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > > >  create mode 100644 app/test/test_pmd_perf.c
> > > >
> > > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > > index 6af6d76..ebfa0ba 100644
> > > > --- a/app/test/Makefile
> > > > +++ b/app/test/Makefile
> > > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > > >
> > > >  SRCS-y += test_ring.c
> > > >  SRCS-y += test_ring_perf.c
> > > > +SRCS-y += test_pmd_perf.c
> > > >
> > > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > > >  SRCS-y += test_table.c
> > > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > > index a9e36b1..f1e746e 100644
> > > > --- a/app/test/commands.c
> > > > +++ b/app/test/commands.c
> > > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > > >
> > > > +#define NB_ETHPORTS_USED                (1)
> > > > +#define NB_SOCKETS                      (2)
> > > > +#define MEMPOOL_CACHE_SIZE 250
> > > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > > RTE_PKTMBUF_HEADROOM)
> > > Don't you want to size this in accordance with the amount of data your
> sending
> > > (64 Bytes as noted above)?
> > [Liang, Cunming] The case is designed to measure small packet IO cost with
> normal mbuf size.
> > Even if decreasing the size, it won't gain significant cycles.
> > >
> That presumes a non-memory constrained system, doesn't it?  I suppose in the
> end
> as long as you have consistency, its not overly relevant, but it seems like
> you'll want to add data sizing as a feature to this eventually (i.e. the ability
> to test performance for larger frames sizes), at which point you'll need to make
> this non-static anyway.
[Liang, Cunming] For a normal Ethernet packet(w/o jumbo frame), packet size is 1518B.
As in really network, there won't have huge number of jumbo frames.
The mbuf size 2048 is a reasonable value to cover most of the packet size.
It's also be chosen by lots of NIC as the default receiving buffer size in DMA register.
In case larger than the size, it need do scatter and gather but lose some performance.
The unit test won't measure size from 64 to 9600, won't plan to measure scatter-gather rx/tx.
It focus on 64B packet size and taking the mbuf size being used the most often.
> 
> > > > +static void
> > > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > > +{
> > > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > > +		eth_addr->addr_bytes[0],
> > > > +		eth_addr->addr_bytes[1],
> > > > +		eth_addr->addr_bytes[2],
> > > > +		eth_addr->addr_bytes[3],
> > > > +		eth_addr->addr_bytes[4],
> > > > +		eth_addr->addr_bytes[5]);
> > > > +}
> > > > +
> > > This was copieed from print_ethaddr.  Seems like a good candidate for a
> common
> > > function in rte_ether.h
> > [Liang, Cunming] Agree with you, some of samples now use it with the same
> copy.
> > I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the
> 48bits address output.
> > And leaving other prints for application customization.
> > >
> Sounds good.
> 
> > >
> > > > +}
> > > > +
> > > > +static void
> > > > +signal_handler(int signum)
> > > > +{
> > > > +	/* When we receive a USR1 signal, print stats */
> > > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits the
> > > program
> > [Liang, Cunming] Thanks, it's a typo.
> > >
> > > > +	if (signum == SIGUSR1) {
> > > SIGINT instead.  Thats the common practice.
> > [Liang, Cunming] I understood your opinion.
> > The considerations I'm not using SIGINT instead are:
> > 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command
> interactive.
> >   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> > 2. By SIGINT semantic, expect to terminate the process.
> >   Here I expect to force stop this case, but still alive in command line.
> >   After it stopped, it can run again or start to run other test cases.
> >   So I keep SIGINT, SIGUSR1 in different behavior.
> > 3. It should be rarely used.
> >   Only when exception timeout, I leave this backdoor for automation test
> control.
> >   For manual test, we can easily force kill the process.
> >
> Hmm, ok, that sounds reasonable.
> 
> > >
> > > > +		printf("Force Stop!\n");
> > > > +		stop = 1;
> > > > +	}
> > > > +	if (signum == SIGUSR2)
> > > > +		stats_display(0);
> > > > +}
> > > > +/* main processing loop */
> > > > +static int
> > > > +main_loop(__rte_unused void *args)
> > > > +{
> > > > +#define PACKET_SIZE 64
> > > > +#define FRAME_GAP 12
> > > > +#define MAC_PREAMBLE 8
> > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > +	unsigned lcore_id;
> > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > +	struct lcore_conf *conf;
> > > > +	uint64_t prev_tsc, cur_tsc;
> > > > +	int pkt_per_port;
> > > > +	uint64_t packets_per_second, total_packets;
> > > > +
> > > > +	lcore_id = rte_lcore_id();
> > > > +	conf = &lcore_conf[lcore_id];
> > > > +	if (conf->status != LCORE_USED)
> > > > +		return 0;
> > > > +
> > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > +
> > > > +	int idx = 0;
> > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > +		int num = pkt_per_port;
> > > > +		portid = conf->portlist[i];
> > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > +		while (num) {
> > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > +						&tx_burst[idx], nb_tx);
> > > > +			num -= nb_tx;
> > > > +			idx += nb_tx;
> > > > +		}
> > > > +	}
> > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > +
> > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
> > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > +		+packets_per_second);
> > > > +
> > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > packets_per_second;
> > > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > > +		+ total_packets);
> > > > +
> > > > +	prev_tsc = rte_rdtsc();
> > > > +
> > > > +	while (likely(!stop)) {
> > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > +			portid = conf->portlist[i];
> > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > +			if (unlikely(nb_rx == 0)) {
> > > > +				idle++;
> > > > +				continue;
> > > > +			}
> > > > +
> > > > +			count += nb_rx;
> > > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> return
> > > -ENOTSUP
> > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> CONFIG.
> > The error is used to avoid no function call register.
> > When ETHDEV_DEBUG turn off, the NULL function call cause segfault directly.
> > So I think it's a library internal error.
> No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is null,
> -ENOTSUPP will be returned to the application, you need to handle the error
> condition.
[Liang, Cunming] The runtime rte_eth_rx_burst is a inline function in rte_ethdev.h.
The one you're talking about is the one defined in rte_ethdev.c which is a extern non-inline function.
It works only when RTE_LIBRTE_ETHDEV_DEBUG turns on.
If we always turns such library internal checking on, it lose performance.
So I insist it's a library internal error checking, doesn't need to take care in application level.
> 
> > In such library exceptional case, I prefer not expecting sample/application to
> condition check library functional error.
> But you're assertion about the library handling the exception is wrong.
> 
> > >
> > > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
> > > Ditto with -ENOTSUP
> > >
> > > > +			if (unlikely(nb_tx < nb_rx)) {
> > > What makes this unlikely?  Seems like a perfectly reasonable condition to
> happen
> > > to me.  If the network is busy, its completely likely that you will receive more
> > > frames than you send, if you elect to receive all frames.
> > [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> > It firstly injects numbers of packets to NIC. Then NIC will loopback all packets to
> ingress side.
> > The code here will receive the packets, and send them back to NIC
> > Packets looping inside all come from initial injection.
> > As the total number of injected packets is much less than in-chip queue size,
> the tx egress queue shouldn't block desc. ring update.
> > So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot
> archive line rate.
> > When it happens, the cycles/packets result make no sense, as the bottle neck is
> NIC.
> > The drop counter can record it.
> Ok.
> 
> > >
> > > > +				drop += (nb_rx - nb_tx);
> > > > +				do {
> > > > +					rte_pktmbuf_free(pkts_burst[nb_tx]);
> > > Defer this, it skews your timing
> > [Liang, Cunming] Agree with you, I ever thought about it.
> > This test cases is designed to measure pure IO RX/TX routine.
> > When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or
> drop it.
> > Each way introduces noise(adding additional control code), resending much
> times even cost more than free it.
> > The cycles/packets is useful when there's no packet drop, otherwise it gives the
> hint where the problem comes from (by idle or drop).
> I'm not sure what you're asserting here.  Are you suggesting that you want to
> include the time it takes to free memory buffers in your testing?  That seems
> dubious at best to me.  If you want to measure I/O, thats great, but memory
> management of packet buffers is a separate operation from that I/O
[Liang, Cunming] Agree with you. I means it doesn't need to take care of the mbuf free cost.
As I said in previous reply, it rarely happens in line rate.
The cycle measurement doesn't make much sense if happens.
On that time the unit test just notify it happens, and keep safe free all the unsent mbuf.

> 
> > >
> > > > +				} while (++nb_tx < nb_rx);
> > > > +			}
> > > > +		}
> > > > +		if (unlikely(count >= total_packets))
> > > > +			break;
> > > Whats the reasoning here?  Do you only ever expect to receive frames that
> you
> > > send?  If so, seems like this should call for a big warning to get printed.
> > [Liang, Cunming] The loop exits when the pre-calculated total_packets are
> received.
> > As the nb_rx is unpredictable, the packet counter may large equal than
> total_packets the last time.
> > The reason unlikely used here is because the condition becomes true only the
> last time.
> ok
> 
> > >
> > > > +	}
> > > > +
> > > > +	cur_tsc = rte_rdtsc();
> > > > +
> > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > +		portid = conf->portlist[i];
> > > > +		int nb_free = pkt_per_port;
> > > > +		do { /* dry out */
> > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > +			nb_tx = 0;
> > > > +			while (nb_tx < nb_rx)
> > > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > > +			nb_free -= nb_rx;
> > > > +		} while (nb_free != 0);
> > > > +		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
> > > > +	}
> > > > +
> > > Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it
> be
> > > enough just to stop the interface?
> > [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> > But it's designed to run multi-times without exit, for the purpose of warming
> up or for getting average number.
> > So stopping device is not enough, we have to release the flying packets.
> >
> Ok
> 
> > >
> > > > +	if (count == 0)
> > > > +		return -1;
> > > > +
> > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > +
> > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped
> (potentially
> > > more than once) depending on your test length.  you need to check the tsc
> before
> > > and after each burst and record an average of deltas instead, accounting in
> each
> > > instance for the possibility of wrap.
> > [Liang, Cunming] I'm not sure catch your point correctly.
> > I think both cur_tsc and prev_tsc are 64 bits width.
> > For 3GHz, I think it won't wrapped so quick.
> > As it's uint64_t, so even get wrapped, the delta should still be correct.
> But theres no guarantee that the tsc starts at zero when you begin your test.
> The system may have been up for a long time and near wrapping already.
> Regardless, you need to account for the possibility that cur_tsc is smaller
> than prev_tsc, or this breaks.
[Liang, Cunming] In case prev_tsc near wrapping, and cur_tsc get wrapped.
As they are unsigned, it still ok.
e.g. cur_tsc=0x2, prev_tsc=0xFFFFFFFFFFFFFFFC
Delta=cur_tsc-prev_tsc is 6, which is still correct.
And for uint64_t, we need to start computer for hundreds of years.
> 
> > >
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int
> > > > +test_pmd_perf(void)
> > > > +{
> > > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > > +	uint16_t portid;
> > > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > > +	int socketid = -1;
> > > > +	int ret;
> > > > +
> > > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > > +
> > > > +	signal(SIGUSR1, signal_handler);
> > > Again SIGINT here.
> > >
> > > > +	signal(SIGUSR2, signal_handler);
> > > > +
> > > > +	nb_ports = rte_eth_dev_count();
> > > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > > +		printf("At least %u port(s) used for perf. test\n",
> > > > +		       NB_ETHPORTS_USED);
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > > +
> > > > +	nb_lcores = rte_lcore_count();
> > > > +
> > > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > > +	init_lcores();
> > > > +
> > > > +	init_mbufpool(NB_MBUF);
> > > > +
> > > > +	reset_count();
> > > > +	num = 0;
> > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > +		if (socketid == -1) {
> > > > +			socketid = rte_eth_dev_socket_id(portid);
> > > > +			slave_id = alloc_lcore(socketid);
> > > > +			if (slave_id == (uint16_t)-1) {
> > > > +				printf("No avail lcore to run test\n");
> > > > +				return -1;
> > > > +			}
> > > > +			printf("Performance test runs on lcore %u socket %u\n",
> > > > +			       slave_id, socketid);
> > > > +		}
> > > > +
> > > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > > +			printf("Skip port %d\n", portid);
> > > > +			continue;
> > > > +		}
> > > > +
> > > > +		/* port configure */
> > > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > > +					    nb_tx_queue, &port_conf);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE,
> > > > +				"Cannot configure device: err=%d, port=%d\n",
> > > > +				 ret, portid);
> > > > +
> > > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > > +		printf("Port %u ", portid);
> > > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > > +		printf("\n");
> > > > +
> > > > +		/* tx queue setup */
> > > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > > +					     socketid, &tx_conf);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE,
> > > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > > +				"port=%d\n", ret, portid);
> > > > +
> > > > +		/* rx queue steup */
> > > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > > +						socketid, &rx_conf,
> > > > +						mbufpool[socketid]);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
> > > > +				 "port=%d\n", ret, portid);
> > > > +
> > > > +		/* Start device */
> > > > +		stop = 0;
> > > > +		ret = rte_eth_dev_start(portid);
> > > > +		if (ret < 0)
> > > > +			rte_exit(EXIT_FAILURE,
> > > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > > +				ret, portid);
> > > > +
> > > > +		/* always eanble promiscuous */
> > > > +		rte_eth_promiscuous_enable(portid);
> > > > +
> > > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > > +		lcore_conf[slave_id].nb_ports++;
> > > > +	}
> > > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > > +
> > > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > > +
> > > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > > +		return -1;
> > > > +
> > > > +	/* port tear down */
> > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > > +			continue;
> > > > +
> > > > +		rte_eth_dev_stop(portid);
> > > > +	}
> > > > +
> > > Clean up your allocated memory/lcores/etc?
> > [Liang, Cunming] It do cleanup on the beginning of case.
> > "Init_lcores","init_mbufpool","reset_count" guarantee the data clean before
> each testing.
> > And mbufpool only allocated once even if we run multiple times.
> Its a janitorial issue.  Before the program exits, you need to free any
> resources that you've allocated.
[Liang, Cunming] Yes, I do but only not for the mbuf pool.
Firstly the case exit is not equal to program exit. I expect the re-enter will use the same mempool, so that can do cache warm up.
Secondly, DPDK doesn't supply really mempool release. The program exit, the memzone then free.
> 
> Neil
> 
> > >
> > > Neil
> >
> >
  
Neil Horman Oct. 21, 2014, 1:37 p.m. UTC | #8
On Tue, Oct 21, 2014 at 10:43:03AM +0000, Richardson, Bruce wrote:
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> > Sent: Tuesday, October 21, 2014 11:33 AM
> > To: Liang, Cunming
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> > 
> > > >
> > > > > +	if (count == 0)
> > > > > +		return -1;
> > > > > +
> > > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > > +
> > > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped (potentially
> > > > more than once) depending on your test length.  you need to check the tsc
> > before
> > > > and after each burst and record an average of deltas instead, accounting in
> > each
> > > > instance for the possibility of wrap.
> > > [Liang, Cunming] I'm not sure catch your point correctly.
> > > I think both cur_tsc and prev_tsc are 64 bits width.
> > > For 3GHz, I think it won't wrapped so quick.
> > > As it's uint64_t, so even get wrapped, the delta should still be correct.
> > But theres no guarantee that the tsc starts at zero when you begin your test.
> > The system may have been up for a long time and near wrapping already.
> > Regardless, you need to account for the possibility that cur_tsc is smaller
> > than prev_tsc, or this breaks.
> > 
> 
> The tsc. is 64-bit and so only wraps around every couple of hundred years or so on a 2GHz machine, so I don't think it's necessary to handle that case. 
> 
But that presumes that no one has written the TSC via IA32_TIME_STAMP_COUNTER.
Assuming that something will never wrap just seems like bad practice here.  We
should have a general purpose macro to handle wrapping counters like this, if
not for this case specficially, then in general.

Neil

> /Bruce
>
  
Ananyev, Konstantin Oct. 21, 2014, 10:43 p.m. UTC | #9
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> Sent: Tuesday, October 21, 2014 2:38 PM
> To: Richardson, Bruce
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx cycles/packet
> 
> On Tue, Oct 21, 2014 at 10:43:03AM +0000, Richardson, Bruce wrote:
> >
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> > > Sent: Tuesday, October 21, 2014 11:33 AM
> > > To: Liang, Cunming
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > cycles/packet
> > >
> > > > >
> > > > > > +	if (count == 0)
> > > > > > +		return -1;
> > > > > > +
> > > > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > > > +
> > > > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped (potentially
> > > > > more than once) depending on your test length.  you need to check the tsc
> > > before
> > > > > and after each burst and record an average of deltas instead, accounting in
> > > each
> > > > > instance for the possibility of wrap.
> > > > [Liang, Cunming] I'm not sure catch your point correctly.
> > > > I think both cur_tsc and prev_tsc are 64 bits width.
> > > > For 3GHz, I think it won't wrapped so quick.
> > > > As it's uint64_t, so even get wrapped, the delta should still be correct.
> > > But theres no guarantee that the tsc starts at zero when you begin your test.
> > > The system may have been up for a long time and near wrapping already.
> > > Regardless, you need to account for the possibility that cur_tsc is smaller
> > > than prev_tsc, or this breaks.
> > >
> >
> > The tsc. is 64-bit and so only wraps around every couple of hundred years or so on a 2GHz machine, so I don't think it's necessary to
> handle that case.
> >
> But that presumes that no one has written the TSC via IA32_TIME_STAMP_COUNTER.

Then the test app would just print the wrong number :)
I suppose user will just repeat the test.
Again, imagine someone wrmsr(TSC), but set the value just much bigger than current.
Then your statistics will be incorrect, but you have no wait to figure that out. 
I think that trying to handle all such hypothetical situations is a pure overkill.

> Assuming that something will never wrap just seems like bad practice here.  We
> should have a general purpose macro to handle wrapping counters like this, if
> not for this case specficially, then in general.
> 
> Neil
> 
> > /Bruce
> >
  
Neil Horman Oct. 22, 2014, 2:03 p.m. UTC | #10
On Tue, Oct 21, 2014 at 01:17:01PM +0000, Liang, Cunming wrote:
> 
> 
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Tuesday, October 21, 2014 6:33 PM
> > To: Liang, Cunming
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> > 
> > On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> > > Hi Neil,
> > >
> > > Very appreciate your comments.
> > > I add inline reply, will send v3 asap when we get alignment.
> > >
> > > BRs,
> > > Liang Cunming
> > >
> > > > -----Original Message-----
> > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > Sent: Saturday, October 11, 2014 1:52 AM
> > > > To: Liang, Cunming
> > > > Cc: dev@dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> > > >
> > > > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > > > It simply gives the average cycles of IO used per packet without test
> > equipment.
> > > > > When doing the test, make sure the link is UP.
> > > > >
> > > > > Usage Example:
> > > > > 1. Run unit test app in interactive mode
> > > > >     app/test -c f -n 4 -- -i
> > > > > 2. Run and wait for the result
> > > > >     pmd_perf_autotest
> > > > >
> > > > > There's option to choose rx/tx pair, default is vector.
> > > > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> > > > INC_VEC=y in config
> > > > >
> > > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > > >
> > > > Notes inline
> > > >
> > > > > ---
> > > > >  app/test/Makefile                   |    1 +
> > > > >  app/test/commands.c                 |   38 +++
> > > > >  app/test/packet_burst_generator.c   |    4 +-
> > > > >  app/test/test.h                     |    4 +
> > > > >  app/test/test_pmd_perf.c            |  626
> > > > +++++++++++++++++++++++++++++++++++
> > > > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > > > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > > > >  create mode 100644 app/test/test_pmd_perf.c
> > > > >
> > > > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > > > index 6af6d76..ebfa0ba 100644
> > > > > --- a/app/test/Makefile
> > > > > +++ b/app/test/Makefile
> > > > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > > > >
> > > > >  SRCS-y += test_ring.c
> > > > >  SRCS-y += test_ring_perf.c
> > > > > +SRCS-y += test_pmd_perf.c
> > > > >
> > > > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > > > >  SRCS-y += test_table.c
> > > > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > > > index a9e36b1..f1e746e 100644
> > > > > --- a/app/test/commands.c
> > > > > +++ b/app/test/commands.c
> > > > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > > > >
> > > > > +#define NB_ETHPORTS_USED                (1)
> > > > > +#define NB_SOCKETS                      (2)
> > > > > +#define MEMPOOL_CACHE_SIZE 250
> > > > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > > > RTE_PKTMBUF_HEADROOM)
> > > > Don't you want to size this in accordance with the amount of data your
> > sending
> > > > (64 Bytes as noted above)?
> > > [Liang, Cunming] The case is designed to measure small packet IO cost with
> > normal mbuf size.
> > > Even if decreasing the size, it won't gain significant cycles.
> > > >
> > That presumes a non-memory constrained system, doesn't it?  I suppose in the
> > end
> > as long as you have consistency, its not overly relevant, but it seems like
> > you'll want to add data sizing as a feature to this eventually (i.e. the ability
> > to test performance for larger frames sizes), at which point you'll need to make
> > this non-static anyway.
> [Liang, Cunming] For a normal Ethernet packet(w/o jumbo frame), packet size is 1518B.
> As in really network, there won't have huge number of jumbo frames.
> The mbuf size 2048 is a reasonable value to cover most of the packet size.
> It's also be chosen by lots of NIC as the default receiving buffer size in DMA register.
> In case larger than the size, it need do scatter and gather but lose some performance.
> The unit test won't measure size from 64 to 9600, won't plan to measure scatter-gather rx/tx.
> It focus on 64B packet size and taking the mbuf size being used the most often.
Fine.

> > 
> > > > > +static void
> > > > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > > > +{
> > > > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > > > +		eth_addr->addr_bytes[0],
> > > > > +		eth_addr->addr_bytes[1],
> > > > > +		eth_addr->addr_bytes[2],
> > > > > +		eth_addr->addr_bytes[3],
> > > > > +		eth_addr->addr_bytes[4],
> > > > > +		eth_addr->addr_bytes[5]);
> > > > > +}
> > > > > +
> > > > This was copieed from print_ethaddr.  Seems like a good candidate for a
> > common
> > > > function in rte_ether.h
> > > [Liang, Cunming] Agree with you, some of samples now use it with the same
> > copy.
> > > I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the
> > 48bits address output.
> > > And leaving other prints for application customization.
> > > >
> > Sounds good.
> > 
> > > >
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +signal_handler(int signum)
> > > > > +{
> > > > > +	/* When we receive a USR1 signal, print stats */
> > > > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits the
> > > > program
> > > [Liang, Cunming] Thanks, it's a typo.
> > > >
> > > > > +	if (signum == SIGUSR1) {
> > > > SIGINT instead.  Thats the common practice.
> > > [Liang, Cunming] I understood your opinion.
> > > The considerations I'm not using SIGINT instead are:
> > > 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command
> > interactive.
> > >   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> > > 2. By SIGINT semantic, expect to terminate the process.
> > >   Here I expect to force stop this case, but still alive in command line.
> > >   After it stopped, it can run again or start to run other test cases.
> > >   So I keep SIGINT, SIGUSR1 in different behavior.
> > > 3. It should be rarely used.
> > >   Only when exception timeout, I leave this backdoor for automation test
> > control.
> > >   For manual test, we can easily force kill the process.
> > >
> > Hmm, ok, that sounds reasonable.
> > 
> > > >
> > > > > +		printf("Force Stop!\n");
> > > > > +		stop = 1;
> > > > > +	}
> > > > > +	if (signum == SIGUSR2)
> > > > > +		stats_display(0);
> > > > > +}
> > > > > +/* main processing loop */
> > > > > +static int
> > > > > +main_loop(__rte_unused void *args)
> > > > > +{
> > > > > +#define PACKET_SIZE 64
> > > > > +#define FRAME_GAP 12
> > > > > +#define MAC_PREAMBLE 8
> > > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > > +	unsigned lcore_id;
> > > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > > +	struct lcore_conf *conf;
> > > > > +	uint64_t prev_tsc, cur_tsc;
> > > > > +	int pkt_per_port;
> > > > > +	uint64_t packets_per_second, total_packets;
> > > > > +
> > > > > +	lcore_id = rte_lcore_id();
> > > > > +	conf = &lcore_conf[lcore_id];
> > > > > +	if (conf->status != LCORE_USED)
> > > > > +		return 0;
> > > > > +
> > > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > > +
> > > > > +	int idx = 0;
> > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > +		int num = pkt_per_port;
> > > > > +		portid = conf->portlist[i];
> > > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > > +		while (num) {
> > > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > > +						&tx_burst[idx], nb_tx);
> > > > > +			num -= nb_tx;
> > > > > +			idx += nb_tx;
> > > > > +		}
> > > > > +	}
> > > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > > +
> > > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
> > > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > > +		+packets_per_second);
> > > > > +
> > > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > > packets_per_second;
> > > > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > > > +		+ total_packets);
> > > > > +
> > > > > +	prev_tsc = rte_rdtsc();
> > > > > +
> > > > > +	while (likely(!stop)) {
> > > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > > +			portid = conf->portlist[i];
> > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > > +			if (unlikely(nb_rx == 0)) {
> > > > > +				idle++;
> > > > > +				continue;
> > > > > +			}
> > > > > +
> > > > > +			count += nb_rx;
> > > > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> > return
> > > > -ENOTSUP
> > > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> > CONFIG.
> > > The error is used to avoid no function call register.
> > > When ETHDEV_DEBUG turn off, the NULL function call cause segfault directly.
> > > So I think it's a library internal error.
> > No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is null,
> > -ENOTSUPP will be returned to the application, you need to handle the error
> > condition.
> [Liang, Cunming] The runtime rte_eth_rx_burst is a inline function in rte_ethdev.h.
> The one you're talking about is the one defined in rte_ethdev.c which is a extern non-inline function.
> It works only when RTE_LIBRTE_ETHDEV_DEBUG turns on.
> If we always turns such library internal checking on, it lose performance.
> So I insist it's a library internal error checking, doesn't need to take care in application level.
I'm really flored that you would respond this way. 

What you have is two versions of a function, one returns errors and one doesn't,
and the version used is based on compile time configuration.  You've written
your application such that it can only gracefully handle one of the two
versions, and you're happy to allow the other version, the one thats supposed to
be in use when you are debugging applications, to create silent accounting
failures in your application.  And it completely ignores the fact that the
non-debug version might one day return errors as well (even if it doesn't
currently).  Your assertion above that we can ignore it because its debug code
is the most short sighted thing I've read in a long time.

> > 
> > > In such library exceptional case, I prefer not expecting sample/application to
> > condition check library functional error.
> > But you're assertion about the library handling the exception is wrong.
> > 
> > > >
> > > > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
> > > > Ditto with -ENOTSUP
> > > >
> > > > > +			if (unlikely(nb_tx < nb_rx)) {
> > > > What makes this unlikely?  Seems like a perfectly reasonable condition to
> > happen
> > > > to me.  If the network is busy, its completely likely that you will receive more
> > > > frames than you send, if you elect to receive all frames.
> > > [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> > > It firstly injects numbers of packets to NIC. Then NIC will loopback all packets to
> > ingress side.
> > > The code here will receive the packets, and send them back to NIC
> > > Packets looping inside all come from initial injection.
> > > As the total number of injected packets is much less than in-chip queue size,
> > the tx egress queue shouldn't block desc. ring update.
> > > So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot
> > archive line rate.
> > > When it happens, the cycles/packets result make no sense, as the bottle neck is
> > NIC.
> > > The drop counter can record it.
> > Ok.
> > 
> > > >
> > > > > +				drop += (nb_rx - nb_tx);
> > > > > +				do {
> > > > > +					rte_pktmbuf_free(pkts_burst[nb_tx]);
> > > > Defer this, it skews your timing
> > > [Liang, Cunming] Agree with you, I ever thought about it.
> > > This test cases is designed to measure pure IO RX/TX routine.
> > > When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or
> > drop it.
> > > Each way introduces noise(adding additional control code), resending much
> > times even cost more than free it.
> > > The cycles/packets is useful when there's no packet drop, otherwise it gives the
> > hint where the problem comes from (by idle or drop).
> > I'm not sure what you're asserting here.  Are you suggesting that you want to
> > include the time it takes to free memory buffers in your testing?  That seems
> > dubious at best to me.  If you want to measure I/O, thats great, but memory
> > management of packet buffers is a separate operation from that I/O
> [Liang, Cunming] Agree with you. I means it doesn't need to take care of the mbuf free cost.
> As I said in previous reply, it rarely happens in line rate.
> The cycle measurement doesn't make much sense if happens.
> On that time the unit test just notify it happens, and keep safe free all the unsent mbuf.
> 
> > 
> > > >
> > > > > +				} while (++nb_tx < nb_rx);
> > > > > +			}
> > > > > +		}
> > > > > +		if (unlikely(count >= total_packets))
> > > > > +			break;
> > > > Whats the reasoning here?  Do you only ever expect to receive frames that
> > you
> > > > send?  If so, seems like this should call for a big warning to get printed.
> > > [Liang, Cunming] The loop exits when the pre-calculated total_packets are
> > received.
> > > As the nb_rx is unpredictable, the packet counter may large equal than
> > total_packets the last time.
> > > The reason unlikely used here is because the condition becomes true only the
> > last time.
> > ok
> > 
> > > >
> > > > > +	}
> > > > > +
> > > > > +	cur_tsc = rte_rdtsc();
> > > > > +
> > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > +		portid = conf->portlist[i];
> > > > > +		int nb_free = pkt_per_port;
> > > > > +		do { /* dry out */
> > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > > +			nb_tx = 0;
> > > > > +			while (nb_tx < nb_rx)
> > > > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > > > +			nb_free -= nb_rx;
> > > > > +		} while (nb_free != 0);
> > > > > +		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
> > > > > +	}
> > > > > +
> > > > Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it
> > be
> > > > enough just to stop the interface?
> > > [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> > > But it's designed to run multi-times without exit, for the purpose of warming
> > up or for getting average number.
> > > So stopping device is not enough, we have to release the flying packets.
> > >
> > Ok
> > 
> > > >
> > > > > +	if (count == 0)
> > > > > +		return -1;
> > > > > +
> > > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > > +
> > > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped
> > (potentially
> > > > more than once) depending on your test length.  you need to check the tsc
> > before
> > > > and after each burst and record an average of deltas instead, accounting in
> > each
> > > > instance for the possibility of wrap.
> > > [Liang, Cunming] I'm not sure catch your point correctly.
> > > I think both cur_tsc and prev_tsc are 64 bits width.
> > > For 3GHz, I think it won't wrapped so quick.
> > > As it's uint64_t, so even get wrapped, the delta should still be correct.
> > But theres no guarantee that the tsc starts at zero when you begin your test.
> > The system may have been up for a long time and near wrapping already.
> > Regardless, you need to account for the possibility that cur_tsc is smaller
> > than prev_tsc, or this breaks.
> [Liang, Cunming] In case prev_tsc near wrapping, and cur_tsc get wrapped.
> As they are unsigned, it still ok.
> e.g. cur_tsc=0x2, prev_tsc=0xFFFFFFFFFFFFFFFC
> Delta=cur_tsc-prev_tsc is 6, which is still correct.
Ah, you're right, my fault.
> And for uint64_t, we need to start computer for hundreds of years.
That only assumes that the tsc has been kept at its initial state from boot, its
entirely possible that the tsc is writen to a different value.  IIRC, in several
debug configurations, the linux kernel inits the tsc to a large value so that it
rolls over shortly after boot to catch wrapping errors.  Though as you note,
your used of uint64 makes it irrelevant.

> > 
> > > >
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +test_pmd_perf(void)
> > > > > +{
> > > > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > > > +	uint16_t portid;
> > > > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > > > +	int socketid = -1;
> > > > > +	int ret;
> > > > > +
> > > > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > > > +
> > > > > +	signal(SIGUSR1, signal_handler);
> > > > Again SIGINT here.
> > > >
> > > > > +	signal(SIGUSR2, signal_handler);
> > > > > +
> > > > > +	nb_ports = rte_eth_dev_count();
> > > > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > > > +		printf("At least %u port(s) used for perf. test\n",
> > > > > +		       NB_ETHPORTS_USED);
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > > > +
> > > > > +	nb_lcores = rte_lcore_count();
> > > > > +
> > > > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > > > +	init_lcores();
> > > > > +
> > > > > +	init_mbufpool(NB_MBUF);
> > > > > +
> > > > > +	reset_count();
> > > > > +	num = 0;
> > > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > > +		if (socketid == -1) {
> > > > > +			socketid = rte_eth_dev_socket_id(portid);
> > > > > +			slave_id = alloc_lcore(socketid);
> > > > > +			if (slave_id == (uint16_t)-1) {
> > > > > +				printf("No avail lcore to run test\n");
> > > > > +				return -1;
> > > > > +			}
> > > > > +			printf("Performance test runs on lcore %u socket %u\n",
> > > > > +			       slave_id, socketid);
> > > > > +		}
> > > > > +
> > > > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > > > +			printf("Skip port %d\n", portid);
> > > > > +			continue;
> > > > > +		}
> > > > > +
> > > > > +		/* port configure */
> > > > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > > > +					    nb_tx_queue, &port_conf);
> > > > > +		if (ret < 0)
> > > > > +			rte_exit(EXIT_FAILURE,
> > > > > +				"Cannot configure device: err=%d, port=%d\n",
> > > > > +				 ret, portid);
> > > > > +
> > > > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > > > +		printf("Port %u ", portid);
> > > > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > > > +		printf("\n");
> > > > > +
> > > > > +		/* tx queue setup */
> > > > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > > > +					     socketid, &tx_conf);
> > > > > +		if (ret < 0)
> > > > > +			rte_exit(EXIT_FAILURE,
> > > > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > > > +				"port=%d\n", ret, portid);
> > > > > +
> > > > > +		/* rx queue steup */
> > > > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > > > +						socketid, &rx_conf,
> > > > > +						mbufpool[socketid]);
> > > > > +		if (ret < 0)
> > > > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
> > > > > +				 "port=%d\n", ret, portid);
> > > > > +
> > > > > +		/* Start device */
> > > > > +		stop = 0;
> > > > > +		ret = rte_eth_dev_start(portid);
> > > > > +		if (ret < 0)
> > > > > +			rte_exit(EXIT_FAILURE,
> > > > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > > > +				ret, portid);
> > > > > +
> > > > > +		/* always eanble promiscuous */
> > > > > +		rte_eth_promiscuous_enable(portid);
> > > > > +
> > > > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > > > +		lcore_conf[slave_id].nb_ports++;
> > > > > +	}
> > > > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > > > +
> > > > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > > > +
> > > > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > > > +		return -1;
> > > > > +
> > > > > +	/* port tear down */
> > > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > > > +			continue;
> > > > > +
> > > > > +		rte_eth_dev_stop(portid);
> > > > > +	}
> > > > > +
> > > > Clean up your allocated memory/lcores/etc?
> > > [Liang, Cunming] It do cleanup on the beginning of case.
> > > "Init_lcores","init_mbufpool","reset_count" guarantee the data clean before
> > each testing.
> > > And mbufpool only allocated once even if we run multiple times.
> > Its a janitorial issue.  Before the program exits, you need to free any
> > resources that you've allocated.
> [Liang, Cunming] Yes, I do but only not for the mbuf pool.
> Firstly the case exit is not equal to program exit. I expect the re-enter will use the same mempool, so that can do cache warm up.
> Secondly, DPDK doesn't supply really mempool release. The program exit, the memzone then free.
I continue to find the fact that freeing mempools isn't supported is rediculous,
but I suppose thats neither here nor there, you need to be able to clean up
resources that you allocate, weather you are exiting a test case, a program, or
simply a function, its just not sane to not be able to do that.

Given the return code check above, all thats left to say I think is NAK to this
code.  If Thomas wants to ignore me, I suppose thats his perogative, but I can't
see how you can ignore that case.

Neil
  
Cunming Liang Oct. 22, 2014, 2:48 p.m. UTC | #11
> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Wednesday, October 22, 2014 10:03 PM
> To: Liang, Cunming
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> On Tue, Oct 21, 2014 at 01:17:01PM +0000, Liang, Cunming wrote:
> >
> >
> > > -----Original Message-----
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Tuesday, October 21, 2014 6:33 PM
> > > To: Liang, Cunming
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > cycles/packet
> > >
> > > On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> > > > Hi Neil,
> > > >
> > > > Very appreciate your comments.
> > > > I add inline reply, will send v3 asap when we get alignment.
> > > >
> > > > BRs,
> > > > Liang Cunming
> > > >
> > > > > -----Original Message-----
> > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > Sent: Saturday, October 11, 2014 1:52 AM
> > > > > To: Liang, Cunming
> > > > > Cc: dev@dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > cycles/packet
> > > > >
> > > > > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > > > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > > > > It simply gives the average cycles of IO used per packet without test
> > > equipment.
> > > > > > When doing the test, make sure the link is UP.
> > > > > >
> > > > > > Usage Example:
> > > > > > 1. Run unit test app in interactive mode
> > > > > >     app/test -c f -n 4 -- -i
> > > > > > 2. Run and wait for the result
> > > > > >     pmd_perf_autotest
> > > > > >
> > > > > > There's option to choose rx/tx pair, default is vector.
> > > > > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > > > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid'
> without
> > > > > INC_VEC=y in config
> > > > > >
> > > > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > > > >
> > > > > Notes inline
> > > > >
> > > > > > ---
> > > > > >  app/test/Makefile                   |    1 +
> > > > > >  app/test/commands.c                 |   38 +++
> > > > > >  app/test/packet_burst_generator.c   |    4 +-
> > > > > >  app/test/test.h                     |    4 +
> > > > > >  app/test/test_pmd_perf.c            |  626
> > > > > +++++++++++++++++++++++++++++++++++
> > > > > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > > > > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > > > > >  create mode 100644 app/test/test_pmd_perf.c
> > > > > >
> > > > > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > > > > index 6af6d76..ebfa0ba 100644
> > > > > > --- a/app/test/Makefile
> > > > > > +++ b/app/test/Makefile
> > > > > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > > > > >
> > > > > >  SRCS-y += test_ring.c
> > > > > >  SRCS-y += test_ring_perf.c
> > > > > > +SRCS-y += test_pmd_perf.c
> > > > > >
> > > > > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > > > > >  SRCS-y += test_table.c
> > > > > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > > > > index a9e36b1..f1e746e 100644
> > > > > > --- a/app/test/commands.c
> > > > > > +++ b/app/test/commands.c
> > > > > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > > > > >
> > > > > > +#define NB_ETHPORTS_USED                (1)
> > > > > > +#define NB_SOCKETS                      (2)
> > > > > > +#define MEMPOOL_CACHE_SIZE 250
> > > > > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > > > > RTE_PKTMBUF_HEADROOM)
> > > > > Don't you want to size this in accordance with the amount of data your
> > > sending
> > > > > (64 Bytes as noted above)?
> > > > [Liang, Cunming] The case is designed to measure small packet IO cost with
> > > normal mbuf size.
> > > > Even if decreasing the size, it won't gain significant cycles.
> > > > >
> > > That presumes a non-memory constrained system, doesn't it?  I suppose in
> the
> > > end
> > > as long as you have consistency, its not overly relevant, but it seems like
> > > you'll want to add data sizing as a feature to this eventually (i.e. the ability
> > > to test performance for larger frames sizes), at which point you'll need to
> make
> > > this non-static anyway.
> > [Liang, Cunming] For a normal Ethernet packet(w/o jumbo frame), packet size is
> 1518B.
> > As in really network, there won't have huge number of jumbo frames.
> > The mbuf size 2048 is a reasonable value to cover most of the packet size.
> > It's also be chosen by lots of NIC as the default receiving buffer size in DMA
> register.
> > In case larger than the size, it need do scatter and gather but lose some
> performance.
> > The unit test won't measure size from 64 to 9600, won't plan to measure
> scatter-gather rx/tx.
> > It focus on 64B packet size and taking the mbuf size being used the most often.
> Fine.
> 
> > >
> > > > > > +static void
> > > > > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > > > > +{
> > > > > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > > > > +		eth_addr->addr_bytes[0],
> > > > > > +		eth_addr->addr_bytes[1],
> > > > > > +		eth_addr->addr_bytes[2],
> > > > > > +		eth_addr->addr_bytes[3],
> > > > > > +		eth_addr->addr_bytes[4],
> > > > > > +		eth_addr->addr_bytes[5]);
> > > > > > +}
> > > > > > +
> > > > > This was copieed from print_ethaddr.  Seems like a good candidate for a
> > > common
> > > > > function in rte_ether.h
> > > > [Liang, Cunming] Agree with you, some of samples now use it with the
> same
> > > copy.
> > > > I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the
> > > 48bits address output.
> > > > And leaving other prints for application customization.
> > > > >
> > > Sounds good.
> > >
> > > > >
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +signal_handler(int signum)
> > > > > > +{
> > > > > > +	/* When we receive a USR1 signal, print stats */
> > > > > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits
> the
> > > > > program
> > > > [Liang, Cunming] Thanks, it's a typo.
> > > > >
> > > > > > +	if (signum == SIGUSR1) {
> > > > > SIGINT instead.  Thats the common practice.
> > > > [Liang, Cunming] I understood your opinion.
> > > > The considerations I'm not using SIGINT instead are:
> > > > 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in
> command
> > > interactive.
> > > >   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> > > > 2. By SIGINT semantic, expect to terminate the process.
> > > >   Here I expect to force stop this case, but still alive in command line.
> > > >   After it stopped, it can run again or start to run other test cases.
> > > >   So I keep SIGINT, SIGUSR1 in different behavior.
> > > > 3. It should be rarely used.
> > > >   Only when exception timeout, I leave this backdoor for automation test
> > > control.
> > > >   For manual test, we can easily force kill the process.
> > > >
> > > Hmm, ok, that sounds reasonable.
> > >
> > > > >
> > > > > > +		printf("Force Stop!\n");
> > > > > > +		stop = 1;
> > > > > > +	}
> > > > > > +	if (signum == SIGUSR2)
> > > > > > +		stats_display(0);
> > > > > > +}
> > > > > > +/* main processing loop */
> > > > > > +static int
> > > > > > +main_loop(__rte_unused void *args)
> > > > > > +{
> > > > > > +#define PACKET_SIZE 64
> > > > > > +#define FRAME_GAP 12
> > > > > > +#define MAC_PREAMBLE 8
> > > > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > > > +	unsigned lcore_id;
> > > > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > > > +	struct lcore_conf *conf;
> > > > > > +	uint64_t prev_tsc, cur_tsc;
> > > > > > +	int pkt_per_port;
> > > > > > +	uint64_t packets_per_second, total_packets;
> > > > > > +
> > > > > > +	lcore_id = rte_lcore_id();
> > > > > > +	conf = &lcore_conf[lcore_id];
> > > > > > +	if (conf->status != LCORE_USED)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > > > +
> > > > > > +	int idx = 0;
> > > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > > +		int num = pkt_per_port;
> > > > > > +		portid = conf->portlist[i];
> > > > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > > > +		while (num) {
> > > > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > > > +						&tx_burst[idx], nb_tx);
> > > > > > +			num -= nb_tx;
> > > > > > +			idx += nb_tx;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > > > +
> > > > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) *
> CHAR_BIT);
> > > > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > > > +		+packets_per_second);
> > > > > > +
> > > > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > > > packets_per_second;
> > > > > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > > > > +		+ total_packets);
> > > > > > +
> > > > > > +	prev_tsc = rte_rdtsc();
> > > > > > +
> > > > > > +	while (likely(!stop)) {
> > > > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > > > +			portid = conf->portlist[i];
> > > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > > > +			if (unlikely(nb_rx == 0)) {
> > > > > > +				idle++;
> > > > > > +				continue;
> > > > > > +			}
> > > > > > +
> > > > > > +			count += nb_rx;
> > > > > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> > > return
> > > > > -ENOTSUP
> > > > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> > > CONFIG.
> > > > The error is used to avoid no function call register.
> > > > When ETHDEV_DEBUG turn off, the NULL function call cause segfault
> directly.
> > > > So I think it's a library internal error.
> > > No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is null,
> > > -ENOTSUPP will be returned to the application, you need to handle the error
> > > condition.
> > [Liang, Cunming] The runtime rte_eth_rx_burst is a inline function in
> rte_ethdev.h.
> > The one you're talking about is the one defined in rte_ethdev.c which is a
> extern non-inline function.
> > It works only when RTE_LIBRTE_ETHDEV_DEBUG turns on.
> > If we always turns such library internal checking on, it lose performance.
> > So I insist it's a library internal error checking, doesn't need to take care in
> application level.
> I'm really flored that you would respond this way.
> 
> What you have is two versions of a function, one returns errors and one doesn't,
> and the version used is based on compile time configuration.  You've written
> your application such that it can only gracefully handle one of the two
> versions, and you're happy to allow the other version, the one thats supposed to
> be in use when you are debugging applications, to create silent accounting
> failures in your application.  And it completely ignores the fact that the
> non-debug version might one day return errors as well (even if it doesn't
> currently).  Your assertion above that we can ignore it because its debug code
> is the most short sighted thing I've read in a long time.
> 
[Liang, Cunming] If as you said, two version of function, one returns errors and one doesn't.
I'll agree with your point. But in fact in either one, will cause error.
Non-debug version will just segment fault as calling NULL function pointer.
Debug version will print the error root cause.
If you searching the rte_eth_rx_burst calling in DPDK, it's used in every sample, no one checks it.
This function call is in fast path, I don't think it's a good idea pay additional cycles on such branch condition.
> > >
> > > > In such library exceptional case, I prefer not expecting sample/application to
> > > condition check library functional error.
> > > But you're assertion about the library handling the exception is wrong.
> > >
> > > > >
> > > > > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
> > > > > Ditto with -ENOTSUP
> > > > >
> > > > > > +			if (unlikely(nb_tx < nb_rx)) {
> > > > > What makes this unlikely?  Seems like a perfectly reasonable condition to
> > > happen
> > > > > to me.  If the network is busy, its completely likely that you will receive
> more
> > > > > frames than you send, if you elect to receive all frames.
> > > > [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> > > > It firstly injects numbers of packets to NIC. Then NIC will loopback all
> packets to
> > > ingress side.
> > > > The code here will receive the packets, and send them back to NIC
> > > > Packets looping inside all come from initial injection.
> > > > As the total number of injected packets is much less than in-chip queue
> size,
> > > the tx egress queue shouldn't block desc. ring update.
> > > > So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot
> > > archive line rate.
> > > > When it happens, the cycles/packets result make no sense, as the bottle
> neck is
> > > NIC.
> > > > The drop counter can record it.
> > > Ok.
> > >
> > > > >
> > > > > > +				drop += (nb_rx - nb_tx);
> > > > > > +				do {
> > > > > > +					rte_pktmbuf_free(pkts_burst[nb_tx]);
> > > > > Defer this, it skews your timing
> > > > [Liang, Cunming] Agree with you, I ever thought about it.
> > > > This test cases is designed to measure pure IO RX/TX routine.
> > > > When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or
> > > drop it.
> > > > Each way introduces noise(adding additional control code), resending much
> > > times even cost more than free it.
> > > > The cycles/packets is useful when there's no packet drop, otherwise it gives
> the
> > > hint where the problem comes from (by idle or drop).
> > > I'm not sure what you're asserting here.  Are you suggesting that you want
> to
> > > include the time it takes to free memory buffers in your testing?  That seems
> > > dubious at best to me.  If you want to measure I/O, thats great, but memory
> > > management of packet buffers is a separate operation from that I/O
> > [Liang, Cunming] Agree with you. I means it doesn't need to take care of the
> mbuf free cost.
> > As I said in previous reply, it rarely happens in line rate.
> > The cycle measurement doesn't make much sense if happens.
> > On that time the unit test just notify it happens, and keep safe free all the
> unsent mbuf.
> >
> > >
> > > > >
> > > > > > +				} while (++nb_tx < nb_rx);
> > > > > > +			}
> > > > > > +		}
> > > > > > +		if (unlikely(count >= total_packets))
> > > > > > +			break;
> > > > > Whats the reasoning here?  Do you only ever expect to receive frames
> that
> > > you
> > > > > send?  If so, seems like this should call for a big warning to get printed.
> > > > [Liang, Cunming] The loop exits when the pre-calculated total_packets are
> > > received.
> > > > As the nb_rx is unpredictable, the packet counter may large equal than
> > > total_packets the last time.
> > > > The reason unlikely used here is because the condition becomes true only
> the
> > > last time.
> > > ok
> > >
> > > > >
> > > > > > +	}
> > > > > > +
> > > > > > +	cur_tsc = rte_rdtsc();
> > > > > > +
> > > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > > +		portid = conf->portlist[i];
> > > > > > +		int nb_free = pkt_per_port;
> > > > > > +		do { /* dry out */
> > > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > > > +			nb_tx = 0;
> > > > > > +			while (nb_tx < nb_rx)
> > > > > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > > > > +			nb_free -= nb_rx;
> > > > > > +		} while (nb_free != 0);
> > > > > > +		printf("free %d mbuf left in port %u\n", pkt_per_port,
> portid);
> > > > > > +	}
> > > > > > +
> > > > > Whats the purpose of this?  Are you trying to flush the device?
> Wouldn't it
> > > be
> > > > > enough just to stop the interface?
> > > > [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> > > > But it's designed to run multi-times without exit, for the purpose of
> warming
> > > up or for getting average number.
> > > > So stopping device is not enough, we have to release the flying packets.
> > > >
> > > Ok
> > >
> > > > >
> > > > > > +	if (count == 0)
> > > > > > +		return -1;
> > > > > > +
> > > > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) /
> count);
> > > > > > +
> > > > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped
> > > (potentially
> > > > > more than once) depending on your test length.  you need to check the
> tsc
> > > before
> > > > > and after each burst and record an average of deltas instead, accounting
> in
> > > each
> > > > > instance for the possibility of wrap.
> > > > [Liang, Cunming] I'm not sure catch your point correctly.
> > > > I think both cur_tsc and prev_tsc are 64 bits width.
> > > > For 3GHz, I think it won't wrapped so quick.
> > > > As it's uint64_t, so even get wrapped, the delta should still be correct.
> > > But theres no guarantee that the tsc starts at zero when you begin your test.
> > > The system may have been up for a long time and near wrapping already.
> > > Regardless, you need to account for the possibility that cur_tsc is smaller
> > > than prev_tsc, or this breaks.
> > [Liang, Cunming] In case prev_tsc near wrapping, and cur_tsc get wrapped.
> > As they are unsigned, it still ok.
> > e.g. cur_tsc=0x2, prev_tsc=0xFFFFFFFFFFFFFFFC
> > Delta=cur_tsc-prev_tsc is 6, which is still correct.
> Ah, you're right, my fault.
> > And for uint64_t, we need to start computer for hundreds of years.
> That only assumes that the tsc has been kept at its initial state from boot, its
> entirely possible that the tsc is writen to a different value.  IIRC, in several
> debug configurations, the linux kernel inits the tsc to a large value so that it
> rolls over shortly after boot to catch wrapping errors.  Though as you note,
> your used of uint64 makes it irrelevant.
[Liang, Cunming] Ok, you're right. 
I think we've already got agreement on delta of tsc.
So we can close this one.
> 
> > >
> > > > >
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int
> > > > > > +test_pmd_perf(void)
> > > > > > +{
> > > > > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > > > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > > > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > > > > +	uint16_t portid;
> > > > > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > > > > +	int socketid = -1;
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > > > > +
> > > > > > +	signal(SIGUSR1, signal_handler);
> > > > > Again SIGINT here.
> > > > >
> > > > > > +	signal(SIGUSR2, signal_handler);
> > > > > > +
> > > > > > +	nb_ports = rte_eth_dev_count();
> > > > > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > > > > +		printf("At least %u port(s) used for perf. test\n",
> > > > > > +		       NB_ETHPORTS_USED);
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > > > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > > > > +
> > > > > > +	nb_lcores = rte_lcore_count();
> > > > > > +
> > > > > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > > > > +	init_lcores();
> > > > > > +
> > > > > > +	init_mbufpool(NB_MBUF);
> > > > > > +
> > > > > > +	reset_count();
> > > > > > +	num = 0;
> > > > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > > > +		if (socketid == -1) {
> > > > > > +			socketid = rte_eth_dev_socket_id(portid);
> > > > > > +			slave_id = alloc_lcore(socketid);
> > > > > > +			if (slave_id == (uint16_t)-1) {
> > > > > > +				printf("No avail lcore to run test\n");
> > > > > > +				return -1;
> > > > > > +			}
> > > > > > +			printf("Performance test runs on lcore %u socket %u\n",
> > > > > > +			       slave_id, socketid);
> > > > > > +		}
> > > > > > +
> > > > > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > > > > +			printf("Skip port %d\n", portid);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		/* port configure */
> > > > > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > > > > +					    nb_tx_queue, &port_conf);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE,
> > > > > > +				"Cannot configure device: err=%d, port=%d\n",
> > > > > > +				 ret, portid);
> > > > > > +
> > > > > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > > > > +		printf("Port %u ", portid);
> > > > > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > > > > +		printf("\n");
> > > > > > +
> > > > > > +		/* tx queue setup */
> > > > > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > > > > +					     socketid, &tx_conf);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE,
> > > > > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > > > > +				"port=%d\n", ret, portid);
> > > > > > +
> > > > > > +		/* rx queue steup */
> > > > > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > > > > +						socketid, &rx_conf,
> > > > > > +						mbufpool[socketid]);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:
> err=%d,"
> > > > > > +				 "port=%d\n", ret, portid);
> > > > > > +
> > > > > > +		/* Start device */
> > > > > > +		stop = 0;
> > > > > > +		ret = rte_eth_dev_start(portid);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE,
> > > > > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > > > > +				ret, portid);
> > > > > > +
> > > > > > +		/* always eanble promiscuous */
> > > > > > +		rte_eth_promiscuous_enable(portid);
> > > > > > +
> > > > > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > > > > +		lcore_conf[slave_id].nb_ports++;
> > > > > > +	}
> > > > > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > > > > +
> > > > > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > > > > +
> > > > > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > > > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > > > > +		return -1;
> > > > > > +
> > > > > > +	/* port tear down */
> > > > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		rte_eth_dev_stop(portid);
> > > > > > +	}
> > > > > > +
> > > > > Clean up your allocated memory/lcores/etc?
> > > > [Liang, Cunming] It do cleanup on the beginning of case.
> > > > "Init_lcores","init_mbufpool","reset_count" guarantee the data clean
> before
> > > each testing.
> > > > And mbufpool only allocated once even if we run multiple times.
> > > Its a janitorial issue.  Before the program exits, you need to free any
> > > resources that you've allocated.
> > [Liang, Cunming] Yes, I do but only not for the mbuf pool.
> > Firstly the case exit is not equal to program exit. I expect the re-enter will use
> the same mempool, so that can do cache warm up.
> > Secondly, DPDK doesn't supply really mempool release. The program exit, the
> memzone then free.
> I continue to find the fact that freeing mempools isn't supported is rediculous,
> but I suppose thats neither here nor there, you need to be able to clean up
> resources that you allocate, weather you are exiting a test case, a program, or
> simply a function, its just not sane to not be able to do that.
[Liang, Cunming] Got it, agree with you on the principle.
> 
> Given the return code check above, all thats left to say I think is NAK to this
> code.  If Thomas wants to ignore me, I suppose thats his perogative, but I can't
> see how you can ignore that case.
> 
> Neil
  
Ananyev, Konstantin Oct. 22, 2014, 2:53 p.m. UTC | #12
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> Sent: Wednesday, October 22, 2014 3:03 PM
> To: Liang, Cunming
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx cycles/packet
> 
> On Tue, Oct 21, 2014 at 01:17:01PM +0000, Liang, Cunming wrote:
> >
> >
> > > -----Original Message-----
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Tuesday, October 21, 2014 6:33 PM
> > > To: Liang, Cunming
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > cycles/packet
> > >
> > > On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> > > > Hi Neil,
> > > >
> > > > Very appreciate your comments.
> > > > I add inline reply, will send v3 asap when we get alignment.
> > > >
> > > > BRs,
> > > > Liang Cunming
> > > >
> > > > > -----Original Message-----
> > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > Sent: Saturday, October 11, 2014 1:52 AM
> > > > > To: Liang, Cunming
> > > > > Cc: dev@dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > cycles/packet
> > > > >
> > > > > On Fri, Oct 10, 2014 at 08:29:58PM +0800, Cunming Liang wrote:
> > > > > > It provides unit test to measure cycles/packet in NIC loopback mode.
> > > > > > It simply gives the average cycles of IO used per packet without test
> > > equipment.
> > > > > > When doing the test, make sure the link is UP.
> > > > > >
> > > > > > Usage Example:
> > > > > > 1. Run unit test app in interactive mode
> > > > > >     app/test -c f -n 4 -- -i
> > > > > > 2. Run and wait for the result
> > > > > >     pmd_perf_autotest
> > > > > >
> > > > > > There's option to choose rx/tx pair, default is vector.
> > > > > >     set_rxtx_mode [vector|scalar|full|hybrid]
> > > > > > Note: To get acurate scalar fast, please choose 'vector' or 'hybrid' without
> > > > > INC_VEC=y in config
> > > > > >
> > > > > > Signed-off-by: Cunming Liang <cunming.liang@intel.com>
> > > > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > > > >
> > > > > Notes inline
> > > > >
> > > > > > ---
> > > > > >  app/test/Makefile                   |    1 +
> > > > > >  app/test/commands.c                 |   38 +++
> > > > > >  app/test/packet_burst_generator.c   |    4 +-
> > > > > >  app/test/test.h                     |    4 +
> > > > > >  app/test/test_pmd_perf.c            |  626
> > > > > +++++++++++++++++++++++++++++++++++
> > > > > >  lib/librte_pmd_ixgbe/ixgbe_ethdev.c |    6 +
> > > > > >  6 files changed, 677 insertions(+), 2 deletions(-)
> > > > > >  create mode 100644 app/test/test_pmd_perf.c
> > > > > >
> > > > > > diff --git a/app/test/Makefile b/app/test/Makefile
> > > > > > index 6af6d76..ebfa0ba 100644
> > > > > > --- a/app/test/Makefile
> > > > > > +++ b/app/test/Makefile
> > > > > > @@ -56,6 +56,7 @@ SRCS-y += test_memzone.c
> > > > > >
> > > > > >  SRCS-y += test_ring.c
> > > > > >  SRCS-y += test_ring_perf.c
> > > > > > +SRCS-y += test_pmd_perf.c
> > > > > >
> > > > > >  ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
> > > > > >  SRCS-y += test_table.c
> > > > > > diff --git a/app/test/commands.c b/app/test/commands.c
> > > > > > index a9e36b1..f1e746e 100644
> > > > > > --- a/app/test/commands.c
> > > > > > +++ b/app/test/commands.c
> > > > > > @@ -310,12 +310,50 @@ cmdline_parse_inst_t cmd_quit = {
> > > > > >
> > > > > > +#define NB_ETHPORTS_USED                (1)
> > > > > > +#define NB_SOCKETS                      (2)
> > > > > > +#define MEMPOOL_CACHE_SIZE 250
> > > > > > +#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) +
> > > > > RTE_PKTMBUF_HEADROOM)
> > > > > Don't you want to size this in accordance with the amount of data your
> > > sending
> > > > > (64 Bytes as noted above)?
> > > > [Liang, Cunming] The case is designed to measure small packet IO cost with
> > > normal mbuf size.
> > > > Even if decreasing the size, it won't gain significant cycles.
> > > > >
> > > That presumes a non-memory constrained system, doesn't it?  I suppose in the
> > > end
> > > as long as you have consistency, its not overly relevant, but it seems like
> > > you'll want to add data sizing as a feature to this eventually (i.e. the ability
> > > to test performance for larger frames sizes), at which point you'll need to make
> > > this non-static anyway.
> > [Liang, Cunming] For a normal Ethernet packet(w/o jumbo frame), packet size is 1518B.
> > As in really network, there won't have huge number of jumbo frames.
> > The mbuf size 2048 is a reasonable value to cover most of the packet size.
> > It's also be chosen by lots of NIC as the default receiving buffer size in DMA register.
> > In case larger than the size, it need do scatter and gather but lose some performance.
> > The unit test won't measure size from 64 to 9600, won't plan to measure scatter-gather rx/tx.
> > It focus on 64B packet size and taking the mbuf size being used the most often.
> Fine.
> 
> > >
> > > > > > +static void
> > > > > > +print_ethaddr(const char *name, const struct ether_addr *eth_addr)
> > > > > > +{
> > > > > > +	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
> > > > > > +		eth_addr->addr_bytes[0],
> > > > > > +		eth_addr->addr_bytes[1],
> > > > > > +		eth_addr->addr_bytes[2],
> > > > > > +		eth_addr->addr_bytes[3],
> > > > > > +		eth_addr->addr_bytes[4],
> > > > > > +		eth_addr->addr_bytes[5]);
> > > > > > +}
> > > > > > +
> > > > > This was copieed from print_ethaddr.  Seems like a good candidate for a
> > > common
> > > > > function in rte_ether.h
> > > > [Liang, Cunming] Agree with you, some of samples now use it with the same
> > > copy.
> > > > I'll rework it. Adding 'ether_format_addr' in rte_ether.h only for format the
> > > 48bits address output.
> > > > And leaving other prints for application customization.
> > > > >
> > > Sounds good.
> > >
> > > > >
> > > > > > +}
> > > > > > +
> > > > > > +static void
> > > > > > +signal_handler(int signum)
> > > > > > +{
> > > > > > +	/* When we receive a USR1 signal, print stats */
> > > > > I think you mean SIGUSR2, below, SIGUSR1 tears the test down and exits the
> > > > > program
> > > > [Liang, Cunming] Thanks, it's a typo.
> > > > >
> > > > > > +	if (signum == SIGUSR1) {
> > > > > SIGINT instead.  Thats the common practice.
> > > > [Liang, Cunming] I understood your opinion.
> > > > The considerations I'm not using SIGINT instead are:
> > > > 1. We unset ISIG in c_lflag of term. CRTL+C won't trigger SIGINT in command
> > > interactive.
> > > >   It always has to explicitly send signal. No matter SIGUSR1 or SIGINT.
> > > > 2. By SIGINT semantic, expect to terminate the process.
> > > >   Here I expect to force stop this case, but still alive in command line.
> > > >   After it stopped, it can run again or start to run other test cases.
> > > >   So I keep SIGINT, SIGUSR1 in different behavior.
> > > > 3. It should be rarely used.
> > > >   Only when exception timeout, I leave this backdoor for automation test
> > > control.
> > > >   For manual test, we can easily force kill the process.
> > > >
> > > Hmm, ok, that sounds reasonable.
> > >
> > > > >
> > > > > > +		printf("Force Stop!\n");
> > > > > > +		stop = 1;
> > > > > > +	}
> > > > > > +	if (signum == SIGUSR2)
> > > > > > +		stats_display(0);
> > > > > > +}
> > > > > > +/* main processing loop */
> > > > > > +static int
> > > > > > +main_loop(__rte_unused void *args)
> > > > > > +{
> > > > > > +#define PACKET_SIZE 64
> > > > > > +#define FRAME_GAP 12
> > > > > > +#define MAC_PREAMBLE 8
> > > > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > > > +	unsigned lcore_id;
> > > > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > > > +	struct lcore_conf *conf;
> > > > > > +	uint64_t prev_tsc, cur_tsc;
> > > > > > +	int pkt_per_port;
> > > > > > +	uint64_t packets_per_second, total_packets;
> > > > > > +
> > > > > > +	lcore_id = rte_lcore_id();
> > > > > > +	conf = &lcore_conf[lcore_id];
> > > > > > +	if (conf->status != LCORE_USED)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > > > +
> > > > > > +	int idx = 0;
> > > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > > +		int num = pkt_per_port;
> > > > > > +		portid = conf->portlist[i];
> > > > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > > > +		while (num) {
> > > > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > > > +						&tx_burst[idx], nb_tx);
> > > > > > +			num -= nb_tx;
> > > > > > +			idx += nb_tx;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > > > +
> > > > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
> > > > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > > > +		+packets_per_second);
> > > > > > +
> > > > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > > > packets_per_second;
> > > > > > +	printf("Test will stop after at least %"PRIu64" packets received\n",
> > > > > > +		+ total_packets);
> > > > > > +
> > > > > > +	prev_tsc = rte_rdtsc();
> > > > > > +
> > > > > > +	while (likely(!stop)) {
> > > > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > > > +			portid = conf->portlist[i];
> > > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > > > +			if (unlikely(nb_rx == 0)) {
> > > > > > +				idle++;
> > > > > > +				continue;
> > > > > > +			}
> > > > > > +
> > > > > > +			count += nb_rx;
> > > > > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> > > return
> > > > > -ENOTSUP
> > > > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> > > CONFIG.
> > > > The error is used to avoid no function call register.
> > > > When ETHDEV_DEBUG turn off, the NULL function call cause segfault directly.
> > > > So I think it's a library internal error.
> > > No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is null,
> > > -ENOTSUPP will be returned to the application, you need to handle the error
> > > condition.
> > [Liang, Cunming] The runtime rte_eth_rx_burst is a inline function in rte_ethdev.h.
> > The one you're talking about is the one defined in rte_ethdev.c which is a extern non-inline function.
> > It works only when RTE_LIBRTE_ETHDEV_DEBUG turns on.
> > If we always turns such library internal checking on, it lose performance.
> > So I insist it's a library internal error checking, doesn't need to take care in application level.
> I'm really flored that you would respond this way.
> 
> What you have is two versions of a function, one returns errors and one doesn't,
> and the version used is based on compile time configuration.  You've written
> your application such that it can only gracefully handle one of the two
> versions, and you're happy to allow the other version, the one thats supposed to
> be in use when you are debugging applications, to create silent accounting
> failures in your application.  And it completely ignores the fact that the
> non-debug version might one day return errors as well (even if it doesn't
> currently).  Your assertion above that we can ignore it because its debug code
> is the most short sighted thing I've read in a long time.

Actually looking at rte_eth_rx_burst(): it returns uint16_t. 
Which is sort of a strange if it expects to return a negative value as error code.
Also reading though 'formal' comments of rte_eth_rx_burst() - it seems that it not supposed to return negative values:
"...
The rte_eth_rx_burst() function does not provide any error
 notification to avoid the corresponding overhead. As a hint, the
 upper-level application might check the status of the device link once
 being systematically returned a 0 value for a given number of tries.
...
@return
 *   The number of packets actually retrieved, which is the number
 *   of pointers to *rte_mbuf* structures effectively supplied to the
 *   *rx_pkts* array."

Again, if you looks at rte_eth_rx_burst() implementation, when RTE_LIBRTE_ETHDEV_DEBUG is on -
For some error cases it returns '0', foor other's: -ENOTSUP:

FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, -ENOTSUP);
if (queue_id >= dev->data->nb_rx_queues) {
                PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
                return 0;
}

Which makes me thing that we just have errnoneous implementation of rte_eth_rx_burst()
when RTE_LIBRTE_ETHDEV_DEBUG is on.
Probably just a mechanical mistake, when FUNC_PTR_OR_ERR_RET() was added.
 I'd say we change rte_eth_rx_burst() to always return 0 (as was initially planned).

Konstantin


> 
> > >
> > > > In such library exceptional case, I prefer not expecting sample/application to
> > > condition check library functional error.
> > > But you're assertion about the library handling the exception is wrong.
> > >
> > > > >
> > > > > > +			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
> > > > > Ditto with -ENOTSUP
> > > > >
> > > > > > +			if (unlikely(nb_tx < nb_rx)) {
> > > > > What makes this unlikely?  Seems like a perfectly reasonable condition to
> > > happen
> > > > > to me.  If the network is busy, its completely likely that you will receive more
> > > > > frames than you send, if you elect to receive all frames.
> > > > [Liang, Cunming] For this case, NIC works in MAC loopback mode.
> > > > It firstly injects numbers of packets to NIC. Then NIC will loopback all packets to
> > > ingress side.
> > > > The code here will receive the packets, and send them back to NIC
> > > > Packets looping inside all come from initial injection.
> > > > As the total number of injected packets is much less than in-chip queue size,
> > > the tx egress queue shouldn't block desc. ring update.
> > > > So when receiving packets in line rate and nb_tx < nb_rx, it means tx cannot
> > > archive line rate.
> > > > When it happens, the cycles/packets result make no sense, as the bottle neck is
> > > NIC.
> > > > The drop counter can record it.
> > > Ok.
> > >
> > > > >
> > > > > > +				drop += (nb_rx - nb_tx);
> > > > > > +				do {
> > > > > > +					rte_pktmbuf_free(pkts_burst[nb_tx]);
> > > > > Defer this, it skews your timing
> > > > [Liang, Cunming] Agree with you, I ever thought about it.
> > > > This test cases is designed to measure pure IO RX/TX routine.
> > > > When doing tx burst with result nb_tx < nb_rx, we either repeat re-send or
> > > drop it.
> > > > Each way introduces noise(adding additional control code), resending much
> > > times even cost more than free it.
> > > > The cycles/packets is useful when there's no packet drop, otherwise it gives the
> > > hint where the problem comes from (by idle or drop).
> > > I'm not sure what you're asserting here.  Are you suggesting that you want to
> > > include the time it takes to free memory buffers in your testing?  That seems
> > > dubious at best to me.  If you want to measure I/O, thats great, but memory
> > > management of packet buffers is a separate operation from that I/O
> > [Liang, Cunming] Agree with you. I means it doesn't need to take care of the mbuf free cost.
> > As I said in previous reply, it rarely happens in line rate.
> > The cycle measurement doesn't make much sense if happens.
> > On that time the unit test just notify it happens, and keep safe free all the unsent mbuf.
> >
> > >
> > > > >
> > > > > > +				} while (++nb_tx < nb_rx);
> > > > > > +			}
> > > > > > +		}
> > > > > > +		if (unlikely(count >= total_packets))
> > > > > > +			break;
> > > > > Whats the reasoning here?  Do you only ever expect to receive frames that
> > > you
> > > > > send?  If so, seems like this should call for a big warning to get printed.
> > > > [Liang, Cunming] The loop exits when the pre-calculated total_packets are
> > > received.
> > > > As the nb_rx is unpredictable, the packet counter may large equal than
> > > total_packets the last time.
> > > > The reason unlikely used here is because the condition becomes true only the
> > > last time.
> > > ok
> > >
> > > > >
> > > > > > +	}
> > > > > > +
> > > > > > +	cur_tsc = rte_rdtsc();
> > > > > > +
> > > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > > +		portid = conf->portlist[i];
> > > > > > +		int nb_free = pkt_per_port;
> > > > > > +		do { /* dry out */
> > > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > > +						 pkts_burst, MAX_PKT_BURST);
> > > > > > +			nb_tx = 0;
> > > > > > +			while (nb_tx < nb_rx)
> > > > > > +				rte_pktmbuf_free(pkts_burst[nb_tx++]);
> > > > > > +			nb_free -= nb_rx;
> > > > > > +		} while (nb_free != 0);
> > > > > > +		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
> > > > > > +	}
> > > > > > +
> > > > > Whats the purpose of this?  Are you trying to flush the device?  Wouldn't it
> > > be
> > > > > enough just to stop the interface?
> > > > [Liang, Cunming] If we only run the cases once and exit, it doesn't matter.
> > > > But it's designed to run multi-times without exit, for the purpose of warming
> > > up or for getting average number.
> > > > So stopping device is not enough, we have to release the flying packets.
> > > >
> > > Ok
> > >
> > > > >
> > > > > > +	if (count == 0)
> > > > > > +		return -1;
> > > > > > +
> > > > > > +	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
> > > > > > +	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
> > > > > > +
> > > > > Bad math here.  Theres no guarantee that the tsc hasn't wrapped
> > > (potentially
> > > > > more than once) depending on your test length.  you need to check the tsc
> > > before
> > > > > and after each burst and record an average of deltas instead, accounting in
> > > each
> > > > > instance for the possibility of wrap.
> > > > [Liang, Cunming] I'm not sure catch your point correctly.
> > > > I think both cur_tsc and prev_tsc are 64 bits width.
> > > > For 3GHz, I think it won't wrapped so quick.
> > > > As it's uint64_t, so even get wrapped, the delta should still be correct.
> > > But theres no guarantee that the tsc starts at zero when you begin your test.
> > > The system may have been up for a long time and near wrapping already.
> > > Regardless, you need to account for the possibility that cur_tsc is smaller
> > > than prev_tsc, or this breaks.
> > [Liang, Cunming] In case prev_tsc near wrapping, and cur_tsc get wrapped.
> > As they are unsigned, it still ok.
> > e.g. cur_tsc=0x2, prev_tsc=0xFFFFFFFFFFFFFFFC
> > Delta=cur_tsc-prev_tsc is 6, which is still correct.
> Ah, you're right, my fault.
> > And for uint64_t, we need to start computer for hundreds of years.
> That only assumes that the tsc has been kept at its initial state from boot, its
> entirely possible that the tsc is writen to a different value.  IIRC, in several
> debug configurations, the linux kernel inits the tsc to a large value so that it
> rolls over shortly after boot to catch wrapping errors.  Though as you note,
> your used of uint64 makes it irrelevant.
> 
> > >
> > > > >
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int
> > > > > > +test_pmd_perf(void)
> > > > > > +{
> > > > > > +	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
> > > > > > +	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
> > > > > > +	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
> > > > > > +	uint16_t portid;
> > > > > > +	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
> > > > > > +	int socketid = -1;
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	printf("Start PMD RXTX cycles cost test.\n");
> > > > > > +
> > > > > > +	signal(SIGUSR1, signal_handler);
> > > > > Again SIGINT here.
> > > > >
> > > > > > +	signal(SIGUSR2, signal_handler);
> > > > > > +
> > > > > > +	nb_ports = rte_eth_dev_count();
> > > > > > +	if (nb_ports < NB_ETHPORTS_USED) {
> > > > > > +		printf("At least %u port(s) used for perf. test\n",
> > > > > > +		       NB_ETHPORTS_USED);
> > > > > > +		return -1;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (nb_ports > RTE_MAX_ETHPORTS)
> > > > > > +		nb_ports = RTE_MAX_ETHPORTS;
> > > > > > +
> > > > > > +	nb_lcores = rte_lcore_count();
> > > > > > +
> > > > > > +	memset(lcore_conf, 0, sizeof(lcore_conf));
> > > > > > +	init_lcores();
> > > > > > +
> > > > > > +	init_mbufpool(NB_MBUF);
> > > > > > +
> > > > > > +	reset_count();
> > > > > > +	num = 0;
> > > > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > > > +		if (socketid == -1) {
> > > > > > +			socketid = rte_eth_dev_socket_id(portid);
> > > > > > +			slave_id = alloc_lcore(socketid);
> > > > > > +			if (slave_id == (uint16_t)-1) {
> > > > > > +				printf("No avail lcore to run test\n");
> > > > > > +				return -1;
> > > > > > +			}
> > > > > > +			printf("Performance test runs on lcore %u socket %u\n",
> > > > > > +			       slave_id, socketid);
> > > > > > +		}
> > > > > > +
> > > > > > +		if (socketid != rte_eth_dev_socket_id(portid)) {
> > > > > > +			printf("Skip port %d\n", portid);
> > > > > > +			continue;
> > > > > > +		}
> > > > > > +
> > > > > > +		/* port configure */
> > > > > > +		ret = rte_eth_dev_configure(portid, nb_rx_queue,
> > > > > > +					    nb_tx_queue, &port_conf);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE,
> > > > > > +				"Cannot configure device: err=%d, port=%d\n",
> > > > > > +				 ret, portid);
> > > > > > +
> > > > > > +		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
> > > > > > +		printf("Port %u ", portid);
> > > > > > +		print_ethaddr("Address:", &ports_eth_addr[portid]);
> > > > > > +		printf("\n");
> > > > > > +
> > > > > > +		/* tx queue setup */
> > > > > > +		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
> > > > > > +					     socketid, &tx_conf);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE,
> > > > > > +				"rte_eth_tx_queue_setup: err=%d, "
> > > > > > +				"port=%d\n", ret, portid);
> > > > > > +
> > > > > > +		/* rx queue steup */
> > > > > > +		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
> > > > > > +						socketid, &rx_conf,
> > > > > > +						mbufpool[socketid]);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
> > > > > > +				 "port=%d\n", ret, portid);
> > > > > > +
> > > > > > +		/* Start device */
> > > > > > +		stop = 0;
> > > > > > +		ret = rte_eth_dev_start(portid);
> > > > > > +		if (ret < 0)
> > > > > > +			rte_exit(EXIT_FAILURE,
> > > > > > +				"rte_eth_dev_start: err=%d, port=%d\n",
> > > > > > +				ret, portid);
> > > > > > +
> > > > > > +		/* always eanble promiscuous */
> > > > > > +		rte_eth_promiscuous_enable(portid);
> > > > > > +
> > > > > > +		lcore_conf[slave_id].portlist[num++] = portid;
> > > > > > +		lcore_conf[slave_id].nb_ports++;
> > > > > > +	}
> > > > > > +	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
> > > > > > +
> > > > > > +	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
> > > > > > +
> > > > > > +	rte_eal_remote_launch(main_loop, NULL, slave_id);
> > > > > > +	if (rte_eal_wait_lcore(slave_id) < 0)
> > > > > > +		return -1;
> > > > > > +
> > > > > > +	/* port tear down */
> > > > > > +	for (portid = 0; portid < nb_ports; portid++) {
> > > > > > +		if (socketid != rte_eth_dev_socket_id(portid))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		rte_eth_dev_stop(portid);
> > > > > > +	}
> > > > > > +
> > > > > Clean up your allocated memory/lcores/etc?
> > > > [Liang, Cunming] It do cleanup on the beginning of case.
> > > > "Init_lcores","init_mbufpool","reset_count" guarantee the data clean before
> > > each testing.
> > > > And mbufpool only allocated once even if we run multiple times.
> > > Its a janitorial issue.  Before the program exits, you need to free any
> > > resources that you've allocated.
> > [Liang, Cunming] Yes, I do but only not for the mbuf pool.
> > Firstly the case exit is not equal to program exit. I expect the re-enter will use the same mempool, so that can do cache warm up.
> > Secondly, DPDK doesn't supply really mempool release. The program exit, the memzone then free.
> I continue to find the fact that freeing mempools isn't supported is rediculous,
> but I suppose thats neither here nor there, you need to be able to clean up
> resources that you allocate, weather you are exiting a test case, a program, or
> simply a function, its just not sane to not be able to do that.
> 
> Given the return code check above, all thats left to say I think is NAK to this
> code.  If Thomas wants to ignore me, I suppose thats his perogative, but I can't
> see how you can ignore that case.
> 
> Neil
  
Bruce Richardson Oct. 22, 2014, 3:09 p.m. UTC | #13
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev, Konstantin
> Sent: Wednesday, October 22, 2014 3:53 PM
> To: Neil Horman; Liang, Cunming
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> > Sent: Wednesday, October 22, 2014 3:03 PM
> > To: Liang, Cunming
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> >
> > On Tue, Oct 21, 2014 at 01:17:01PM +0000, Liang, Cunming wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > Sent: Tuesday, October 21, 2014 6:33 PM
> > > > To: Liang, Cunming
> > > > Cc: dev@dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > > cycles/packet
> > > >
> > > > On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> > > > > Hi Neil,
> > > > >
> > > > > Very appreciate your comments.
> > > > > I add inline reply, will send v3 asap when we get alignment.
> > > > >
> > > > > BRs,
> > > > > Liang Cunming
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > > Sent: Saturday, October 11, 2014 1:52 AM
> > > > > > To: Liang, Cunming
> > > > > > Cc: dev@dpdk.org
> > > > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > > cycles/packet
> > > > > >
<...snip...>
> > > > > >
> > > > > > > +		printf("Force Stop!\n");
> > > > > > > +		stop = 1;
> > > > > > > +	}
> > > > > > > +	if (signum == SIGUSR2)
> > > > > > > +		stats_display(0);
> > > > > > > +}
> > > > > > > +/* main processing loop */
> > > > > > > +static int
> > > > > > > +main_loop(__rte_unused void *args)
> > > > > > > +{
> > > > > > > +#define PACKET_SIZE 64
> > > > > > > +#define FRAME_GAP 12
> > > > > > > +#define MAC_PREAMBLE 8
> > > > > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > > > > +	unsigned lcore_id;
> > > > > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > > > > +	struct lcore_conf *conf;
> > > > > > > +	uint64_t prev_tsc, cur_tsc;
> > > > > > > +	int pkt_per_port;
> > > > > > > +	uint64_t packets_per_second, total_packets;
> > > > > > > +
> > > > > > > +	lcore_id = rte_lcore_id();
> > > > > > > +	conf = &lcore_conf[lcore_id];
> > > > > > > +	if (conf->status != LCORE_USED)
> > > > > > > +		return 0;
> > > > > > > +
> > > > > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > > > > +
> > > > > > > +	int idx = 0;
> > > > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > > > +		int num = pkt_per_port;
> > > > > > > +		portid = conf->portlist[i];
> > > > > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > > > > +		while (num) {
> > > > > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > > > > +						&tx_burst[idx], nb_tx);
> > > > > > > +			num -= nb_tx;
> > > > > > > +			idx += nb_tx;
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > > > > +
> > > > > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) *
> CHAR_BIT);
> > > > > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > > > > +		+packets_per_second);
> > > > > > > +
> > > > > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > > > > packets_per_second;
> > > > > > > +	printf("Test will stop after at least %"PRIu64" packets
> received\n",
> > > > > > > +		+ total_packets);
> > > > > > > +
> > > > > > > +	prev_tsc = rte_rdtsc();
> > > > > > > +
> > > > > > > +	while (likely(!stop)) {
> > > > > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > > > > +			portid = conf->portlist[i];
> > > > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > > > +						 pkts_burst,
> MAX_PKT_BURST);
> > > > > > > +			if (unlikely(nb_rx == 0)) {
> > > > > > > +				idle++;
> > > > > > > +				continue;
> > > > > > > +			}
> > > > > > > +
> > > > > > > +			count += nb_rx;
> > > > > > Doesn't take into consideration error conditions.  rte_eth_rx_burst can
> > > > return
> > > > > > -ENOTSUP
> > > > > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> > > > CONFIG.
> > > > > The error is used to avoid no function call register.
> > > > > When ETHDEV_DEBUG turn off, the NULL function call cause segfault
> directly.
> > > > > So I think it's a library internal error.
> > > > No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is
> null,
> > > > -ENOTSUPP will be returned to the application, you need to handle the
> error
> > > > condition.
> > > [Liang, Cunming] The runtime rte_eth_rx_burst is a inline function in
> rte_ethdev.h.
> > > The one you're talking about is the one defined in rte_ethdev.c which is a
> extern non-inline function.
> > > It works only when RTE_LIBRTE_ETHDEV_DEBUG turns on.
> > > If we always turns such library internal checking on, it lose performance.
> > > So I insist it's a library internal error checking, doesn't need to take care in
> application level.
> > I'm really flored that you would respond this way.
> >
> > What you have is two versions of a function, one returns errors and one
> doesn't,
> > and the version used is based on compile time configuration.  You've written
> > your application such that it can only gracefully handle one of the two
> > versions, and you're happy to allow the other version, the one thats supposed
> to
> > be in use when you are debugging applications, to create silent accounting
> > failures in your application.  And it completely ignores the fact that the
> > non-debug version might one day return errors as well (even if it doesn't
> > currently).  Your assertion above that we can ignore it because its debug code
> > is the most short sighted thing I've read in a long time.
> 
> Actually looking at rte_eth_rx_burst(): it returns uint16_t.
> Which is sort of a strange if it expects to return a negative value as error code.
> Also reading though 'formal' comments of rte_eth_rx_burst() - it seems that it
> not supposed to return negative values:
> "...
> The rte_eth_rx_burst() function does not provide any error
>  notification to avoid the corresponding overhead. As a hint, the
>  upper-level application might check the status of the device link once
>  being systematically returned a 0 value for a given number of tries.
> ...
> @return
>  *   The number of packets actually retrieved, which is the number
>  *   of pointers to *rte_mbuf* structures effectively supplied to the
>  *   *rx_pkts* array."
> 
> Again, if you looks at rte_eth_rx_burst() implementation, when
> RTE_LIBRTE_ETHDEV_DEBUG is on -
> For some error cases it returns '0', foor other's: -ENOTSUP:
> 
> FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, -ENOTSUP);
> if (queue_id >= dev->data->nb_rx_queues) {
>                 PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
>                 return 0;
> }
> 
> Which makes me thing that we just have errnoneous implementation of
> rte_eth_rx_burst()
> when RTE_LIBRTE_ETHDEV_DEBUG is on.
> Probably just a mechanical mistake, when FUNC_PTR_OR_ERR_RET() was
> added.
>  I'd say we change rte_eth_rx_burst() to always return 0 (as was initially
> planned).
> 
> Konstantin

Agreed. For any errors other than no-packets available yet, we can also consider setting rte_errno when returning 0. 

/Bruce
  
Cunming Liang Oct. 24, 2014, 3:06 a.m. UTC | #14
It's reasonable to me.
I'll make a patch for rte_ethdev.c.

> -----Original Message-----
> From: Richardson, Bruce
> Sent: Wednesday, October 22, 2014 11:10 PM
> To: Ananyev, Konstantin; Neil Horman; Liang, Cunming
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> cycles/packet
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Ananyev, Konstantin
> > Sent: Wednesday, October 22, 2014 3:53 PM
> > To: Neil Horman; Liang, Cunming
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> >
> >
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Neil Horman
> > > Sent: Wednesday, October 22, 2014 3:03 PM
> > > To: Liang, Cunming
> > > Cc: dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > cycles/packet
> > >
> > > On Tue, Oct 21, 2014 at 01:17:01PM +0000, Liang, Cunming wrote:
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > Sent: Tuesday, October 21, 2014 6:33 PM
> > > > > To: Liang, Cunming
> > > > > Cc: dev@dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > > > cycles/packet
> > > > >
> > > > > On Sun, Oct 12, 2014 at 11:10:39AM +0000, Liang, Cunming wrote:
> > > > > > Hi Neil,
> > > > > >
> > > > > > Very appreciate your comments.
> > > > > > I add inline reply, will send v3 asap when we get alignment.
> > > > > >
> > > > > > BRs,
> > > > > > Liang Cunming
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > > > Sent: Saturday, October 11, 2014 1:52 AM
> > > > > > > To: Liang, Cunming
> > > > > > > Cc: dev@dpdk.org
> > > > > > > Subject: Re: [dpdk-dev] [PATCH v2 1/4] app/test: unit test for rx and tx
> > > > > cycles/packet
> > > > > > >
> <...snip...>
> > > > > > >
> > > > > > > > +		printf("Force Stop!\n");
> > > > > > > > +		stop = 1;
> > > > > > > > +	}
> > > > > > > > +	if (signum == SIGUSR2)
> > > > > > > > +		stats_display(0);
> > > > > > > > +}
> > > > > > > > +/* main processing loop */
> > > > > > > > +static int
> > > > > > > > +main_loop(__rte_unused void *args)
> > > > > > > > +{
> > > > > > > > +#define PACKET_SIZE 64
> > > > > > > > +#define FRAME_GAP 12
> > > > > > > > +#define MAC_PREAMBLE 8
> > > > > > > > +	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> > > > > > > > +	unsigned lcore_id;
> > > > > > > > +	unsigned i, portid, nb_rx = 0, nb_tx = 0;
> > > > > > > > +	struct lcore_conf *conf;
> > > > > > > > +	uint64_t prev_tsc, cur_tsc;
> > > > > > > > +	int pkt_per_port;
> > > > > > > > +	uint64_t packets_per_second, total_packets;
> > > > > > > > +
> > > > > > > > +	lcore_id = rte_lcore_id();
> > > > > > > > +	conf = &lcore_conf[lcore_id];
> > > > > > > > +	if (conf->status != LCORE_USED)
> > > > > > > > +		return 0;
> > > > > > > > +
> > > > > > > > +	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
> > > > > > > > +
> > > > > > > > +	int idx = 0;
> > > > > > > > +	for (i = 0; i < conf->nb_ports; i++) {
> > > > > > > > +		int num = pkt_per_port;
> > > > > > > > +		portid = conf->portlist[i];
> > > > > > > > +		printf("inject %d packet to port %d\n", num, portid);
> > > > > > > > +		while (num) {
> > > > > > > > +			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
> > > > > > > > +			nb_tx = rte_eth_tx_burst(portid, 0,
> > > > > > > > +						&tx_burst[idx], nb_tx);
> > > > > > > > +			num -= nb_tx;
> > > > > > > > +			idx += nb_tx;
> > > > > > > > +		}
> > > > > > > > +	}
> > > > > > > > +	printf("Total packets inject to prime ports = %u\n", idx);
> > > > > > > > +
> > > > > > > > +	packets_per_second = (link_mbps * 1000 * 1000) /
> > > > > > > > +		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) *
> > CHAR_BIT);
> > > > > > > > +	printf("Each port will do %"PRIu64" packets per second\n",
> > > > > > > > +		+packets_per_second);
> > > > > > > > +
> > > > > > > > +	total_packets = RTE_TEST_DURATION * conf->nb_ports *
> > > > > > > packets_per_second;
> > > > > > > > +	printf("Test will stop after at least %"PRIu64" packets
> > received\n",
> > > > > > > > +		+ total_packets);
> > > > > > > > +
> > > > > > > > +	prev_tsc = rte_rdtsc();
> > > > > > > > +
> > > > > > > > +	while (likely(!stop)) {
> > > > > > > > +		for (i = 0; i < conf->nb_ports; i++) {
> > > > > > > > +			portid = conf->portlist[i];
> > > > > > > > +			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
> > > > > > > > +						 pkts_burst,
> > MAX_PKT_BURST);
> > > > > > > > +			if (unlikely(nb_rx == 0)) {
> > > > > > > > +				idle++;
> > > > > > > > +				continue;
> > > > > > > > +			}
> > > > > > > > +
> > > > > > > > +			count += nb_rx;
> > > > > > > Doesn't take into consideration error conditions.  rte_eth_rx_burst
> can
> > > > > return
> > > > > > > -ENOTSUP
> > > > > > [Liang, Cunming] It returns -ENOTSUP when turning on ETHDEV_DEBUG
> > > > > CONFIG.
> > > > > > The error is used to avoid no function call register.
> > > > > > When ETHDEV_DEBUG turn off, the NULL function call cause segfault
> > directly.
> > > > > > So I think it's a library internal error.
> > > > > No, look at FUNC_PTR_OR_ERR_RET.  If the passed in function pointer is
> > null,
> > > > > -ENOTSUPP will be returned to the application, you need to handle the
> > error
> > > > > condition.
> > > > [Liang, Cunming] The runtime rte_eth_rx_burst is a inline function in
> > rte_ethdev.h.
> > > > The one you're talking about is the one defined in rte_ethdev.c which is a
> > extern non-inline function.
> > > > It works only when RTE_LIBRTE_ETHDEV_DEBUG turns on.
> > > > If we always turns such library internal checking on, it lose performance.
> > > > So I insist it's a library internal error checking, doesn't need to take care in
> > application level.
> > > I'm really flored that you would respond this way.
> > >
> > > What you have is two versions of a function, one returns errors and one
> > doesn't,
> > > and the version used is based on compile time configuration.  You've written
> > > your application such that it can only gracefully handle one of the two
> > > versions, and you're happy to allow the other version, the one thats
> supposed
> > to
> > > be in use when you are debugging applications, to create silent accounting
> > > failures in your application.  And it completely ignores the fact that the
> > > non-debug version might one day return errors as well (even if it doesn't
> > > currently).  Your assertion above that we can ignore it because its debug code
> > > is the most short sighted thing I've read in a long time.
> >
> > Actually looking at rte_eth_rx_burst(): it returns uint16_t.
> > Which is sort of a strange if it expects to return a negative value as error code.
> > Also reading though 'formal' comments of rte_eth_rx_burst() - it seems that it
> > not supposed to return negative values:
> > "...
> > The rte_eth_rx_burst() function does not provide any error
> >  notification to avoid the corresponding overhead. As a hint, the
> >  upper-level application might check the status of the device link once
> >  being systematically returned a 0 value for a given number of tries.
> > ...
> > @return
> >  *   The number of packets actually retrieved, which is the number
> >  *   of pointers to *rte_mbuf* structures effectively supplied to the
> >  *   *rx_pkts* array."
> >
> > Again, if you looks at rte_eth_rx_burst() implementation, when
> > RTE_LIBRTE_ETHDEV_DEBUG is on -
> > For some error cases it returns '0', foor other's: -ENOTSUP:
> >
> > FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, -ENOTSUP);
> > if (queue_id >= dev->data->nb_rx_queues) {
> >                 PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", queue_id);
> >                 return 0;
> > }
> >
> > Which makes me thing that we just have errnoneous implementation of
> > rte_eth_rx_burst()
> > when RTE_LIBRTE_ETHDEV_DEBUG is on.
> > Probably just a mechanical mistake, when FUNC_PTR_OR_ERR_RET() was
> > added.
> >  I'd say we change rte_eth_rx_burst() to always return 0 (as was initially
> > planned).
> >
> > Konstantin
> 
> Agreed. For any errors other than no-packets available yet, we can also consider
> setting rte_errno when returning 0.
> 
> /Bruce
  

Patch

diff --git a/app/test/Makefile b/app/test/Makefile
index 6af6d76..ebfa0ba 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -56,6 +56,7 @@  SRCS-y += test_memzone.c
 
 SRCS-y += test_ring.c
 SRCS-y += test_ring_perf.c
+SRCS-y += test_pmd_perf.c
 
 ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
 SRCS-y += test_table.c
diff --git a/app/test/commands.c b/app/test/commands.c
index a9e36b1..f1e746e 100644
--- a/app/test/commands.c
+++ b/app/test/commands.c
@@ -310,12 +310,50 @@  cmdline_parse_inst_t cmd_quit = {
 
 /****************/
 
+/****************/
+
+struct cmd_set_rxtx_result {
+	cmdline_fixed_string_t set;
+	cmdline_fixed_string_t mode;
+};
+
+static void cmd_set_rxtx_parsed(void *parsed_result, struct cmdline *cl,
+				__attribute__((unused)) void *data)
+{
+	struct cmd_set_rxtx_result *res = parsed_result;
+	if (test_set_rxtx_conf(res->mode) < 0)
+		cmdline_printf(cl, "Cannot find such mode\n");
+}
+
+cmdline_parse_token_string_t cmd_set_rxtx_set =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_rxtx_result, set,
+				 "set_rxtx_mode");
+
+cmdline_parse_token_string_t cmd_set_rxtx_mode =
+	TOKEN_STRING_INITIALIZER(struct cmd_set_rxtx_result, mode, NULL);
+
+cmdline_parse_inst_t cmd_set_rxtx = {
+	.f = cmd_set_rxtx_parsed,  /* function to call */
+	.data = NULL,      /* 2nd arg of func */
+	.help_str = "set rxtx routine: "
+			"set_rxtx <mode>",
+	.tokens = {        /* token list, NULL terminated */
+		(void *)&cmd_set_rxtx_set,
+		(void *)&cmd_set_rxtx_mode,
+		NULL,
+	},
+};
+
+/****************/
+
+
 cmdline_parse_ctx_t main_ctx[] = {
 	(cmdline_parse_inst_t *)&cmd_autotest,
 	(cmdline_parse_inst_t *)&cmd_dump,
 	(cmdline_parse_inst_t *)&cmd_dump_one,
 	(cmdline_parse_inst_t *)&cmd_set_ring,
 	(cmdline_parse_inst_t *)&cmd_quit,
+	(cmdline_parse_inst_t *)&cmd_set_rxtx,
 	NULL,
 };
 
diff --git a/app/test/packet_burst_generator.c b/app/test/packet_burst_generator.c
index 9e747a4..9100b1d 100644
--- a/app/test/packet_burst_generator.c
+++ b/app/test/packet_burst_generator.c
@@ -191,12 +191,12 @@  initialize_ipv4_header(struct ipv4_hdr *ip_hdr, uint32_t src_addr,
  */
 #define RTE_MAX_SEGS_PER_PKT 255 /**< pkt.nb_segs is a 8-bit unsigned char. */
 
-#define TXONLY_DEF_PACKET_LEN 64
+#define TXONLY_DEF_PACKET_LEN 60
 #define TXONLY_DEF_PACKET_LEN_128 128
 
 uint16_t tx_pkt_length = TXONLY_DEF_PACKET_LEN;
 uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT] = {
-		TXONLY_DEF_PACKET_LEN_128,
+		TXONLY_DEF_PACKET_LEN,
 };
 
 uint8_t  tx_pkt_nb_segs = 1;
diff --git a/app/test/test.h b/app/test/test.h
index 98ab804..76033a8 100644
--- a/app/test/test.h
+++ b/app/test/test.h
@@ -125,6 +125,9 @@  int unit_test_suite_runner(struct unit_test_suite *suite);
 
 #define RECURSIVE_ENV_VAR "RTE_TEST_RECURSIVE"
 
+#include <cmdline_parse.h>
+#include <cmdline_parse_string.h>
+
 extern const char *prgname;
 
 int commands_init(void);
@@ -137,6 +140,7 @@  int test_pci_run;
 int test_mp_secondary(void);
 
 int test_ivshmem(void);
+int test_set_rxtx_conf(cmdline_fixed_string_t mode);
 
 typedef int (test_callback)(void);
 TAILQ_HEAD(test_commands_list, test_command);
diff --git a/app/test/test_pmd_perf.c b/app/test/test_pmd_perf.c
new file mode 100644
index 0000000..bca4a9a
--- /dev/null
+++ b/app/test/test_pmd_perf.c
@@ -0,0 +1,626 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <unistd.h>
+#include <rte_cycles.h>
+#include <rte_ethdev.h>
+#include <rte_byteorder.h>
+#include "packet_burst_generator.h"
+#include "test.h"
+
+#define NB_ETHPORTS_USED                (1)
+#define NB_SOCKETS                      (2)
+#define MEMPOOL_CACHE_SIZE 250
+#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
+#define MAX_PKT_BURST                   (32)
+#define RTE_TEST_RX_DESC_DEFAULT        (128)
+#define RTE_TEST_TX_DESC_DEFAULT        (512)
+#define RTE_PORT_ALL            (~(uint8_t)0x0)
+
+/* how long test would take at full line rate */
+#define RTE_TEST_DURATION                (2)
+
+/*
+ * RX and TX Prefetch, Host, and Write-back threshold values should be
+ * carefully set for optimal performance. Consult the network
+ * controller's datasheet and supporting DPDK documentation for guidance
+ * on how these parameters should be set.
+ */
+#define RX_PTHRESH 8 /**< Default values of RX prefetch threshold reg. */
+#define RX_HTHRESH 8 /**< Default values of RX host threshold reg. */
+#define RX_WTHRESH 0 /**< Default values of RX write-back threshold reg. */
+
+/*
+ * These default values are optimized for use with the Intel(R) 82599 10 GbE
+ * Controller and the DPDK ixgbe PMD. Consider using other values for other
+ * network controllers and/or network drivers.
+ */
+#define TX_PTHRESH 32 /**< Default values of TX prefetch threshold reg. */
+#define TX_HTHRESH 0  /**< Default values of TX host threshold reg. */
+#define TX_WTHRESH 0  /**< Default values of TX write-back threshold reg. */
+
+
+#define NB_MBUF RTE_MAX(						\
+	  (unsigned)(nb_ports*nb_rx_queue*RTE_TEST_RX_DESC_DEFAULT +	\
+	   nb_ports*nb_lcores*MAX_PKT_BURST +				\
+	   nb_ports*nb_tx_queue*RTE_TEST_TX_DESC_DEFAULT +		\
+	   nb_lcores*MEMPOOL_CACHE_SIZE),				\
+	  (unsigned)8192)
+
+
+static struct rte_mempool *mbufpool[NB_SOCKETS];
+/* ethernet addresses of ports */
+static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
+
+static struct rte_eth_conf port_conf = {
+	.rxmode = {
+		.mq_mode = ETH_MQ_RX_NONE,
+		.max_rx_pkt_len = ETHER_MAX_LEN,
+		.split_hdr_size = 0,
+		.header_split   = 0, /**< Header Split disabled */
+		.hw_ip_checksum = 0, /**< IP checksum offload enabled */
+		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
+		.hw_vlan_strip  = 0, /**< VLAN strip enabled. */
+		.hw_vlan_extend = 0, /**< Extended VLAN disabled. */
+		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
+		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
+		.enable_scatter = 0, /**< scatter rx disabled */
+	},
+	.txmode = {
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.lpbk_mode = 1,  /* enable loopback */
+};
+
+static struct rte_eth_rxconf rx_conf = {
+	.rx_thresh = {
+		.pthresh = RX_PTHRESH,
+		.hthresh = RX_HTHRESH,
+		.wthresh = RX_WTHRESH,
+	},
+	.rx_free_thresh = 32,
+};
+
+static struct rte_eth_txconf tx_conf = {
+	.tx_thresh = {
+		.pthresh = TX_PTHRESH,
+		.hthresh = TX_HTHRESH,
+		.wthresh = TX_WTHRESH,
+	},
+	.tx_free_thresh = 32, /* Use PMD default values */
+	.tx_rs_thresh = 32, /* Use PMD default values */
+	.txq_flags = (ETH_TXQ_FLAGS_NOMULTSEGS |
+		      ETH_TXQ_FLAGS_NOVLANOFFL |
+		      ETH_TXQ_FLAGS_NOXSUMSCTP |
+		      ETH_TXQ_FLAGS_NOXSUMUDP |
+		      ETH_TXQ_FLAGS_NOXSUMTCP)
+};
+
+enum {
+	LCORE_INVALID = 0,
+	LCORE_AVAIL,
+	LCORE_USED,
+};
+
+struct lcore_conf {
+	uint8_t status;
+	uint8_t socketid;
+	uint16_t nb_ports;
+	uint8_t portlist[RTE_MAX_ETHPORTS];
+} __rte_cache_aligned;
+
+struct lcore_conf lcore_conf[RTE_MAX_LCORE];
+
+static uint64_t link_mbps;
+
+/* Check the link status of all ports in up to 3s, and print them finally */
+static void
+check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
+{
+#define CHECK_INTERVAL 100 /* 100ms */
+#define MAX_CHECK_TIME 30 /* 3s (30 * 100ms) in total */
+	uint8_t portid, count, all_ports_up, print_flag = 0;
+	struct rte_eth_link link;
+
+	printf("Checking link statuses...\n");
+	fflush(stdout);
+	for (count = 0; count <= MAX_CHECK_TIME; count++) {
+		all_ports_up = 1;
+		for (portid = 0; portid < port_num; portid++) {
+			if ((port_mask & (1 << portid)) == 0)
+				continue;
+			memset(&link, 0, sizeof(link));
+			rte_eth_link_get_nowait(portid, &link);
+			/* print link status if flag set */
+			if (print_flag == 1) {
+				if (link.link_status) {
+					printf("Port %d Link Up - speed %u "
+						"Mbps - %s\n", (uint8_t)portid,
+						(unsigned)link.link_speed,
+				(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
+					("full-duplex") : ("half-duplex\n"));
+					if (link_mbps == 0)
+						link_mbps = link.link_speed;
+				} else
+					printf("Port %d Link Down\n",
+						(uint8_t)portid);
+				continue;
+			}
+			/* clear all_ports_up flag if any link down */
+			if (link.link_status == 0) {
+				all_ports_up = 0;
+				break;
+			}
+		}
+		/* after finally printing all link status, get out */
+		if (print_flag == 1)
+			break;
+
+		if (all_ports_up == 0) {
+			fflush(stdout);
+			rte_delay_ms(CHECK_INTERVAL);
+		}
+
+		/* set the print_flag if all ports up or timeout */
+		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1))
+			print_flag = 1;
+	}
+}
+
+static void
+print_ethaddr(const char *name, const struct ether_addr *eth_addr)
+{
+	printf("%s%02X:%02X:%02X:%02X:%02X:%02X", name,
+		eth_addr->addr_bytes[0],
+		eth_addr->addr_bytes[1],
+		eth_addr->addr_bytes[2],
+		eth_addr->addr_bytes[3],
+		eth_addr->addr_bytes[4],
+		eth_addr->addr_bytes[5]);
+}
+
+static int
+init_traffic(struct rte_mempool *mp,
+	     struct rte_mbuf **pkts_burst, uint32_t burst_size)
+{
+	struct ether_hdr pkt_eth_hdr;
+	struct ipv4_hdr pkt_ipv4_hdr;
+	struct udp_hdr pkt_udp_hdr;
+	uint32_t pktlen;
+	static uint8_t src_mac[] = { 0x00, 0xFF, 0xAA, 0xFF, 0xAA, 0xFF };
+	static uint8_t dst_mac[] = { 0x00, 0xAA, 0xFF, 0xAA, 0xFF, 0xAA };
+
+
+	initialize_eth_header(&pkt_eth_hdr,
+		(struct ether_addr *)src_mac,
+		(struct ether_addr *)dst_mac, 0, 0);
+	pkt_eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+	pktlen = initialize_ipv4_header(&pkt_ipv4_hdr,
+					IPV4_ADDR(10, 0, 0, 1),
+					IPV4_ADDR(10, 0, 0, 2), 26);
+	printf("IPv4 pktlen %u\n", pktlen);
+
+	pktlen = initialize_udp_header(&pkt_udp_hdr, 0, 0, 18);
+
+	printf("UDP pktlen %u\n", pktlen);
+
+	return generate_packet_burst(mp, pkts_burst, &pkt_eth_hdr,
+				     0, &pkt_ipv4_hdr, 1,
+				     &pkt_udp_hdr, burst_size);
+}
+
+static int
+init_lcores(void)
+{
+	unsigned lcore_id;
+
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		lcore_conf[lcore_id].socketid =
+			rte_lcore_to_socket_id(lcore_id);
+		if (rte_lcore_is_enabled(lcore_id) == 0) {
+			lcore_conf[lcore_id].status = LCORE_INVALID;
+			continue;
+		} else
+			lcore_conf[lcore_id].status = LCORE_AVAIL;
+	}
+	return 0;
+}
+
+static int
+init_mbufpool(unsigned nb_mbuf)
+{
+	int socketid;
+	unsigned lcore_id;
+	char s[64];
+
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		if (rte_lcore_is_enabled(lcore_id) == 0)
+			continue;
+
+		socketid = rte_lcore_to_socket_id(lcore_id);
+		if (socketid >= NB_SOCKETS) {
+			rte_exit(EXIT_FAILURE,
+				"Socket %d of lcore %u is out of range %d\n",
+				socketid, lcore_id, NB_SOCKETS);
+		}
+		if (mbufpool[socketid] == NULL) {
+			snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
+			mbufpool[socketid] =
+				rte_mempool_create(s, nb_mbuf, MBUF_SIZE,
+					MEMPOOL_CACHE_SIZE,
+					sizeof(struct rte_pktmbuf_pool_private),
+					rte_pktmbuf_pool_init, NULL,
+					rte_pktmbuf_init, NULL,
+					socketid, 0);
+			if (mbufpool[socketid] == NULL)
+				rte_exit(EXIT_FAILURE,
+					"Cannot init mbuf pool on socket %d\n",
+					socketid);
+			else
+				printf("Allocated mbuf pool on socket %d\n",
+					socketid);
+		}
+	}
+	return 0;
+}
+
+static uint16_t
+alloc_lcore(uint16_t socketid)
+{
+	unsigned lcore_id;
+
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		if (LCORE_AVAIL != lcore_conf[lcore_id].status ||
+		    lcore_conf[lcore_id].socketid != socketid ||
+		    lcore_id == rte_get_master_lcore())
+			continue;
+		lcore_conf[lcore_id].status = LCORE_USED;
+		lcore_conf[lcore_id].nb_ports = 0;
+		return lcore_id;
+	}
+
+	return (uint16_t)-1;
+}
+
+volatile uint64_t stop;
+uint64_t count;
+uint64_t drop;
+uint64_t idle;
+
+static void
+reset_count(void)
+{
+	count = 0;
+	drop = 0;
+	idle = 0;
+}
+
+static void
+stats_display(uint8_t port_id)
+{
+	struct rte_eth_stats stats;
+	rte_eth_stats_get(port_id, &stats);
+
+	printf("  RX-packets: %-10"PRIu64" RX-missed: %-10"PRIu64" RX-bytes:  "
+	       "%-"PRIu64"\n",
+	       stats.ipackets, stats.imissed, stats.ibytes);
+	printf("  RX-badcrc:  %-10"PRIu64" RX-badlen: %-10"PRIu64" RX-errors: "
+	       "%-"PRIu64"\n",
+	       stats.ibadcrc, stats.ibadlen, stats.ierrors);
+	printf("  RX-nombuf:  %-10"PRIu64"\n",
+	       stats.rx_nombuf);
+	printf("  TX-packets: %-10"PRIu64" TX-errors: %-10"PRIu64" TX-bytes:  "
+	       "%-"PRIu64"\n",
+	       stats.opackets, stats.oerrors, stats.obytes);
+}
+
+static void
+signal_handler(int signum)
+{
+	/* When we receive a USR1 signal, print stats */
+	if (signum == SIGUSR1) {
+		printf("Force Stop!\n");
+		stop = 1;
+	}
+	if (signum == SIGUSR2)
+		stats_display(0);
+}
+
+#define MAX_TRAFIC_BURST                (4096)
+struct rte_mbuf *tx_burst[MAX_TRAFIC_BURST];
+
+/* main processing loop */
+static int
+main_loop(__rte_unused void *args)
+{
+#define PACKET_SIZE 64
+#define FRAME_GAP 12
+#define MAC_PREAMBLE 8
+	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+	unsigned lcore_id;
+	unsigned i, portid, nb_rx = 0, nb_tx = 0;
+	struct lcore_conf *conf;
+	uint64_t prev_tsc, cur_tsc;
+	int pkt_per_port;
+	uint64_t packets_per_second, total_packets;
+
+	lcore_id = rte_lcore_id();
+	conf = &lcore_conf[lcore_id];
+	if (conf->status != LCORE_USED)
+		return 0;
+
+	pkt_per_port =  MAX_TRAFIC_BURST / conf->nb_ports;
+
+	int idx = 0;
+	for (i = 0; i < conf->nb_ports; i++) {
+		int num = pkt_per_port;
+		portid = conf->portlist[i];
+		printf("inject %d packet to port %d\n", num, portid);
+		while (num) {
+			nb_tx = RTE_MIN(MAX_PKT_BURST, num);
+			nb_tx = rte_eth_tx_burst(portid, 0,
+						&tx_burst[idx], nb_tx);
+			num -= nb_tx;
+			idx += nb_tx;
+		}
+	}
+	printf("Total packets inject to prime ports = %u\n", idx);
+
+	packets_per_second = (link_mbps * 1000 * 1000) /
+		+((PACKET_SIZE + FRAME_GAP + MAC_PREAMBLE) * CHAR_BIT);
+	printf("Each port will do %"PRIu64" packets per second\n",
+		+packets_per_second);
+
+	total_packets = RTE_TEST_DURATION * conf->nb_ports * packets_per_second;
+	printf("Test will stop after at least %"PRIu64" packets received\n",
+		+ total_packets);
+
+	prev_tsc = rte_rdtsc();
+
+	while (likely(!stop)) {
+		for (i = 0; i < conf->nb_ports; i++) {
+			portid = conf->portlist[i];
+			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
+						 pkts_burst, MAX_PKT_BURST);
+			if (unlikely(nb_rx == 0)) {
+				idle++;
+				continue;
+			}
+
+			count += nb_rx;
+			nb_tx = rte_eth_tx_burst(portid, 0, pkts_burst, nb_rx);
+			if (unlikely(nb_tx < nb_rx)) {
+				drop += (nb_rx - nb_tx);
+				do {
+					rte_pktmbuf_free(pkts_burst[nb_tx]);
+				} while (++nb_tx < nb_rx);
+			}
+		}
+		if (unlikely(count >= total_packets))
+			break;
+	}
+
+	cur_tsc = rte_rdtsc();
+
+	for (i = 0; i < conf->nb_ports; i++) {
+		portid = conf->portlist[i];
+		int nb_free = pkt_per_port;
+		do { /* dry out */
+			nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,
+						 pkts_burst, MAX_PKT_BURST);
+			nb_tx = 0;
+			while (nb_tx < nb_rx)
+				rte_pktmbuf_free(pkts_burst[nb_tx++]);
+			nb_free -= nb_rx;
+		} while (nb_free != 0);
+		printf("free %d mbuf left in port %u\n", pkt_per_port, portid);
+	}
+
+	if (count == 0)
+		return -1;
+
+	printf("%lu packet, %lu drop, %lu idle\n", count, drop, idle);
+	printf("Result: %ld cycles per packet\n", (cur_tsc - prev_tsc) / count);
+
+	return 0;
+}
+
+static int
+test_pmd_perf(void)
+{
+	uint16_t nb_ports, num, nb_lcores, slave_id = (uint16_t)-1;
+	uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
+	uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
+	uint16_t portid;
+	uint16_t nb_rx_queue = 1, nb_tx_queue = 1;
+	int socketid = -1;
+	int ret;
+
+	printf("Start PMD RXTX cycles cost test.\n");
+
+	signal(SIGUSR1, signal_handler);
+	signal(SIGUSR2, signal_handler);
+
+	nb_ports = rte_eth_dev_count();
+	if (nb_ports < NB_ETHPORTS_USED) {
+		printf("At least %u port(s) used for perf. test\n",
+		       NB_ETHPORTS_USED);
+		return -1;
+	}
+
+	if (nb_ports > RTE_MAX_ETHPORTS)
+		nb_ports = RTE_MAX_ETHPORTS;
+
+	nb_lcores = rte_lcore_count();
+
+	memset(lcore_conf, 0, sizeof(lcore_conf));
+	init_lcores();
+
+	init_mbufpool(NB_MBUF);
+
+	reset_count();
+	num = 0;
+	for (portid = 0; portid < nb_ports; portid++) {
+		if (socketid == -1) {
+			socketid = rte_eth_dev_socket_id(portid);
+			slave_id = alloc_lcore(socketid);
+			if (slave_id == (uint16_t)-1) {
+				printf("No avail lcore to run test\n");
+				return -1;
+			}
+			printf("Performance test runs on lcore %u socket %u\n",
+			       slave_id, socketid);
+		}
+
+		if (socketid != rte_eth_dev_socket_id(portid)) {
+			printf("Skip port %d\n", portid);
+			continue;
+		}
+
+		/* port configure */
+		ret = rte_eth_dev_configure(portid, nb_rx_queue,
+					    nb_tx_queue, &port_conf);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE,
+				"Cannot configure device: err=%d, port=%d\n",
+				 ret, portid);
+
+		rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
+		printf("Port %u ", portid);
+		print_ethaddr("Address:", &ports_eth_addr[portid]);
+		printf("\n");
+
+		/* tx queue setup */
+		ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
+					     socketid, &tx_conf);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE,
+				"rte_eth_tx_queue_setup: err=%d, "
+				"port=%d\n", ret, portid);
+
+		/* rx queue steup */
+		ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
+						socketid, &rx_conf,
+						mbufpool[socketid]);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
+				 "port=%d\n", ret, portid);
+
+		/* Start device */
+		stop = 0;
+		ret = rte_eth_dev_start(portid);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE,
+				"rte_eth_dev_start: err=%d, port=%d\n",
+				ret, portid);
+
+		/* always eanble promiscuous */
+		rte_eth_promiscuous_enable(portid);
+
+		lcore_conf[slave_id].portlist[num++] = portid;
+		lcore_conf[slave_id].nb_ports++;
+	}
+	check_all_ports_link_status(nb_ports, RTE_PORT_ALL);
+
+	init_traffic(mbufpool[socketid], tx_burst, MAX_TRAFIC_BURST);
+
+	rte_eal_remote_launch(main_loop, NULL, slave_id);
+	if (rte_eal_wait_lcore(slave_id) < 0)
+		return -1;
+
+	/* port tear down */
+	for (portid = 0; portid < nb_ports; portid++) {
+		if (socketid != rte_eth_dev_socket_id(portid))
+			continue;
+
+		rte_eth_dev_stop(portid);
+	}
+
+	return 0;
+}
+
+int
+test_set_rxtx_conf(cmdline_fixed_string_t mode)
+{
+	printf("mode is %s\n", mode);
+
+	if (!strcmp(mode, "vector")) {
+		/* vector rx, tx */
+		tx_conf.txq_flags = 0xf01;
+		tx_conf.tx_rs_thresh = 32;
+		tx_conf.tx_free_thresh = 32;
+		port_conf.rxmode.hw_ip_checksum = 0;
+		port_conf.rxmode.enable_scatter = 0;
+		return 0;
+	} else if (!strcmp(mode, "scalar")) {
+		/* bulk alloc rx, simple tx */
+		tx_conf.txq_flags = 0xf01;
+		tx_conf.tx_rs_thresh = 128;
+		tx_conf.tx_free_thresh = 128;
+		port_conf.rxmode.hw_ip_checksum = 1;
+		port_conf.rxmode.enable_scatter = 0;
+		return 0;
+	} else if (!strcmp(mode, "hybrid")) {
+		/* bulk alloc rx, vector tx
+		 * when vec macro not define,
+		 * using the same rx/tx as scalar
+		 */
+		tx_conf.txq_flags = 0xf01;
+		tx_conf.tx_rs_thresh = 32;
+		tx_conf.tx_free_thresh = 32;
+		port_conf.rxmode.hw_ip_checksum = 1;
+		port_conf.rxmode.enable_scatter = 0;
+		return 0;
+	} else if (!strcmp(mode, "full")) {
+		/* full feature rx,tx pair */
+		tx_conf.txq_flags = 0x0;   /* must condition */
+		tx_conf.tx_rs_thresh = 32;
+		tx_conf.tx_free_thresh = 32;
+		port_conf.rxmode.hw_ip_checksum = 0;
+		port_conf.rxmode.enable_scatter = 1; /* must condition */
+		return 0;
+	}
+
+	return -1;
+}
+
+static struct test_command pmd_perf_cmd = {
+	.command = "pmd_perf_autotest",
+	.callback = test_pmd_perf,
+};
+REGISTER_TEST_COMMAND(pmd_perf_cmd);
diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index 215e563..8b87f98 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -1586,6 +1586,9 @@  ixgbe_dev_stop(struct rte_eth_dev *dev)
 
 	ixgbe_dev_clear_queues(dev);
 
+	/* Clear stored conf */
+	dev->data->scattered_rx = 0;
+
 	/* Clear recorded link status */
 	memset(&link, 0, sizeof(link));
 	rte_ixgbe_dev_atomic_write_link_status(dev, &link);
@@ -2852,6 +2855,9 @@  ixgbevf_dev_stop(struct rte_eth_dev *dev)
 	  */
 	ixgbevf_set_vfta_all(dev,0);
 
+	/* Clear stored conf */
+	dev->data->scattered_rx = 0;
+
 	ixgbe_dev_clear_queues(dev);
 }