Hi Simon,
> Currently the rx/tx queue is allocated from the buffer pool on socket of:
> - port's socket if --port-numa-config specified
> - or ring-numa-config setting per port
>
> All the above will "bind" queue to single socket per port configuration.
> But it can actually archieve better performance if one port's queue can
> be spread across multiple NUMA nodes, and the rx/tx queue is allocated
> per lcpu socket.
>
> With this patch, testpmd can utilize the PCI-e bus bandwidth on another
> NUMA nodes. with 64bytes package, When running in PowerPC with Mellanox
> CX-4 card, single port(100G), with 8 cores, fw mode:
> - Without this patch: 52.5Mpps throughput
> - With this patch: 66Mpps throughput
>
> Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
> diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
> index fbe6284..d02059d 100644
> --- a/app/test-pmd/parameters.c
> +++ b/app/test-pmd/parameters.c
> @@ -130,6 +130,11 @@
> "(flag: 1 for RX; 2 for TX; 3 for RX and TX).\n");
> printf(" --socket-num=N: set socket from which all memory is allocated "
> "in NUMA mode.\n");
> + printf(" --ring-bind-lcpu: "
> + "specify TX/RX rings will be allocated on local socket of lcpu."
> + "It will overrridden ring-numa-config or port-numa-config if success."
> + "If local ring buffer is unavailable, it will use --ring-numa-config or port-numa-config instead."
> + "It allows one port binds to multiple NUMA nodes.\n");
I think it's a good patch to give the APP an example about how to choose the appropriate core.
Just have some concern about the priority. Maybe ring-numa-config and port-numa-config should have higher priority.
Becuase if APP assigned the specific socket by some purpose, it's not good to overwrite it silently.
> printf(" --mbuf-size=N: set the data size of mbuf to N bytes.\n");
> printf(" --total-num-mbufs=N: set the number of mbufs to be allocated "
> "in mbuf pools.\n");
@@ -130,6 +130,11 @@
"(flag: 1 for RX; 2 for TX; 3 for RX and TX).\n");
printf(" --socket-num=N: set socket from which all memory is allocated "
"in NUMA mode.\n");
+ printf(" --ring-bind-lcpu: "
+ "specify TX/RX rings will be allocated on local socket of lcpu."
+ "It will overrridden ring-numa-config or port-numa-config if success."
+ "If local ring buffer is unavailable, it will use --ring-numa-config or port-numa-config instead."
+ "It allows one port binds to multiple NUMA nodes.\n");
printf(" --mbuf-size=N: set the data size of mbuf to N bytes.\n");
printf(" --total-num-mbufs=N: set the number of mbufs to be allocated "
"in mbuf pools.\n");
@@ -563,6 +568,7 @@
{ "interactive", 0, 0, 0 },
{ "cmdline-file", 1, 0, 0 },
{ "auto-start", 0, 0, 0 },
+ { "ring-bind-lcpu", 0, 0, 0 },
{ "eth-peers-configfile", 1, 0, 0 },
{ "eth-peer", 1, 0, 0 },
#endif
@@ -674,6 +680,10 @@
printf("Auto-start selected\n");
auto_start = 1;
}
+ if (!strcmp(lgopts[opt_idx].name, "ring-bind-lcpu")) {
+ printf("RingBuffer bind with local CPU selected\n");
+ ring_bind_lcpu = 1;
+ }
if (!strcmp(lgopts[opt_idx].name,
"eth-peers-configfile")) {
if (init_peer_eth_addrs(optarg) != 0)
@@ -98,6 +98,7 @@
/* use master core for command line ? */
uint8_t interactive = 0;
uint8_t auto_start = 0;
+uint8_t ring_bind_lcpu;
char cmdline_filename[PATH_MAX] = {0};
/*
@@ -1395,6 +1396,46 @@ static void eth_event_callback(uint8_t port_id,
return 1;
}
+static int find_local_socket(queueid_t qi, int is_rxq)
+{
+ /*
+ * try to find the local mp with following logic:
+ * 1) Find the correct stream for the queue;
+ * 2) Find the correct lcore for the stream;
+ * 3) Find the correct socket for the lcore;
+ * 4) Find the correct mp for the scoket;
+ *
+ * If failed, failover to the old implementation.
+ */
+ int i, j, socket = NUMA_NO_CONFIG;
+
+ /* find the stream based on queue no*/
+ for (i = 0; i < nb_fwd_streams; i++) {
+ if (is_rxq) {
+ if (fwd_streams[i]->rx_queue == qi)
+ break;
+ } else {
+ if (fwd_streams[i]->tx_queue == qi)
+ break;
+ }
+ }
+ if (i == nb_fwd_streams)
+ return NUMA_NO_CONFIG;
+
+ /* find the lcore based on stream idx */
+ for (j = 0; j < nb_lcores; j++) {
+ if (fwd_lcores[j]->stream_idx == i)
+ break;
+ }
+ if (j == nb_lcores)
+ return NUMA_NO_CONFIG;
+
+ /* find the scoket for the lcore */
+ socket = rte_lcore_to_socket_id(fwd_lcores_cpuids[j]);
+
+ return socket;
+}
+
int
start_port(portid_t pid)
{
@@ -1445,14 +1486,19 @@ static void eth_event_callback(uint8_t port_id,
port->need_reconfig_queues = 0;
/* setup tx queues */
for (qi = 0; qi < nb_txq; qi++) {
+ int socket = port->socket_id;
if ((numa_support) &&
(txring_numa[pi] != NUMA_NO_CONFIG))
- diag = rte_eth_tx_queue_setup(pi, qi,
- nb_txd,txring_numa[pi],
- &(port->tx_conf));
- else
- diag = rte_eth_tx_queue_setup(pi, qi,
- nb_txd,port->socket_id,
+ socket = txring_numa[pi];
+
+ if (ring_bind_lcpu) {
+ int ret = find_local_socket(qi, 0);
+ if (ret != NUMA_NO_CONFIG)
+ socket = ret;
+ }
+
+ diag = rte_eth_tx_queue_setup(pi, qi,
+ nb_txd, socket,
&(port->tx_conf));
if (diag == 0)
@@ -1471,35 +1517,29 @@ static void eth_event_callback(uint8_t port_id,
}
/* setup rx queues */
for (qi = 0; qi < nb_rxq; qi++) {
+ int socket = port->socket_id;
if ((numa_support) &&
- (rxring_numa[pi] != NUMA_NO_CONFIG)) {
- struct rte_mempool * mp =
- mbuf_pool_find(rxring_numa[pi]);
- if (mp == NULL) {
- printf("Failed to setup RX queue:"
- "No mempool allocation"
- " on the socket %d\n",
- rxring_numa[pi]);
- return -1;
- }
-
- diag = rte_eth_rx_queue_setup(pi, qi,
- nb_rxd,rxring_numa[pi],
- &(port->rx_conf),mp);
- } else {
- struct rte_mempool *mp =
- mbuf_pool_find(port->socket_id);
- if (mp == NULL) {
- printf("Failed to setup RX queue:"
+ (rxring_numa[pi] != NUMA_NO_CONFIG))
+ socket = rxring_numa[pi];
+
+ if (ring_bind_lcpu) {
+ int ret = find_local_socket(qi, 1);
+ if (ret != NUMA_NO_CONFIG)
+ socket = ret;
+ }
+
+ struct rte_mempool *mp =
+ mbuf_pool_find(socket);
+ if (mp == NULL) {
+ printf("Failed to setup RX queue:"
"No mempool allocation"
" on the socket %d\n",
- port->socket_id);
- return -1;
- }
- diag = rte_eth_rx_queue_setup(pi, qi,
- nb_rxd,port->socket_id,
- &(port->rx_conf), mp);
+ socket);
+ return -1;
}
+ diag = rte_eth_rx_queue_setup(pi, qi,
+ nb_rxd, socket,
+ &(port->rx_conf), mp);
if (diag == 0)
continue;
@@ -299,6 +299,7 @@ struct queue_stats_mappings {
extern uint16_t verbose_level; /**< Drives messages being displayed, if any. */
extern uint8_t interactive;
extern uint8_t auto_start;
+extern uint8_t ring_bind_lcpu;
extern char cmdline_filename[PATH_MAX]; /**< offline commands file */
extern uint8_t numa_support; /**< set by "--numa" parameter */
extern uint16_t port_topology; /**< set by "--port-topology" parameter */