diff mbox series

[RFC,3/3] net/af_xdp: preferred busy polling

Message ID 20210218092307.29575-4-ciara.loftus@intel.com (mailing list archive)
State Superseded
Delegated to: Ferruh Yigit
Headers show
Series AF_XDP Preferred Busy Polling | expand

Checks

Context Check Description
ci/intel-Testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Ciara Loftus Feb. 18, 2021, 9:23 a.m. UTC
This commit introduces support for preferred busy polling
to the AF_XDP PMD. This feature aims to improve single-core
performance for AF_XDP sockets under heavy load.

A new vdev arg is introduced called 'busy_budget' whose default
value is 64. busy_budget is the value supplied to the kernel
with the SO_BUSY_POLL_BUDGET socket option and represents the
busy-polling NAPI budget. To set the budget to a different value
eg. 256:

--vdev=net_af_xdp0,iface=eth0,busy_budget=256

Preferred busy polling is enabled by default provided a kernel with
version >= v5.11 is in use. To disable it, set the budget to zero.

The following settings are also strongly recommended to be used in
conjunction with this feature:

echo 2 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs
echo 200000 | sudo tee /sys/class/net/eth0/gro_flush_timeout

.. where eth0 is the interface being used by the PMD.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 doc/guides/nics/af_xdp.rst          | 38 +++++++++++++-
 drivers/net/af_xdp/compat.h         | 13 +++++
 drivers/net/af_xdp/rte_eth_af_xdp.c | 80 ++++++++++++++++++++++++-----
 3 files changed, 116 insertions(+), 15 deletions(-)
diff mbox series

Patch

diff --git a/doc/guides/nics/af_xdp.rst b/doc/guides/nics/af_xdp.rst
index 5ed24374f8..8bf40b5f0f 100644
--- a/doc/guides/nics/af_xdp.rst
+++ b/doc/guides/nics/af_xdp.rst
@@ -35,6 +35,7 @@  The following options can be provided to set up an af_xdp port in DPDK.
 *   ``shared_umem`` - PMD will attempt to share UMEM with others (optional,
     default 0);
 *   ``xdp_prog`` - path to custom xdp program (optional, default none);
+*   ``busy_budget`` - busy polling budget (optional, default 64);
 
 Prerequisites
 -------------
@@ -51,6 +52,7 @@  This is a Linux-specific PMD, thus the following prerequisites apply:
 *  For shared_umem, it requires kernel version v5.10 or later and libbpf version
    v0.2.0 or later.
 *  For 32-bit OS, a kernel with version 5.4 or later is required.
+*  For busy polling, kernel version v5.11 or later is required.
 
 Set up an af_xdp interface
 -----------------------------
@@ -107,4 +109,38 @@  Limitations
   .. code-block:: console
 
     --vdev net_af_xdp0,iface=ens786f1,shared_umem=1 \
-    --vdev net_af_xdp1,iface=ens786f2,shared_umem=1 \
\ No newline at end of file
+    --vdev net_af_xdp1,iface=ens786f2,shared_umem=1 \
+
+- **Preferred Busy Polling**
+
+  The SO_PREFER_BUSY_POLL socket option was introduced in kernel v5.11. It can
+  deliver a performance improvement for sockets with heavy traffic loads and
+  can significantly improve single-core performance in this context.
+
+  The feature is enabled by default in the AF_XDP PMD. To disable it, set the
+  'busy_budget' vdevarg to zero:
+
+  .. code-block:: console
+
+    --vdev net_af_xdp0,iface=ens786f1,busy_budget=0
+
+  The default 'busy_budget' is 64 and it represents the number of packets the
+  kernel will attempt to process in the netdev's NAPI context. You can change
+  the value for example to 256 like so:
+
+  .. code-block:: console
+
+    --vdev net_af_xdp0,iface=ens786f1,busy_budget=256
+
+  It is also strongly recommended to set the following for optimal performance:
+
+  .. code-block:: console
+
+    echo 2 | sudo tee /sys/class/net/ens786f1/napi_defer_hard_irqs
+    echo 200000 | sudo tee /sys/class/net/ens786f1/gro_flush_timeout
+
+  The above defers interrupts for interface ens786f1 and instead schedules its
+  NAPI context from a watchdog timer instead of from softirqs. More information
+  on this feature can be found at [1].
+
+  [1] https://lwn.net/Articles/837010/
\ No newline at end of file
diff --git a/drivers/net/af_xdp/compat.h b/drivers/net/af_xdp/compat.h
index 7aa40d522e..1d247a50b2 100644
--- a/drivers/net/af_xdp/compat.h
+++ b/drivers/net/af_xdp/compat.h
@@ -39,3 +39,16 @@  create_shared_socket(struct xsk_socket **xsk_ptr __rte_unused,
 	return -1;
 }
 #endif
+
+#ifdef XDP_USE_NEED_WAKEUP
+static int
+syscall_needed(struct xsk_ring_prod *q, uint32_t busy_budget)
+{
+	return xsk_ring_prod__needs_wakeup(q) | busy_budget;
+}
+#else
+syscall_needed(struct xsk_ring_prod *q __rte_unused, uint32_t busy_budget)
+{
+	return busy_budget;
+}
+#endif
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index 34b15aa3d0..c586f3042a 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -65,6 +65,8 @@  RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
 #define ETH_AF_XDP_DFLT_NUM_DESCS	XSK_RING_CONS__DEFAULT_NUM_DESCS
 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX	0
 #define ETH_AF_XDP_DFLT_QUEUE_COUNT	1
+#define ETH_AF_XDP_DFLT_BUSY_BUDGET	64
+#define ETH_AF_XDP_DFLT_BUSY_TIMEOUT	20
 
 #define ETH_AF_XDP_RX_BATCH_SIZE	512
 #define ETH_AF_XDP_TX_BATCH_SIZE	512
@@ -100,6 +102,7 @@  struct pkt_rx_queue {
 	struct pkt_tx_queue *pair;
 	struct pollfd fds[1];
 	int xsk_queue_idx;
+	uint32_t busy_budget;
 };
 
 struct tx_stats {
@@ -140,6 +143,7 @@  struct pmd_internals {
 #define ETH_AF_XDP_QUEUE_COUNT_ARG		"queue_count"
 #define ETH_AF_XDP_SHARED_UMEM_ARG		"shared_umem"
 #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
+#define ETH_AF_XDP_BUDGET_ARG			"busy_budget"
 
 static const char * const valid_arguments[] = {
 	ETH_AF_XDP_IFACE_ARG,
@@ -147,6 +151,7 @@  static const char * const valid_arguments[] = {
 	ETH_AF_XDP_QUEUE_COUNT_ARG,
 	ETH_AF_XDP_SHARED_UMEM_ARG,
 	ETH_AF_XDP_PROG_ARG,
+	ETH_AF_XDP_BUDGET_ARG,
 	NULL
 };
 
@@ -261,11 +266,9 @@  af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 
 	if (nb_pkts == 0) {
-#if defined(XDP_USE_NEED_WAKEUP)
-		if (xsk_ring_prod__needs_wakeup(fq))
+		if (syscall_needed(&rxq->fq, rxq->busy_budget))
 			recvfrom(xsk_socket__fd(rxq->xsk), NULL, 0,
 				MSG_DONTWAIT, NULL, NULL);
-#endif
 		return 0;
 	}
 
@@ -334,11 +337,9 @@  af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 	nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 	if (nb_pkts == 0) {
-#if defined(XDP_USE_NEED_WAKEUP)
-		if (xsk_ring_prod__needs_wakeup(fq))
+		if (syscall_needed(&rxq->fq, rxq->busy_budget))
 			recvfrom(xsk_socket__fd(rxq->xsk), NULL, 0,
 				MSG_DONTWAIT, NULL, NULL);
-#endif
 		return 0;
 	}
 
@@ -422,9 +423,7 @@  kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
 
 	pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
 
-#if defined(XDP_USE_NEED_WAKEUP)
-	if (xsk_ring_prod__needs_wakeup(&txq->tx))
-#endif
+	if (syscall_needed(&txq->tx, txq->pair->busy_budget)) {
 		while (send(xsk_socket__fd(txq->pair->xsk), NULL,
 			    0, MSG_DONTWAIT) < 0) {
 			/* some thing unexpected */
@@ -437,6 +436,7 @@  kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
 					     XSK_RING_CONS__DEFAULT_NUM_DESCS,
 					     cq);
 		}
+	}
 }
 
 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
@@ -1145,6 +1145,34 @@  xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
 		goto err;
 	}
 
+#ifdef SO_PREFER_BUSY_POLL
+	int sock_opt = 1;
+	int fd = xsk_socket__fd(rxq->xsk);
+
+	if (setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, (void *)&sock_opt,
+			sizeof(sock_opt)) < 0) {
+		AF_XDP_LOG(ERR, "Failed to set SO_PREFER_BUSY_POLL\n");
+		goto err;
+	}
+
+	sock_opt = ETH_AF_XDP_DFLT_BUSY_TIMEOUT;
+	if (setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
+			sizeof(sock_opt)) < 0) {
+		AF_XDP_LOG(ERR, "Failed to set SO_BUSY_POLL\n");
+		goto err;
+	}
+
+	sock_opt = rxq->busy_budget;
+	if (setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt,
+			sizeof(sock_opt)) < 0) {
+		AF_XDP_LOG(ERR, "Failed to set SO_BUSY_POLL_BUDGET\n");
+		goto err;
+	} else {
+		AF_XDP_LOG(INFO, "Busy polling budget set to: %u\n",
+					rxq->busy_budget);
+	}
+#endif
+
 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 	if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
 		AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
@@ -1416,7 +1444,8 @@  xdp_get_channels_info(const char *if_name, int *max_queues,
 
 static int
 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
-			int *queue_cnt, int *shared_umem, char *prog_path)
+			int *queue_cnt, int *shared_umem, char *prog_path,
+			int *busy_budget)
 {
 	int ret;
 
@@ -1447,6 +1476,11 @@  parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
 	if (ret < 0)
 		goto free_kvlist;
 
+	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_BUDGET_ARG,
+				&parse_integer_arg, busy_budget);
+	if (ret < 0)
+		goto free_kvlist;
+
 free_kvlist:
 	rte_kvargs_free(kvlist);
 	return ret;
@@ -1485,7 +1519,7 @@  get_iface_info(const char *if_name,
 static struct rte_eth_dev *
 init_internals(struct rte_vdev_device *dev, const char *if_name,
 		int start_queue_idx, int queue_cnt, int shared_umem,
-		const char *prog_path)
+		const char *prog_path, int busy_budget)
 {
 	const char *name = rte_vdev_device_name(dev);
 	const unsigned int numa_node = dev->device.numa_node;
@@ -1546,6 +1580,7 @@  init_internals(struct rte_vdev_device *dev, const char *if_name,
 		internals->rx_queues[i].pair = &internals->tx_queues[i];
 		internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
 		internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
+		internals->rx_queues[i].busy_budget = busy_budget;
 	}
 
 	ret = get_iface_info(if_name, &internals->eth_addr,
@@ -1589,6 +1624,7 @@  rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 	int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
 	int shared_umem = 0;
 	char prog_path[PATH_MAX] = {'\0'};
+	int busy_budget = -1;
 	struct rte_eth_dev *eth_dev = NULL;
 	const char *name;
 
@@ -1618,7 +1654,8 @@  rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 		dev->device.numa_node = rte_socket_id();
 
 	if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
-			     &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
+			     &xsk_queue_cnt, &shared_umem, prog_path,
+			     &busy_budget) < 0) {
 		AF_XDP_LOG(ERR, "Invalid kvargs value\n");
 		return -EINVAL;
 	}
@@ -1628,8 +1665,22 @@  rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
 		return -EINVAL;
 	}
 
+#ifdef SO_PREFER_BUSY_POLL
+	busy_budget = busy_budget == -1 ? ETH_AF_XDP_DFLT_BUSY_BUDGET :
+					busy_budget;
+	if (!busy_budget)
+		AF_XDP_LOG(ERR, "Preferred busy polling disabled\n");
+#else
+	if (busy_budget > 0) {
+		AF_XDP_LOG(ERR, "Kernel does not support SO_PREFER_BUSY_POLL\n");
+		return -ENOTSUP;
+	}
+	busy_budget = 0;
+#endif
+
 	eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
-					xsk_queue_cnt, shared_umem, prog_path);
+					xsk_queue_cnt, shared_umem, prog_path,
+					busy_budget);
 	if (eth_dev == NULL) {
 		AF_XDP_LOG(ERR, "Failed to init internals\n");
 		return -1;
@@ -1674,4 +1725,5 @@  RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
 			      "start_queue=<int> "
 			      "queue_count=<int> "
 			      "shared_umem=<int> "
-			      "xdp_prog=<string> ");
+			      "xdp_prog=<string> "
+			      "busy_budget=<int>");