event/eth_tx: prefetch mbuf headers
Checks
Commit Message
Prefetch mbuf headers, resulting in ~10% throughput improvement when
the Ethernet RX and TX Adapters are hosted on the same core (likely
~2x in case a dedicated TX core is used).
Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Tested-by: Peter Nilsson <peter.j.nilsson@ericsson.com>
---
lib/eventdev/rte_event_eth_tx_adapter.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
Comments
On 2025-03-28 06:43, Mattias Rönnblom wrote:
> Prefetch mbuf headers, resulting in ~10% throughput improvement when
> the Ethernet RX and TX Adapters are hosted on the same core (likely
> ~2x in case a dedicated TX core is used).
>
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Tested-by: Peter Nilsson <peter.j.nilsson@ericsson.com>
What should be added is that what's been tested is the
non-RTE_EVENT_TYPE_VECTOR case.
> ---
> lib/eventdev/rte_event_eth_tx_adapter.c | 20 ++++++++++++++++++++
> 1 file changed, 20 insertions(+)
>
> diff --git a/lib/eventdev/rte_event_eth_tx_adapter.c b/lib/eventdev/rte_event_eth_tx_adapter.c
> index 67fff8b7d6..d740ae00f9 100644
> --- a/lib/eventdev/rte_event_eth_tx_adapter.c
> +++ b/lib/eventdev/rte_event_eth_tx_adapter.c
> @@ -598,6 +598,12 @@ txa_process_event_vector(struct txa_service_data *txa,
> return nb_tx;
> }
>
> +static inline void
> +txa_prefetch_mbuf(struct rte_mbuf *mbuf)
> +{
> + rte_mbuf_prefetch_part1(mbuf);
> +}
> +
> static void
> txa_service_tx(struct txa_service_data *txa, struct rte_event *ev,
> uint32_t n)
> @@ -608,6 +614,20 @@ txa_service_tx(struct txa_service_data *txa, struct rte_event *ev,
>
> stats = &txa->stats;
>
> + for (i = 0; i < n; i++) {
> + struct rte_event *event = &ev[i];
> +
> + if (unlikely(event->event_type & RTE_EVENT_TYPE_VECTOR)) {
> + struct rte_event_vector *vec = event->vec;
> + struct rte_mbuf **mbufs = vec->mbufs;
> + uint32_t k;
> +
> + for (k = 0; k < vec->nb_elem; k++)
> + txa_prefetch_mbuf(mbufs[k]);
> + } else
> + txa_prefetch_mbuf(event->mbuf);
> + }
> +
> nb_tx = 0;
> for (i = 0; i < n; i++) {
> uint16_t port;
On 2025-03-28 07:07, Mattias Rönnblom wrote:
> On 2025-03-28 06:43, Mattias Rönnblom wrote:
>> Prefetch mbuf headers, resulting in ~10% throughput improvement when
>> the Ethernet RX and TX Adapters are hosted on the same core (likely
>> ~2x in case a dedicated TX core is used).
>>
>> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
>> Tested-by: Peter Nilsson <peter.j.nilsson@ericsson.com>
>
<snip>
Naga, could you comment on this patch?
> -----Original Message-----
> From: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Sent: Friday, March 28, 2025 11:14 AM
> To: dev@dpdk.org
> Cc: Mattias Rönnblom <hofors@lysator.liu.se>; Naga Harish K, S V
> <s.v.naga.harish.k@intel.com>; Jerin Jacob <jerinj@marvell.com>; Mattias
> Rönnblom <mattias.ronnblom@ericsson.com>; Peter Nilsson
> <peter.j.nilsson@ericsson.com>
> Subject: [PATCH] event/eth_tx: prefetch mbuf headers
>
> Prefetch mbuf headers, resulting in ~10% throughput improvement when the
> Ethernet RX and TX Adapters are hosted on the same core (likely ~2x in case a
> dedicated TX core is used).
>
> Signed-off-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Tested-by: Peter Nilsson <peter.j.nilsson@ericsson.com>
> ---
> lib/eventdev/rte_event_eth_tx_adapter.c | 20 ++++++++++++++++++++
> 1 file changed, 20 insertions(+)
>
> diff --git a/lib/eventdev/rte_event_eth_tx_adapter.c
> b/lib/eventdev/rte_event_eth_tx_adapter.c
> index 67fff8b7d6..d740ae00f9 100644
> --- a/lib/eventdev/rte_event_eth_tx_adapter.c
> +++ b/lib/eventdev/rte_event_eth_tx_adapter.c
> @@ -598,6 +598,12 @@ txa_process_event_vector(struct txa_service_data
> *txa,
> return nb_tx;
> }
>
> +static inline void
> +txa_prefetch_mbuf(struct rte_mbuf *mbuf) {
> + rte_mbuf_prefetch_part1(mbuf);
> +}
> +
> static void
> txa_service_tx(struct txa_service_data *txa, struct rte_event *ev,
> uint32_t n)
> @@ -608,6 +614,20 @@ txa_service_tx(struct txa_service_data *txa, struct
> rte_event *ev,
>
> stats = &txa->stats;
>
> + for (i = 0; i < n; i++) {
> + struct rte_event *event = &ev[i];
> +
> + if (unlikely(event->event_type & RTE_EVENT_TYPE_VECTOR))
This gives a branch prediction advantage to non-vector events. Is that the intention?
> {
> + struct rte_event_vector *vec = event->vec;
> + struct rte_mbuf **mbufs = vec->mbufs;
> + uint32_t k;
> +
> + for (k = 0; k < vec->nb_elem; k++)
> + txa_prefetch_mbuf(mbufs[k]);
> + } else
> + txa_prefetch_mbuf(event->mbuf);
> + }
> +
> nb_tx = 0;
> for (i = 0; i < n; i++) {
> uint16_t port;
> --
> 2.43.0
@@ -598,6 +598,12 @@ txa_process_event_vector(struct txa_service_data *txa,
return nb_tx;
}
+static inline void
+txa_prefetch_mbuf(struct rte_mbuf *mbuf)
+{
+ rte_mbuf_prefetch_part1(mbuf);
+}
+
static void
txa_service_tx(struct txa_service_data *txa, struct rte_event *ev,
uint32_t n)
@@ -608,6 +614,20 @@ txa_service_tx(struct txa_service_data *txa, struct rte_event *ev,
stats = &txa->stats;
+ for (i = 0; i < n; i++) {
+ struct rte_event *event = &ev[i];
+
+ if (unlikely(event->event_type & RTE_EVENT_TYPE_VECTOR)) {
+ struct rte_event_vector *vec = event->vec;
+ struct rte_mbuf **mbufs = vec->mbufs;
+ uint32_t k;
+
+ for (k = 0; k < vec->nb_elem; k++)
+ txa_prefetch_mbuf(mbufs[k]);
+ } else
+ txa_prefetch_mbuf(event->mbuf);
+ }
+
nb_tx = 0;
for (i = 0; i < n; i++) {
uint16_t port;