[v3] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs
Checks
Commit Message
From: Long Li <longli@microsoft.com>
Instead of allocating mbufs one by one during RX, use
rte_pktmbuf_alloc_bulk() to allocate them in a batch.
There are no measurable performance improvements in benchmarks. However,
this patch should improve CPU cycles and reduce potential locking
conflicts in real-world applications.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change in v2:
use rte_calloc_socket() in place of rte_calloc()
v3:
add more comment explaining the benefit of doing alloc_bulk.
free mbufs that are failed to post
drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
1 file changed, 50 insertions(+), 24 deletions(-)
Comments
On Wed, Jan 31, 2024 at 07:45:50PM -0800, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
>
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
>
> There are no measurable performance improvements in benchmarks. However,
> this patch should improve CPU cycles and reduce potential locking
> conflicts in real-world applications.
>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> Change in v2:
> use rte_calloc_socket() in place of rte_calloc()
>
> v3:
> add more comment explaining the benefit of doing alloc_bulk.
> free mbufs that are failed to post
>
> drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
> 1 file changed, 50 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
> index acad5e26cd..6112db2219 100644
> --- a/drivers/net/mana/rx.c
> +++ b/drivers/net/mana/rx.c
> @@ -2,6 +2,7 @@
> * Copyright 2022 Microsoft Corporation
> */
> #include <ethdev_driver.h>
> +#include <rte_malloc.h>
>
> #include <infiniband/verbs.h>
> #include <infiniband/manadv.h>
> @@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
> }
>
> static int
> -mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
> +mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
> {
> - struct rte_mbuf *mbuf = NULL;
> struct gdma_sgl_element sgl[1];
> struct gdma_work_request request;
> uint32_t wqe_size_in_bu;
> @@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
> int ret;
> struct mana_mr_cache *mr;
>
> - mbuf = rte_pktmbuf_alloc(rxq->mp);
> - if (!mbuf) {
> - rxq->stats.nombuf++;
> - return -ENOMEM;
> - }
> -
> mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
> if (!mr) {
> DP_LOG(ERR, "failed to register RX MR");
> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
> * Post work requests for a Rx queue.
> */
> static int
> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
> {
> int ret;
> uint32_t i;
> + struct rte_mbuf **mbufs;
> +
> + mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
> + 0, rxq->mp->socket_id);
> + if (!mbufs)
> + return -ENOMEM;
> +
> + ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> + if (ret) {
> + DP_LOG(ERR, "failed to allocate mbufs for RX");
> + rxq->stats.nombuf += count;
> + goto fail;
> + }
>
> #ifdef RTE_ARCH_32
> rxq->wqe_cnt_to_short_db = 0;
> #endif
> - for (i = 0; i < rxq->num_desc; i++) {
> - ret = mana_alloc_and_post_rx_wqe(rxq);
> + for (i = 0; i < count; i++) {
> + ret = mana_post_rx_wqe(rxq, mbufs[i]);
> if (ret) {
> DP_LOG(ERR, "failed to post RX ret = %d", ret);
> - return ret;
> + break;
> }
>
> #ifdef RTE_ARCH_32
> @@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> #endif
> }
>
> + /* Free the remaining mbufs that are not posted */
> + while (i < count) {
> + rte_pktmbuf_free(mbufs[i]);
> + i++;
> + }
there is also rte_pktmbuf_free_bulk() that could be used. probably won't
make any material difference to perf though so just an fyi.
> > + /* Free the remaining mbufs that are not posted */
> > + while (i < count) {
> > + rte_pktmbuf_free(mbufs[i]);
> > + i++;
> > + }
>
> there is also rte_pktmbuf_free_bulk() that could be used. probably won't make
> any material difference to perf though so just an fyi.
Thank you! Will use rte_pktmbuf_free_bulk().
@@ -2,6 +2,7 @@
* Copyright 2022 Microsoft Corporation
*/
#include <ethdev_driver.h>
+#include <rte_malloc.h>
#include <infiniband/verbs.h>
#include <infiniband/manadv.h>
@@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
}
static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
{
- struct rte_mbuf *mbuf = NULL;
struct gdma_sgl_element sgl[1];
struct gdma_work_request request;
uint32_t wqe_size_in_bu;
@@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
int ret;
struct mana_mr_cache *mr;
- mbuf = rte_pktmbuf_alloc(rxq->mp);
- if (!mbuf) {
- rxq->stats.nombuf++;
- return -ENOMEM;
- }
-
mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
if (!mr) {
DP_LOG(ERR, "failed to register RX MR");
@@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
* Post work requests for a Rx queue.
*/
static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
{
int ret;
uint32_t i;
+ struct rte_mbuf **mbufs;
+
+ mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
+ 0, rxq->mp->socket_id);
+ if (!mbufs)
+ return -ENOMEM;
+
+ ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
+ if (ret) {
+ DP_LOG(ERR, "failed to allocate mbufs for RX");
+ rxq->stats.nombuf += count;
+ goto fail;
+ }
#ifdef RTE_ARCH_32
rxq->wqe_cnt_to_short_db = 0;
#endif
- for (i = 0; i < rxq->num_desc; i++) {
- ret = mana_alloc_and_post_rx_wqe(rxq);
+ for (i = 0; i < count; i++) {
+ ret = mana_post_rx_wqe(rxq, mbufs[i]);
if (ret) {
DP_LOG(ERR, "failed to post RX ret = %d", ret);
- return ret;
+ break;
}
#ifdef RTE_ARCH_32
@@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
#endif
}
+ /* Free the remaining mbufs that are not posted */
+ while (i < count) {
+ rte_pktmbuf_free(mbufs[i]);
+ i++;
+ }
+
mana_rq_ring_doorbell(rxq);
+fail:
+ rte_free(mbufs);
return ret;
}
@@ -404,7 +419,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
}
for (i = 0; i < priv->num_queues; i++) {
- ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+ ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
if (ret)
goto fail;
}
@@ -423,7 +440,7 @@ uint16_t
mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
uint16_t pkt_received = 0;
- uint16_t wqe_posted = 0;
+ uint16_t wqe_consumed = 0;
struct mana_rxq *rxq = dpdk_rxq;
struct mana_priv *priv = rxq->priv;
struct rte_mbuf *mbuf;
@@ -535,18 +552,23 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
rxq->gdma_rq.tail += desc->wqe_size_in_bu;
- /* Consume this request and post another request */
- ret = mana_alloc_and_post_rx_wqe(rxq);
- if (ret) {
- DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
- break;
- }
-
- wqe_posted++;
+ /* Record the number of the RX WQE we need to post to replenish
+ * consumed RX requests
+ */
+ wqe_consumed++;
if (pkt_received == pkts_n)
break;
#ifdef RTE_ARCH_32
+ /* Always post WQE as soon as it's consumed for short DB */
+ ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+ if (ret) {
+ DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+ wqe_consumed, ret);
+ return pkt_received;
+ }
+ wqe_consumed = 0;
+
/* Ring short doorbell if approaching the wqe increment
* limit.
*/
@@ -569,8 +591,12 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
goto repoll;
}
- if (wqe_posted)
- mana_rq_ring_doorbell(rxq);
+ if (wqe_consumed) {
+ ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+ if (ret)
+ DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+ wqe_consumed, ret);
+ }
return pkt_received;
}