diff mbox series

[v2] net/af_xdp: fix umem map size for zero copy

Message ID	20240511052618.1890677-1-frank.du@intel.com (mailing list archive)
State	Under Review
Delegated to:	Ferruh Yigit
Headers	From: Frank Du <frank.du@intel.com> To: dev@dpdk.org Cc: ciara.loftus@intel.com Subject: [PATCH v2] net/af_xdp: fix umem map size for zero copy Date: Sat, 11 May 2024 13:26:18 +0800 Message-Id: <20240511052618.1890677-1-frank.du@intel.com> In-Reply-To: <20240426005128.148730-1-frank.du@intel.com> References: <20240426005128.148730-1-frank.du@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: dev-bounces@dpdk.org
Series	[v2] net/af_xdp: fix umem map size for zero copy \| [v2] net/af_xdp: fix umem map size for zero copy

Checks

Context	Check	Description
ci/checkpatch	warning	coding style issues
ci/loongarch-compilation	success	Compilation OK
ci/loongarch-unit-testing	success	Unit Testing PASS
ci/Intel-compilation	success	Compilation OK
ci/iol-mellanox-Performance	success	Performance Testing PASS
ci/iol-intel-Performance	success	Performance Testing PASS
ci/github-robot: build	success	github build: passed
ci/iol-abi-testing	success	Testing PASS
ci/iol-compile-amd64-testing	success	Testing PASS
ci/iol-unit-arm64-testing	success	Testing PASS
ci/iol-sample-apps-testing	success	Testing PASS
ci/iol-unit-amd64-testing	fail	Testing issues
ci/iol-intel-Functional	success	Functional Testing PASS
ci/iol-broadcom-Performance	success	Performance Testing PASS
ci/iol-compile-arm64-testing	success	Testing PASS
ci/iol-broadcom-Functional	success	Functional Testing PASS
ci/intel-Functional	success	Functional PASS
ci/intel-Testing	success	Testing PASS

Commit Message

Du, Frank May 11, 2024, 5:26 a.m. UTC

  The current calculation assumes that the mbufs are contiguous. However,
this assumption is incorrect when the memory spans across a huge page.
Correct to directly read the size from the mempool memory chunks.

Signed-off-by: Frank Du <frank.du@intel.com>

---
v2:
* Add virtual contiguous detect for for multiple memhdrs.
---
 drivers/net/af_xdp/rte_eth_af_xdp.c | 34 ++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 6 deletions(-)

Comments

Loftus, Ciara May 17, 2024, 1:19 p.m. UTC | #1

> 
> The current calculation assumes that the mbufs are contiguous. However,
> this assumption is incorrect when the memory spans across a huge page.
> Correct to directly read the size from the mempool memory chunks.
> 
> Signed-off-by: Frank Du <frank.du@intel.com>

Hi Frank,

Thanks for the patch.

Before your patch the umem_size was calculated using mb_pool->populated_size * rte_mempool_calc_obj_size(mb_pool->elt_size, ..)
With your patch we sum up the lens of all memhdrs in the mempool.

When debugging I see the new calculation can yield a larger value, but the new logic looks good and more thorough to me so I'm happy to opt with the new approach.

Acked-by: Ciara Loftus <ciara.loftus@intel.com>

Thanks,
Ciara

> 
> ---
> v2:
> * Add virtual contiguous detect for for multiple memhdrs.
> ---
>  drivers/net/af_xdp/rte_eth_af_xdp.c | 34 ++++++++++++++++++++++++----
> -
>  1 file changed, 28 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> b/drivers/net/af_xdp/rte_eth_af_xdp.c
> index 268a130c49..7456108d6d 100644
> --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev
> __rte_unused,
>  }
> 
>  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> -static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t
> *align)
> +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp, uint64_t
> *align, size_t *len)
>  {
> -	struct rte_mempool_memhdr *memhdr;
> +	struct rte_mempool_memhdr *memhdr, *next;
>  	uintptr_t memhdr_addr, aligned_addr;
> +	size_t memhdr_len = 0;
> 
> +	/* get the mempool base addr and align */
>  	memhdr = STAILQ_FIRST(&mp->mem_list);
>  	memhdr_addr = (uintptr_t)memhdr->addr;
>  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
>  	*align = memhdr_addr - aligned_addr;
> +	memhdr_len += memhdr->len;
> +
> +	/* check if virtual contiguous memory for multiple memhdrs */
> +	next = STAILQ_NEXT(memhdr, next);
> +	while (next != NULL) {
> +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr +
> memhdr->len) {
> +			AF_XDP_LOG(ERR, "memory chunks not virtual
> contiguous, "
> +					"next: %p, cur: %p(len: %" PRId64 "
> )\n",
> +					next->addr, memhdr->addr, memhdr-
> >len);
> +			return 0;
> +		}
> +		/* virtual contiguous */
> +		memhdr = next;
> +		memhdr_len += memhdr->len;
> +		next = STAILQ_NEXT(memhdr, next);
> +	}
> 
> +	*len = memhdr_len;
>  	return aligned_addr;
>  }
> 
> @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct
> pmd_internals *internals,
>  	void *base_addr = NULL;
>  	struct rte_mempool *mb_pool = rxq->mb_pool;
>  	uint64_t umem_size, align = 0;
> +	size_t len = 0;
> 
>  	if (internals->shared_umem) {
>  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
> @@ -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct
> pmd_internals *internals,
>  		}
> 
>  		umem->mb_pool = mb_pool;
> -		base_addr = (void *)get_base_addr(mb_pool, &align);
> -		umem_size = (uint64_t)mb_pool->populated_size *
> -				(uint64_t)usr_config.frame_size +
> -				align;
> +		base_addr = (void *)get_memhdr_info(mb_pool, &align,
> &len);
> +		if (!base_addr) {
> +			AF_XDP_LOG(ERR, "Failed to parse memhdr info from
> pool\n");
> +			goto err;
> +		}
> +		umem_size = (uint64_t)len + align;
> 
>  		ret = xsk_umem__create(&umem->umem, base_addr,
> umem_size,
>  				&rxq->fq, &rxq->cq, &usr_config);
> --
> 2.34.1

Du, Frank May 20, 2024, 1:28 a.m. UTC | #2

> -----Original Message-----
> From: Loftus, Ciara <ciara.loftus@intel.com>
> Sent: Friday, May 17, 2024 9:20 PM
> To: Du, Frank <frank.du@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [PATCH v2] net/af_xdp: fix umem map size for zero copy
> 
> >
> > The current calculation assumes that the mbufs are contiguous.
> > However, this assumption is incorrect when the memory spans across a huge
> page.
> > Correct to directly read the size from the mempool memory chunks.
> >
> > Signed-off-by: Frank Du <frank.du@intel.com>
> 
> Hi Frank,
> 
> Thanks for the patch.
> 
> Before your patch the umem_size was calculated using mb_pool->populated_size
> * rte_mempool_calc_obj_size(mb_pool->elt_size, ..) With your patch we sum up
> the lens of all memhdrs in the mempool.
> 
> When debugging I see the new calculation can yield a larger value, but the new
> logic looks good and more thorough to me so I'm happy to opt with the new
> approach.

Hi Ciara,

Thanks for the review. The reason for a larger value is that, the pool contain some necessary space hole to ensure one single mbuf does not span two virtual huge page, so that the PA of each mbuf is secure for the hardware operation.

> 
> Acked-by: Ciara Loftus <ciara.loftus@intel.com>
> 
> Thanks,
> Ciara
> 
> >
> > ---
> > v2:
> > * Add virtual contiguous detect for for multiple memhdrs.
> > ---
> >  drivers/net/af_xdp/rte_eth_af_xdp.c | 34 ++++++++++++++++++++++++----
> > -
> >  1 file changed, 28 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > index 268a130c49..7456108d6d 100644
> > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev
> > __rte_unused,  }
> >
> >  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> > -static inline uintptr_t get_base_addr(struct rte_mempool *mp,
> > uint64_t
> > *align)
> > +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp,
> > +uint64_t
> > *align, size_t *len)
> >  {
> > -	struct rte_mempool_memhdr *memhdr;
> > +	struct rte_mempool_memhdr *memhdr, *next;
> >  	uintptr_t memhdr_addr, aligned_addr;
> > +	size_t memhdr_len = 0;
> >
> > +	/* get the mempool base addr and align */
> >  	memhdr = STAILQ_FIRST(&mp->mem_list);
> >  	memhdr_addr = (uintptr_t)memhdr->addr;
> >  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
> >  	*align = memhdr_addr - aligned_addr;
> > +	memhdr_len += memhdr->len;
> > +
> > +	/* check if virtual contiguous memory for multiple memhdrs */
> > +	next = STAILQ_NEXT(memhdr, next);
> > +	while (next != NULL) {
> > +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr +
> > memhdr->len) {
> > +			AF_XDP_LOG(ERR, "memory chunks not virtual
> > contiguous, "
> > +					"next: %p, cur: %p(len: %" PRId64 "
> > )\n",
> > +					next->addr, memhdr->addr, memhdr-
> > >len);
> > +			return 0;
> > +		}
> > +		/* virtual contiguous */
> > +		memhdr = next;
> > +		memhdr_len += memhdr->len;
> > +		next = STAILQ_NEXT(memhdr, next);
> > +	}
> >
> > +	*len = memhdr_len;
> >  	return aligned_addr;
> >  }
> >
> > @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct
> > pmd_internals *internals,
> >  	void *base_addr = NULL;
> >  	struct rte_mempool *mb_pool = rxq->mb_pool;
> >  	uint64_t umem_size, align = 0;
> > +	size_t len = 0;
> >
> >  	if (internals->shared_umem) {
> >  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0) @@
> > -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct
> > pmd_internals *internals,
> >  		}
> >
> >  		umem->mb_pool = mb_pool;
> > -		base_addr = (void *)get_base_addr(mb_pool, &align);
> > -		umem_size = (uint64_t)mb_pool->populated_size *
> > -				(uint64_t)usr_config.frame_size +
> > -				align;
> > +		base_addr = (void *)get_memhdr_info(mb_pool, &align,
> > &len);
> > +		if (!base_addr) {
> > +			AF_XDP_LOG(ERR, "Failed to parse memhdr info from
> > pool\n");
> > +			goto err;
> > +		}
> > +		umem_size = (uint64_t)len + align;
> >
> >  		ret = xsk_umem__create(&umem->umem, base_addr,
> umem_size,
> >  				&rxq->fq, &rxq->cq, &usr_config);
> > --
> > 2.34.1

Ferruh Yigit May 21, 2024, 3:43 p.m. UTC | #3

On 5/11/2024 6:26 AM, Frank Du wrote:
> The current calculation assumes that the mbufs are contiguous. However,
> this assumption is incorrect when the memory spans across a huge page.
> Correct to directly read the size from the mempool memory chunks.
> 
> Signed-off-by: Frank Du <frank.du@intel.com>
>

Hi Frank,

As this is a fix patch, can you please provide a fixes tag [1], this
helps us understanding the logic in the original code and enables us
figuring out which stable tree to backport this fix.

[1]
https://doc.dpdk.org/guides/contributing/patches.html#commit-messages-body

Ferruh Yigit May 21, 2024, 5:57 p.m. UTC | #4

On 5/11/2024 6:26 AM, Frank Du wrote:
> The current calculation assumes that the mbufs are contiguous. However,
> this assumption is incorrect when the memory spans across a huge page.
> Correct to directly read the size from the mempool memory chunks.
> 
> Signed-off-by: Frank Du <frank.du@intel.com>
> 
> ---
> v2:
> * Add virtual contiguous detect for for multiple memhdrs.
> ---
>  drivers/net/af_xdp/rte_eth_af_xdp.c | 34 ++++++++++++++++++++++++-----
>  1 file changed, 28 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
> index 268a130c49..7456108d6d 100644
> --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev __rte_unused,
>  }
>  
>  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> -static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
> +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp, uint64_t *align, size_t *len)
>  {
> -	struct rte_mempool_memhdr *memhdr;
> +	struct rte_mempool_memhdr *memhdr, *next;
>  	uintptr_t memhdr_addr, aligned_addr;
> +	size_t memhdr_len = 0;
>  
> +	/* get the mempool base addr and align */
>  	memhdr = STAILQ_FIRST(&mp->mem_list);
>  	memhdr_addr = (uintptr_t)memhdr->addr;
>  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
>  	*align = memhdr_addr - aligned_addr;
>

I am aware this is not part of this patch, but as note, can't we use
'RTE_ALIGN_FLOOR' to calculate aligned address.


> +	memhdr_len += memhdr->len;
> +
> +	/* check if virtual contiguous memory for multiple memhdrs */
> +	next = STAILQ_NEXT(memhdr, next);
> +	while (next != NULL) {
> +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr + memhdr->len) {
> +			AF_XDP_LOG(ERR, "memory chunks not virtual contiguous, "
> +					"next: %p, cur: %p(len: %" PRId64 " )\n",
> +					next->addr, memhdr->addr, memhdr->len);
> +			return 0;
> +		}
>

Isn't there a mempool flag that can help us figure out mempool is not
IOVA contiguous? Isn't it sufficient on its own?


> +		/* virtual contiguous */
> +		memhdr = next;
> +		memhdr_len += memhdr->len;
> +		next = STAILQ_NEXT(memhdr, next);
> +	}
>  
> +	*len = memhdr_len;
>  	return aligned_addr;
>  }
>

This function goes too much details of the mempool object, and any
change in mempool details has potential to break this code.

@Andrew, @Morten, do you think does it make sense to have
'rte_mempool_info_get()' kind of function, that provides at least
address and length of the mempool, and used here?

This helps to hide internal details and complexity of the mempool for users.


>  
> @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
>  	void *base_addr = NULL;
>  	struct rte_mempool *mb_pool = rxq->mb_pool;
>  	uint64_t umem_size, align = 0;
> +	size_t len = 0;
>  
>  	if (internals->shared_umem) {
>  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
> @@ -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
>  		}
>  
>  		umem->mb_pool = mb_pool;
> -		base_addr = (void *)get_base_addr(mb_pool, &align);
> -		umem_size = (uint64_t)mb_pool->populated_size *
> -				(uint64_t)usr_config.frame_size +
> -				align;
> +		base_addr = (void *)get_memhdr_info(mb_pool, &align, &len);
>

Is this calculation correct if mempool is not already aligned to page size?

Like in an example page size is '0x1000', and "memhdr_addr = 0x000a1080"
returned aligned address is '0x000a1000', "base_addr = 0x000a1000"

Any access between '0x000a1000' & '0x000a1080' is invalid. Is this expected?


> +		if (!base_addr) {
> +			AF_XDP_LOG(ERR, "Failed to parse memhdr info from pool\n");
>

Log message is not accurate, it is not parsing memhdr info failed, but
mempool was not satisfying expectation.

> +			goto err;
> +		}
> +		umem_size = (uint64_t)len + align;
>  
>  		ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
>  				&rxq->fq, &rxq->cq, &usr_config);

Du, Frank May 22, 2024, 1:25 a.m. UTC | #5

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@amd.com>
> Sent: Wednesday, May 22, 2024 1:58 AM
> To: Du, Frank <frank.du@intel.com>; dev@dpdk.org; Andrew Rybchenko
> <andrew.rybchenko@oktetlabs.ru>; Morten Brørup
> <mb@smartsharesystems.com>
> Cc: Loftus, Ciara <ciara.loftus@intel.com>; Burakov, Anatoly
> <anatoly.burakov@intel.com>
> Subject: Re: [PATCH v2] net/af_xdp: fix umem map size for zero copy
> 
> On 5/11/2024 6:26 AM, Frank Du wrote:
> > The current calculation assumes that the mbufs are contiguous.
> > However, this assumption is incorrect when the memory spans across a huge
> page.
> > Correct to directly read the size from the mempool memory chunks.
> >
> > Signed-off-by: Frank Du <frank.du@intel.com>
> >
> > ---
> > v2:
> > * Add virtual contiguous detect for for multiple memhdrs.
> > ---
> >  drivers/net/af_xdp/rte_eth_af_xdp.c | 34
> > ++++++++++++++++++++++++-----
> >  1 file changed, 28 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > index 268a130c49..7456108d6d 100644
> > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev
> > __rte_unused,  }
> >
> >  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> > -static inline uintptr_t get_base_addr(struct rte_mempool *mp,
> > uint64_t *align)
> > +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp,
> > +uint64_t *align, size_t *len)
> >  {
> > -	struct rte_mempool_memhdr *memhdr;
> > +	struct rte_mempool_memhdr *memhdr, *next;
> >  	uintptr_t memhdr_addr, aligned_addr;
> > +	size_t memhdr_len = 0;
> >
> > +	/* get the mempool base addr and align */
> >  	memhdr = STAILQ_FIRST(&mp->mem_list);
> >  	memhdr_addr = (uintptr_t)memhdr->addr;
> >  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
> >  	*align = memhdr_addr - aligned_addr;
> >
> 
> I am aware this is not part of this patch, but as note, can't we use
> 'RTE_ALIGN_FLOOR' to calculate aligned address.

Sure, will use RTE_ALIGN_FLOOR in next version.

> 
> 
> > +	memhdr_len += memhdr->len;
> > +
> > +	/* check if virtual contiguous memory for multiple memhdrs */
> > +	next = STAILQ_NEXT(memhdr, next);
> > +	while (next != NULL) {
> > +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr + memhdr-
> >len) {
> > +			AF_XDP_LOG(ERR, "memory chunks not virtual
> contiguous, "
> > +					"next: %p, cur: %p(len: %" PRId64
> " )\n",
> > +					next->addr, memhdr->addr, memhdr-
> >len);
> > +			return 0;
> > +		}
> >
> 
> Isn't there a mempool flag that can help us figure out mempool is not IOVA
> contiguous? Isn't it sufficient on its own?

Indeed, what we need to ascertain is whether it's contiguous in CPU virtual space, not IOVA. I haven't come across a flag specifically for CPU virtual contiguity. The major limitation in XDP is XSK UMEM only supports registering a single contiguous virtual memory area.

> 
> 
> > +		/* virtual contiguous */
> > +		memhdr = next;
> > +		memhdr_len += memhdr->len;
> > +		next = STAILQ_NEXT(memhdr, next);
> > +	}
> >
> > +	*len = memhdr_len;
> >  	return aligned_addr;
> >  }
> >
> 
> This function goes too much details of the mempool object, and any change in
> mempool details has potential to break this code.
> 
> @Andrew, @Morten, do you think does it make sense to have
> 'rte_mempool_info_get()' kind of function, that provides at least address and
> length of the mempool, and used here?
> 
> This helps to hide internal details and complexity of the mempool for users.
> 
> 
> >
> > @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct
> pmd_internals *internals,
> >  	void *base_addr = NULL;
> >  	struct rte_mempool *mb_pool = rxq->mb_pool;
> >  	uint64_t umem_size, align = 0;
> > +	size_t len = 0;
> >
> >  	if (internals->shared_umem) {
> >  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0) @@
> > -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct
> pmd_internals *internals,
> >  		}
> >
> >  		umem->mb_pool = mb_pool;
> > -		base_addr = (void *)get_base_addr(mb_pool, &align);
> > -		umem_size = (uint64_t)mb_pool->populated_size *
> > -				(uint64_t)usr_config.frame_size +
> > -				align;
> > +		base_addr = (void *)get_memhdr_info(mb_pool, &align, &len);
> >
> 
> Is this calculation correct if mempool is not already aligned to page size?
> 
> Like in an example page size is '0x1000', and "memhdr_addr = 0x000a1080"
> returned aligned address is '0x000a1000', "base_addr = 0x000a1000"
> 
> Any access between '0x000a1000' & '0x000a1080' is invalid. Is this expected?

Yes, since the XSK UMEM memory area requires page alignment. However, no need to worry; the memory pointer in the XSK TX/RX descriptor is obtained from the mbuf data area. We don’t have any chance to access the invalid range [0x000a1000: 0x000a1080] here.

> 
> 
> > +		if (!base_addr) {
> > +			AF_XDP_LOG(ERR, "Failed to parse memhdr info from
> pool\n");
> >
> 
> Log message is not accurate, it is not parsing memhdr info failed, but mempool
> was not satisfying expectation.

Thanks, will correct it in next version.

> 
> > +			goto err;
> > +		}
> > +		umem_size = (uint64_t)len + align;
> >
> >  		ret = xsk_umem__create(&umem->umem, base_addr,
> umem_size,
> >  				&rxq->fq, &rxq->cq, &usr_config);

Morten Brørup May 22, 2024, 7:26 a.m. UTC | #6

> From: Du, Frank [mailto:frank.du@intel.com]
> Sent: Wednesday, 22 May 2024 03.25
> 
> > From: Ferruh Yigit <ferruh.yigit@amd.com>
> > Sent: Wednesday, May 22, 2024 1:58 AM
> >
> > On 5/11/2024 6:26 AM, Frank Du wrote:
> > > The current calculation assumes that the mbufs are contiguous.
> > > However, this assumption is incorrect when the memory spans across a huge
> > page.
> > > Correct to directly read the size from the mempool memory chunks.
> > >
> > > Signed-off-by: Frank Du <frank.du@intel.com>
> > >
> > > ---
> > > v2:
> > > * Add virtual contiguous detect for for multiple memhdrs.
> > > ---
> > >  drivers/net/af_xdp/rte_eth_af_xdp.c | 34
> > > ++++++++++++++++++++++++-----
> > >  1 file changed, 28 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > index 268a130c49..7456108d6d 100644
> > > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > > @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev
> > > __rte_unused,  }
> > >
> > >  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> > > -static inline uintptr_t get_base_addr(struct rte_mempool *mp,
> > > uint64_t *align)
> > > +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp,
> > > +uint64_t *align, size_t *len)
> > >  {
> > > -	struct rte_mempool_memhdr *memhdr;
> > > +	struct rte_mempool_memhdr *memhdr, *next;
> > >  	uintptr_t memhdr_addr, aligned_addr;
> > > +	size_t memhdr_len = 0;
> > >
> > > +	/* get the mempool base addr and align */
> > >  	memhdr = STAILQ_FIRST(&mp->mem_list);
> > >  	memhdr_addr = (uintptr_t)memhdr->addr;

This is not a new bug; but if the mempool is not populated, memhdr is NULL here.

> > >  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
> > >  	*align = memhdr_addr - aligned_addr;
> > >
> >
> > I am aware this is not part of this patch, but as note, can't we use
> > 'RTE_ALIGN_FLOOR' to calculate aligned address.
> 
> Sure, will use RTE_ALIGN_FLOOR in next version.
> 
> >
> >
> > > +	memhdr_len += memhdr->len;
> > > +
> > > +	/* check if virtual contiguous memory for multiple memhdrs */
> > > +	next = STAILQ_NEXT(memhdr, next);
> > > +	while (next != NULL) {
> > > +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr + memhdr-
> > >len) {
> > > +			AF_XDP_LOG(ERR, "memory chunks not virtual
> > contiguous, "
> > > +					"next: %p, cur: %p(len: %" PRId64
> > " )\n",
> > > +					next->addr, memhdr->addr, memhdr-
> > >len);
> > > +			return 0;
> > > +		}
> > >
> >
> > Isn't there a mempool flag that can help us figure out mempool is not IOVA
> > contiguous? Isn't it sufficient on its own?
> 
> Indeed, what we need to ascertain is whether it's contiguous in CPU virtual
> space, not IOVA. I haven't come across a flag specifically for CPU virtual
> contiguity. The major limitation in XDP is XSK UMEM only supports registering
> a single contiguous virtual memory area.

I would assume that the EAL memory manager merges free memory into contiguous chunks whenever possible.
@Anatoly, please confirm?

If my assumption is correct, it means that if mp->nb_mem_chunks != 1, then the mempool is not virtual contiguous. And if mp->nb_mem_chunks == 1, then it is; there is no need to iterate through the memhdr list.

> 
> >
> >
> > > +		/* virtual contiguous */
> > > +		memhdr = next;
> > > +		memhdr_len += memhdr->len;
> > > +		next = STAILQ_NEXT(memhdr, next);
> > > +	}
> > >
> > > +	*len = memhdr_len;
> > >  	return aligned_addr;
> > >  }
> > >
> >
> > This function goes too much details of the mempool object, and any change in
> > mempool details has potential to break this code.
> >
> > @Andrew, @Morten, do you think does it make sense to have
> > 'rte_mempool_info_get()' kind of function, that provides at least address
> and
> > length of the mempool, and used here?
> >
> > This helps to hide internal details and complexity of the mempool for users.

I think all the relevant information is available as (public) fields directly in the rte_mempool.
I agree about hiding internal details. For discriminating between private and public information, I would prefer marking the "private" fields in the rte_mempool structure as such.

Optimally we need an rte_mempool_create() flag, specifying that the mempool objects must be allocated as one chunk of memory with contiguous virtual addresses when populating the mempool. As discussed in another thread [1], the proposed pointer compression library would also benefit from such a mempool flag.

[1] https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9F45C@smartserver.smartshare.dk/

> >
> >
> > >
> > > @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct
> > pmd_internals *internals,
> > >  	void *base_addr = NULL;
> > >  	struct rte_mempool *mb_pool = rxq->mb_pool;
> > >  	uint64_t umem_size, align = 0;
> > > +	size_t len = 0;
> > >
> > >  	if (internals->shared_umem) {
> > >  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0) @@
> > > -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct
> > pmd_internals *internals,
> > >  		}
> > >
> > >  		umem->mb_pool = mb_pool;
> > > -		base_addr = (void *)get_base_addr(mb_pool, &align);
> > > -		umem_size = (uint64_t)mb_pool->populated_size *
> > > -				(uint64_t)usr_config.frame_size +
> > > -				align;
> > > +		base_addr = (void *)get_memhdr_info(mb_pool, &align, &len);
> > >
> >
> > Is this calculation correct if mempool is not already aligned to page size?

Please note: The mempool uses one memzone for the mempool structure itself. The objects in the mempool are stored in another memzone (or multiple other memzones, if necessary). I think you are talking about the alignment of the mempool object chunk, not of the mempool structure itself.

> >
> > Like in an example page size is '0x1000', and "memhdr_addr = 0x000a1080"
> > returned aligned address is '0x000a1000', "base_addr = 0x000a1000"
> >
> > Any access between '0x000a1000' & '0x000a1080' is invalid. Is this expected?
> 
> Yes, since the XSK UMEM memory area requires page alignment. However, no need
> to worry; the memory pointer in the XSK TX/RX descriptor is obtained from the
> mbuf data area. We don’t have any chance to access the invalid range
> [0x000a1000: 0x000a1080] here.
> 
> >
> >
> > > +		if (!base_addr) {
> > > +			AF_XDP_LOG(ERR, "Failed to parse memhdr info from
> > pool\n");
> > >
> >
> > Log message is not accurate, it is not parsing memhdr info failed, but
> mempool
> > was not satisfying expectation.

Looking at get_memhdr_info() above, it could be either mempool or memhdr failing to parse.

> 
> Thanks, will correct it in next version.
> 
> >
> > > +			goto err;
> > > +		}
> > > +		umem_size = (uint64_t)len + align;
> > >
> > >  		ret = xsk_umem__create(&umem->umem, base_addr,
> > umem_size,
> > >  				&rxq->fq, &rxq->cq, &usr_config);

Ferruh Yigit May 22, 2024, 10 a.m. UTC | #7

On 5/22/2024 2:25 AM, Du, Frank wrote:
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>> Sent: Wednesday, May 22, 2024 1:58 AM
>> To: Du, Frank <frank.du@intel.com>; dev@dpdk.org; Andrew Rybchenko
>> <andrew.rybchenko@oktetlabs.ru>; Morten Brørup
>> <mb@smartsharesystems.com>
>> Cc: Loftus, Ciara <ciara.loftus@intel.com>; Burakov, Anatoly
>> <anatoly.burakov@intel.com>
>> Subject: Re: [PATCH v2] net/af_xdp: fix umem map size for zero copy
>>
>> On 5/11/2024 6:26 AM, Frank Du wrote:
>>> The current calculation assumes that the mbufs are contiguous.
>>> However, this assumption is incorrect when the memory spans across a huge
>> page.
>>> Correct to directly read the size from the mempool memory chunks.
>>>
>>> Signed-off-by: Frank Du <frank.du@intel.com>
>>>
>>> ---
>>> v2:
>>> * Add virtual contiguous detect for for multiple memhdrs.
>>> ---
>>>  drivers/net/af_xdp/rte_eth_af_xdp.c | 34
>>> ++++++++++++++++++++++++-----
>>>  1 file changed, 28 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
>>> b/drivers/net/af_xdp/rte_eth_af_xdp.c
>>> index 268a130c49..7456108d6d 100644
>>> --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
>>> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
>>> @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev
>>> __rte_unused,  }
>>>
>>>  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
>>> -static inline uintptr_t get_base_addr(struct rte_mempool *mp,
>>> uint64_t *align)
>>> +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp,
>>> +uint64_t *align, size_t *len)
>>>  {
>>> -	struct rte_mempool_memhdr *memhdr;
>>> +	struct rte_mempool_memhdr *memhdr, *next;
>>>  	uintptr_t memhdr_addr, aligned_addr;
>>> +	size_t memhdr_len = 0;
>>>
>>> +	/* get the mempool base addr and align */
>>>  	memhdr = STAILQ_FIRST(&mp->mem_list);
>>>  	memhdr_addr = (uintptr_t)memhdr->addr;
>>>  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
>>>  	*align = memhdr_addr - aligned_addr;
>>>
>>
>> I am aware this is not part of this patch, but as note, can't we use
>> 'RTE_ALIGN_FLOOR' to calculate aligned address.
> 
> Sure, will use RTE_ALIGN_FLOOR in next version.
> 
>>
>>
>>> +	memhdr_len += memhdr->len;
>>> +
>>> +	/* check if virtual contiguous memory for multiple memhdrs */
>>> +	next = STAILQ_NEXT(memhdr, next);
>>> +	while (next != NULL) {
>>> +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr + memhdr-
>>> len) {
>>> +			AF_XDP_LOG(ERR, "memory chunks not virtual
>> contiguous, "
>>> +					"next: %p, cur: %p(len: %" PRId64
>> " )\n",
>>> +					next->addr, memhdr->addr, memhdr-
>>> len);
>>> +			return 0;
>>> +		}
>>>
>>
>> Isn't there a mempool flag that can help us figure out mempool is not IOVA
>> contiguous? Isn't it sufficient on its own?
> 
> Indeed, what we need to ascertain is whether it's contiguous in CPU virtual space, not IOVA. I haven't come across a flag specifically for CPU virtual contiguity. The major limitation in XDP is XSK UMEM only supports registering a single contiguous virtual memory area.
> 

'RTE_MEMPOOL_F_NO_IOVA_CONTIG' is the flag I was looking for. This flag
being *cleared* implies IOVA contiguous but not sure if it is
guaranteed, need to check.

And I may be wrong here, but as far as I remember in IOVA as VA mode,
process virtual address and IOVA address are same, so IOVA contiguous is
same as contiguous CPU virtual address.

>>
>>
>>> +		/* virtual contiguous */
>>> +		memhdr = next;
>>> +		memhdr_len += memhdr->len;
>>> +		next = STAILQ_NEXT(memhdr, next);
>>> +	}
>>>
>>> +	*len = memhdr_len;
>>>  	return aligned_addr;
>>>  }
>>>
>>
>> This function goes too much details of the mempool object, and any change in
>> mempool details has potential to break this code.
>>
>> @Andrew, @Morten, do you think does it make sense to have
>> 'rte_mempool_info_get()' kind of function, that provides at least address and
>> length of the mempool, and used here?
>>
>> This helps to hide internal details and complexity of the mempool for users.
>>
>>
>>>
>>> @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct
>> pmd_internals *internals,
>>>  	void *base_addr = NULL;
>>>  	struct rte_mempool *mb_pool = rxq->mb_pool;
>>>  	uint64_t umem_size, align = 0;
>>> +	size_t len = 0;
>>>
>>>  	if (internals->shared_umem) {
>>>  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0) @@
>>> -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct
>> pmd_internals *internals,
>>>  		}
>>>
>>>  		umem->mb_pool = mb_pool;
>>> -		base_addr = (void *)get_base_addr(mb_pool, &align);
>>> -		umem_size = (uint64_t)mb_pool->populated_size *
>>> -				(uint64_t)usr_config.frame_size +
>>> -				align;
>>> +		base_addr = (void *)get_memhdr_info(mb_pool, &align, &len);
>>>
>>
>> Is this calculation correct if mempool is not already aligned to page size?
>>
>> Like in an example page size is '0x1000', and "memhdr_addr = 0x000a1080"
>> returned aligned address is '0x000a1000', "base_addr = 0x000a1000"
>>
>> Any access between '0x000a1000' & '0x000a1080' is invalid. Is this expected?
> 
> Yes, since the XSK UMEM memory area requires page alignment. However, no need to worry; the memory pointer in the XSK TX/RX descriptor is obtained from the mbuf data area. We don’t have any chance to access the invalid range [0x000a1000: 0x000a1080] here.
> 

Thanks for clarification.

Ferruh Yigit May 22, 2024, 10:20 a.m. UTC | #8

On 5/22/2024 8:26 AM, Morten Brørup wrote:
>> From: Du, Frank [mailto:frank.du@intel.com]
>> Sent: Wednesday, 22 May 2024 03.25
>>
>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>>> Sent: Wednesday, May 22, 2024 1:58 AM
>>>
>>> On 5/11/2024 6:26 AM, Frank Du wrote:
>>>> The current calculation assumes that the mbufs are contiguous.
>>>> However, this assumption is incorrect when the memory spans across a huge
>>> page.
>>>> Correct to directly read the size from the mempool memory chunks.
>>>>
>>>> Signed-off-by: Frank Du <frank.du@intel.com>
>>>>
>>>> ---
>>>> v2:
>>>> * Add virtual contiguous detect for for multiple memhdrs.
>>>> ---
>>>>  drivers/net/af_xdp/rte_eth_af_xdp.c | 34
>>>> ++++++++++++++++++++++++-----
>>>>  1 file changed, 28 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
>>>> b/drivers/net/af_xdp/rte_eth_af_xdp.c
>>>> index 268a130c49..7456108d6d 100644
>>>> --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
>>>> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
>>>> @@ -1039,16 +1039,35 @@ eth_link_update(struct rte_eth_dev *dev
>>>> __rte_unused,  }
>>>>
>>>>  #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
>>>> -static inline uintptr_t get_base_addr(struct rte_mempool *mp,
>>>> uint64_t *align)
>>>> +static inline uintptr_t get_memhdr_info(struct rte_mempool *mp,
>>>> +uint64_t *align, size_t *len)
>>>>  {
>>>> -	struct rte_mempool_memhdr *memhdr;
>>>> +	struct rte_mempool_memhdr *memhdr, *next;
>>>>  	uintptr_t memhdr_addr, aligned_addr;
>>>> +	size_t memhdr_len = 0;
>>>>
>>>> +	/* get the mempool base addr and align */
>>>>  	memhdr = STAILQ_FIRST(&mp->mem_list);
>>>>  	memhdr_addr = (uintptr_t)memhdr->addr;
> 
> This is not a new bug; but if the mempool is not populated, memhdr is NULL here.
> 
>>>>  	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
>>>>  	*align = memhdr_addr - aligned_addr;
>>>>
>>>
>>> I am aware this is not part of this patch, but as note, can't we use
>>> 'RTE_ALIGN_FLOOR' to calculate aligned address.
>>
>> Sure, will use RTE_ALIGN_FLOOR in next version.
>>
>>>
>>>
>>>> +	memhdr_len += memhdr->len;
>>>> +
>>>> +	/* check if virtual contiguous memory for multiple memhdrs */
>>>> +	next = STAILQ_NEXT(memhdr, next);
>>>> +	while (next != NULL) {
>>>> +		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr + memhdr-
>>>> len) {
>>>> +			AF_XDP_LOG(ERR, "memory chunks not virtual
>>> contiguous, "
>>>> +					"next: %p, cur: %p(len: %" PRId64
>>> " )\n",
>>>> +					next->addr, memhdr->addr, memhdr-
>>>> len);
>>>> +			return 0;
>>>> +		}
>>>>
>>>
>>> Isn't there a mempool flag that can help us figure out mempool is not IOVA
>>> contiguous? Isn't it sufficient on its own?
>>
>> Indeed, what we need to ascertain is whether it's contiguous in CPU virtual
>> space, not IOVA. I haven't come across a flag specifically for CPU virtual
>> contiguity. The major limitation in XDP is XSK UMEM only supports registering
>> a single contiguous virtual memory area.
> 
> I would assume that the EAL memory manager merges free memory into contiguous chunks whenever possible.
> @Anatoly, please confirm?
> 
> If my assumption is correct, it means that if mp->nb_mem_chunks != 1, then the mempool is not virtual contiguous. And if mp->nb_mem_chunks == 1, then it is; there is no need to iterate through the memhdr list.
> 
>>
>>>
>>>
>>>> +		/* virtual contiguous */
>>>> +		memhdr = next;
>>>> +		memhdr_len += memhdr->len;
>>>> +		next = STAILQ_NEXT(memhdr, next);
>>>> +	}
>>>>
>>>> +	*len = memhdr_len;
>>>>  	return aligned_addr;
>>>>  }
>>>>
>>>
>>> This function goes too much details of the mempool object, and any change in
>>> mempool details has potential to break this code.
>>>
>>> @Andrew, @Morten, do you think does it make sense to have
>>> 'rte_mempool_info_get()' kind of function, that provides at least address
>> and
>>> length of the mempool, and used here?
>>>
>>> This helps to hide internal details and complexity of the mempool for users.
> 
> I think all the relevant information is available as (public) fields directly in the rte_mempool.
> I agree about hiding internal details. For discriminating between private and public information, I would prefer marking the "private" fields in the rte_mempool structure as such.
> 

Marking fields 'private' may break some user code, so I don't know about
this, although I agree better thing to do programmatically.

But even required fields are public, having an _info() API may help
abstracting the implementation details and kept it withing the library
(instead of leaking to drivers like this).
But not sure if there are other potential users of such an API.

> Optimally we need an rte_mempool_create() flag, specifying that the mempool objects must be allocated as one chunk of memory with contiguous virtual addresses when populating the mempool. As discussed in another thread [1], the proposed pointer compression library would also benefit from such a mempool flag.
> 

Ack, this also can be useful for this usecase.

> [1] https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9F45C@smartserver.smartshare.dk/
> 
>>>
>>>
>>>>
>>>> @@ -1125,6 +1144,7 @@ xsk_umem_info *xdp_umem_configure(struct
>>> pmd_internals *internals,
>>>>  	void *base_addr = NULL;
>>>>  	struct rte_mempool *mb_pool = rxq->mb_pool;
>>>>  	uint64_t umem_size, align = 0;
>>>> +	size_t len = 0;
>>>>
>>>>  	if (internals->shared_umem) {
>>>>  		if (get_shared_umem(rxq, internals->if_name, &umem) < 0) @@
>>>> -1156,10 +1176,12 @@ xsk_umem_info *xdp_umem_configure(struct
>>> pmd_internals *internals,
>>>>  		}
>>>>
>>>>  		umem->mb_pool = mb_pool;
>>>> -		base_addr = (void *)get_base_addr(mb_pool, &align);
>>>> -		umem_size = (uint64_t)mb_pool->populated_size *
>>>> -				(uint64_t)usr_config.frame_size +
>>>> -				align;
>>>> +		base_addr = (void *)get_memhdr_info(mb_pool, &align, &len);
>>>>
>>>
>>> Is this calculation correct if mempool is not already aligned to page size?
> 
> Please note: The mempool uses one memzone for the mempool structure itself. The objects in the mempool are stored in another memzone (or multiple other memzones, if necessary). I think you are talking about the alignment of the mempool object chunk, not of the mempool structure itself.
> 

Yes, about the mempool object chunk.

Morten Brørup May 22, 2024, 11:03 a.m. UTC | #9

> From: Ferruh Yigit [mailto:ferruh.yigit@amd.com]
> Sent: Wednesday, 22 May 2024 12.01
> 
> On 5/22/2024 2:25 AM, Du, Frank wrote:
> >> From: Ferruh Yigit <ferruh.yigit@amd.com>
> >> Sent: Wednesday, May 22, 2024 1:58 AM
> >>
> >> Isn't there a mempool flag that can help us figure out mempool is not IOVA
> >> contiguous? Isn't it sufficient on its own?
> >
> > Indeed, what we need to ascertain is whether it's contiguous in CPU virtual
> space, not IOVA. I haven't come across a flag specifically for CPU virtual
> contiguity. The major limitation in XDP is XSK UMEM only supports registering
> a single contiguous virtual memory area.
> >
> 
> 'RTE_MEMPOOL_F_NO_IOVA_CONTIG' is the flag I was looking for. This flag
> being *cleared* implies IOVA contiguous but not sure if it is
> guaranteed, need to check.

Wrong.
RTE_MEMPOOL_F_NO_IOVA_CONTIG only relates to individual objects in the pool. This flag being cleared only means that each individual object in the mempool is IOVA contiguous.
It does not imply that the entire pool of objects is IOVA contiguous.

> 
> And I may be wrong here, but as far as I remember in IOVA as VA mode,
> process virtual address and IOVA address are same, so IOVA contiguous is
> same as contiguous CPU virtual address.

Ferruh Yigit May 22, 2024, 2:05 p.m. UTC | #10

On 5/22/2024 12:03 PM, Morten Brørup wrote:
>> From: Ferruh Yigit [mailto:ferruh.yigit@amd.com]
>> Sent: Wednesday, 22 May 2024 12.01
>>
>> On 5/22/2024 2:25 AM, Du, Frank wrote:
>>>> From: Ferruh Yigit <ferruh.yigit@amd.com>
>>>> Sent: Wednesday, May 22, 2024 1:58 AM
>>>>
>>>> Isn't there a mempool flag that can help us figure out mempool is not IOVA
>>>> contiguous? Isn't it sufficient on its own?
>>>
>>> Indeed, what we need to ascertain is whether it's contiguous in CPU virtual
>> space, not IOVA. I haven't come across a flag specifically for CPU virtual
>> contiguity. The major limitation in XDP is XSK UMEM only supports registering
>> a single contiguous virtual memory area.
>>>
>>
>> 'RTE_MEMPOOL_F_NO_IOVA_CONTIG' is the flag I was looking for. This flag
>> being *cleared* implies IOVA contiguous but not sure if it is
>> guaranteed, need to check.
> 
> Wrong.
> RTE_MEMPOOL_F_NO_IOVA_CONTIG only relates to individual objects in the pool. This flag being cleared only means that each individual object in the mempool is IOVA contiguous.
> It does not imply that the entire pool of objects is IOVA contiguous.
> 

Ah, thanks for clarification.

diff mbox series

Patch

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
index 268a130c49..7456108d6d 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -1039,16 +1039,35 @@  eth_link_update(struct rte_eth_dev *dev __rte_unused,
 }
 
 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
-static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
+static inline uintptr_t get_memhdr_info(struct rte_mempool *mp, uint64_t *align, size_t *len)
 {
-	struct rte_mempool_memhdr *memhdr;
+	struct rte_mempool_memhdr *memhdr, *next;
 	uintptr_t memhdr_addr, aligned_addr;
+	size_t memhdr_len = 0;
 
+	/* get the mempool base addr and align */
 	memhdr = STAILQ_FIRST(&mp->mem_list);
 	memhdr_addr = (uintptr_t)memhdr->addr;
 	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
 	*align = memhdr_addr - aligned_addr;
+	memhdr_len += memhdr->len;
+
+	/* check if virtual contiguous memory for multiple memhdrs */
+	next = STAILQ_NEXT(memhdr, next);
+	while (next != NULL) {
+		if ((uintptr_t)next->addr != (uintptr_t)memhdr->addr + memhdr->len) {
+			AF_XDP_LOG(ERR, "memory chunks not virtual contiguous, "
+					"next: %p, cur: %p(len: %" PRId64 " )\n",
+					next->addr, memhdr->addr, memhdr->len);
+			return 0;
+		}
+		/* virtual contiguous */
+		memhdr = next;
+		memhdr_len += memhdr->len;
+		next = STAILQ_NEXT(memhdr, next);
+	}
 
+	*len = memhdr_len;
 	return aligned_addr;
 }
 
@@ -1125,6 +1144,7 @@  xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
 	void *base_addr = NULL;
 	struct rte_mempool *mb_pool = rxq->mb_pool;
 	uint64_t umem_size, align = 0;
+	size_t len = 0;
 
 	if (internals->shared_umem) {
 		if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
@@ -1156,10 +1176,12 @@  xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
 		}
 
 		umem->mb_pool = mb_pool;
-		base_addr = (void *)get_base_addr(mb_pool, &align);
-		umem_size = (uint64_t)mb_pool->populated_size *
-				(uint64_t)usr_config.frame_size +
-				align;
+		base_addr = (void *)get_memhdr_info(mb_pool, &align, &len);
+		if (!base_addr) {
+			AF_XDP_LOG(ERR, "Failed to parse memhdr info from pool\n");
+			goto err;
+		}
+		umem_size = (uint64_t)len + align;
 
 		ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
 				&rxq->fq, &rxq->cq, &usr_config);