[v2,5/6] mempool: prevent objects from being across pages

Message ID 20191030143619.4007-6-olivier.matz@6wind.com (mailing list archive)
State Superseded, archived
Headers
Series mempool: avoid objects allocations across pages |

Checks

Context Check Description
ci/Intel-compilation fail Compilation issues
ci/checkpatch success coding style OK

Commit Message

Olivier Matz Oct. 30, 2019, 2:36 p.m. UTC
  When populating a mempool, ensure that objects are not located across
several pages, except if user did not request iova contiguous objects.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 drivers/mempool/octeontx2/Makefile           |   3 +
 drivers/mempool/octeontx2/meson.build        |   3 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
 lib/librte_mempool/rte_mempool.c             |  23 ++--
 lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
 5 files changed, 147 insertions(+), 33 deletions(-)
  

Comments

Vamsi Krishna Attunuru Oct. 31, 2019, 6:54 a.m. UTC | #1
Hi Olivier,

Thanks for reworked patches.
With V2, Tests with 512MB & 2M page sizes work fine with octeontx2 mempool pmd. One more concern is, octeontx fpa mempool driver also has the similar requirements. How do we address that, Can you suggest the best way to avoid code duplication in PMDs.

Regards
A Vamsi 

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Wednesday, October 30, 2019 8:06 PM
> To: dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> pages
> 
> External Email
> 
> ----------------------------------------------------------------------
> When populating a mempool, ensure that objects are not located across several
> pages, except if user did not request iova contiguous objects.
> 
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  drivers/mempool/octeontx2/Makefile           |   3 +
>  drivers/mempool/octeontx2/meson.build        |   3 +
>  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
>  lib/librte_mempool/rte_mempool.c             |  23 ++--
>  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
>  5 files changed, 147 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/mempool/octeontx2/Makefile
> b/drivers/mempool/octeontx2/Makefile
> index 87cce22c6..d781cbfc6 100644
> --- a/drivers/mempool/octeontx2/Makefile
> +++ b/drivers/mempool/octeontx2/Makefile
> @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> 
>  LIBABIVER := 1
> 
> +# for rte_mempool_get_page_size
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +
>  #
>  # all source are stored in SRCS-y
>  #
> diff --git a/drivers/mempool/octeontx2/meson.build
> b/drivers/mempool/octeontx2/meson.build
> index 9fde40f0e..28f9634da 100644
> --- a/drivers/mempool/octeontx2/meson.build
> +++ b/drivers/mempool/octeontx2/meson.build
> @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> 
>  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> +
> +# for rte_mempool_get_page_size
> +allow_experimental_apis = true
> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> index d769575f4..47117aec6 100644
> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> @@ -713,12 +713,76 @@ static ssize_t
>  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
>  		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> -	/*
> -	 * Simply need space for one more object to be able to
> -	 * fulfill alignment requirements.
> -	 */
> -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> pg_shift,
> -						    min_chunk_size, align);
> +	size_t total_elt_sz;
> +	size_t obj_per_page, pg_sz, objs_in_last_page;
> +	size_t mem_size;
> +
> +	/* derived from rte_mempool_op_calc_mem_size_default() */
> +
> +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> +
> +	if (total_elt_sz == 0) {
> +		mem_size = 0;
> +	} else if (pg_shift == 0) {
> +		/* one object margin to fix alignment */
> +		mem_size = total_elt_sz * (obj_num + 1);
> +	} else {
> +		pg_sz = (size_t)1 << pg_shift;
> +		obj_per_page = pg_sz / total_elt_sz;
> +
> +		/* we need to keep one object to fix alignment */
> +		if (obj_per_page > 0)
> +			obj_per_page--;
> +
> +		if (obj_per_page == 0) {
> +			/*
> +			 * Note that if object size is bigger than page size,
> +			 * then it is assumed that pages are grouped in subsets
> +			 * of physically continuous pages big enough to store
> +			 * at least one object.
> +			 */
> +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> +						pg_sz) * obj_num;
> +		} else {
> +			/* In the best case, the allocator will return a
> +			 * page-aligned address. For example, with 5 objs,
> +			 * the required space is as below:
> +			 *  |     page0     |     page1     |  page2 (last) |
> +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> +			 *  <------------- mem_size ------------->
> +			 */
> +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> 1;
> +			/* room required for the last page */
> +			mem_size = objs_in_last_page * total_elt_sz;
> +			/* room required for other pages */
> +			mem_size += ((obj_num - objs_in_last_page) /
> +				obj_per_page) << pg_shift;
> +
> +			/* In the worst case, the allocator returns a
> +			 * non-aligned pointer, wasting up to
> +			 * total_elt_sz. Add a margin for that.
> +			 */
> +			 mem_size += total_elt_sz - 1;
> +		}
> +	}
> +
> +	*min_chunk_size = total_elt_sz * 2;
> +	*align = RTE_CACHE_LINE_SIZE;
> +
> +	return mem_size;
> +}
> +
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> pg_sz))
> +		return -1;
> +	return 0;
>  }
> 
>  static int
> @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> unsigned int max_objs, void *vaddr,
>  		  rte_iova_t iova, size_t len,
>  		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>  	size_t off;
> +	unsigned int i;
> +	void *obj;
> +	int ret;
> 
>  	if (iova == RTE_BAD_IOVA)
>  		return -EINVAL;
> @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> unsigned int max_objs, void *vaddr,
>  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> 
>  	/* Align object start address to a multiple of total_elt_sz */
> -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> 
>  	if (len < off)
>  		return -EINVAL;
> 
> -	vaddr = (char *)vaddr + off;
> -	iova += off;
> -	len -= off;
> 
> -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> 
>  	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
>  		return -EBUSY;
> 
> -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> len,
> -					       obj_cb, obj_cb_arg);
> +	/* the following is derived from rte_mempool_op_populate_default() */
> +
> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> +	if (ret < 0)
> +		return ret;
> +
> +	for (i = 0; i < max_objs; i++) {
> +		/* avoid objects to cross page boundaries, and align
> +		 * offset to a multiple of total_elt_sz.
> +		 */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> off);
> +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> +						total_elt_sz) + 1);
> +		}
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
> +		off += mp->header_size;
> +		obj = va + off;
> +		obj_cb(mp, obj_cb_arg, obj,
> +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> +		off += mp->elt_size + mp->trailer_size;
> +	}
> +
> +	return i;
>  }
> 
>  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 758c5410b..d3db9273d 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> size_t *pg_sz)
> 
>  	if (!need_iova_contig_obj)
>  		*pg_sz = 0;
> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> -		*pg_sz = 0;
>  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>  		*pg_sz = get_min_page_size(mp->socket_id);
>  	else
> @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> *mp)
>  	 * then just set page shift and page size to 0, because the user has
>  	 * indicated that there's no need to care about anything.
>  	 *
> -	 * if we do need contiguous objects, there is also an option to reserve
> -	 * the entire mempool memory as one contiguous block of memory, in
> -	 * which case the page shift and alignment wouldn't matter as well.
> +	 * if we do need contiguous objects (if a mempool driver has its
> +	 * own calc_size() method returning min_chunk_size = mem_size),
> +	 * there is also an option to reserve the entire mempool memory
> +	 * as one contiguous block of memory.
>  	 *
>  	 * if we require contiguous objects, but not necessarily the entire
> -	 * mempool reserved space to be contiguous, then there are two
> options.
> -	 *
> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> -	 * case), then no page shift needed - our memory allocation will give us
> -	 * contiguous IO memory as far as the hardware is concerned, so
> -	 * act as if we're getting contiguous memory.
> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> +	 * and the default ops->populate() will take care of not placing
> +	 * objects across pages.
>  	 *
>  	 * if our IO addresses are physical, we may get memory from bigger
>  	 * pages, or we might get memory from smaller pages, and how much
> of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
>  	 *
>  	 * If we fail to get enough contiguous memory, then we'll go and
>  	 * reserve space in smaller chunks.
> -	 *
> -	 * We also have to take into account the fact that memory that we're
> -	 * going to allocate from can belong to an externally allocated memory
> -	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold.
>  	 */
> 
>  	need_iova_contig_obj = !(mp->flags &
> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> a/lib/librte_mempool/rte_mempool_ops_default.c
> b/lib/librte_mempool/rte_mempool_ops_default.c
> index f6aea7662..e5cd4600f 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> rte_mempool *mp,
>  	return mem_size;
>  }
> 
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> pg_sz))
> +		return -1;
> +	return 0;
> +}
> +
>  int
>  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> max_objs,
>  		void *vaddr, rte_iova_t iova, size_t len,
>  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>  	size_t off;
>  	unsigned int i;
>  	void *obj;
> +	int ret;
> +
> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> +	if (ret < 0)
> +		return ret;
> 
>  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> 
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* avoid objects to cross page boundaries */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> off);
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>  		off += mp->header_size;
> -		obj = (char *)vaddr + off;
> +		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> --
> 2.20.1
  
Jerin Jacob Oct. 31, 2019, 8:19 a.m. UTC | #2
On Thu, Oct 31, 2019 at 12:25 PM Vamsi Krishna Attunuru
<vattunuru@marvell.com> wrote:
>
> Hi Olivier,
>
> Thanks for reworked patches.
> With V2, Tests with 512MB & 2M page sizes work fine with octeontx2 mempool pmd. One more concern is, octeontx fpa mempool driver also has the similar requirements. How do we address that, Can you suggest the best way to avoid code duplication in PMDs.

# Actually both drivers don't call any HW specific function on those ops
# Is it possible to move the code under "  /* derived from
rte_mempool_op_calc_mem_size_default() */"
to static function in the mempool lib
ie.
it will keep the generic rte_mempool_op_calc_mem_size_default() clean
and we can introduce
other variant of rte_mempool_op_calc_mem_size_default() for this
specific requirement  which inclues the static generic function.

I don't think, it is a one-off requirement to have an object size
aligned to object start address in all HW based mempool
implementation.

The reason for the HW scheme doing such a scheme is the following:
# if object size aligned to object block size, when HW wants to
enqueue() a packet back to pool,
irrespective of the free pointer offset it can enqueue the mbuf
address to mempool.

Example:
X - mbuf start address
Y - the object block size
X + n <- is the packet pointer to send the packet

When submitting the packet o HW to send the packet, SW needs to only
mention the X + n and
when HW frees it, it can derive the X(mbuf pointer address) by the
following arithmetic
X = (X + n ) - ((X + n) MOD Y)

Hi Olivier,
It is not worth going back and forth on this code organization. You
can decide on a scheme, We will follow that.


>
> Regards
> A Vamsi
>
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Wednesday, October 30, 2019 8:06 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > pages
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > When populating a mempool, ensure that objects are not located across several
> > pages, except if user did not request iova contiguous objects.
> >
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  drivers/mempool/octeontx2/Makefile           |   3 +
> >  drivers/mempool/octeontx2/meson.build        |   3 +
> >  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> >  lib/librte_mempool/rte_mempool.c             |  23 ++--
> >  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> >  5 files changed, 147 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/mempool/octeontx2/Makefile
> > b/drivers/mempool/octeontx2/Makefile
> > index 87cce22c6..d781cbfc6 100644
> > --- a/drivers/mempool/octeontx2/Makefile
> > +++ b/drivers/mempool/octeontx2/Makefile
> > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> >
> >  LIBABIVER := 1
> >
> > +# for rte_mempool_get_page_size
> > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > +
> >  #
> >  # all source are stored in SRCS-y
> >  #
> > diff --git a/drivers/mempool/octeontx2/meson.build
> > b/drivers/mempool/octeontx2/meson.build
> > index 9fde40f0e..28f9634da 100644
> > --- a/drivers/mempool/octeontx2/meson.build
> > +++ b/drivers/mempool/octeontx2/meson.build
> > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> >
> >  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > +
> > +# for rte_mempool_get_page_size
> > +allow_experimental_apis = true
> > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > index d769575f4..47117aec6 100644
> > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > @@ -713,12 +713,76 @@ static ssize_t
> >  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> >                      uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > -     /*
> > -      * Simply need space for one more object to be able to
> > -      * fulfill alignment requirements.
> > -      */
> > -     return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > pg_shift,
> > -                                                 min_chunk_size, align);
> > +     size_t total_elt_sz;
> > +     size_t obj_per_page, pg_sz, objs_in_last_page;
> > +     size_t mem_size;
> > +
> > +     /* derived from rte_mempool_op_calc_mem_size_default() */
> > +
> > +     total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > +
> > +     if (total_elt_sz == 0) {
> > +             mem_size = 0;
> > +     } else if (pg_shift == 0) {
> > +             /* one object margin to fix alignment */
> > +             mem_size = total_elt_sz * (obj_num + 1);
> > +     } else {
> > +             pg_sz = (size_t)1 << pg_shift;
> > +             obj_per_page = pg_sz / total_elt_sz;
> > +
> > +             /* we need to keep one object to fix alignment */
> > +             if (obj_per_page > 0)
> > +                     obj_per_page--;
> > +
> > +             if (obj_per_page == 0) {
> > +                     /*
> > +                      * Note that if object size is bigger than page size,
> > +                      * then it is assumed that pages are grouped in subsets
> > +                      * of physically continuous pages big enough to store
> > +                      * at least one object.
> > +                      */
> > +                     mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > +                                             pg_sz) * obj_num;
> > +             } else {
> > +                     /* In the best case, the allocator will return a
> > +                      * page-aligned address. For example, with 5 objs,
> > +                      * the required space is as below:
> > +                      *  |     page0     |     page1     |  page2 (last) |
> > +                      *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > +                      *  <------------- mem_size ------------->
> > +                      */
> > +                     objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > 1;
> > +                     /* room required for the last page */
> > +                     mem_size = objs_in_last_page * total_elt_sz;
> > +                     /* room required for other pages */
> > +                     mem_size += ((obj_num - objs_in_last_page) /
> > +                             obj_per_page) << pg_shift;
> > +
> > +                     /* In the worst case, the allocator returns a
> > +                      * non-aligned pointer, wasting up to
> > +                      * total_elt_sz. Add a margin for that.
> > +                      */
> > +                      mem_size += total_elt_sz - 1;
> > +             }
> > +     }
> > +
> > +     *min_chunk_size = total_elt_sz * 2;
> > +     *align = RTE_CACHE_LINE_SIZE;
> > +
> > +     return mem_size;
> > +}
> > +
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +     if (pg_sz == 0)
> > +             return 0;
> > +     if (elt_sz > pg_sz)
> > +             return 0;
> > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +             return -1;
> > +     return 0;
> >  }
> >
> >  static int
> > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >                 rte_iova_t iova, size_t len,
> >                 rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > {
> > -     size_t total_elt_sz;
> > +     char *va = vaddr;
> > +     size_t total_elt_sz, pg_sz;
> >       size_t off;
> > +     unsigned int i;
> > +     void *obj;
> > +     int ret;
> >
> >       if (iova == RTE_BAD_IOVA)
> >               return -EINVAL;
> > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> >       /* Align object start address to a multiple of total_elt_sz */
> > -     off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > +     off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> >
> >       if (len < off)
> >               return -EINVAL;
> >
> > -     vaddr = (char *)vaddr + off;
> > -     iova += off;
> > -     len -= off;
> >
> > -     npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > +     npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> >
> >       if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> >               return -EBUSY;
> >
> > -     return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > len,
> > -                                            obj_cb, obj_cb_arg);
> > +     /* the following is derived from rte_mempool_op_populate_default() */
> > +
> > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +     if (ret < 0)
> > +             return ret;
> > +
> > +     for (i = 0; i < max_objs; i++) {
> > +             /* avoid objects to cross page boundaries, and align
> > +              * offset to a multiple of total_elt_sz.
> > +              */
> > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +                     off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > +                                             total_elt_sz) + 1);
> > +             }
> > +
> > +             if (off + total_elt_sz > len)
> > +                     break;
> > +
> > +             off += mp->header_size;
> > +             obj = va + off;
> > +             obj_cb(mp, obj_cb_arg, obj,
> > +                    (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > +             rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > +             off += mp->elt_size + mp->trailer_size;
> > +     }
> > +
> > +     return i;
> >  }
> >
> >  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 758c5410b..d3db9273d 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > size_t *pg_sz)
> >
> >       if (!need_iova_contig_obj)
> >               *pg_sz = 0;
> > -     else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > -             *pg_sz = 0;
> >       else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >               *pg_sz = get_min_page_size(mp->socket_id);
> >       else
> > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > *mp)
> >        * then just set page shift and page size to 0, because the user has
> >        * indicated that there's no need to care about anything.
> >        *
> > -      * if we do need contiguous objects, there is also an option to reserve
> > -      * the entire mempool memory as one contiguous block of memory, in
> > -      * which case the page shift and alignment wouldn't matter as well.
> > +      * if we do need contiguous objects (if a mempool driver has its
> > +      * own calc_size() method returning min_chunk_size = mem_size),
> > +      * there is also an option to reserve the entire mempool memory
> > +      * as one contiguous block of memory.
> >        *
> >        * if we require contiguous objects, but not necessarily the entire
> > -      * mempool reserved space to be contiguous, then there are two
> > options.
> > -      *
> > -      * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -      * case), then no page shift needed - our memory allocation will give us
> > -      * contiguous IO memory as far as the hardware is concerned, so
> > -      * act as if we're getting contiguous memory.
> > +      * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +      * and the default ops->populate() will take care of not placing
> > +      * objects across pages.
> >        *
> >        * if our IO addresses are physical, we may get memory from bigger
> >        * pages, or we might get memory from smaller pages, and how much
> > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >        *
> >        * If we fail to get enough contiguous memory, then we'll go and
> >        * reserve space in smaller chunks.
> > -      *
> > -      * We also have to take into account the fact that memory that we're
> > -      * going to allocate from can belong to an externally allocated memory
> > -      * area, in which case the assumption of IOVA as VA mode being
> > -      * synonymous with IOVA contiguousness will not hold.
> >        */
> >
> >       need_iova_contig_obj = !(mp->flags &
> > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..e5cd4600f 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > rte_mempool *mp,
> >       return mem_size;
> >  }
> >
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +     if (pg_sz == 0)
> > +             return 0;
> > +     if (elt_sz > pg_sz)
> > +             return 0;
> > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +             return -1;
> > +     return 0;
> > +}
> > +
> >  int
> >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > max_objs,
> >               void *vaddr, rte_iova_t iova, size_t len,
> >               rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > -     size_t total_elt_sz;
> > +     char *va = vaddr;
> > +     size_t total_elt_sz, pg_sz;
> >       size_t off;
> >       unsigned int i;
> >       void *obj;
> > +     int ret;
> > +
> > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +     if (ret < 0)
> > +             return ret;
> >
> >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> > -     for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +     for (off = 0, i = 0; i < max_objs; i++) {
> > +             /* avoid objects to cross page boundaries */
> > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +
> > +             if (off + total_elt_sz > len)
> > +                     break;
> > +
> >               off += mp->header_size;
> > -             obj = (char *)vaddr + off;
> > +             obj = va + off;
> >               obj_cb(mp, obj_cb_arg, obj,
> >                      (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >               rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > --
> > 2.20.1
>
  
Olivier Matz Oct. 31, 2019, 8:24 a.m. UTC | #3
Hi,

On Thu, Oct 31, 2019 at 06:54:50AM +0000, Vamsi Krishna Attunuru wrote:
> Hi Olivier,
> 
> Thanks for reworked patches.
> With V2, Tests with 512MB & 2M page sizes work fine with octeontx2
> mempool pmd.

Good to hear.

> One more concern is, octeontx fpa mempool driver also has the similar
> requirements. How do we address that, Can you suggest the best way to
> avoid code duplication in PMDs.

Well, we could provide additional helpers in librte_mempool:
rte_mempool_calc_mem_size_helper() and rte_mempool_populate_helper()
that would be internal to mempool lib + drivers. This helpers could take
an additional parameter to enable the alignemnt on objects.

But: with this approach, we are moving back driver specificities inside
the common code, and if tomorrow more drivers with different
requirements also asks to factorize them in common code, we may end up
with common functions that must support hardware specificities, making
them harder to maintain.

I agree that duplicating code is not satisfying either, but I think it
is still better than starting to move hw requirements in common code,
given that mempool drivers where introduced for that reason.

@Andrew, any opinion?



> 
> Regards
> A Vamsi 
> 
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Wednesday, October 30, 2019 8:06 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > pages
> > 
> > External Email
> > 
> > ----------------------------------------------------------------------
> > When populating a mempool, ensure that objects are not located across several
> > pages, except if user did not request iova contiguous objects.
> > 
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  drivers/mempool/octeontx2/Makefile           |   3 +
> >  drivers/mempool/octeontx2/meson.build        |   3 +
> >  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> >  lib/librte_mempool/rte_mempool.c             |  23 ++--
> >  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> >  5 files changed, 147 insertions(+), 33 deletions(-)
> > 
> > diff --git a/drivers/mempool/octeontx2/Makefile
> > b/drivers/mempool/octeontx2/Makefile
> > index 87cce22c6..d781cbfc6 100644
> > --- a/drivers/mempool/octeontx2/Makefile
> > +++ b/drivers/mempool/octeontx2/Makefile
> > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> > 
> >  LIBABIVER := 1
> > 
> > +# for rte_mempool_get_page_size
> > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > +
> >  #
> >  # all source are stored in SRCS-y
> >  #
> > diff --git a/drivers/mempool/octeontx2/meson.build
> > b/drivers/mempool/octeontx2/meson.build
> > index 9fde40f0e..28f9634da 100644
> > --- a/drivers/mempool/octeontx2/meson.build
> > +++ b/drivers/mempool/octeontx2/meson.build
> > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> > 
> >  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > +
> > +# for rte_mempool_get_page_size
> > +allow_experimental_apis = true
> > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > index d769575f4..47117aec6 100644
> > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > @@ -713,12 +713,76 @@ static ssize_t
> >  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> >  		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > -	/*
> > -	 * Simply need space for one more object to be able to
> > -	 * fulfill alignment requirements.
> > -	 */
> > -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > pg_shift,
> > -						    min_chunk_size, align);
> > +	size_t total_elt_sz;
> > +	size_t obj_per_page, pg_sz, objs_in_last_page;
> > +	size_t mem_size;
> > +
> > +	/* derived from rte_mempool_op_calc_mem_size_default() */
> > +
> > +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > +
> > +	if (total_elt_sz == 0) {
> > +		mem_size = 0;
> > +	} else if (pg_shift == 0) {
> > +		/* one object margin to fix alignment */
> > +		mem_size = total_elt_sz * (obj_num + 1);
> > +	} else {
> > +		pg_sz = (size_t)1 << pg_shift;
> > +		obj_per_page = pg_sz / total_elt_sz;
> > +
> > +		/* we need to keep one object to fix alignment */
> > +		if (obj_per_page > 0)
> > +			obj_per_page--;
> > +
> > +		if (obj_per_page == 0) {
> > +			/*
> > +			 * Note that if object size is bigger than page size,
> > +			 * then it is assumed that pages are grouped in subsets
> > +			 * of physically continuous pages big enough to store
> > +			 * at least one object.
> > +			 */
> > +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > +						pg_sz) * obj_num;
> > +		} else {
> > +			/* In the best case, the allocator will return a
> > +			 * page-aligned address. For example, with 5 objs,
> > +			 * the required space is as below:
> > +			 *  |     page0     |     page1     |  page2 (last) |
> > +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > +			 *  <------------- mem_size ------------->
> > +			 */
> > +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > 1;
> > +			/* room required for the last page */
> > +			mem_size = objs_in_last_page * total_elt_sz;
> > +			/* room required for other pages */
> > +			mem_size += ((obj_num - objs_in_last_page) /
> > +				obj_per_page) << pg_shift;
> > +
> > +			/* In the worst case, the allocator returns a
> > +			 * non-aligned pointer, wasting up to
> > +			 * total_elt_sz. Add a margin for that.
> > +			 */
> > +			 mem_size += total_elt_sz - 1;
> > +		}
> > +	}
> > +
> > +	*min_chunk_size = total_elt_sz * 2;
> > +	*align = RTE_CACHE_LINE_SIZE;
> > +
> > +	return mem_size;
> > +}
> > +
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +		return -1;
> > +	return 0;
> >  }
> > 
> >  static int
> > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >  		  rte_iova_t iova, size_t len,
> >  		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >  	size_t off;
> > +	unsigned int i;
> > +	void *obj;
> > +	int ret;
> > 
> >  	if (iova == RTE_BAD_IOVA)
> >  		return -EINVAL;
> > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > 
> >  	/* Align object start address to a multiple of total_elt_sz */
> > -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> > 
> >  	if (len < off)
> >  		return -EINVAL;
> > 
> > -	vaddr = (char *)vaddr + off;
> > -	iova += off;
> > -	len -= off;
> > 
> > -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> > 
> >  	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> >  		return -EBUSY;
> > 
> > -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > len,
> > -					       obj_cb, obj_cb_arg);
> > +	/* the following is derived from rte_mempool_op_populate_default() */
> > +
> > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	for (i = 0; i < max_objs; i++) {
> > +		/* avoid objects to cross page boundaries, and align
> > +		 * offset to a multiple of total_elt_sz.
> > +		 */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > +						total_elt_sz) + 1);
> > +		}
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> > +		off += mp->header_size;
> > +		obj = va + off;
> > +		obj_cb(mp, obj_cb_arg, obj,
> > +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > +		off += mp->elt_size + mp->trailer_size;
> > +	}
> > +
> > +	return i;
> >  }
> > 
> >  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 758c5410b..d3db9273d 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > size_t *pg_sz)
> > 
> >  	if (!need_iova_contig_obj)
> >  		*pg_sz = 0;
> > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > -		*pg_sz = 0;
> >  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >  		*pg_sz = get_min_page_size(mp->socket_id);
> >  	else
> > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > *mp)
> >  	 * then just set page shift and page size to 0, because the user has
> >  	 * indicated that there's no need to care about anything.
> >  	 *
> > -	 * if we do need contiguous objects, there is also an option to reserve
> > -	 * the entire mempool memory as one contiguous block of memory, in
> > -	 * which case the page shift and alignment wouldn't matter as well.
> > +	 * if we do need contiguous objects (if a mempool driver has its
> > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > +	 * there is also an option to reserve the entire mempool memory
> > +	 * as one contiguous block of memory.
> >  	 *
> >  	 * if we require contiguous objects, but not necessarily the entire
> > -	 * mempool reserved space to be contiguous, then there are two
> > options.
> > -	 *
> > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -	 * case), then no page shift needed - our memory allocation will give us
> > -	 * contiguous IO memory as far as the hardware is concerned, so
> > -	 * act as if we're getting contiguous memory.
> > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +	 * and the default ops->populate() will take care of not placing
> > +	 * objects across pages.
> >  	 *
> >  	 * if our IO addresses are physical, we may get memory from bigger
> >  	 * pages, or we might get memory from smaller pages, and how much
> > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >  	 *
> >  	 * If we fail to get enough contiguous memory, then we'll go and
> >  	 * reserve space in smaller chunks.
> > -	 *
> > -	 * We also have to take into account the fact that memory that we're
> > -	 * going to allocate from can belong to an externally allocated memory
> > -	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold.
> >  	 */
> > 
> >  	need_iova_contig_obj = !(mp->flags &
> > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..e5cd4600f 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > rte_mempool *mp,
> >  	return mem_size;
> >  }
> > 
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +		return -1;
> > +	return 0;
> > +}
> > +
> >  int
> >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > max_objs,
> >  		void *vaddr, rte_iova_t iova, size_t len,
> >  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >  	size_t off;
> >  	unsigned int i;
> >  	void *obj;
> > +	int ret;
> > +
> > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +	if (ret < 0)
> > +		return ret;
> > 
> >  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > 
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* avoid objects to cross page boundaries */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >  		off += mp->header_size;
> > -		obj = (char *)vaddr + off;
> > +		obj = va + off;
> >  		obj_cb(mp, obj_cb_arg, obj,
> >  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > --
> > 2.20.1
>
  
Olivier Matz Oct. 31, 2019, 8:29 a.m. UTC | #4
Hi Jerin,

On Thu, Oct 31, 2019 at 01:49:10PM +0530, Jerin Jacob wrote:
> On Thu, Oct 31, 2019 at 12:25 PM Vamsi Krishna Attunuru
> <vattunuru@marvell.com> wrote:
> >
> > Hi Olivier,
> >
> > Thanks for reworked patches.
> > With V2, Tests with 512MB & 2M page sizes work fine with octeontx2 mempool pmd. One more concern is, octeontx fpa mempool driver also has the similar requirements. How do we address that, Can you suggest the best way to avoid code duplication in PMDs.
> 
> # Actually both drivers don't call any HW specific function on those ops
> # Is it possible to move the code under "  /* derived from
> rte_mempool_op_calc_mem_size_default() */"
> to static function in the mempool lib
> ie.
> it will keep the generic rte_mempool_op_calc_mem_size_default() clean
> and we can introduce
> other variant of rte_mempool_op_calc_mem_size_default() for this
> specific requirement  which inclues the static generic function.
> 
> I don't think, it is a one-off requirement to have an object size
> aligned to object start address in all HW based mempool
> implementation.
> 
> The reason for the HW scheme doing such a scheme is the following:
> # if object size aligned to object block size, when HW wants to
> enqueue() a packet back to pool,
> irrespective of the free pointer offset it can enqueue the mbuf
> address to mempool.
> 
> Example:
> X - mbuf start address
> Y - the object block size
> X + n <- is the packet pointer to send the packet
> 
> When submitting the packet o HW to send the packet, SW needs to only
> mention the X + n and
> when HW frees it, it can derive the X(mbuf pointer address) by the
> following arithmetic
> X = (X + n ) - ((X + n) MOD Y)
> 
> Hi Olivier,
> It is not worth going back and forth on this code organization. You
> can decide on a scheme, We will follow that.

Thanks for the explanation.
Our mail crossed each others. Please see my answer to Vamsi.


> 
> 
> >
> > Regards
> > A Vamsi
> >
> > > -----Original Message-----
> > > From: Olivier Matz <olivier.matz@6wind.com>
> > > Sent: Wednesday, October 30, 2019 8:06 PM
> > > To: dev@dpdk.org
> > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > > pages
> > >
> > > External Email
> > >
> > > ----------------------------------------------------------------------
> > > When populating a mempool, ensure that objects are not located across several
> > > pages, except if user did not request iova contiguous objects.
> > >
> > > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > ---
> > >  drivers/mempool/octeontx2/Makefile           |   3 +
> > >  drivers/mempool/octeontx2/meson.build        |   3 +
> > >  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> > >  lib/librte_mempool/rte_mempool.c             |  23 ++--
> > >  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> > >  5 files changed, 147 insertions(+), 33 deletions(-)
> > >
> > > diff --git a/drivers/mempool/octeontx2/Makefile
> > > b/drivers/mempool/octeontx2/Makefile
> > > index 87cce22c6..d781cbfc6 100644
> > > --- a/drivers/mempool/octeontx2/Makefile
> > > +++ b/drivers/mempool/octeontx2/Makefile
> > > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> > >
> > >  LIBABIVER := 1
> > >
> > > +# for rte_mempool_get_page_size
> > > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > > +
> > >  #
> > >  # all source are stored in SRCS-y
> > >  #
> > > diff --git a/drivers/mempool/octeontx2/meson.build
> > > b/drivers/mempool/octeontx2/meson.build
> > > index 9fde40f0e..28f9634da 100644
> > > --- a/drivers/mempool/octeontx2/meson.build
> > > +++ b/drivers/mempool/octeontx2/meson.build
> > > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> > >
> > >  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > > +
> > > +# for rte_mempool_get_page_size
> > > +allow_experimental_apis = true
> > > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > index d769575f4..47117aec6 100644
> > > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > @@ -713,12 +713,76 @@ static ssize_t
> > >  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> > >                      uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > > -     /*
> > > -      * Simply need space for one more object to be able to
> > > -      * fulfill alignment requirements.
> > > -      */
> > > -     return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > > pg_shift,
> > > -                                                 min_chunk_size, align);
> > > +     size_t total_elt_sz;
> > > +     size_t obj_per_page, pg_sz, objs_in_last_page;
> > > +     size_t mem_size;
> > > +
> > > +     /* derived from rte_mempool_op_calc_mem_size_default() */
> > > +
> > > +     total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > +
> > > +     if (total_elt_sz == 0) {
> > > +             mem_size = 0;
> > > +     } else if (pg_shift == 0) {
> > > +             /* one object margin to fix alignment */
> > > +             mem_size = total_elt_sz * (obj_num + 1);
> > > +     } else {
> > > +             pg_sz = (size_t)1 << pg_shift;
> > > +             obj_per_page = pg_sz / total_elt_sz;
> > > +
> > > +             /* we need to keep one object to fix alignment */
> > > +             if (obj_per_page > 0)
> > > +                     obj_per_page--;
> > > +
> > > +             if (obj_per_page == 0) {
> > > +                     /*
> > > +                      * Note that if object size is bigger than page size,
> > > +                      * then it is assumed that pages are grouped in subsets
> > > +                      * of physically continuous pages big enough to store
> > > +                      * at least one object.
> > > +                      */
> > > +                     mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > > +                                             pg_sz) * obj_num;
> > > +             } else {
> > > +                     /* In the best case, the allocator will return a
> > > +                      * page-aligned address. For example, with 5 objs,
> > > +                      * the required space is as below:
> > > +                      *  |     page0     |     page1     |  page2 (last) |
> > > +                      *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > > +                      *  <------------- mem_size ------------->
> > > +                      */
> > > +                     objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > > 1;
> > > +                     /* room required for the last page */
> > > +                     mem_size = objs_in_last_page * total_elt_sz;
> > > +                     /* room required for other pages */
> > > +                     mem_size += ((obj_num - objs_in_last_page) /
> > > +                             obj_per_page) << pg_shift;
> > > +
> > > +                     /* In the worst case, the allocator returns a
> > > +                      * non-aligned pointer, wasting up to
> > > +                      * total_elt_sz. Add a margin for that.
> > > +                      */
> > > +                      mem_size += total_elt_sz - 1;
> > > +             }
> > > +     }
> > > +
> > > +     *min_chunk_size = total_elt_sz * 2;
> > > +     *align = RTE_CACHE_LINE_SIZE;
> > > +
> > > +     return mem_size;
> > > +}
> > > +
> > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > +     if (pg_sz == 0)
> > > +             return 0;
> > > +     if (elt_sz > pg_sz)
> > > +             return 0;
> > > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > pg_sz))
> > > +             return -1;
> > > +     return 0;
> > >  }
> > >
> > >  static int
> > > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > unsigned int max_objs, void *vaddr,
> > >                 rte_iova_t iova, size_t len,
> > >                 rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > {
> > > -     size_t total_elt_sz;
> > > +     char *va = vaddr;
> > > +     size_t total_elt_sz, pg_sz;
> > >       size_t off;
> > > +     unsigned int i;
> > > +     void *obj;
> > > +     int ret;
> > >
> > >       if (iova == RTE_BAD_IOVA)
> > >               return -EINVAL;
> > > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > unsigned int max_objs, void *vaddr,
> > >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > >
> > >       /* Align object start address to a multiple of total_elt_sz */
> > > -     off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > > +     off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> > >
> > >       if (len < off)
> > >               return -EINVAL;
> > >
> > > -     vaddr = (char *)vaddr + off;
> > > -     iova += off;
> > > -     len -= off;
> > >
> > > -     npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > > +     npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> > >
> > >       if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> > >               return -EBUSY;
> > >
> > > -     return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > > len,
> > > -                                            obj_cb, obj_cb_arg);
> > > +     /* the following is derived from rte_mempool_op_populate_default() */
> > > +
> > > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > +     if (ret < 0)
> > > +             return ret;
> > > +
> > > +     for (i = 0; i < max_objs; i++) {
> > > +             /* avoid objects to cross page boundaries, and align
> > > +              * offset to a multiple of total_elt_sz.
> > > +              */
> > > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > off);
> > > +                     off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > > +                                             total_elt_sz) + 1);
> > > +             }
> > > +
> > > +             if (off + total_elt_sz > len)
> > > +                     break;
> > > +
> > > +             off += mp->header_size;
> > > +             obj = va + off;
> > > +             obj_cb(mp, obj_cb_arg, obj,
> > > +                    (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > > +             rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > +             off += mp->elt_size + mp->trailer_size;
> > > +     }
> > > +
> > > +     return i;
> > >  }
> > >
> > >  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > > index 758c5410b..d3db9273d 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > > size_t *pg_sz)
> > >
> > >       if (!need_iova_contig_obj)
> > >               *pg_sz = 0;
> > > -     else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > > -             *pg_sz = 0;
> > >       else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > >               *pg_sz = get_min_page_size(mp->socket_id);
> > >       else
> > > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > > *mp)
> > >        * then just set page shift and page size to 0, because the user has
> > >        * indicated that there's no need to care about anything.
> > >        *
> > > -      * if we do need contiguous objects, there is also an option to reserve
> > > -      * the entire mempool memory as one contiguous block of memory, in
> > > -      * which case the page shift and alignment wouldn't matter as well.
> > > +      * if we do need contiguous objects (if a mempool driver has its
> > > +      * own calc_size() method returning min_chunk_size = mem_size),
> > > +      * there is also an option to reserve the entire mempool memory
> > > +      * as one contiguous block of memory.
> > >        *
> > >        * if we require contiguous objects, but not necessarily the entire
> > > -      * mempool reserved space to be contiguous, then there are two
> > > options.
> > > -      *
> > > -      * if our IO addresses are virtual, not actual physical (IOVA as VA
> > > -      * case), then no page shift needed - our memory allocation will give us
> > > -      * contiguous IO memory as far as the hardware is concerned, so
> > > -      * act as if we're getting contiguous memory.
> > > +      * mempool reserved space to be contiguous, pg_sz will be != 0,
> > > +      * and the default ops->populate() will take care of not placing
> > > +      * objects across pages.
> > >        *
> > >        * if our IO addresses are physical, we may get memory from bigger
> > >        * pages, or we might get memory from smaller pages, and how much
> > > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > >        *
> > >        * If we fail to get enough contiguous memory, then we'll go and
> > >        * reserve space in smaller chunks.
> > > -      *
> > > -      * We also have to take into account the fact that memory that we're
> > > -      * going to allocate from can belong to an externally allocated memory
> > > -      * area, in which case the assumption of IOVA as VA mode being
> > > -      * synonymous with IOVA contiguousness will not hold.
> > >        */
> > >
> > >       need_iova_contig_obj = !(mp->flags &
> > > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > > a/lib/librte_mempool/rte_mempool_ops_default.c
> > > b/lib/librte_mempool/rte_mempool_ops_default.c
> > > index f6aea7662..e5cd4600f 100644
> > > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > > rte_mempool *mp,
> > >       return mem_size;
> > >  }
> > >
> > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > +     if (pg_sz == 0)
> > > +             return 0;
> > > +     if (elt_sz > pg_sz)
> > > +             return 0;
> > > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > pg_sz))
> > > +             return -1;
> > > +     return 0;
> > > +}
> > > +
> > >  int
> > >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > max_objs,
> > >               void *vaddr, rte_iova_t iova, size_t len,
> > >               rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > > -     size_t total_elt_sz;
> > > +     char *va = vaddr;
> > > +     size_t total_elt_sz, pg_sz;
> > >       size_t off;
> > >       unsigned int i;
> > >       void *obj;
> > > +     int ret;
> > > +
> > > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > +     if (ret < 0)
> > > +             return ret;
> > >
> > >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > >
> > > -     for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > +     for (off = 0, i = 0; i < max_objs; i++) {
> > > +             /* avoid objects to cross page boundaries */
> > > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > off);
> > > +
> > > +             if (off + total_elt_sz > len)
> > > +                     break;
> > > +
> > >               off += mp->header_size;
> > > -             obj = (char *)vaddr + off;
> > > +             obj = va + off;
> > >               obj_cb(mp, obj_cb_arg, obj,
> > >                      (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > >               rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > --
> > > 2.20.1
> >
  
Andrew Rybchenko Oct. 31, 2019, 8:33 a.m. UTC | #5
On 10/31/19 11:24 AM, Olivier Matz wrote:
> Hi,
>
> On Thu, Oct 31, 2019 at 06:54:50AM +0000, Vamsi Krishna Attunuru wrote:
>> Hi Olivier,
>>
>> Thanks for reworked patches.
>> With V2, Tests with 512MB & 2M page sizes work fine with octeontx2
>> mempool pmd.
> Good to hear.
>
>> One more concern is, octeontx fpa mempool driver also has the similar
>> requirements. How do we address that, Can you suggest the best way to
>> avoid code duplication in PMDs.
> Well, we could provide additional helpers in librte_mempool:
> rte_mempool_calc_mem_size_helper() and rte_mempool_populate_helper()
> that would be internal to mempool lib + drivers. This helpers could take
> an additional parameter to enable the alignemnt on objects.
>
> But: with this approach, we are moving back driver specificities inside
> the common code, and if tomorrow more drivers with different
> requirements also asks to factorize them in common code, we may end up
> with common functions that must support hardware specificities, making
> them harder to maintain.
>
> I agree that duplicating code is not satisfying either, but I think it
> is still better than starting to move hw requirements in common code,
> given that mempool drivers where introduced for that reason.
>
> @Andrew, any opinion?

I think I'd prefer to have helper function. It avoids HW specific
flags in external interface (which was discussed initially), but also
avoids code duplication which makes it easier to maintain.
May be we will reconsider it in the future, but right now I think
it is the best option.

>> Regards
>> A Vamsi
>>
>>> -----Original Message-----
>>> From: Olivier Matz <olivier.matz@6wind.com>
>>> Sent: Wednesday, October 30, 2019 8:06 PM
>>> To: dev@dpdk.org
>>> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
>>> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
>>> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
>>> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
>>> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
>>> <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
>>> Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
>>> pages
>>>
>>> External Email
>>>
>>> ----------------------------------------------------------------------
>>> When populating a mempool, ensure that objects are not located across several
>>> pages, except if user did not request iova contiguous objects.
>>>
>>> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>>> ---
>>>   drivers/mempool/octeontx2/Makefile           |   3 +
>>>   drivers/mempool/octeontx2/meson.build        |   3 +
>>>   drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
>>>   lib/librte_mempool/rte_mempool.c             |  23 ++--
>>>   lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
>>>   5 files changed, 147 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/drivers/mempool/octeontx2/Makefile
>>> b/drivers/mempool/octeontx2/Makefile
>>> index 87cce22c6..d781cbfc6 100644
>>> --- a/drivers/mempool/octeontx2/Makefile
>>> +++ b/drivers/mempool/octeontx2/Makefile
>>> @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
>>>
>>>   LIBABIVER := 1
>>>
>>> +# for rte_mempool_get_page_size
>>> +CFLAGS += -DALLOW_EXPERIMENTAL_API
>>> +
>>>   #
>>>   # all source are stored in SRCS-y
>>>   #
>>> diff --git a/drivers/mempool/octeontx2/meson.build
>>> b/drivers/mempool/octeontx2/meson.build
>>> index 9fde40f0e..28f9634da 100644
>>> --- a/drivers/mempool/octeontx2/meson.build
>>> +++ b/drivers/mempool/octeontx2/meson.build
>>> @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
>>>
>>>   deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
>>> +
>>> +# for rte_mempool_get_page_size
>>> +allow_experimental_apis = true
>>> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> index d769575f4..47117aec6 100644
>>> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> @@ -713,12 +713,76 @@ static ssize_t
>>>   otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
>>>   		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
>>> -	/*
>>> -	 * Simply need space for one more object to be able to
>>> -	 * fulfill alignment requirements.
>>> -	 */
>>> -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
>>> pg_shift,
>>> -						    min_chunk_size, align);
>>> +	size_t total_elt_sz;
>>> +	size_t obj_per_page, pg_sz, objs_in_last_page;
>>> +	size_t mem_size;
>>> +
>>> +	/* derived from rte_mempool_op_calc_mem_size_default() */
>>> +
>>> +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>> +
>>> +	if (total_elt_sz == 0) {
>>> +		mem_size = 0;
>>> +	} else if (pg_shift == 0) {
>>> +		/* one object margin to fix alignment */
>>> +		mem_size = total_elt_sz * (obj_num + 1);
>>> +	} else {
>>> +		pg_sz = (size_t)1 << pg_shift;
>>> +		obj_per_page = pg_sz / total_elt_sz;
>>> +
>>> +		/* we need to keep one object to fix alignment */
>>> +		if (obj_per_page > 0)
>>> +			obj_per_page--;
>>> +
>>> +		if (obj_per_page == 0) {
>>> +			/*
>>> +			 * Note that if object size is bigger than page size,
>>> +			 * then it is assumed that pages are grouped in subsets
>>> +			 * of physically continuous pages big enough to store
>>> +			 * at least one object.
>>> +			 */
>>> +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
>>> +						pg_sz) * obj_num;
>>> +		} else {
>>> +			/* In the best case, the allocator will return a
>>> +			 * page-aligned address. For example, with 5 objs,
>>> +			 * the required space is as below:
>>> +			 *  |     page0     |     page1     |  page2 (last) |
>>> +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
>>> +			 *  <------------- mem_size ------------->
>>> +			 */
>>> +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
>>> 1;
>>> +			/* room required for the last page */
>>> +			mem_size = objs_in_last_page * total_elt_sz;
>>> +			/* room required for other pages */
>>> +			mem_size += ((obj_num - objs_in_last_page) /
>>> +				obj_per_page) << pg_shift;
>>> +
>>> +			/* In the worst case, the allocator returns a
>>> +			 * non-aligned pointer, wasting up to
>>> +			 * total_elt_sz. Add a margin for that.
>>> +			 */
>>> +			 mem_size += total_elt_sz - 1;
>>> +		}
>>> +	}
>>> +
>>> +	*min_chunk_size = total_elt_sz * 2;
>>> +	*align = RTE_CACHE_LINE_SIZE;
>>> +
>>> +	return mem_size;
>>> +}
>>> +
>>> +/* Returns -1 if object crosses a page boundary, else returns 0 */
>>> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
>>> +	if (pg_sz == 0)
>>> +		return 0;
>>> +	if (elt_sz > pg_sz)
>>> +		return 0;
>>> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
>>> pg_sz))
>>> +		return -1;
>>> +	return 0;
>>>   }
>>>
>>>   static int
>>> @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
>>> unsigned int max_objs, void *vaddr,
>>>   		  rte_iova_t iova, size_t len,
>>>   		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>>> {
>>> -	size_t total_elt_sz;
>>> +	char *va = vaddr;
>>> +	size_t total_elt_sz, pg_sz;
>>>   	size_t off;
>>> +	unsigned int i;
>>> +	void *obj;
>>> +	int ret;
>>>
>>>   	if (iova == RTE_BAD_IOVA)
>>>   		return -EINVAL;
>>> @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
>>> unsigned int max_objs, void *vaddr,
>>>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>>
>>>   	/* Align object start address to a multiple of total_elt_sz */
>>> -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
>>> +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
>>>
>>>   	if (len < off)
>>>   		return -EINVAL;
>>>
>>> -	vaddr = (char *)vaddr + off;
>>> -	iova += off;
>>> -	len -= off;
>>>
>>> -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
>>> +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
>>>
>>>   	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
>>>   		return -EBUSY;
>>>
>>> -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
>>> len,
>>> -					       obj_cb, obj_cb_arg);
>>> +	/* the following is derived from rte_mempool_op_populate_default() */
>>> +
>>> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
>>> +	if (ret < 0)
>>> +		return ret;
>>> +
>>> +	for (i = 0; i < max_objs; i++) {
>>> +		/* avoid objects to cross page boundaries, and align
>>> +		 * offset to a multiple of total_elt_sz.
>>> +		 */
>>> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
>>> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
>>> off);
>>> +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
>>> +						total_elt_sz) + 1);
>>> +		}
>>> +
>>> +		if (off + total_elt_sz > len)
>>> +			break;
>>> +
>>> +		off += mp->header_size;
>>> +		obj = va + off;
>>> +		obj_cb(mp, obj_cb_arg, obj,
>>> +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>>> +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>>> +		off += mp->elt_size + mp->trailer_size;
>>> +	}
>>> +
>>> +	return i;
>>>   }
>>>
>>>   static struct rte_mempool_ops otx2_npa_ops = { diff --git
>>> a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
>>> index 758c5410b..d3db9273d 100644
>>> --- a/lib/librte_mempool/rte_mempool.c
>>> +++ b/lib/librte_mempool/rte_mempool.c
>>> @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
>>> size_t *pg_sz)
>>>
>>>   	if (!need_iova_contig_obj)
>>>   		*pg_sz = 0;
>>> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
>>> -		*pg_sz = 0;
>>>   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>>>   		*pg_sz = get_min_page_size(mp->socket_id);
>>>   	else
>>> @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
>>> *mp)
>>>   	 * then just set page shift and page size to 0, because the user has
>>>   	 * indicated that there's no need to care about anything.
>>>   	 *
>>> -	 * if we do need contiguous objects, there is also an option to reserve
>>> -	 * the entire mempool memory as one contiguous block of memory, in
>>> -	 * which case the page shift and alignment wouldn't matter as well.
>>> +	 * if we do need contiguous objects (if a mempool driver has its
>>> +	 * own calc_size() method returning min_chunk_size = mem_size),
>>> +	 * there is also an option to reserve the entire mempool memory
>>> +	 * as one contiguous block of memory.
>>>   	 *
>>>   	 * if we require contiguous objects, but not necessarily the entire
>>> -	 * mempool reserved space to be contiguous, then there are two
>>> options.
>>> -	 *
>>> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
>>> -	 * case), then no page shift needed - our memory allocation will give us
>>> -	 * contiguous IO memory as far as the hardware is concerned, so
>>> -	 * act as if we're getting contiguous memory.
>>> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
>>> +	 * and the default ops->populate() will take care of not placing
>>> +	 * objects across pages.
>>>   	 *
>>>   	 * if our IO addresses are physical, we may get memory from bigger
>>>   	 * pages, or we might get memory from smaller pages, and how much
>>> of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
>>> rte_mempool *mp)
>>>   	 *
>>>   	 * If we fail to get enough contiguous memory, then we'll go and
>>>   	 * reserve space in smaller chunks.
>>> -	 *
>>> -	 * We also have to take into account the fact that memory that we're
>>> -	 * going to allocate from can belong to an externally allocated memory
>>> -	 * area, in which case the assumption of IOVA as VA mode being
>>> -	 * synonymous with IOVA contiguousness will not hold.
>>>   	 */
>>>
>>>   	need_iova_contig_obj = !(mp->flags &
>>> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
>>> a/lib/librte_mempool/rte_mempool_ops_default.c
>>> b/lib/librte_mempool/rte_mempool_ops_default.c
>>> index f6aea7662..e5cd4600f 100644
>>> --- a/lib/librte_mempool/rte_mempool_ops_default.c
>>> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
>>> @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
>>> rte_mempool *mp,
>>>   	return mem_size;
>>>   }
>>>
>>> +/* Returns -1 if object crosses a page boundary, else returns 0 */
>>> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
>>> +	if (pg_sz == 0)
>>> +		return 0;
>>> +	if (elt_sz > pg_sz)
>>> +		return 0;
>>> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
>>> pg_sz))
>>> +		return -1;
>>> +	return 0;
>>> +}
>>> +
>>>   int
>>>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
>>> max_objs,
>>>   		void *vaddr, rte_iova_t iova, size_t len,
>>>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
>>> -	size_t total_elt_sz;
>>> +	char *va = vaddr;
>>> +	size_t total_elt_sz, pg_sz;
>>>   	size_t off;
>>>   	unsigned int i;
>>>   	void *obj;
>>> +	int ret;
>>> +
>>> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
>>> +	if (ret < 0)
>>> +		return ret;
>>>
>>>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>>
>>> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
>>> +	for (off = 0, i = 0; i < max_objs; i++) {
>>> +		/* avoid objects to cross page boundaries */
>>> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
>>> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
>>> off);
>>> +
>>> +		if (off + total_elt_sz > len)
>>> +			break;
>>> +
>>>   		off += mp->header_size;
>>> -		obj = (char *)vaddr + off;
>>> +		obj = va + off;
>>>   		obj_cb(mp, obj_cb_arg, obj,
>>>   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>>>   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>>> --
>>> 2.20.1
  
Olivier Matz Oct. 31, 2019, 8:45 a.m. UTC | #6
On Thu, Oct 31, 2019 at 11:33:30AM +0300, Andrew Rybchenko wrote:
> On 10/31/19 11:24 AM, Olivier Matz wrote:
> > Hi,
> > 
> > On Thu, Oct 31, 2019 at 06:54:50AM +0000, Vamsi Krishna Attunuru wrote:
> > > Hi Olivier,
> > > 
> > > Thanks for reworked patches.
> > > With V2, Tests with 512MB & 2M page sizes work fine with octeontx2
> > > mempool pmd.
> > Good to hear.
> > 
> > > One more concern is, octeontx fpa mempool driver also has the similar
> > > requirements. How do we address that, Can you suggest the best way to
> > > avoid code duplication in PMDs.
> > Well, we could provide additional helpers in librte_mempool:
> > rte_mempool_calc_mem_size_helper() and rte_mempool_populate_helper()
> > that would be internal to mempool lib + drivers. This helpers could take
> > an additional parameter to enable the alignemnt on objects.
> > 
> > But: with this approach, we are moving back driver specificities inside
> > the common code, and if tomorrow more drivers with different
> > requirements also asks to factorize them in common code, we may end up
> > with common functions that must support hardware specificities, making
> > them harder to maintain.
> > 
> > I agree that duplicating code is not satisfying either, but I think it
> > is still better than starting to move hw requirements in common code,
> > given that mempool drivers where introduced for that reason.
> > 
> > @Andrew, any opinion?
> 
> I think I'd prefer to have helper function. It avoids HW specific
> flags in external interface (which was discussed initially), but also
> avoids code duplication which makes it easier to maintain.
> May be we will reconsider it in the future, but right now I think
> it is the best option.

OK, let me try to propose a v3 with helpers like this, and we
can discuss on this basis.


> 
> > > Regards
> > > A Vamsi
> > > 
> > > > -----Original Message-----
> > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > Sent: Wednesday, October 30, 2019 8:06 PM
> > > > To: dev@dpdk.org
> > > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > > > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > > > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > > > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > > > pages
> > > > 
> > > > External Email
> > > > 
> > > > ----------------------------------------------------------------------
> > > > When populating a mempool, ensure that objects are not located across several
> > > > pages, except if user did not request iova contiguous objects.
> > > > 
> > > > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > > ---
> > > >   drivers/mempool/octeontx2/Makefile           |   3 +
> > > >   drivers/mempool/octeontx2/meson.build        |   3 +
> > > >   drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> > > >   lib/librte_mempool/rte_mempool.c             |  23 ++--
> > > >   lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> > > >   5 files changed, 147 insertions(+), 33 deletions(-)
> > > > 
> > > > diff --git a/drivers/mempool/octeontx2/Makefile
> > > > b/drivers/mempool/octeontx2/Makefile
> > > > index 87cce22c6..d781cbfc6 100644
> > > > --- a/drivers/mempool/octeontx2/Makefile
> > > > +++ b/drivers/mempool/octeontx2/Makefile
> > > > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> > > > 
> > > >   LIBABIVER := 1
> > > > 
> > > > +# for rte_mempool_get_page_size
> > > > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > > > +
> > > >   #
> > > >   # all source are stored in SRCS-y
> > > >   #
> > > > diff --git a/drivers/mempool/octeontx2/meson.build
> > > > b/drivers/mempool/octeontx2/meson.build
> > > > index 9fde40f0e..28f9634da 100644
> > > > --- a/drivers/mempool/octeontx2/meson.build
> > > > +++ b/drivers/mempool/octeontx2/meson.build
> > > > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> > > > 
> > > >   deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > > > +
> > > > +# for rte_mempool_get_page_size
> > > > +allow_experimental_apis = true
> > > > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > index d769575f4..47117aec6 100644
> > > > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > @@ -713,12 +713,76 @@ static ssize_t
> > > >   otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> > > >   		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > > > -	/*
> > > > -	 * Simply need space for one more object to be able to
> > > > -	 * fulfill alignment requirements.
> > > > -	 */
> > > > -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > > > pg_shift,
> > > > -						    min_chunk_size, align);
> > > > +	size_t total_elt_sz;
> > > > +	size_t obj_per_page, pg_sz, objs_in_last_page;
> > > > +	size_t mem_size;
> > > > +
> > > > +	/* derived from rte_mempool_op_calc_mem_size_default() */
> > > > +
> > > > +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > > +
> > > > +	if (total_elt_sz == 0) {
> > > > +		mem_size = 0;
> > > > +	} else if (pg_shift == 0) {
> > > > +		/* one object margin to fix alignment */
> > > > +		mem_size = total_elt_sz * (obj_num + 1);
> > > > +	} else {
> > > > +		pg_sz = (size_t)1 << pg_shift;
> > > > +		obj_per_page = pg_sz / total_elt_sz;
> > > > +
> > > > +		/* we need to keep one object to fix alignment */
> > > > +		if (obj_per_page > 0)
> > > > +			obj_per_page--;
> > > > +
> > > > +		if (obj_per_page == 0) {
> > > > +			/*
> > > > +			 * Note that if object size is bigger than page size,
> > > > +			 * then it is assumed that pages are grouped in subsets
> > > > +			 * of physically continuous pages big enough to store
> > > > +			 * at least one object.
> > > > +			 */
> > > > +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > > > +						pg_sz) * obj_num;
> > > > +		} else {
> > > > +			/* In the best case, the allocator will return a
> > > > +			 * page-aligned address. For example, with 5 objs,
> > > > +			 * the required space is as below:
> > > > +			 *  |     page0     |     page1     |  page2 (last) |
> > > > +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > > > +			 *  <------------- mem_size ------------->
> > > > +			 */
> > > > +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > > > 1;
> > > > +			/* room required for the last page */
> > > > +			mem_size = objs_in_last_page * total_elt_sz;
> > > > +			/* room required for other pages */
> > > > +			mem_size += ((obj_num - objs_in_last_page) /
> > > > +				obj_per_page) << pg_shift;
> > > > +
> > > > +			/* In the worst case, the allocator returns a
> > > > +			 * non-aligned pointer, wasting up to
> > > > +			 * total_elt_sz. Add a margin for that.
> > > > +			 */
> > > > +			 mem_size += total_elt_sz - 1;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	*min_chunk_size = total_elt_sz * 2;
> > > > +	*align = RTE_CACHE_LINE_SIZE;
> > > > +
> > > > +	return mem_size;
> > > > +}
> > > > +
> > > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > > +	if (pg_sz == 0)
> > > > +		return 0;
> > > > +	if (elt_sz > pg_sz)
> > > > +		return 0;
> > > > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > > pg_sz))
> > > > +		return -1;
> > > > +	return 0;
> > > >   }
> > > > 
> > > >   static int
> > > > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > > unsigned int max_objs, void *vaddr,
> > > >   		  rte_iova_t iova, size_t len,
> > > >   		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > > {
> > > > -	size_t total_elt_sz;
> > > > +	char *va = vaddr;
> > > > +	size_t total_elt_sz, pg_sz;
> > > >   	size_t off;
> > > > +	unsigned int i;
> > > > +	void *obj;
> > > > +	int ret;
> > > > 
> > > >   	if (iova == RTE_BAD_IOVA)
> > > >   		return -EINVAL;
> > > > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > > unsigned int max_objs, void *vaddr,
> > > >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > > 
> > > >   	/* Align object start address to a multiple of total_elt_sz */
> > > > -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > > > +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> > > > 
> > > >   	if (len < off)
> > > >   		return -EINVAL;
> > > > 
> > > > -	vaddr = (char *)vaddr + off;
> > > > -	iova += off;
> > > > -	len -= off;
> > > > 
> > > > -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > > > +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> > > > 
> > > >   	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> > > >   		return -EBUSY;
> > > > 
> > > > -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > > > len,
> > > > -					       obj_cb, obj_cb_arg);
> > > > +	/* the following is derived from rte_mempool_op_populate_default() */
> > > > +
> > > > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > > +	if (ret < 0)
> > > > +		return ret;
> > > > +
> > > > +	for (i = 0; i < max_objs; i++) {
> > > > +		/* avoid objects to cross page boundaries, and align
> > > > +		 * offset to a multiple of total_elt_sz.
> > > > +		 */
> > > > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > > > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > > off);
> > > > +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > > > +						total_elt_sz) + 1);
> > > > +		}
> > > > +
> > > > +		if (off + total_elt_sz > len)
> > > > +			break;
> > > > +
> > > > +		off += mp->header_size;
> > > > +		obj = va + off;
> > > > +		obj_cb(mp, obj_cb_arg, obj,
> > > > +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > > > +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > > +		off += mp->elt_size + mp->trailer_size;
> > > > +	}
> > > > +
> > > > +	return i;
> > > >   }
> > > > 
> > > >   static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > > > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > > > index 758c5410b..d3db9273d 100644
> > > > --- a/lib/librte_mempool/rte_mempool.c
> > > > +++ b/lib/librte_mempool/rte_mempool.c
> > > > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > > > size_t *pg_sz)
> > > > 
> > > >   	if (!need_iova_contig_obj)
> > > >   		*pg_sz = 0;
> > > > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > > > -		*pg_sz = 0;
> > > >   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > > >   		*pg_sz = get_min_page_size(mp->socket_id);
> > > >   	else
> > > > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > > > *mp)
> > > >   	 * then just set page shift and page size to 0, because the user has
> > > >   	 * indicated that there's no need to care about anything.
> > > >   	 *
> > > > -	 * if we do need contiguous objects, there is also an option to reserve
> > > > -	 * the entire mempool memory as one contiguous block of memory, in
> > > > -	 * which case the page shift and alignment wouldn't matter as well.
> > > > +	 * if we do need contiguous objects (if a mempool driver has its
> > > > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > > > +	 * there is also an option to reserve the entire mempool memory
> > > > +	 * as one contiguous block of memory.
> > > >   	 *
> > > >   	 * if we require contiguous objects, but not necessarily the entire
> > > > -	 * mempool reserved space to be contiguous, then there are two
> > > > options.
> > > > -	 *
> > > > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > > > -	 * case), then no page shift needed - our memory allocation will give us
> > > > -	 * contiguous IO memory as far as the hardware is concerned, so
> > > > -	 * act as if we're getting contiguous memory.
> > > > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > > > +	 * and the default ops->populate() will take care of not placing
> > > > +	 * objects across pages.
> > > >   	 *
> > > >   	 * if our IO addresses are physical, we may get memory from bigger
> > > >   	 * pages, or we might get memory from smaller pages, and how much
> > > > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > > > rte_mempool *mp)
> > > >   	 *
> > > >   	 * If we fail to get enough contiguous memory, then we'll go and
> > > >   	 * reserve space in smaller chunks.
> > > > -	 *
> > > > -	 * We also have to take into account the fact that memory that we're
> > > > -	 * going to allocate from can belong to an externally allocated memory
> > > > -	 * area, in which case the assumption of IOVA as VA mode being
> > > > -	 * synonymous with IOVA contiguousness will not hold.
> > > >   	 */
> > > > 
> > > >   	need_iova_contig_obj = !(mp->flags &
> > > > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > > > a/lib/librte_mempool/rte_mempool_ops_default.c
> > > > b/lib/librte_mempool/rte_mempool_ops_default.c
> > > > index f6aea7662..e5cd4600f 100644
> > > > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > > > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > > > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > > > rte_mempool *mp,
> > > >   	return mem_size;
> > > >   }
> > > > 
> > > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > > +	if (pg_sz == 0)
> > > > +		return 0;
> > > > +	if (elt_sz > pg_sz)
> > > > +		return 0;
> > > > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > > pg_sz))
> > > > +		return -1;
> > > > +	return 0;
> > > > +}
> > > > +
> > > >   int
> > > >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > > max_objs,
> > > >   		void *vaddr, rte_iova_t iova, size_t len,
> > > >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > > > -	size_t total_elt_sz;
> > > > +	char *va = vaddr;
> > > > +	size_t total_elt_sz, pg_sz;
> > > >   	size_t off;
> > > >   	unsigned int i;
> > > >   	void *obj;
> > > > +	int ret;
> > > > +
> > > > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > > +	if (ret < 0)
> > > > +		return ret;
> > > > 
> > > >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > > 
> > > > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > > +	for (off = 0, i = 0; i < max_objs; i++) {
> > > > +		/* avoid objects to cross page boundaries */
> > > > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > > off);
> > > > +
> > > > +		if (off + total_elt_sz > len)
> > > > +			break;
> > > > +
> > > >   		off += mp->header_size;
> > > > -		obj = (char *)vaddr + off;
> > > > +		obj = va + off;
> > > >   		obj_cb(mp, obj_cb_arg, obj,
> > > >   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > > >   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > > --
> > > > 2.20.1
>
  

Patch

diff --git a/drivers/mempool/octeontx2/Makefile b/drivers/mempool/octeontx2/Makefile
index 87cce22c6..d781cbfc6 100644
--- a/drivers/mempool/octeontx2/Makefile
+++ b/drivers/mempool/octeontx2/Makefile
@@ -27,6 +27,9 @@  EXPORT_MAP := rte_mempool_octeontx2_version.map
 
 LIBABIVER := 1
 
+# for rte_mempool_get_page_size
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 #
 # all source are stored in SRCS-y
 #
diff --git a/drivers/mempool/octeontx2/meson.build b/drivers/mempool/octeontx2/meson.build
index 9fde40f0e..28f9634da 100644
--- a/drivers/mempool/octeontx2/meson.build
+++ b/drivers/mempool/octeontx2/meson.build
@@ -21,3 +21,6 @@  foreach flag: extra_flags
 endforeach
 
 deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
+
+# for rte_mempool_get_page_size
+allow_experimental_apis = true
diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index d769575f4..47117aec6 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -713,12 +713,76 @@  static ssize_t
 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
 		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
 {
-	/*
-	 * Simply need space for one more object to be able to
-	 * fulfill alignment requirements.
-	 */
-	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1, pg_shift,
-						    min_chunk_size, align);
+	size_t total_elt_sz;
+	size_t obj_per_page, pg_sz, objs_in_last_page;
+	size_t mem_size;
+
+	/* derived from rte_mempool_op_calc_mem_size_default() */
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	if (total_elt_sz == 0) {
+		mem_size = 0;
+	} else if (pg_shift == 0) {
+		/* one object margin to fix alignment */
+		mem_size = total_elt_sz * (obj_num + 1);
+	} else {
+		pg_sz = (size_t)1 << pg_shift;
+		obj_per_page = pg_sz / total_elt_sz;
+
+		/* we need to keep one object to fix alignment */
+		if (obj_per_page > 0)
+			obj_per_page--;
+
+		if (obj_per_page == 0) {
+			/*
+			 * Note that if object size is bigger than page size,
+			 * then it is assumed that pages are grouped in subsets
+			 * of physically continuous pages big enough to store
+			 * at least one object.
+			 */
+			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
+						pg_sz) * obj_num;
+		} else {
+			/* In the best case, the allocator will return a
+			 * page-aligned address. For example, with 5 objs,
+			 * the required space is as below:
+			 *  |     page0     |     page1     |  page2 (last) |
+			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
+			 *  <------------- mem_size ------------->
+			 */
+			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
+			/* room required for the last page */
+			mem_size = objs_in_last_page * total_elt_sz;
+			/* room required for other pages */
+			mem_size += ((obj_num - objs_in_last_page) /
+				obj_per_page) << pg_shift;
+
+			/* In the worst case, the allocator returns a
+			 * non-aligned pointer, wasting up to
+			 * total_elt_sz. Add a margin for that.
+			 */
+			 mem_size += total_elt_sz - 1;
+		}
+	}
+
+	*min_chunk_size = total_elt_sz * 2;
+	*align = RTE_CACHE_LINE_SIZE;
+
+	return mem_size;
+}
+
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
 }
 
 static int
@@ -726,8 +790,12 @@  otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 		  rte_iova_t iova, size_t len,
 		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
+	unsigned int i;
+	void *obj;
+	int ret;
 
 	if (iova == RTE_BAD_IOVA)
 		return -EINVAL;
@@ -735,22 +803,45 @@  otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	/* Align object start address to a multiple of total_elt_sz */
-	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
 
 	if (len < off)
 		return -EINVAL;
 
-	vaddr = (char *)vaddr + off;
-	iova += off;
-	len -= off;
 
-	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
+	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
 
 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
 		return -EBUSY;
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
-					       obj_cb, obj_cb_arg);
+	/* the following is derived from rte_mempool_op_populate_default() */
+
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < max_objs; i++) {
+		/* avoid objects to cross page boundaries, and align
+		 * offset to a multiple of total_elt_sz.
+		 */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
+						total_elt_sz) + 1);
+		}
+
+		if (off + total_elt_sz > len)
+			break;
+
+		off += mp->header_size;
+		obj = va + off;
+		obj_cb(mp, obj_cb_arg, obj,
+		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
+		off += mp->elt_size + mp->trailer_size;
+	}
+
+	return i;
 }
 
 static struct rte_mempool_ops otx2_npa_ops = {
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 758c5410b..d3db9273d 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -431,8 +431,6 @@  rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
 
 	if (!need_iova_contig_obj)
 		*pg_sz = 0;
-	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
-		*pg_sz = 0;
 	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
 		*pg_sz = get_min_page_size(mp->socket_id);
 	else
@@ -481,17 +479,15 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 	 * then just set page shift and page size to 0, because the user has
 	 * indicated that there's no need to care about anything.
 	 *
-	 * if we do need contiguous objects, there is also an option to reserve
-	 * the entire mempool memory as one contiguous block of memory, in
-	 * which case the page shift and alignment wouldn't matter as well.
+	 * if we do need contiguous objects (if a mempool driver has its
+	 * own calc_size() method returning min_chunk_size = mem_size),
+	 * there is also an option to reserve the entire mempool memory
+	 * as one contiguous block of memory.
 	 *
 	 * if we require contiguous objects, but not necessarily the entire
-	 * mempool reserved space to be contiguous, then there are two options.
-	 *
-	 * if our IO addresses are virtual, not actual physical (IOVA as VA
-	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous IO memory as far as the hardware is concerned, so
-	 * act as if we're getting contiguous memory.
+	 * mempool reserved space to be contiguous, pg_sz will be != 0,
+	 * and the default ops->populate() will take care of not placing
+	 * objects across pages.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
 	 * pages, or we might get memory from smaller pages, and how much of it
@@ -504,11 +500,6 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * If we fail to get enough contiguous memory, then we'll go and
 	 * reserve space in smaller chunks.
-	 *
-	 * We also have to take into account the fact that memory that we're
-	 * going to allocate from can belong to an externally allocated memory
-	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index f6aea7662..e5cd4600f 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -61,21 +61,47 @@  rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
+	int ret;
+
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	for (off = 0, i = 0; i < max_objs; i++) {
+		/* avoid objects to cross page boundaries */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
-		obj = (char *)vaddr + off;
+		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);