[5/5] mempool: prevent objects from being across pages

Message ID 20191028140122.9592-6-olivier.matz@6wind.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mempool: avoid objects allocations across pages |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues
ci/travis-robot success Travis build: passed

Commit Message

Olivier Matz Oct. 28, 2019, 2:01 p.m. UTC
  When populating a mempool, ensure that objects are not located across
several pages, except if user did not request iova contiguous objects.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
 lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++--
 2 files changed, 33 insertions(+), 19 deletions(-)
  

Comments

Andrew Rybchenko Oct. 29, 2019, 10:59 a.m. UTC | #1
On 10/28/19 5:01 PM, Olivier Matz wrote:
> When populating a mempool, ensure that objects are not located across
> several pages, except if user did not request iova contiguous objects.

I think it breaks distribution across memory channels which could
affect performance significantly.

> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
>   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++--
>   2 files changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 7664764e5..b23fd1b06 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
>   
>   	if (!need_iova_contig_obj)
>   		*pg_sz = 0;
> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> -		*pg_sz = 0;
>   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>   		*pg_sz = get_min_page_size(mp->socket_id);
>   	else
> @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	 * then just set page shift and page size to 0, because the user has
>   	 * indicated that there's no need to care about anything.
>   	 *
> -	 * if we do need contiguous objects, there is also an option to reserve
> -	 * the entire mempool memory as one contiguous block of memory, in
> -	 * which case the page shift and alignment wouldn't matter as well.
> +	 * if we do need contiguous objects (if a mempool driver has its
> +	 * own calc_size() method returning min_chunk_size = mem_size),
> +	 * there is also an option to reserve the entire mempool memory
> +	 * as one contiguous block of memory.
>   	 *
>   	 * if we require contiguous objects, but not necessarily the entire
> -	 * mempool reserved space to be contiguous, then there are two options.
> -	 *
> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> -	 * case), then no page shift needed - our memory allocation will give us
> -	 * contiguous IO memory as far as the hardware is concerned, so
> -	 * act as if we're getting contiguous memory.
> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> +	 * and the default ops->populate() will take care of not placing
> +	 * objects across pages.
>   	 *
>   	 * if our IO addresses are physical, we may get memory from bigger
>   	 * pages, or we might get memory from smaller pages, and how much of it
> @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	 *
>   	 * If we fail to get enough contiguous memory, then we'll go and
>   	 * reserve space in smaller chunks.
> -	 *
> -	 * We also have to take into account the fact that memory that we're
> -	 * going to allocate from can belong to an externally allocated memory
> -	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold.
>   	 */
>   
>   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> index f6aea7662..dd09a0a32 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   	return mem_size;
>   }
>   
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int
> +check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
> +{
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
> +		return -1;
> +	return 0;
> +}
> +
>   int
>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
>   		void *vaddr, rte_iova_t iova, size_t len,
>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>   {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>   	size_t off;
>   	unsigned int i;
>   	void *obj;
>   
> +	rte_mempool_get_page_size(mp, &pg_sz);
> +

The function may return an error which should be taken into account here.

>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* align offset to next page start if required */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>   		off += mp->header_size;
> -		obj = (char *)vaddr + off;
> +		obj = va + off;
>   		obj_cb(mp, obj_cb_arg, obj,
>   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
  
Vamsi Krishna Attunuru Oct. 29, 2019, 5:25 p.m. UTC | #2
Hi Olivier,

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Monday, October 28, 2019 7:31 PM
> To: dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; Stephen Hemminger
> <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
> Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> pages
> 
> External Email
> 
> ----------------------------------------------------------------------
> When populating a mempool, ensure that objects are not located across
> several pages, except if user did not request iova contiguous objects.
> 
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
>  lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
> -
>  2 files changed, 33 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/librte_mempool/rte_mempool.c
> b/lib/librte_mempool/rte_mempool.c
> index 7664764e5..b23fd1b06 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
> *mp, size_t *pg_sz)
> 
>  	if (!need_iova_contig_obj)
>  		*pg_sz = 0;
> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
> RTE_IOVA_VA)
> -		*pg_sz = 0;
>  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>  		*pg_sz = get_min_page_size(mp->socket_id);
>  	else
> @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
>  	 * then just set page shift and page size to 0, because the user has
>  	 * indicated that there's no need to care about anything.
>  	 *
> -	 * if we do need contiguous objects, there is also an option to reserve
> -	 * the entire mempool memory as one contiguous block of memory,
> in
> -	 * which case the page shift and alignment wouldn't matter as well.
> +	 * if we do need contiguous objects (if a mempool driver has its
> +	 * own calc_size() method returning min_chunk_size = mem_size),
> +	 * there is also an option to reserve the entire mempool memory
> +	 * as one contiguous block of memory.
>  	 *
>  	 * if we require contiguous objects, but not necessarily the entire
> -	 * mempool reserved space to be contiguous, then there are two
> options.
> -	 *
> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> -	 * case), then no page shift needed - our memory allocation will give
> us
> -	 * contiguous IO memory as far as the hardware is concerned, so
> -	 * act as if we're getting contiguous memory.
> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> +	 * and the default ops->populate() will take care of not placing
> +	 * objects across pages.
>  	 *
>  	 * if our IO addresses are physical, we may get memory from bigger
>  	 * pages, or we might get memory from smaller pages, and how
> much of it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
>  	 *
>  	 * If we fail to get enough contiguous memory, then we'll go and
>  	 * reserve space in smaller chunks.
> -	 *
> -	 * We also have to take into account the fact that memory that we're
> -	 * going to allocate from can belong to an externally allocated
> memory
> -	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold.
>  	 */
> 
>  	need_iova_contig_obj = !(mp->flags &
> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> a/lib/librte_mempool/rte_mempool_ops_default.c
> b/lib/librte_mempool/rte_mempool_ops_default.c
> index f6aea7662..dd09a0a32 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
> struct rte_mempool *mp,
>  	return mem_size;
>  }
> 
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> pg_sz))
> +		return -1;
> +	return 0;
> +}
> +
>  int
>  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> max_objs,
>  		void *vaddr, rte_iova_t iova, size_t len,
>  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>  	size_t off;
>  	unsigned int i;
>  	void *obj;
> 
> +	rte_mempool_get_page_size(mp, &pg_sz);
> +
>  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> 
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* align offset to next page start if required */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> off);

Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).

> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>  		off += mp->header_size;
> -		obj = (char *)vaddr + off;
> +		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> --
> 2.20.1
  
Olivier Matz Oct. 29, 2019, 5:34 p.m. UTC | #3
On Tue, Oct 29, 2019 at 01:59:00PM +0300, Andrew Rybchenko wrote:
> On 10/28/19 5:01 PM, Olivier Matz wrote:
> > When populating a mempool, ensure that objects are not located across
> > several pages, except if user did not request iova contiguous objects.
> 
> I think it breaks distribution across memory channels which could
> affect performance significantly.

With 2M hugepages, there are ~900 mbufs per page, and all of them will
be distributed across memory channels. For larger objects, I don't think
the distribution is that important.

With small pages, that may be true. I think the problem was already
there except in IOVA=VA mode.

This should be fixable, but I'm not sure a use-case where we can see a
regression really exists.


> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
> >   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++--
> >   2 files changed, 33 insertions(+), 19 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 7664764e5..b23fd1b06 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
> >   	if (!need_iova_contig_obj)
> >   		*pg_sz = 0;
> > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > -		*pg_sz = 0;
> >   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >   		*pg_sz = get_min_page_size(mp->socket_id);
> >   	else
> > @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	 * then just set page shift and page size to 0, because the user has
> >   	 * indicated that there's no need to care about anything.
> >   	 *
> > -	 * if we do need contiguous objects, there is also an option to reserve
> > -	 * the entire mempool memory as one contiguous block of memory, in
> > -	 * which case the page shift and alignment wouldn't matter as well.
> > +	 * if we do need contiguous objects (if a mempool driver has its
> > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > +	 * there is also an option to reserve the entire mempool memory
> > +	 * as one contiguous block of memory.
> >   	 *
> >   	 * if we require contiguous objects, but not necessarily the entire
> > -	 * mempool reserved space to be contiguous, then there are two options.
> > -	 *
> > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -	 * case), then no page shift needed - our memory allocation will give us
> > -	 * contiguous IO memory as far as the hardware is concerned, so
> > -	 * act as if we're getting contiguous memory.
> > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +	 * and the default ops->populate() will take care of not placing
> > +	 * objects across pages.
> >   	 *
> >   	 * if our IO addresses are physical, we may get memory from bigger
> >   	 * pages, or we might get memory from smaller pages, and how much of it
> > @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	 *
> >   	 * If we fail to get enough contiguous memory, then we'll go and
> >   	 * reserve space in smaller chunks.
> > -	 *
> > -	 * We also have to take into account the fact that memory that we're
> > -	 * going to allocate from can belong to an externally allocated memory
> > -	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold.
> >   	 */
> >   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..dd09a0a32 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   	return mem_size;
> >   }
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int
> > +check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
> > +{
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
> > +		return -1;
> > +	return 0;
> > +}
> > +
> >   int
> >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
> >   		void *vaddr, rte_iova_t iova, size_t len,
> >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >   {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >   	size_t off;
> >   	unsigned int i;
> >   	void *obj;
> > +	rte_mempool_get_page_size(mp, &pg_sz);
> > +
> 
> The function may return an error which should be taken into account here.

That would be better, indeed.


> >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* align offset to next page start if required */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >   		off += mp->header_size;
> > -		obj = (char *)vaddr + off;
> > +		obj = va + off;
> >   		obj_cb(mp, obj_cb_arg, obj,
> >   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> 

Thanks for all the comments!
I will send a v2 tomorrow.

Olivier
  
Vamsi Krishna Attunuru Oct. 30, 2019, 3:55 a.m. UTC | #4
Hi Olivier,

> -----Original Message-----
> From: Vamsi Krishna Attunuru
> Sent: Tuesday, October 29, 2019 10:55 PM
> To: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: RE: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> pages
> 
> Hi Olivier,
> 
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Monday, October 28, 2019 7:31 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit
> > <ferruh.yigit@linux.intel.com>; Giridharan, Ganesan
> > <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > <kirankumark@marvell.com>; Stephen Hemminger
> <sthemmin@microsoft.com>;
> > Thomas Monjalon <thomas@monjalon.net>; Vamsi Krishna Attunuru
> > <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> > pages
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > When populating a mempool, ensure that objects are not located across
> > several pages, except if user did not request iova contiguous objects.
> >
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
> >  lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
> > -
> >  2 files changed, 33 insertions(+), 19 deletions(-)
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index 7664764e5..b23fd1b06 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
> *mp,
> > size_t *pg_sz)
> >
> >  	if (!need_iova_contig_obj)
> >  		*pg_sz = 0;
> > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
> > RTE_IOVA_VA)
> > -		*pg_sz = 0;
> >  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >  		*pg_sz = get_min_page_size(mp->socket_id);
> >  	else
> > @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >  	 * then just set page shift and page size to 0, because the user has
> >  	 * indicated that there's no need to care about anything.
> >  	 *
> > -	 * if we do need contiguous objects, there is also an option to reserve
> > -	 * the entire mempool memory as one contiguous block of memory,
> > in
> > -	 * which case the page shift and alignment wouldn't matter as well.
> > +	 * if we do need contiguous objects (if a mempool driver has its
> > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > +	 * there is also an option to reserve the entire mempool memory
> > +	 * as one contiguous block of memory.
> >  	 *
> >  	 * if we require contiguous objects, but not necessarily the entire
> > -	 * mempool reserved space to be contiguous, then there are two
> > options.
> > -	 *
> > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -	 * case), then no page shift needed - our memory allocation will give
> > us
> > -	 * contiguous IO memory as far as the hardware is concerned, so
> > -	 * act as if we're getting contiguous memory.
> > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +	 * and the default ops->populate() will take care of not placing
> > +	 * objects across pages.
> >  	 *
> >  	 * if our IO addresses are physical, we may get memory from bigger
> >  	 * pages, or we might get memory from smaller pages, and how much
> of
> > it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >  	 *
> >  	 * If we fail to get enough contiguous memory, then we'll go and
> >  	 * reserve space in smaller chunks.
> > -	 *
> > -	 * We also have to take into account the fact that memory that we're
> > -	 * going to allocate from can belong to an externally allocated
> > memory
> > -	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold.
> >  	 */
> >
> >  	need_iova_contig_obj = !(mp->flags &
> MEMPOOL_F_NO_IOVA_CONTIG); diff
> > --git a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..dd09a0a32 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
> > struct rte_mempool *mp,
> >  	return mem_size;
> >  }
> >
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +		return -1;
> > +	return 0;
> > +}
> > +
> >  int
> >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > max_objs,
> >  		void *vaddr, rte_iova_t iova, size_t len,
> >  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg) {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >  	size_t off;
> >  	unsigned int i;
> >  	void *obj;
> >
> > +	rte_mempool_get_page_size(mp, &pg_sz);
> > +
> >  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* align offset to next page start if required */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> 
> Moving offset to the start of next page and than freeing (vaddr + off +
> header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf
> alignment requirement(buffer address needs to be multiple of buffer size).

Earlier there was a flag to align the object to total object size, later it was deprecated(sew below link) after adding calc min chunk size callback. In this new patch(5/5), while moving to next page, the object addr is being align to the pg_sz. But oteontx2 mempool hw requires object addresses freed to it must be aligned with total object size.  We need these alignment requirement fulfilled to make it work with HW mempool.

https://git.dpdk.org/dpdk/commit/lib/librte_mempool/rte_mempool.c?id=ce1f2c61ed135e4133d0429e86e554bfd4d58cb0



> 
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >  		off += mp->header_size;
> > -		obj = (char *)vaddr + off;
> > +		obj = va + off;
> >  		obj_cb(mp, obj_cb_arg, obj,
> >  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > --
> > 2.20.1
  
Andrew Rybchenko Oct. 30, 2019, 7:46 a.m. UTC | #5
On 10/29/19 8:25 PM, Vamsi Krishna Attunuru wrote:
> Hi Olivier,
>
>> -----Original Message-----
>> From: Olivier Matz <olivier.matz@6wind.com>
>> Sent: Monday, October 28, 2019 7:31 PM
>> To: dev@dpdk.org
>> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
>> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
>> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
>> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
>> <kirankumark@marvell.com>; Stephen Hemminger
>> <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
>> Vamsi Krishna Attunuru <vattunuru@marvell.com>
>> Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
>> pages
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> When populating a mempool, ensure that objects are not located across
>> several pages, except if user did not request iova contiguous objects.
>>
>> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>> ---
>>   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
>>   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
>> -
>>   2 files changed, 33 insertions(+), 19 deletions(-)
>>
>> diff --git a/lib/librte_mempool/rte_mempool.c
>> b/lib/librte_mempool/rte_mempool.c
>> index 7664764e5..b23fd1b06 100644
>> --- a/lib/librte_mempool/rte_mempool.c
>> +++ b/lib/librte_mempool/rte_mempool.c
>> @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
>> *mp, size_t *pg_sz)
>>
>>   	if (!need_iova_contig_obj)
>>   		*pg_sz = 0;
>> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
>> RTE_IOVA_VA)
>> -		*pg_sz = 0;
>>   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>>   		*pg_sz = get_min_page_size(mp->socket_id);
>>   	else
>> @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
>> rte_mempool *mp)
>>   	 * then just set page shift and page size to 0, because the user has
>>   	 * indicated that there's no need to care about anything.
>>   	 *
>> -	 * if we do need contiguous objects, there is also an option to reserve
>> -	 * the entire mempool memory as one contiguous block of memory,
>> in
>> -	 * which case the page shift and alignment wouldn't matter as well.
>> +	 * if we do need contiguous objects (if a mempool driver has its
>> +	 * own calc_size() method returning min_chunk_size = mem_size),
>> +	 * there is also an option to reserve the entire mempool memory
>> +	 * as one contiguous block of memory.
>>   	 *
>>   	 * if we require contiguous objects, but not necessarily the entire
>> -	 * mempool reserved space to be contiguous, then there are two
>> options.
>> -	 *
>> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
>> -	 * case), then no page shift needed - our memory allocation will give
>> us
>> -	 * contiguous IO memory as far as the hardware is concerned, so
>> -	 * act as if we're getting contiguous memory.
>> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
>> +	 * and the default ops->populate() will take care of not placing
>> +	 * objects across pages.
>>   	 *
>>   	 * if our IO addresses are physical, we may get memory from bigger
>>   	 * pages, or we might get memory from smaller pages, and how
>> much of it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
>> rte_mempool *mp)
>>   	 *
>>   	 * If we fail to get enough contiguous memory, then we'll go and
>>   	 * reserve space in smaller chunks.
>> -	 *
>> -	 * We also have to take into account the fact that memory that we're
>> -	 * going to allocate from can belong to an externally allocated
>> memory
>> -	 * area, in which case the assumption of IOVA as VA mode being
>> -	 * synonymous with IOVA contiguousness will not hold.
>>   	 */
>>
>>   	need_iova_contig_obj = !(mp->flags &
>> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
>> a/lib/librte_mempool/rte_mempool_ops_default.c
>> b/lib/librte_mempool/rte_mempool_ops_default.c
>> index f6aea7662..dd09a0a32 100644
>> --- a/lib/librte_mempool/rte_mempool_ops_default.c
>> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
>> @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
>> struct rte_mempool *mp,
>>   	return mem_size;
>>   }
>>
>> +/* Returns -1 if object crosses a page boundary, else returns 0 */
>> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
>> +	if (pg_sz == 0)
>> +		return 0;
>> +	if (elt_sz > pg_sz)
>> +		return 0;
>> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
>> pg_sz))
>> +		return -1;
>> +	return 0;
>> +}
>> +
>>   int
>>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
>> max_objs,
>>   		void *vaddr, rte_iova_t iova, size_t len,
>>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>> {
>> -	size_t total_elt_sz;
>> +	char *va = vaddr;
>> +	size_t total_elt_sz, pg_sz;
>>   	size_t off;
>>   	unsigned int i;
>>   	void *obj;
>>
>> +	rte_mempool_get_page_size(mp, &pg_sz);
>> +
>>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>
>> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
>> +	for (off = 0, i = 0; i < max_objs; i++) {
>> +		/* align offset to next page start if required */
>> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
>> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
>> off);
> Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).

It sounds line octeontx2 mempool should have its own populate callback
which cares about it.
  
Andrew Rybchenko Oct. 30, 2019, 7:56 a.m. UTC | #6
On 10/29/19 8:34 PM, Olivier Matz wrote:
> On Tue, Oct 29, 2019 at 01:59:00PM +0300, Andrew Rybchenko wrote:
>> On 10/28/19 5:01 PM, Olivier Matz wrote:
>>> When populating a mempool, ensure that objects are not located across
>>> several pages, except if user did not request iova contiguous objects.
>> I think it breaks distribution across memory channels which could
>> affect performance significantly.
> With 2M hugepages, there are ~900 mbufs per page, and all of them will
> be distributed across memory channels. For larger objects, I don't think
> the distribution is that important.

Yes, that's true. I was thinking about the case when page is smaller.

> With small pages, that may be true. I think the problem was already
> there except in IOVA=VA mode.

I don't know any good real examples. In theory before even
small pages could be merged together and populated as one big
chunk. Right now merging does not make sense since page boundaries
will be respected by default anyway.

> This should be fixable, but I'm not sure a use-case where we can see a
> regression really exists.

Agreed. If total object size allows it, it is possible to spread
offset from page boundary.

I don't know either if it is critical or not.
  
Jerin Jacob Oct. 30, 2019, 8:38 a.m. UTC | #7
On Wed, Oct 30, 2019 at 1:16 PM Andrew Rybchenko
<arybchenko@solarflare.com> wrote:
>

> >>   int
> >>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> >> max_objs,
> >>              void *vaddr, rte_iova_t iova, size_t len,
> >>              rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >> {
> >> -    size_t total_elt_sz;
> >> +    char *va = vaddr;
> >> +    size_t total_elt_sz, pg_sz;
> >>      size_t off;
> >>      unsigned int i;
> >>      void *obj;
> >>
> >> +    rte_mempool_get_page_size(mp, &pg_sz);
> >> +
> >>      total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >>
> >> -    for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> >> +    for (off = 0, i = 0; i < max_objs; i++) {
> >> +            /* align offset to next page start if required */
> >> +            if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> >> +                    off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> >> off);
> > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
>
> It sounds line octeontx2 mempool should have its own populate callback
> which cares about it.

Driver specific populate function is not a bad idea. The only concern
would be to

# We need to duplicate rte_mempool_op_populate_default() and
rte_mempool_op_calc_mem_size_default()
# We need to make sure if some one changes the
rte_mempool_op_populate_default() and
rte_mempool_op_calc_mem_size_default() then he/she needs to update the
drivers too

# I would like to add one more point here is that:
- Calculation of  object pad requirements for MEMPOOL_F_NO_SPREAD i.e
optimize_object_size()
is NOT GENERIC. i.e get_gcd() based logic is not generic. DDR
controller defines the address to DDR channel "spread" and it will be
based on SoC or Mirco architecture.
So we need to consider that as well.
  
Olivier Matz Oct. 30, 2019, 8:42 a.m. UTC | #8
On Wed, Oct 30, 2019 at 10:46:31AM +0300, Andrew Rybchenko wrote:
> On 10/29/19 8:25 PM, Vamsi Krishna Attunuru wrote:
> > Hi Olivier,
> > 
> > > -----Original Message-----
> > > From: Olivier Matz <olivier.matz@6wind.com>
> > > Sent: Monday, October 28, 2019 7:31 PM
> > > To: dev@dpdk.org
> > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > > <kirankumark@marvell.com>; Stephen Hemminger
> > > <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
> > > Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> > > pages
> > > 
> > > External Email
> > > 
> > > ----------------------------------------------------------------------
> > > When populating a mempool, ensure that objects are not located across
> > > several pages, except if user did not request iova contiguous objects.
> > > 
> > > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > ---
> > >   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
> > >   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
> > > -
> > >   2 files changed, 33 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/lib/librte_mempool/rte_mempool.c
> > > b/lib/librte_mempool/rte_mempool.c
> > > index 7664764e5..b23fd1b06 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
> > > *mp, size_t *pg_sz)
> > > 
> > >   	if (!need_iova_contig_obj)
> > >   		*pg_sz = 0;
> > > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
> > > RTE_IOVA_VA)
> > > -		*pg_sz = 0;
> > >   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > >   		*pg_sz = get_min_page_size(mp->socket_id);
> > >   	else
> > > @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > >   	 * then just set page shift and page size to 0, because the user has
> > >   	 * indicated that there's no need to care about anything.
> > >   	 *
> > > -	 * if we do need contiguous objects, there is also an option to reserve
> > > -	 * the entire mempool memory as one contiguous block of memory,
> > > in
> > > -	 * which case the page shift and alignment wouldn't matter as well.
> > > +	 * if we do need contiguous objects (if a mempool driver has its
> > > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > > +	 * there is also an option to reserve the entire mempool memory
> > > +	 * as one contiguous block of memory.
> > >   	 *
> > >   	 * if we require contiguous objects, but not necessarily the entire
> > > -	 * mempool reserved space to be contiguous, then there are two
> > > options.
> > > -	 *
> > > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > > -	 * case), then no page shift needed - our memory allocation will give
> > > us
> > > -	 * contiguous IO memory as far as the hardware is concerned, so
> > > -	 * act as if we're getting contiguous memory.
> > > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > > +	 * and the default ops->populate() will take care of not placing
> > > +	 * objects across pages.
> > >   	 *
> > >   	 * if our IO addresses are physical, we may get memory from bigger
> > >   	 * pages, or we might get memory from smaller pages, and how
> > > much of it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > >   	 *
> > >   	 * If we fail to get enough contiguous memory, then we'll go and
> > >   	 * reserve space in smaller chunks.
> > > -	 *
> > > -	 * We also have to take into account the fact that memory that we're
> > > -	 * going to allocate from can belong to an externally allocated
> > > memory
> > > -	 * area, in which case the assumption of IOVA as VA mode being
> > > -	 * synonymous with IOVA contiguousness will not hold.
> > >   	 */
> > > 
> > >   	need_iova_contig_obj = !(mp->flags &
> > > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > > a/lib/librte_mempool/rte_mempool_ops_default.c
> > > b/lib/librte_mempool/rte_mempool_ops_default.c
> > > index f6aea7662..dd09a0a32 100644
> > > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > > @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
> > > struct rte_mempool *mp,
> > >   	return mem_size;
> > >   }
> > > 
> > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > +	if (pg_sz == 0)
> > > +		return 0;
> > > +	if (elt_sz > pg_sz)
> > > +		return 0;
> > > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > pg_sz))
> > > +		return -1;
> > > +	return 0;
> > > +}
> > > +
> > >   int
> > >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > max_objs,
> > >   		void *vaddr, rte_iova_t iova, size_t len,
> > >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > {
> > > -	size_t total_elt_sz;
> > > +	char *va = vaddr;
> > > +	size_t total_elt_sz, pg_sz;
> > >   	size_t off;
> > >   	unsigned int i;
> > >   	void *obj;
> > > 
> > > +	rte_mempool_get_page_size(mp, &pg_sz);
> > > +
> > >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > 
> > > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > +	for (off = 0, i = 0; i < max_objs; i++) {
> > > +		/* align offset to next page start if required */
> > > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > off);
> > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
> 
> It sounds line octeontx2 mempool should have its own populate callback
> which cares about it.
> 

Agree, even the calc_mem_size() should be modified to be rigorous.
Let me draft something in next version, so you can test it.
  
Olivier Matz Oct. 30, 2019, 2:33 p.m. UTC | #9
Hi Jerin,

On Wed, Oct 30, 2019 at 02:08:40PM +0530, Jerin Jacob wrote:
> On Wed, Oct 30, 2019 at 1:16 PM Andrew Rybchenko
> <arybchenko@solarflare.com> wrote:
> >
> 
> > >>   int
> > >>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > >> max_objs,
> > >>              void *vaddr, rte_iova_t iova, size_t len,
> > >>              rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > >> {
> > >> -    size_t total_elt_sz;
> > >> +    char *va = vaddr;
> > >> +    size_t total_elt_sz, pg_sz;
> > >>      size_t off;
> > >>      unsigned int i;
> > >>      void *obj;
> > >>
> > >> +    rte_mempool_get_page_size(mp, &pg_sz);
> > >> +
> > >>      total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > >>
> > >> -    for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > >> +    for (off = 0, i = 0; i < max_objs; i++) {
> > >> +            /* align offset to next page start if required */
> > >> +            if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > >> +                    off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > >> off);
> > > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
> >
> > It sounds line octeontx2 mempool should have its own populate callback
> > which cares about it.
> 
> Driver specific populate function is not a bad idea. The only concern
> would be to
> 
> # We need to duplicate rte_mempool_op_populate_default() and
> rte_mempool_op_calc_mem_size_default()
> # We need to make sure if some one changes the
> rte_mempool_op_populate_default() and
> rte_mempool_op_calc_mem_size_default() then he/she needs to update the
> drivers too

Agree, we need to be careful. Hopefully we shouldn't change this code
very often.

I'm sending a v2 with a patch to the octeontx2 driver which --I hope--
should solve the issue.

> # I would like to add one more point here is that:
> - Calculation of  object pad requirements for MEMPOOL_F_NO_SPREAD i.e
> optimize_object_size()
> is NOT GENERIC. i.e get_gcd() based logic is not generic. DDR
> controller defines the address to DDR channel "spread" and it will be
> based on SoC or Mirco architecture.
> So we need to consider that as well.

Could you give some details about how it should be optimized on your
platforms, and what is the behavior today (advertised nb_rank and
nb_channels)?

Thanks,
Olivier
  
Jerin Jacob Oct. 30, 2019, 2:54 p.m. UTC | #10
On Wed, Oct 30, 2019 at 8:03 PM Olivier Matz <olivier.matz@6wind.com> wrote:
>
> Hi Jerin,
>
> On Wed, Oct 30, 2019 at 02:08:40PM +0530, Jerin Jacob wrote:
> > On Wed, Oct 30, 2019 at 1:16 PM Andrew Rybchenko
> > <arybchenko@solarflare.com> wrote:
> > >
> >
> > > >>   int
> > > >>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > >> max_objs,
> > > >>              void *vaddr, rte_iova_t iova, size_t len,
> > > >>              rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > >> {
> > > >> -    size_t total_elt_sz;
> > > >> +    char *va = vaddr;
> > > >> +    size_t total_elt_sz, pg_sz;
> > > >>      size_t off;
> > > >>      unsigned int i;
> > > >>      void *obj;
> > > >>
> > > >> +    rte_mempool_get_page_size(mp, &pg_sz);
> > > >> +
> > > >>      total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > >>
> > > >> -    for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > >> +    for (off = 0, i = 0; i < max_objs; i++) {
> > > >> +            /* align offset to next page start if required */
> > > >> +            if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > >> +                    off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > >> off);
> > > > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
> > >
> > > It sounds line octeontx2 mempool should have its own populate callback
> > > which cares about it.
> >
> > Driver specific populate function is not a bad idea. The only concern
> > would be to
> >
> > # We need to duplicate rte_mempool_op_populate_default() and
> > rte_mempool_op_calc_mem_size_default()
> > # We need to make sure if some one changes the
> > rte_mempool_op_populate_default() and
> > rte_mempool_op_calc_mem_size_default() then he/she needs to update the
> > drivers too
>
> Agree, we need to be careful. Hopefully we shouldn't change this code
> very often.
>
> I'm sending a v2 with a patch to the octeontx2 driver which --I hope--
> should solve the issue.

Thanks, Olivier. We will test it.

>
> > # I would like to add one more point here is that:
> > - Calculation of  object pad requirements for MEMPOOL_F_NO_SPREAD i.e
> > optimize_object_size()
> > is NOT GENERIC. i.e get_gcd() based logic is not generic. DDR
> > controller defines the address to DDR channel "spread" and it will be
> > based on SoC or Mirco architecture.
> > So we need to consider that as well.
>
> Could you give some details about how it should be optimized on your
> platforms, and what is the behavior today (advertised nb_rank and
> nb_channels)?

I will start a new thread on this CCing all arch maintainers.


>
> Thanks,
> Olivier
  

Patch

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7664764e5..b23fd1b06 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -428,8 +428,6 @@  rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
 
 	if (!need_iova_contig_obj)
 		*pg_sz = 0;
-	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
-		*pg_sz = 0;
 	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
 		*pg_sz = get_min_page_size(mp->socket_id);
 	else
@@ -478,17 +476,15 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 	 * then just set page shift and page size to 0, because the user has
 	 * indicated that there's no need to care about anything.
 	 *
-	 * if we do need contiguous objects, there is also an option to reserve
-	 * the entire mempool memory as one contiguous block of memory, in
-	 * which case the page shift and alignment wouldn't matter as well.
+	 * if we do need contiguous objects (if a mempool driver has its
+	 * own calc_size() method returning min_chunk_size = mem_size),
+	 * there is also an option to reserve the entire mempool memory
+	 * as one contiguous block of memory.
 	 *
 	 * if we require contiguous objects, but not necessarily the entire
-	 * mempool reserved space to be contiguous, then there are two options.
-	 *
-	 * if our IO addresses are virtual, not actual physical (IOVA as VA
-	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous IO memory as far as the hardware is concerned, so
-	 * act as if we're getting contiguous memory.
+	 * mempool reserved space to be contiguous, pg_sz will be != 0,
+	 * and the default ops->populate() will take care of not placing
+	 * objects across pages.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
 	 * pages, or we might get memory from smaller pages, and how much of it
@@ -501,11 +497,6 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * If we fail to get enough contiguous memory, then we'll go and
 	 * reserve space in smaller chunks.
-	 *
-	 * We also have to take into account the fact that memory that we're
-	 * going to allocate from can belong to an externally allocated memory
-	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index f6aea7662..dd09a0a32 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -61,21 +61,44 @@  rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
 
+	rte_mempool_get_page_size(mp, &pg_sz);
+
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	for (off = 0, i = 0; i < max_objs; i++) {
+		/* align offset to next page start if required */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
-		obj = (char *)vaddr + off;
+		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);