[v3,2/4] mbuf: create packet pool with external memory buffers

Message ID 1578993305-15165-3-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mbuf: detach mbuf with pinned external buffer |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko Jan. 14, 2020, 9:15 a.m. UTC
  The dedicated routine rte_pktmbuf_pool_create_extbuf() is
provided to create mbuf pool with data buffers located in
the pinned external memory. The application provides the
external memory description and routine initialises each
mbuf with appropriate virtual and physical buffer address.
It is entirely application responsibility to register
external memory with rte_extmem_register() API, map this
memory, etc.

The new introduced flag RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF
is set in private pool structure, specifying the new special
pool type. The allocated mbufs from pool of this kind will
have the EXT_ATTACHED_MBUF flag set and NULL shared info
pointer, because external buffers are not supposed to be
freed and sharing management is not needed. Also, these
mbufs can not be attached to other mbufs (not intended to
be indirect).

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 lib/librte_mbuf/rte_mbuf.c           | 144 ++++++++++++++++++++++++++++++++++-
 lib/librte_mbuf/rte_mbuf.h           |  86 ++++++++++++++++++++-
 lib/librte_mbuf/rte_mbuf_version.map |   1 +
 3 files changed, 228 insertions(+), 3 deletions(-)
  

Comments

Olivier Matz Jan. 14, 2020, 4:04 p.m. UTC | #1
On Tue, Jan 14, 2020 at 09:15:03AM +0000, Viacheslav Ovsiienko wrote:
> The dedicated routine rte_pktmbuf_pool_create_extbuf() is
> provided to create mbuf pool with data buffers located in
> the pinned external memory. The application provides the
> external memory description and routine initialises each
> mbuf with appropriate virtual and physical buffer address.
> It is entirely application responsibility to register
> external memory with rte_extmem_register() API, map this
> memory, etc.
> 
> The new introduced flag RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF
> is set in private pool structure, specifying the new special
> pool type. The allocated mbufs from pool of this kind will
> have the EXT_ATTACHED_MBUF flag set and NULL shared info
> pointer, because external buffers are not supposed to be
> freed and sharing management is not needed. Also, these
> mbufs can not be attached to other mbufs (not intended to
> be indirect).
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  lib/librte_mbuf/rte_mbuf.c           | 144 ++++++++++++++++++++++++++++++++++-
>  lib/librte_mbuf/rte_mbuf.h           |  86 ++++++++++++++++++++-
>  lib/librte_mbuf/rte_mbuf_version.map |   1 +
>  3 files changed, 228 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 8fa7f49..d151469 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -59,9 +59,9 @@
>  	}
>  
>  	RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) +
> -		user_mbp_priv->mbuf_data_room_size +
> +		((user_mbp_priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) ?
> +		0 : user_mbp_priv->mbuf_data_room_size) +
>  		user_mbp_priv->mbuf_priv_size);

Is this check really needed?

> -	RTE_ASSERT(user_mbp_priv->flags == 0);

We can keep
 RTE_ASSERT(user_mbp_priv->flags & ~RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF == 0);

>  
>  	mbp_priv = rte_mempool_get_priv(mp);
>  	memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv));
> @@ -107,6 +107,63 @@
>  	m->next = NULL;
>  }
>  
> +/*
> + * pktmbuf constructor for the pool with pinned external buffer,
> + * given as a callback function to rte_mempool_obj_iter() in
> + * rte_pktmbuf_pool_create_extbuf(). Set the fields of a packet
> + * mbuf to their default values.
> + */
> +void
> +rte_pktmbuf_init_extmem(struct rte_mempool *mp,
> +			void *opaque_arg,
> +			void *_m,
> +			__attribute__((unused)) unsigned int i)
> +{
> +	struct rte_mbuf *m = _m;
> +	struct rte_pktmbuf_extmem_init_ctx *ctx = opaque_arg;
> +	struct rte_pktmbuf_extmem *ext_mem;
> +	uint32_t mbuf_size, buf_len, priv_size;
> +
> +	priv_size = rte_pktmbuf_priv_size(mp);
> +	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
> +	buf_len = rte_pktmbuf_data_room_size(mp);
> +
> +	RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == priv_size);
> +	RTE_ASSERT(mp->elt_size >= mbuf_size);
> +	RTE_ASSERT(buf_len <= UINT16_MAX);
> +
> +	memset(m, 0, mbuf_size);
> +	m->priv_size = priv_size;
> +	m->buf_len = (uint16_t)buf_len;
> +
> +	/* set the data buffer pointers to external memory */
> +	ext_mem = ctx->ext_mem + ctx->ext;
> +
> +	RTE_ASSERT(ctx->ext < ctx->ext_num);
> +	RTE_ASSERT(ctx->off < ext_mem->buf_len);
> +
> +	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
> +	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
> +		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
> +
> +	ctx->off += ext_mem->elt_size;
> +	if (ctx->off >= ext_mem->buf_len) {
> +		ctx->off = 0;
> +		++ctx->ext;
> +	}
> +	/* keep some headroom between start of buffer and data */
> +	m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len);
> +
> +	/* init some constant fields */
> +	m->pool = mp;
> +	m->nb_segs = 1;
> +	m->port = MBUF_INVALID_PORT;
> +	m->ol_flags = EXT_ATTACHED_MBUF;
> +	rte_mbuf_refcnt_set(m, 1);
> +	m->next = NULL;
> +}
> +
> +
>  /* Helper to create a mbuf pool with given mempool ops name*/
>  struct rte_mempool *
>  rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n,
> @@ -169,6 +226,89 @@ struct rte_mempool *
>  			data_room_size, socket_id, NULL);
>  }
>  
> +/* Helper to create a mbuf pool with pinned external data buffers. */
> +struct rte_mempool *
> +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
> +	unsigned int cache_size, uint16_t priv_size,
> +	uint16_t data_room_size, int socket_id,
> +	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num)
> +{
> +	struct rte_mempool *mp;
> +	struct rte_pktmbuf_pool_private mbp_priv;
> +	struct rte_pktmbuf_extmem_init_ctx init_ctx;
> +	const char *mp_ops_name;
> +	unsigned int elt_size;
> +	unsigned int i, n_elts = 0;
> +	int ret;
> +
> +	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
> +		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
> +			priv_size);
> +		rte_errno = EINVAL;
> +		return NULL;
> +	}
> +	/* Check the external memory descriptors. */
> +	for (i = 0; i < ext_num; i++) {
> +		struct rte_pktmbuf_extmem *extm = ext_mem + i;
> +
> +		if (!extm->elt_size || !extm->buf_len || !extm->buf_ptr) {
> +			RTE_LOG(ERR, MBUF, "invalid extmem descriptor\n");
> +			rte_errno = EINVAL;
> +			return NULL;
> +		}
> +		if (data_room_size > extm->elt_size) {
> +			RTE_LOG(ERR, MBUF, "ext elt_size=%u is too small\n",
> +				priv_size);
> +			rte_errno = EINVAL;
> +			return NULL;
> +		}
> +		n_elts += extm->buf_len / extm->elt_size;
> +	}
> +	/* Check whether enough external memory provided. */
> +	if (n_elts < n) {
> +		RTE_LOG(ERR, MBUF, "not enough extmem\n");
> +		rte_errno = ENOMEM;
> +		return NULL;
> +	}
> +	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size;
> +	memset(&mbp_priv, 0, sizeof(mbp_priv));
> +	mbp_priv.mbuf_data_room_size = data_room_size;
> +	mbp_priv.mbuf_priv_size = priv_size;
> +	mbp_priv.flags = RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
> +
> +	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
> +		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
> +	if (mp == NULL)
> +		return NULL;
> +
> +	mp_ops_name = rte_mbuf_best_mempool_ops();
> +	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
> +	if (ret != 0) {
> +		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
> +		rte_mempool_free(mp);
> +		rte_errno = -ret;
> +		return NULL;
> +	}
> +	rte_pktmbuf_pool_init(mp, &mbp_priv);
> +
> +	ret = rte_mempool_populate_default(mp);
> +	if (ret < 0) {
> +		rte_mempool_free(mp);
> +		rte_errno = -ret;
> +		return NULL;
> +	}
> +
> +	init_ctx = (struct rte_pktmbuf_extmem_init_ctx){
> +		.ext_mem = ext_mem,
> +		.ext_num = ext_num,
> +		.ext = 0,
> +		.off = 0,
> +	};
> +	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
> +
> +	return mp;
> +}

Instead of duplicating some code, would it be possible to do:

int
rte_pktmbuf_pool_attach_extbuf(struct rte_mempool *mp,
	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num)
{
	struct rte_pktmbuf_extmem_init_ctx init_ctx = { 0 };
	struct rte_pktmbuf_pool_private *priv;

	/* XXX assert mempool is fully populated? */

	priv = rte_mempool_get_priv(mp);
	mbp_priv.flags |= RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;

	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);

	return init_ctx.ret;
}

The application would have to call:

	rte_pktmbuf_pool_create(...);
	rte_pktmbuf_pool_attach_extbuf(...);


> +
>  /* do some sanity checks on a mbuf: panic if it fails */
>  void
>  rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header)
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 8f486af..7bde297 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -642,6 +642,34 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
>  void rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg,
>  		      void *m, unsigned i);
>  
> +/** The context to initialize the mbufs with pinned external buffers. */
> +struct rte_pktmbuf_extmem_init_ctx {
> +	struct rte_pktmbuf_extmem *ext_mem; /* pointer to descriptor array. */
> +	unsigned int ext_num; /* number of descriptors in array. */
> +	unsigned int ext; /* loop descriptor index. */
> +	size_t off; /* loop buffer offset. */
> +};
> +
> +/**
> + * The packet mbuf constructor for pools with pinned external memory.
> + *
> + * This function initializes some fields in the mbuf structure that are
> + * not modified by the user once created (origin pool, buffer start
> + * address, and so on). This function is given as a callback function to
> + * rte_mempool_obj_iter() called from rte_mempool_create_extmem().
> + *
> + * @param mp
> + *   The mempool from which mbufs originate.
> + * @param opaque_arg
> + *   A pointer to the rte_pktmbuf_extmem_init_ctx - initialization
> + *   context structure
> + * @param m
> + *   The mbuf to initialize.
> + * @param i
> + *   The index of the mbuf in the pool table.
> + */
> +void rte_pktmbuf_init_extmem(struct rte_mempool *mp, void *opaque_arg,
> +			     void *m, unsigned int i);
>  
>  /**
>   * A  packet mbuf pool constructor.
> @@ -743,6 +771,62 @@ struct rte_mempool *
>  	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
>  	int socket_id, const char *ops_name);
>  
> +/** A structure that describes the pinned external buffer segment. */
> +struct rte_pktmbuf_extmem {
> +	void *buf_ptr;		/**< The virtual address of data buffer. */
> +	rte_iova_t buf_iova;	/**< The IO address of the data buffer. */
> +	size_t buf_len;		/**< External buffer length in bytes. */
> +	uint16_t elt_size;	/**< mbuf element size in bytes. */
> +};
> +
> +/**
> + * Create a mbuf pool with external pinned data buffers.
> + *
> + * This function creates and initializes a packet mbuf pool that contains
> + * only mbufs with external buffer. It is a wrapper to rte_mempool functions.
> + *
> + * @param name
> + *   The name of the mbuf pool.
> + * @param n
> + *   The number of elements in the mbuf pool. The optimum size (in terms
> + *   of memory usage) for a mempool is when n is a power of two minus one:
> + *   n = (2^q - 1).
> + * @param cache_size
> + *   Size of the per-core object cache. See rte_mempool_create() for
> + *   details.
> + * @param priv_size
> + *   Size of application private are between the rte_mbuf structure
> + *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
> + * @param data_room_size
> + *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
> + * @param socket_id
> + *   The socket identifier where the memory should be allocated. The
> + *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
> + *   reserved zone.
> + * @param ext_mem
> + *   Pointer to the array of structures describing the external memory
> + *   for data buffers. It is caller responsibility to register this memory
> + *   with rte_extmem_register() (if needed), map this memory to appropriate
> + *   physical device, etc.
> + * @param ext_num
> + *   Number of elements in the ext_mem array.
> + * @return
> + *   The pointer to the new allocated mempool, on success. NULL on error
> + *   with rte_errno set appropriately. Possible rte_errno values include:
> + *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
> + *    - E_RTE_SECONDARY - function was called from a secondary process instance
> + *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
> + *    - ENOSPC - the maximum number of memzones has already been allocated
> + *    - EEXIST - a memzone with the same name already exists
> + *    - ENOMEM - no appropriate memory area found in which to create memzone
> + */
> +__rte_experimental
> +struct rte_mempool *
> +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
> +	unsigned int cache_size, uint16_t priv_size,
> +	uint16_t data_room_size, int socket_id,
> +	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num);
> +
>  /**
>   * Get the data room size of mbufs stored in a pktmbuf_pool
>   *
> @@ -818,7 +902,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
>  	m->nb_segs = 1;
>  	m->port = MBUF_INVALID_PORT;
>  
> -	m->ol_flags = 0;
> +	m->ol_flags &= EXT_ATTACHED_MBUF;
>  	m->packet_type = 0;
>  	rte_pktmbuf_reset_headroom(m);
>  

I wonder if it should go in previous patch?

> diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
> index 3bbb476..ab161bc 100644
> --- a/lib/librte_mbuf/rte_mbuf_version.map
> +++ b/lib/librte_mbuf/rte_mbuf_version.map
> @@ -44,5 +44,6 @@ EXPERIMENTAL {
>  	rte_mbuf_dyn_dump;
>  	rte_pktmbuf_copy;
>  	rte_pktmbuf_free_bulk;
> +	rte_pktmbuf_pool_create_extbuf;
>  
>  };
> -- 
> 1.8.3.1
>
  
Slava Ovsiienko Jan. 15, 2020, 6:13 p.m. UTC | #2
> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Tuesday, January 14, 2020 18:05
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: dev@dpdk.org; Matan Azrad <matan@mellanox.com>; Raslan Darawsheh
> <rasland@mellanox.com>; Ori Kam <orika@mellanox.com>; Shahaf Shuler
> <shahafs@mellanox.com>; stephen@networkplumber.org
> Subject: Re: [PATCH v3 2/4] mbuf: create packet pool with external memory
> buffers
> 
> On Tue, Jan 14, 2020 at 09:15:03AM +0000, Viacheslav Ovsiienko wrote:
> > The dedicated routine rte_pktmbuf_pool_create_extbuf() is provided to
> > create mbuf pool with data buffers located in the pinned external
> > memory. The application provides the external memory description and
> > routine initialises each mbuf with appropriate virtual and physical
> > buffer address.
> > It is entirely application responsibility to register external memory
> > with rte_extmem_register() API, map this memory, etc.
> >
> > The new introduced flag RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF is set in
> > private pool structure, specifying the new special pool type. The
> > allocated mbufs from pool of this kind will have the EXT_ATTACHED_MBUF
> > flag set and NULL shared info pointer, because external buffers are
> > not supposed to be freed and sharing management is not needed. Also,
> > these mbufs can not be attached to other mbufs (not intended to be
> > indirect).
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  lib/librte_mbuf/rte_mbuf.c           | 144
> ++++++++++++++++++++++++++++++++++-
> >  lib/librte_mbuf/rte_mbuf.h           |  86 ++++++++++++++++++++-
> >  lib/librte_mbuf/rte_mbuf_version.map |   1 +
> >  3 files changed, 228 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> > index 8fa7f49..d151469 100644
> > --- a/lib/librte_mbuf/rte_mbuf.c
> > +++ b/lib/librte_mbuf/rte_mbuf.c
> > @@ -59,9 +59,9 @@
> >  	}
> >
> >  	RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) +
> > -		user_mbp_priv->mbuf_data_room_size +
> > +		((user_mbp_priv->flags &
> RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) ?
> > +		0 : user_mbp_priv->mbuf_data_room_size) +
> >  		user_mbp_priv->mbuf_priv_size);
> 
> Is this check really needed?
It seems so, it is in separated routine, which might be called externally.

> 
> > -	RTE_ASSERT(user_mbp_priv->flags == 0);
> 
> We can keep
>  RTE_ASSERT(user_mbp_priv->flags &
> ~RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF == 0);
OK, thanks.

> 
> >
> >  	mbp_priv = rte_mempool_get_priv(mp);
> >  	memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv)); @@ -107,6
> > +107,63 @@
> >  	m->next = NULL;
> >  }
> >
> > +/*
> > + * pktmbuf constructor for the pool with pinned external buffer,
> > + * given as a callback function to rte_mempool_obj_iter() in
> > + * rte_pktmbuf_pool_create_extbuf(). Set the fields of a packet
> > + * mbuf to their default values.
> > + */
> > +void
> > +rte_pktmbuf_init_extmem(struct rte_mempool *mp,
> > +			void *opaque_arg,
> > +			void *_m,
> > +			__attribute__((unused)) unsigned int i) {
> > +	struct rte_mbuf *m = _m;
> > +	struct rte_pktmbuf_extmem_init_ctx *ctx = opaque_arg;
> > +	struct rte_pktmbuf_extmem *ext_mem;
> > +	uint32_t mbuf_size, buf_len, priv_size;
> > +
> > +	priv_size = rte_pktmbuf_priv_size(mp);
> > +	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
> > +	buf_len = rte_pktmbuf_data_room_size(mp);
> > +
> > +	RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) ==
> priv_size);
> > +	RTE_ASSERT(mp->elt_size >= mbuf_size);
> > +	RTE_ASSERT(buf_len <= UINT16_MAX);
> > +
> > +	memset(m, 0, mbuf_size);
> > +	m->priv_size = priv_size;
> > +	m->buf_len = (uint16_t)buf_len;
> > +
> > +	/* set the data buffer pointers to external memory */
> > +	ext_mem = ctx->ext_mem + ctx->ext;
> > +
> > +	RTE_ASSERT(ctx->ext < ctx->ext_num);
> > +	RTE_ASSERT(ctx->off < ext_mem->buf_len);
> > +
> > +	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
> > +	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
> > +		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
> > +
> > +	ctx->off += ext_mem->elt_size;
> > +	if (ctx->off >= ext_mem->buf_len) {
> > +		ctx->off = 0;
> > +		++ctx->ext;
> > +	}
> > +	/* keep some headroom between start of buffer and data */
> > +	m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m-
> >buf_len);
> > +
> > +	/* init some constant fields */
> > +	m->pool = mp;
> > +	m->nb_segs = 1;
> > +	m->port = MBUF_INVALID_PORT;
> > +	m->ol_flags = EXT_ATTACHED_MBUF;
> > +	rte_mbuf_refcnt_set(m, 1);
> > +	m->next = NULL;
> > +}
> > +
> > +
> >  /* Helper to create a mbuf pool with given mempool ops name*/  struct
> > rte_mempool *  rte_pktmbuf_pool_create_by_ops(const char *name,
> > unsigned int n, @@ -169,6 +226,89 @@ struct rte_mempool *
> >  			data_room_size, socket_id, NULL);
> >  }
> >
> > +/* Helper to create a mbuf pool with pinned external data buffers. */
> > +struct rte_mempool * rte_pktmbuf_pool_create_extbuf(const char *name,
> > +unsigned int n,
> > +	unsigned int cache_size, uint16_t priv_size,
> > +	uint16_t data_room_size, int socket_id,
> > +	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num) {
> > +	struct rte_mempool *mp;
> > +	struct rte_pktmbuf_pool_private mbp_priv;
> > +	struct rte_pktmbuf_extmem_init_ctx init_ctx;
> > +	const char *mp_ops_name;
> > +	unsigned int elt_size;
> > +	unsigned int i, n_elts = 0;
> > +	int ret;
> > +
> > +	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
> > +		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
> > +			priv_size);
> > +		rte_errno = EINVAL;
> > +		return NULL;
> > +	}
> > +	/* Check the external memory descriptors. */
> > +	for (i = 0; i < ext_num; i++) {
> > +		struct rte_pktmbuf_extmem *extm = ext_mem + i;
> > +
> > +		if (!extm->elt_size || !extm->buf_len || !extm->buf_ptr) {
> > +			RTE_LOG(ERR, MBUF, "invalid extmem descriptor\n");
> > +			rte_errno = EINVAL;
> > +			return NULL;
> > +		}
> > +		if (data_room_size > extm->elt_size) {
> > +			RTE_LOG(ERR, MBUF, "ext elt_size=%u is too small\n",
> > +				priv_size);
> > +			rte_errno = EINVAL;
> > +			return NULL;
> > +		}
> > +		n_elts += extm->buf_len / extm->elt_size;
> > +	}
> > +	/* Check whether enough external memory provided. */
> > +	if (n_elts < n) {
> > +		RTE_LOG(ERR, MBUF, "not enough extmem\n");
> > +		rte_errno = ENOMEM;
> > +		return NULL;
> > +	}
> > +	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size;
> > +	memset(&mbp_priv, 0, sizeof(mbp_priv));
> > +	mbp_priv.mbuf_data_room_size = data_room_size;
> > +	mbp_priv.mbuf_priv_size = priv_size;
> > +	mbp_priv.flags = RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
> > +
> > +	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
> > +		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
> > +	if (mp == NULL)
> > +		return NULL;
> > +
> > +	mp_ops_name = rte_mbuf_best_mempool_ops();
> > +	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
> > +	if (ret != 0) {
> > +		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
> > +		rte_mempool_free(mp);
> > +		rte_errno = -ret;
> > +		return NULL;
> > +	}
> > +	rte_pktmbuf_pool_init(mp, &mbp_priv);
> > +
> > +	ret = rte_mempool_populate_default(mp);
> > +	if (ret < 0) {
> > +		rte_mempool_free(mp);
> > +		rte_errno = -ret;
> > +		return NULL;
> > +	}
> > +
> > +	init_ctx = (struct rte_pktmbuf_extmem_init_ctx){
> > +		.ext_mem = ext_mem,
> > +		.ext_num = ext_num,
> > +		.ext = 0,
> > +		.off = 0,
> > +	};
> > +	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
> > +
> > +	return mp;
> > +}
> 
> Instead of duplicating some code, would it be possible to do:
> 
> int
> rte_pktmbuf_pool_attach_extbuf(struct rte_mempool *mp,
> 	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num) {
> 	struct rte_pktmbuf_extmem_init_ctx init_ctx = { 0 };
> 	struct rte_pktmbuf_pool_private *priv;
> 
> 	/* XXX assert mempool is fully populated? */
> 
> 	priv = rte_mempool_get_priv(mp);
> 	mbp_priv.flags |= RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
> 
> 	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
> 
> 	return init_ctx.ret;
> }
> 
> The application would have to call:
> 
> 	rte_pktmbuf_pool_create(...);
> 	rte_pktmbuf_pool_attach_extbuf(...);
> 
It seems there are some disadvantages:
- no data_room_size check (we should remove asserts from rte_pktmbuf_pool_init)
- rte_mempool_obj_iter would be called twice, it might involve rte_mempool_virt2iova()
  and it would take some time

The code duplication is not so large as it could be seen from the diff - the part of
the  rte_pktmbuf_pool_create_extbuf() is related to checking extmem, and the main part
of job is done in the rte_pktmbuf_init_extmem().

> 
> > +
> >  /* do some sanity checks on a mbuf: panic if it fails */  void
> > rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) diff
> > --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index
> > 8f486af..7bde297 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -642,6 +642,34 @@ static inline struct rte_mbuf
> > *rte_mbuf_raw_alloc(struct rte_mempool *mp)  void
> rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg,
> >  		      void *m, unsigned i);
> >
> > +/** The context to initialize the mbufs with pinned external buffers.
> > +*/ struct rte_pktmbuf_extmem_init_ctx {
> > +	struct rte_pktmbuf_extmem *ext_mem; /* pointer to descriptor
> array. */
> > +	unsigned int ext_num; /* number of descriptors in array. */
> > +	unsigned int ext; /* loop descriptor index. */
> > +	size_t off; /* loop buffer offset. */ };
> > +
> > +/**
> > + * The packet mbuf constructor for pools with pinned external memory.
> > + *
> > + * This function initializes some fields in the mbuf structure that
> > +are
> > + * not modified by the user once created (origin pool, buffer start
> > + * address, and so on). This function is given as a callback function
> > +to
> > + * rte_mempool_obj_iter() called from rte_mempool_create_extmem().
> > + *
> > + * @param mp
> > + *   The mempool from which mbufs originate.
> > + * @param opaque_arg
> > + *   A pointer to the rte_pktmbuf_extmem_init_ctx - initialization
> > + *   context structure
> > + * @param m
> > + *   The mbuf to initialize.
> > + * @param i
> > + *   The index of the mbuf in the pool table.
> > + */
> > +void rte_pktmbuf_init_extmem(struct rte_mempool *mp, void
> *opaque_arg,
> > +			     void *m, unsigned int i);
> >
> >  /**
> >   * A  packet mbuf pool constructor.
> > @@ -743,6 +771,62 @@ struct rte_mempool *
> >  	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
> >  	int socket_id, const char *ops_name);
> >
> > +/** A structure that describes the pinned external buffer segment. */
> > +struct rte_pktmbuf_extmem {
> > +	void *buf_ptr;		/**< The virtual address of data buffer. */
> > +	rte_iova_t buf_iova;	/**< The IO address of the data buffer. */
> > +	size_t buf_len;		/**< External buffer length in bytes. */
> > +	uint16_t elt_size;	/**< mbuf element size in bytes. */
> > +};
> > +
> > +/**
> > + * Create a mbuf pool with external pinned data buffers.
> > + *
> > + * This function creates and initializes a packet mbuf pool that
> > +contains
> > + * only mbufs with external buffer. It is a wrapper to rte_mempool
> functions.
> > + *
> > + * @param name
> > + *   The name of the mbuf pool.
> > + * @param n
> > + *   The number of elements in the mbuf pool. The optimum size (in terms
> > + *   of memory usage) for a mempool is when n is a power of two minus
> one:
> > + *   n = (2^q - 1).
> > + * @param cache_size
> > + *   Size of the per-core object cache. See rte_mempool_create() for
> > + *   details.
> > + * @param priv_size
> > + *   Size of application private are between the rte_mbuf structure
> > + *   and the data buffer. This value must be aligned to
> RTE_MBUF_PRIV_ALIGN.
> > + * @param data_room_size
> > + *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
> > + * @param socket_id
> > + *   The socket identifier where the memory should be allocated. The
> > + *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
> > + *   reserved zone.
> > + * @param ext_mem
> > + *   Pointer to the array of structures describing the external memory
> > + *   for data buffers. It is caller responsibility to register this memory
> > + *   with rte_extmem_register() (if needed), map this memory to
> appropriate
> > + *   physical device, etc.
> > + * @param ext_num
> > + *   Number of elements in the ext_mem array.
> > + * @return
> > + *   The pointer to the new allocated mempool, on success. NULL on error
> > + *   with rte_errno set appropriately. Possible rte_errno values include:
> > + *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config
> structure
> > + *    - E_RTE_SECONDARY - function was called from a secondary process
> instance
> > + *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
> > + *    - ENOSPC - the maximum number of memzones has already been
> allocated
> > + *    - EEXIST - a memzone with the same name already exists
> > + *    - ENOMEM - no appropriate memory area found in which to create
> memzone
> > + */
> > +__rte_experimental
> > +struct rte_mempool *
> > +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
> > +	unsigned int cache_size, uint16_t priv_size,
> > +	uint16_t data_room_size, int socket_id,
> > +	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num);
> > +
> >  /**
> >   * Get the data room size of mbufs stored in a pktmbuf_pool
> >   *
> > @@ -818,7 +902,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf
> *m)
> >  	m->nb_segs = 1;
> >  	m->port = MBUF_INVALID_PORT;
> >
> > -	m->ol_flags = 0;
> > +	m->ol_flags &= EXT_ATTACHED_MBUF;
> >  	m->packet_type = 0;
> >  	rte_pktmbuf_reset_headroom(m);
> >
> 
> I wonder if it should go in previous patch?

Mmm... Definitely - yes 😊
Thanks, will move this line.
> 
> > diff --git a/lib/librte_mbuf/rte_mbuf_version.map
> > b/lib/librte_mbuf/rte_mbuf_version.map
> > index 3bbb476..ab161bc 100644
> > --- a/lib/librte_mbuf/rte_mbuf_version.map
> > +++ b/lib/librte_mbuf/rte_mbuf_version.map
> > @@ -44,5 +44,6 @@ EXPERIMENTAL {
> >  	rte_mbuf_dyn_dump;
> >  	rte_pktmbuf_copy;
> >  	rte_pktmbuf_free_bulk;
> > +	rte_pktmbuf_pool_create_extbuf;
> >
> >  };
> > --
> > 1.8.3.1
> >
With best regards, Slava
  

Patch

diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 8fa7f49..d151469 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -59,9 +59,9 @@ 
 	}
 
 	RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) +
-		user_mbp_priv->mbuf_data_room_size +
+		((user_mbp_priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) ?
+		0 : user_mbp_priv->mbuf_data_room_size) +
 		user_mbp_priv->mbuf_priv_size);
-	RTE_ASSERT(user_mbp_priv->flags == 0);
 
 	mbp_priv = rte_mempool_get_priv(mp);
 	memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv));
@@ -107,6 +107,63 @@ 
 	m->next = NULL;
 }
 
+/*
+ * pktmbuf constructor for the pool with pinned external buffer,
+ * given as a callback function to rte_mempool_obj_iter() in
+ * rte_pktmbuf_pool_create_extbuf(). Set the fields of a packet
+ * mbuf to their default values.
+ */
+void
+rte_pktmbuf_init_extmem(struct rte_mempool *mp,
+			void *opaque_arg,
+			void *_m,
+			__attribute__((unused)) unsigned int i)
+{
+	struct rte_mbuf *m = _m;
+	struct rte_pktmbuf_extmem_init_ctx *ctx = opaque_arg;
+	struct rte_pktmbuf_extmem *ext_mem;
+	uint32_t mbuf_size, buf_len, priv_size;
+
+	priv_size = rte_pktmbuf_priv_size(mp);
+	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
+	buf_len = rte_pktmbuf_data_room_size(mp);
+
+	RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == priv_size);
+	RTE_ASSERT(mp->elt_size >= mbuf_size);
+	RTE_ASSERT(buf_len <= UINT16_MAX);
+
+	memset(m, 0, mbuf_size);
+	m->priv_size = priv_size;
+	m->buf_len = (uint16_t)buf_len;
+
+	/* set the data buffer pointers to external memory */
+	ext_mem = ctx->ext_mem + ctx->ext;
+
+	RTE_ASSERT(ctx->ext < ctx->ext_num);
+	RTE_ASSERT(ctx->off < ext_mem->buf_len);
+
+	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
+	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
+		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
+
+	ctx->off += ext_mem->elt_size;
+	if (ctx->off >= ext_mem->buf_len) {
+		ctx->off = 0;
+		++ctx->ext;
+	}
+	/* keep some headroom between start of buffer and data */
+	m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len);
+
+	/* init some constant fields */
+	m->pool = mp;
+	m->nb_segs = 1;
+	m->port = MBUF_INVALID_PORT;
+	m->ol_flags = EXT_ATTACHED_MBUF;
+	rte_mbuf_refcnt_set(m, 1);
+	m->next = NULL;
+}
+
+
 /* Helper to create a mbuf pool with given mempool ops name*/
 struct rte_mempool *
 rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n,
@@ -169,6 +226,89 @@  struct rte_mempool *
 			data_room_size, socket_id, NULL);
 }
 
+/* Helper to create a mbuf pool with pinned external data buffers. */
+struct rte_mempool *
+rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size,
+	uint16_t data_room_size, int socket_id,
+	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num)
+{
+	struct rte_mempool *mp;
+	struct rte_pktmbuf_pool_private mbp_priv;
+	struct rte_pktmbuf_extmem_init_ctx init_ctx;
+	const char *mp_ops_name;
+	unsigned int elt_size;
+	unsigned int i, n_elts = 0;
+	int ret;
+
+	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
+		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
+			priv_size);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	/* Check the external memory descriptors. */
+	for (i = 0; i < ext_num; i++) {
+		struct rte_pktmbuf_extmem *extm = ext_mem + i;
+
+		if (!extm->elt_size || !extm->buf_len || !extm->buf_ptr) {
+			RTE_LOG(ERR, MBUF, "invalid extmem descriptor\n");
+			rte_errno = EINVAL;
+			return NULL;
+		}
+		if (data_room_size > extm->elt_size) {
+			RTE_LOG(ERR, MBUF, "ext elt_size=%u is too small\n",
+				priv_size);
+			rte_errno = EINVAL;
+			return NULL;
+		}
+		n_elts += extm->buf_len / extm->elt_size;
+	}
+	/* Check whether enough external memory provided. */
+	if (n_elts < n) {
+		RTE_LOG(ERR, MBUF, "not enough extmem\n");
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size;
+	memset(&mbp_priv, 0, sizeof(mbp_priv));
+	mbp_priv.mbuf_data_room_size = data_room_size;
+	mbp_priv.mbuf_priv_size = priv_size;
+	mbp_priv.flags = RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
+
+	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
+		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
+	if (mp == NULL)
+		return NULL;
+
+	mp_ops_name = rte_mbuf_best_mempool_ops();
+	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+	rte_pktmbuf_pool_init(mp, &mbp_priv);
+
+	ret = rte_mempool_populate_default(mp);
+	if (ret < 0) {
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+
+	init_ctx = (struct rte_pktmbuf_extmem_init_ctx){
+		.ext_mem = ext_mem,
+		.ext_num = ext_num,
+		.ext = 0,
+		.off = 0,
+	};
+	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
+
+	return mp;
+}
+
 /* do some sanity checks on a mbuf: panic if it fails */
 void
 rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header)
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 8f486af..7bde297 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -642,6 +642,34 @@  static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
 void rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg,
 		      void *m, unsigned i);
 
+/** The context to initialize the mbufs with pinned external buffers. */
+struct rte_pktmbuf_extmem_init_ctx {
+	struct rte_pktmbuf_extmem *ext_mem; /* pointer to descriptor array. */
+	unsigned int ext_num; /* number of descriptors in array. */
+	unsigned int ext; /* loop descriptor index. */
+	size_t off; /* loop buffer offset. */
+};
+
+/**
+ * The packet mbuf constructor for pools with pinned external memory.
+ *
+ * This function initializes some fields in the mbuf structure that are
+ * not modified by the user once created (origin pool, buffer start
+ * address, and so on). This function is given as a callback function to
+ * rte_mempool_obj_iter() called from rte_mempool_create_extmem().
+ *
+ * @param mp
+ *   The mempool from which mbufs originate.
+ * @param opaque_arg
+ *   A pointer to the rte_pktmbuf_extmem_init_ctx - initialization
+ *   context structure
+ * @param m
+ *   The mbuf to initialize.
+ * @param i
+ *   The index of the mbuf in the pool table.
+ */
+void rte_pktmbuf_init_extmem(struct rte_mempool *mp, void *opaque_arg,
+			     void *m, unsigned int i);
 
 /**
  * A  packet mbuf pool constructor.
@@ -743,6 +771,62 @@  struct rte_mempool *
 	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
 	int socket_id, const char *ops_name);
 
+/** A structure that describes the pinned external buffer segment. */
+struct rte_pktmbuf_extmem {
+	void *buf_ptr;		/**< The virtual address of data buffer. */
+	rte_iova_t buf_iova;	/**< The IO address of the data buffer. */
+	size_t buf_len;		/**< External buffer length in bytes. */
+	uint16_t elt_size;	/**< mbuf element size in bytes. */
+};
+
+/**
+ * Create a mbuf pool with external pinned data buffers.
+ *
+ * This function creates and initializes a packet mbuf pool that contains
+ * only mbufs with external buffer. It is a wrapper to rte_mempool functions.
+ *
+ * @param name
+ *   The name of the mbuf pool.
+ * @param n
+ *   The number of elements in the mbuf pool. The optimum size (in terms
+ *   of memory usage) for a mempool is when n is a power of two minus one:
+ *   n = (2^q - 1).
+ * @param cache_size
+ *   Size of the per-core object cache. See rte_mempool_create() for
+ *   details.
+ * @param priv_size
+ *   Size of application private are between the rte_mbuf structure
+ *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
+ * @param data_room_size
+ *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
+ * @param socket_id
+ *   The socket identifier where the memory should be allocated. The
+ *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
+ *   reserved zone.
+ * @param ext_mem
+ *   Pointer to the array of structures describing the external memory
+ *   for data buffers. It is caller responsibility to register this memory
+ *   with rte_extmem_register() (if needed), map this memory to appropriate
+ *   physical device, etc.
+ * @param ext_num
+ *   Number of elements in the ext_mem array.
+ * @return
+ *   The pointer to the new allocated mempool, on success. NULL on error
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ *    - E_RTE_SECONDARY - function was called from a secondary process instance
+ *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
+ *    - ENOSPC - the maximum number of memzones has already been allocated
+ *    - EEXIST - a memzone with the same name already exists
+ *    - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+__rte_experimental
+struct rte_mempool *
+rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size,
+	uint16_t data_room_size, int socket_id,
+	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num);
+
 /**
  * Get the data room size of mbufs stored in a pktmbuf_pool
  *
@@ -818,7 +902,7 @@  static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
 	m->nb_segs = 1;
 	m->port = MBUF_INVALID_PORT;
 
-	m->ol_flags = 0;
+	m->ol_flags &= EXT_ATTACHED_MBUF;
 	m->packet_type = 0;
 	rte_pktmbuf_reset_headroom(m);
 
diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
index 3bbb476..ab161bc 100644
--- a/lib/librte_mbuf/rte_mbuf_version.map
+++ b/lib/librte_mbuf/rte_mbuf_version.map
@@ -44,5 +44,6 @@  EXPERIMENTAL {
 	rte_mbuf_dyn_dump;
 	rte_pktmbuf_copy;
 	rte_pktmbuf_free_bulk;
+	rte_pktmbuf_pool_create_extbuf;
 
 };