[dpdk-dev,v3,1/2] mbuf: support attaching external buffer to mbuf

Message ID 20180419011105.9694-1-yskoh@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Yongseok Koh April 19, 2018, 1:11 a.m. UTC
  This patch introduces a new way of attaching an external buffer to a mbuf.

Attaching an external buffer is quite similar to mbuf indirection in
replacing buffer addresses and length of a mbuf, but a few differences:
  - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
    mbuf must be read-only. But external buffer has its own refcnt and it
    starts from 1. Unless multiple mbufs are attached to a mbuf having an
    external buffer, the external buffer is writable.
  - There's no need to allocate buffer from a mempool. Any buffer can be
    attached with appropriate free callback.
  - Smaller metadata is required to maintain shared data such as refcnt.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---

Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches
will be submitted separately rebased on a differnet patchset which
accommodates new memory hotplug design to mlx PMDs.

v3:
* implement external buffer attachment instead of introducing buf_off for
  mbuf indirection.

 lib/librte_mbuf/rte_mbuf.h | 276 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 248 insertions(+), 28 deletions(-)
  

Comments

Ananyev, Konstantin April 23, 2018, 11:53 a.m. UTC | #1
> 
> This patch introduces a new way of attaching an external buffer to a mbuf.
> 
> Attaching an external buffer is quite similar to mbuf indirection in
> replacing buffer addresses and length of a mbuf, but a few differences:
>   - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
>     mbuf must be read-only. But external buffer has its own refcnt and it
>     starts from 1. Unless multiple mbufs are attached to a mbuf having an
>     external buffer, the external buffer is writable.
>   - There's no need to allocate buffer from a mempool. Any buffer can be
>     attached with appropriate free callback.
>   - Smaller metadata is required to maintain shared data such as refcnt.
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> ---
> 
> Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches
> will be submitted separately rebased on a differnet patchset which
> accommodates new memory hotplug design to mlx PMDs.
> 
> v3:
> * implement external buffer attachment instead of introducing buf_off for
>   mbuf indirection.
> 
>  lib/librte_mbuf/rte_mbuf.h | 276 ++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 248 insertions(+), 28 deletions(-)
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 06eceba37..e64160c81 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -326,7 +326,7 @@ extern "C" {
>  		PKT_TX_MACSEC |		 \
>  		PKT_TX_SEC_OFFLOAD)
> 
> -#define __RESERVED           (1ULL << 61) /**< reserved for future mbuf use */
> +#define EXT_ATTACHED_MBUF    (1ULL << 61) /**< Mbuf having external buffer */
> 
>  #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
> 
> @@ -568,6 +568,24 @@ struct rte_mbuf {
> 
>  } __rte_cache_aligned;
> 
> +/**
> + * Function typedef of callback to free externally attached buffer.
> + */
> +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
> +
> +/**
> + * Shared data at the end of an external buffer.
> + */
> +struct rte_mbuf_ext_shared_info {
> +	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
> +	void *fcb_opaque;                        /**< Free callback argument */
> +	RTE_STD_C11
> +	union {
> +		rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
> +		uint16_t refcnt;          /**< Non-atomically accessed refcnt */
> +	};
> +};
> +
>  /**< Maximum number of nb_segs allowed. */
>  #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
> 
> @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
>  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> 
>  /**
> + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> + */
> +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> +
> +/**
>   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
>   */
> -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))

As a nit:
RTE_MBUF_DIRECT(mb)  (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0)

> 
>  /**
>   * Private data in case of pktmbuf pool.
> @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
> 
>  #endif /* RTE_MBUF_REFCNT_ATOMIC */
> 
> +/**
> + * Reads the refcnt of an external buffer.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @return
> + *   Reference count number.
> + */
> +static inline uint16_t
> +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)
> +{
> +	return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic));
> +}
> +
> +/**
> + * Set refcnt of an external buffer.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @param new_value
> + *   Value set
> + */
> +static inline void
> +rte_extbuf_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo,
> +	uint16_t new_value)
> +{
> +	rte_atomic16_set(&shinfo->refcnt_atomic, new_value);
> +}
> +
> +/**
> + * Add given value to refcnt of an external buffer and return its new
> + * value.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @param value
> + *   Value to add/subtract
> + * @return
> + *   Updated value
> + */
> +static inline uint16_t
> +rte_extbuf_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
> +	int16_t value)
> +{
> +	if (likely(rte_extbuf_refcnt_read(shinfo) == 1)) {
> +		rte_extbuf_refcnt_set(shinfo, 1 + value);
> +		return 1 + value;
> +	}
> +
> +	return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value);
> +}
> +
>  /** Mbuf prefetch */
>  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
>  	if ((m) != NULL)                        \
> @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>  }
> 
>  /**
> + * Return shared data of external buffer of a mbuf.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @return
> + *   The address of the shared data.
> + */
> +static inline struct rte_mbuf_ext_shared_info *
> +rte_mbuf_ext_shinfo(struct rte_mbuf *m)
> +{
> +	return (struct rte_mbuf_ext_shared_info *)
> +		RTE_PTR_ADD(m->buf_addr, m->buf_len);
> +}
> +
> +/**
> + * Attach an external buffer to a mbuf.
> + *
> + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> + * it, corresponding free callback function and its argument should be
> + * provided. This callback function will be called once all the mbufs are
> + * detached from the buffer.
> + *
> + * More mbufs can be attached to the same external buffer by
> + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> + * this API.
> + *
> + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> + * ``rte_pktmbuf_detach()``.
> + *
> + * A few bytes in the trailer of the provided buffer will be dedicated for
> + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt,
> + * callback function and so on. The shared data can be referenced by
> + * ``rte_mbuf_ext_shinfo()``
> + *
> + * Attaching an external buffer is quite similar to mbuf indirection in
> + * replacing buffer addresses and length of a mbuf, but a few differences:
> + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> + *   mbuf must be read-only. But external buffer has its own refcnt and it
> + *   starts from 1. Unless multiple mbufs are attached to a mbuf having an
> + *   external buffer, the external buffer is writable.
> + * - There's no need to allocate buffer from a mempool. Any buffer can be
> + *   attached with appropriate free callback.
> + * - Smaller metadata is required to maintain shared data such as refcnt.
> + *
> + * @warning
> + * @b EXPERIMENTAL: This API may change without prior notice.
> + * Once external buffer is enabled by allowing experimental API,
> + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> + * exclusive. A mbuf can be consiered direct if it is neither indirect nor
> + * having external buffer.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @param buf_addr
> + *   The pointer to the external buffer we're attaching to.
> + * @param buf_len
> + *   The size of the external buffer we're attaching to. This must be larger
> + *   than the size of ``struct rte_mbuf_ext_shared_info`` and padding for
> + *   alignment. If not enough, this function will return NULL.
> + * @param free_cb
> + *   Free callback function to call when the external buffer needs to be freed.
> + * @param fcb_opaque
> + *   Argument for the free callback function.
> + * @return
> + *   A pointer to the new start of the data on success, return NULL otherwise.
> + */
> +static inline char * __rte_experimental
> +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> +	uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
> +	void *fcb_opaque)
> +{
> +	void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +
> +	shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
> +			sizeof(uintptr_t));
> +
> +	if ((void *)shinfo <= buf_addr)
> +		return NULL;
> +
> +	m->buf_addr = buf_addr;
> +	m->buf_iova = rte_mempool_virt2iova(buf_addr);


That wouldn't work for arbitrary extern buffer.
Only for the one that is an element in some other mempool.
For arbitrary external buffer - callee has to provide PA for it plus guarantee that
it's VA would be locked down.
From other side - if your intention is just to use only elements of other mempools -
No need to have free_cb(). mempool_put should do.

> +	m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> +	m->data_len = 0;
> +
> +	rte_pktmbuf_reset_headroom(m);
> +	m->ol_flags |= EXT_ATTACHED_MBUF;
> +
> +	rte_extbuf_refcnt_set(shinfo, 1);
> +	shinfo->free_cb = free_cb;
> +	shinfo->fcb_opaque = fcb_opaque;
> +
> +	return (char *)m->buf_addr + m->data_off;
> +}
> +
> +/**
> + * Detach the external buffer attached to a mbuf, same as
> + * ``rte_pktmbuf_detach()``
> + *
> + * @param m
> + *   The mbuf having external buffer.
> + */
> +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m)
> +
> +/**
>   * Attach packet mbuf to another packet mbuf.
>   *
> - * After attachment we refer the mbuf we attached as 'indirect',
> - * while mbuf we attached to as 'direct'.
> - * The direct mbuf's reference counter is incremented.
> + * If the mbuf we are attaching to isn't a direct buffer and is attached to
> + * an external buffer, the mbuf being attached will be attached to the
> + * external buffer instead of mbuf indirection.
> + *
> + * Otherwise, the mbuf will be indirectly attached. After attachment we
> + * refer the mbuf we attached as 'indirect', while mbuf we attached to as
> + * 'direct'.  The direct mbuf's reference counter is incremented.
>   *
>   * Right now, not supported:
>   *  - attachment for already indirect mbuf (e.g. - mi has to be direct).
> @@ -1213,19 +1397,18 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>   */
>  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  {
> -	struct rte_mbuf *md;
> -
>  	RTE_ASSERT(RTE_MBUF_DIRECT(mi) &&
>  	    rte_mbuf_refcnt_read(mi) == 1);
> 
> -	/* if m is not direct, get the mbuf that embeds the data */
> -	if (RTE_MBUF_DIRECT(m))
> -		md = m;
> -	else
> -		md = rte_mbuf_from_indirect(m);
> +	if (RTE_MBUF_HAS_EXTBUF(m)) {
> +		rte_extbuf_refcnt_update(rte_mbuf_ext_shinfo(m), 1);
> +		mi->ol_flags = m->ol_flags;
> +	} else {
> +		rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1);
> +		mi->priv_size = m->priv_size;
> +		mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
> +	}
> 
> -	rte_mbuf_refcnt_update(md, 1);
> -	mi->priv_size = m->priv_size;
>  	mi->buf_iova = m->buf_iova;
>  	mi->buf_addr = m->buf_addr;
>  	mi->buf_len = m->buf_len;
> @@ -1241,7 +1424,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  	mi->next = NULL;
>  	mi->pkt_len = mi->data_len;
>  	mi->nb_segs = 1;
> -	mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
>  	mi->packet_type = m->packet_type;
>  	mi->timestamp = m->timestamp;
> 
> @@ -1250,12 +1432,53 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  }
> 
>  /**
> - * Detach an indirect packet mbuf.
> + * @internal used by rte_pktmbuf_detach().
> + *
> + * Decrement the reference counter of the external buffer. When the
> + * reference counter becomes 0, the buffer is freed by pre-registered
> + * callback.
> + */
> +static inline void
> +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> +{
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +
> +	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> +
> +	shinfo = rte_mbuf_ext_shinfo(m);
> +
> +	if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
> +		shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);


I understand the reason but extra function call for each external mbuf - seems quite expensive.
Wonder is it possible to group them somehow and amortize the cost?

> +}
> +
> +/**
> + * @internal used by rte_pktmbuf_detach().
>   *
> + * Decrement the direct mbuf's reference counter. When the reference
> + * counter becomes 0, the direct mbuf is freed.
> + */
> +static inline void
> +__rte_pktmbuf_free_direct(struct rte_mbuf *m)
> +{
> +	struct rte_mbuf *md = rte_mbuf_from_indirect(m);
> +
> +	RTE_ASSERT(RTE_MBUF_INDIRECT(m));
> +
> +	if (rte_mbuf_refcnt_update(md, -1) == 0) {
> +		md->next = NULL;
> +		md->nb_segs = 1;
> +		rte_mbuf_refcnt_set(md, 1);
> +		rte_mbuf_raw_free(md);
> +	}
> +}
> +
> +/**
> + * Detach a packet mbuf from external buffer or direct buffer.
> + *
> + *  - decrement refcnt and free the external/direct buffer if refcnt
> + *    becomes zero.
>   *  - restore original mbuf address and length values.
>   *  - reset pktmbuf data and data_len to their default values.
> - *  - decrement the direct mbuf's reference counter. When the
> - *  reference counter becomes 0, the direct mbuf is freed.
>   *
>   * All other fields of the given packet mbuf will be left intact.
>   *
> @@ -1264,10 +1487,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>   */
>  static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>  {
> -	struct rte_mbuf *md = rte_mbuf_from_indirect(m);
>  	struct rte_mempool *mp = m->pool;
>  	uint32_t mbuf_size, buf_len, priv_size;
> 
> +	if (RTE_MBUF_HAS_EXTBUF(m))
> +		__rte_pktmbuf_free_extbuf(m);
> +	else
> +		__rte_pktmbuf_free_direct(m);
> +
>  	priv_size = rte_pktmbuf_priv_size(mp);
>  	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
>  	buf_len = rte_pktmbuf_data_room_size(mp);
> @@ -1279,13 +1506,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>  	rte_pktmbuf_reset_headroom(m);
>  	m->data_len = 0;
>  	m->ol_flags = 0;
> -
> -	if (rte_mbuf_refcnt_update(md, -1) == 0) {
> -		md->next = NULL;
> -		md->nb_segs = 1;
> -		rte_mbuf_refcnt_set(md, 1);
> -		rte_mbuf_raw_free(md);
> -	}
>  }
> 
>  /**
> @@ -1309,7 +1529,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
> 
>  	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
> 
> -		if (RTE_MBUF_INDIRECT(m))
> +		if (!RTE_MBUF_DIRECT(m))
>  			rte_pktmbuf_detach(m);
> 
>  		if (m->next != NULL) {
> @@ -1321,7 +1541,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
> 
>  	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
> 
> -		if (RTE_MBUF_INDIRECT(m))
> +		if (!RTE_MBUF_DIRECT(m))
>  			rte_pktmbuf_detach(m);
> 
>  		if (m->next != NULL) {
> --
> 2.11.0
  
Olivier Matz April 23, 2018, 4:18 p.m. UTC | #2
Hi Yongseok,

Please see some comments below.

On Wed, Apr 18, 2018 at 06:11:04PM -0700, Yongseok Koh wrote:
> This patch introduces a new way of attaching an external buffer to a mbuf.
> 
> Attaching an external buffer is quite similar to mbuf indirection in
> replacing buffer addresses and length of a mbuf, but a few differences:
>   - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
>     mbuf must be read-only. But external buffer has its own refcnt and it
>     starts from 1. Unless multiple mbufs are attached to a mbuf having an
>     external buffer, the external buffer is writable.

I'm wondering if "As refcnt of a direct mbuf is at least 2" should be
clarified. I guess we are talking about a direct mbuf that has another one
attached too.

I'm also not sure if I understand properly: to me, it is possible to have
an indirect mbuf that references a direct mbuf with a refcount of 1:
  m = rte_pktmbuf_alloc()
  mi = rte_pktmbuf_alloc()
  rte_pktmbuf_attach(mi, m)
  rte_pktmbuf_free(m)

>   - There's no need to allocate buffer from a mempool. Any buffer can be
>     attached with appropriate free callback.
>   - Smaller metadata is required to maintain shared data such as refcnt.
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>

[...]

> +/**
> + * Function typedef of callback to free externally attached buffer.
> + */
> +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
> +
> +/**
> + * Shared data at the end of an external buffer.
> + */
> +struct rte_mbuf_ext_shared_info {
> +	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
> +	void *fcb_opaque;                        /**< Free callback argument */
> +	RTE_STD_C11
> +	union {
> +		rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
> +		uint16_t refcnt;          /**< Non-atomically accessed refcnt */

It looks that only refcnt_atomic is used.
I don't know if we really need the non-atomic one yet.


> +	};
> +};
> +
>  /**< Maximum number of nb_segs allowed. */
>  #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
>  
> @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
>  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
>  
>  /**
> + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> + */
> +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> +
> +/**
>   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
>   */
> -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))

I'm a bit reticent to have RTE_MBUF_DIRECT(m) different of
!RTE_MBUF_INDIRECT(m), I feel it's not very natural.

What about:
- direct = embeds its own data
- clone (or another name) = data is another mbuf
- extbuf = data is in an external buffer


>  /**
>   * Private data in case of pktmbuf pool.
> @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
>  
>  #endif /* RTE_MBUF_REFCNT_ATOMIC */
>  
> +/**
> + * Reads the refcnt of an external buffer.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @return
> + *   Reference count number.
> + */
> +static inline uint16_t
> +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)

What do you think about rte_mbuf_ext_refcnt_read() to keep name consistency?
(same for other functions below)

[...]

> @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>  }
>  
>  /**
> + * Return shared data of external buffer of a mbuf.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @return
> + *   The address of the shared data.
> + */
> +static inline struct rte_mbuf_ext_shared_info *
> +rte_mbuf_ext_shinfo(struct rte_mbuf *m)
> +{
> +	return (struct rte_mbuf_ext_shared_info *)
> +		RTE_PTR_ADD(m->buf_addr, m->buf_len);
> +}

This forces to have the shared data at the end of the buffer. Is it
always possible? I think there are use-cases where the user may want to
specify another location for it.

For instance, an application mmaps a big file (locked in memory), and
wants to send mbufs pointing to this data without doing any copy.

Maybe adding a m->shinfo field would be a better choice, what do you
think?

This would certainly break the ABI, but I wonder if that patch does
not already break it. I mean, how would react an application compiled
for 18.02 if an EXTBUF is passed to it, knowing that many functions
are inline?


> +/**
> + * Attach an external buffer to a mbuf.
> + *
> + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> + * it, corresponding free callback function and its argument should be
> + * provided. This callback function will be called once all the mbufs are
> + * detached from the buffer.
> + *
> + * More mbufs can be attached to the same external buffer by
> + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> + * this API.
> + *
> + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> + * ``rte_pktmbuf_detach()``.
> + *
> + * A few bytes in the trailer of the provided buffer will be dedicated for
> + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt,
> + * callback function and so on. The shared data can be referenced by
> + * ``rte_mbuf_ext_shinfo()``
> + *
> + * Attaching an external buffer is quite similar to mbuf indirection in
> + * replacing buffer addresses and length of a mbuf, but a few differences:
> + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> + *   mbuf must be read-only. But external buffer has its own refcnt and it
> + *   starts from 1. Unless multiple mbufs are attached to a mbuf having an
> + *   external buffer, the external buffer is writable.
> + * - There's no need to allocate buffer from a mempool. Any buffer can be
> + *   attached with appropriate free callback.
> + * - Smaller metadata is required to maintain shared data such as refcnt.
> + *
> + * @warning
> + * @b EXPERIMENTAL: This API may change without prior notice.
> + * Once external buffer is enabled by allowing experimental API,
> + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> + * exclusive. A mbuf can be consiered direct if it is neither indirect nor

small typo:
consiered -> considered

> + * having external buffer.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @param buf_addr
> + *   The pointer to the external buffer we're attaching to.
> + * @param buf_len
> + *   The size of the external buffer we're attaching to. This must be larger
> + *   than the size of ``struct rte_mbuf_ext_shared_info`` and padding for
> + *   alignment. If not enough, this function will return NULL.
> + * @param free_cb
> + *   Free callback function to call when the external buffer needs to be freed.
> + * @param fcb_opaque
> + *   Argument for the free callback function.
> + * @return
> + *   A pointer to the new start of the data on success, return NULL otherwise.
> + */
> +static inline char * __rte_experimental
> +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> +	uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
> +	void *fcb_opaque)
> +{
> +	void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +
> +	shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
> +			sizeof(uintptr_t));
> +
> +	if ((void *)shinfo <= buf_addr)
> +		return NULL;
> +
> +	m->buf_addr = buf_addr;
> +	m->buf_iova = rte_mempool_virt2iova(buf_addr);

Agree with Konstantin's comment. I think buf_iova should be an argument
of the function.


> +	m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);

Related to what I said above: I think m->buf_len should be set to
the buf_len argument, so a user can point to existing read-only
data.

[...]

Few more comments:

I think we still need to find a good way to advertise to the users
if a mbuf is writable or readable. Today, the rules are quite implicit.
There are surely some use cases where the mbuf is indirect but with
only one active user, meaning it could be READ-WRITE. We could target
18.08 for this.

One side question about your implementation in mlx. I guess the
hardware will write the mbuf data in a big contiguous buffer like
this:

+-------+--------------+--------+--------------+--------+- - -
|       |mbuf1 data    |        |mbuf2 data    |        |
|       |              |        |              |        |
+-------+--------------+--------+--------------+--------+- - -

Which will be transformed in:

+--+----+--------------+---+----+--------------+---+---+- - -
|  |head|mbuf1 data    |sh |head|mbuf2 data    |sh |   |
|  |room|              |inf|room|              |inf|   |
+--+----+--------------+---+----+--------------+---+---+- - -

So, there is one shinfo (i.e one refcount) for each mbuf.
How do you know when the big buffer is not used anymore?


To summarize, I like the idea of your patchset, this is close to
what I had in mind... which does not necessarly mean it is the good
way to do ;)

I'm a bit afraid about ABI breakage, we need to check that a
18.02-compiled application still works well with this change.

About testing, I don't know if you checked the mbuf autotests,
but it could also help to check that basic stuff still work.


Thanks,
Olivier
  
Yongseok Koh April 24, 2018, 1:29 a.m. UTC | #3
On Mon, Apr 23, 2018 at 06:18:43PM +0200, Olivier Matz wrote:
> Hi Yongseok,
> 
> Please see some comments below.
> 
> On Wed, Apr 18, 2018 at 06:11:04PM -0700, Yongseok Koh wrote:
> > This patch introduces a new way of attaching an external buffer to a mbuf.
> > 
> > Attaching an external buffer is quite similar to mbuf indirection in
> > replacing buffer addresses and length of a mbuf, but a few differences:
> >   - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> >     mbuf must be read-only. But external buffer has its own refcnt and it
> >     starts from 1. Unless multiple mbufs are attached to a mbuf having an
> >     external buffer, the external buffer is writable.
> 
> I'm wondering if "As refcnt of a direct mbuf is at least 2" should be
> clarified. I guess we are talking about a direct mbuf that has another one
> attached too.
> 
> I'm also not sure if I understand properly: to me, it is possible to have
> an indirect mbuf that references a direct mbuf with a refcount of 1:
>   m = rte_pktmbuf_alloc()
>   mi = rte_pktmbuf_alloc()
>   rte_pktmbuf_attach(mi, m)
>   rte_pktmbuf_free(m)

Totally agree. Will change the comment.

[...]
> > +struct rte_mbuf_ext_shared_info {
> > +	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
> > +	void *fcb_opaque;                        /**< Free callback argument */
> > +	RTE_STD_C11
> > +	union {
> > +		rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
> > +		uint16_t refcnt;          /**< Non-atomically accessed refcnt */
> 
> It looks that only refcnt_atomic is used.
> I don't know if we really need the non-atomic one yet.

Will remove.

> > +	};
> > +};
> > +
> >  /**< Maximum number of nb_segs allowed. */
> >  #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
> >  
> > @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
> >  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> >  
> >  /**
> > + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> > + */
> > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> > +
> > +/**
> >   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> >   */
> > -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> > +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
> 
> I'm a bit reticent to have RTE_MBUF_DIRECT(m) different of
> !RTE_MBUF_INDIRECT(m), I feel it's not very natural.
> 
> What about:
> - direct = embeds its own data
> - clone (or another name) = data is another mbuf
> - extbuf = data is in an external buffer

Good point. I'll clarify it in a new version by adding RTE_MBUF_CLONED().

> >  /**
> >   * Private data in case of pktmbuf pool.
> > @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
> >  
> >  #endif /* RTE_MBUF_REFCNT_ATOMIC */
> >  
> > +/**
> > + * Reads the refcnt of an external buffer.
> > + *
> > + * @param shinfo
> > + *   Shared data of the external buffer.
> > + * @return
> > + *   Reference count number.
> > + */
> > +static inline uint16_t
> > +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)
> 
> What do you think about rte_mbuf_ext_refcnt_read() to keep name consistency?
> (same for other functions below)

No problem.

> [...]
> 
> > @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> >  }
> >  
> >  /**
> > + * Return shared data of external buffer of a mbuf.
> > + *
> > + * @param m
> > + *   The pointer to the mbuf.
> > + * @return
> > + *   The address of the shared data.
> > + */
> > +static inline struct rte_mbuf_ext_shared_info *
> > +rte_mbuf_ext_shinfo(struct rte_mbuf *m)
> > +{
> > +	return (struct rte_mbuf_ext_shared_info *)
> > +		RTE_PTR_ADD(m->buf_addr, m->buf_len);
> > +}
> 
> This forces to have the shared data at the end of the buffer. Is it
> always possible? I think there are use-cases where the user may want to
> specify another location for it.
> 
> For instance, an application mmaps a big file (locked in memory), and
> wants to send mbufs pointing to this data without doing any copy.

Very good point. Will make rte_pktmbuf_attach_extbuf() take *shinfo as an
argument.

> Maybe adding a m->shinfo field would be a better choice, what do you
> think?

I like it to store in mbuf too.

> This would certainly break the ABI, but I wonder if that patch does
> not already break it. I mean, how would react an application compiled
> for 18.02 if an EXTBUF is passed to it, knowing that many functions
> are inline?

Even if I add shinfo field in rte_mbuf, I think it won't break the ABI. The
second cacheline is just 40B and it would simply make it 48B. Some code might
check the order/size of some fields (e.g. vPMD) in the struct, but if it is
added at the end of the struct, it should be okay. And there's no need to make a
change in a C file for this.

> > +/**
> > + * Attach an external buffer to a mbuf.
> > + *
> > + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> > + * it, corresponding free callback function and its argument should be
> > + * provided. This callback function will be called once all the mbufs are
> > + * detached from the buffer.
> > + *
> > + * More mbufs can be attached to the same external buffer by
> > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> > + * this API.
> > + *
> > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> > + * ``rte_pktmbuf_detach()``.
> > + *
> > + * A few bytes in the trailer of the provided buffer will be dedicated for
> > + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt,
> > + * callback function and so on. The shared data can be referenced by
> > + * ``rte_mbuf_ext_shinfo()``
> > + *
> > + * Attaching an external buffer is quite similar to mbuf indirection in
> > + * replacing buffer addresses and length of a mbuf, but a few differences:
> > + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> > + *   mbuf must be read-only. But external buffer has its own refcnt and it
> > + *   starts from 1. Unless multiple mbufs are attached to a mbuf having an
> > + *   external buffer, the external buffer is writable.
> > + * - There's no need to allocate buffer from a mempool. Any buffer can be
> > + *   attached with appropriate free callback.
> > + * - Smaller metadata is required to maintain shared data such as refcnt.
> > + *
> > + * @warning
> > + * @b EXPERIMENTAL: This API may change without prior notice.
> > + * Once external buffer is enabled by allowing experimental API,
> > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> > + * exclusive. A mbuf can be consiered direct if it is neither indirect nor
> 
> small typo:
> consiered -> considered

Will fix. Thanks.

> > + * having external buffer.
> > + *
> > + * @param m
> > + *   The pointer to the mbuf.
> > + * @param buf_addr
> > + *   The pointer to the external buffer we're attaching to.
> > + * @param buf_len
> > + *   The size of the external buffer we're attaching to. This must be larger
> > + *   than the size of ``struct rte_mbuf_ext_shared_info`` and padding for
> > + *   alignment. If not enough, this function will return NULL.
> > + * @param free_cb
> > + *   Free callback function to call when the external buffer needs to be freed.
> > + * @param fcb_opaque
> > + *   Argument for the free callback function.
> > + * @return
> > + *   A pointer to the new start of the data on success, return NULL otherwise.
> > + */
> > +static inline char * __rte_experimental
> > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> > +	uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
> > +	void *fcb_opaque)
> > +{
> > +	void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> > +	struct rte_mbuf_ext_shared_info *shinfo;
> > +
> > +	shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
> > +			sizeof(uintptr_t));
> > +
> > +	if ((void *)shinfo <= buf_addr)
> > +		return NULL;
> > +
> > +	m->buf_addr = buf_addr;
> > +	m->buf_iova = rte_mempool_virt2iova(buf_addr);
> 
> Agree with Konstantin's comment. I think buf_iova should be an argument
> of the function.

Oops, that was my silly mistake. I just copied this block from
rte_pktmbuf_init(). Then, I wanted to change it to rte_malloc_virt2iova() but I
forgot. I didn't realized it during my tests because mlx devices don't use iova
but virtaddr.

If it takes iova as an argument instead, it can be faster and it can use 'real'
external memory for packet DMA, e.g. storage application you mentioned. I mean,
even if a buffer isn't allocated inside DPDK (doesn't belong to one of memseg
list), this should work. Good suggestion!

> > +	m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> 
> Related to what I said above: I think m->buf_len should be set to
> the buf_len argument, so a user can point to existing read-only
> data.

I will make a change to have an argument of extra memory for shared data. And if
shinfo is passed as NULL, it will still spare bytes at the end, just for
convenience.

> [...]
> 
> Few more comments:
> 
> I think we still need to find a good way to advertise to the users
> if a mbuf is writable or readable. Today, the rules are quite implicit.
> There are surely some use cases where the mbuf is indirect but with
> only one active user, meaning it could be READ-WRITE. We could target
> 18.08 for this.

Right. That'll be very good to have.

> One side question about your implementation in mlx. I guess the
> hardware will write the mbuf data in a big contiguous buffer like
> this:
> 
> +-------+--------------+--------+--------------+--------+- - -
> |       |mbuf1 data    |        |mbuf2 data    |        |
> |       |              |        |              |        |
> +-------+--------------+--------+--------------+--------+- - -
> 
> Which will be transformed in:
> 
> +--+----+--------------+---+----+--------------+---+---+- - -
> |  |head|mbuf1 data    |sh |head|mbuf2 data    |sh |   |
> |  |room|              |inf|room|              |inf|   |
> +--+----+--------------+---+----+--------------+---+---+- - -
> 
> So, there is one shinfo (i.e one refcount) for each mbuf.
> How do you know when the big buffer is not used anymore?
 
 +--+----+--------------+---+----+--------------+---+---+- - -
 |  |head|mbuf1 data    |sh |head|mbuf2 data    |sh |   |
 |  |room|              |inf|room|              |inf|   |
 +--+----+--------------+---+----+--------------+---+---+- - -
  ^
  |
  Metadata for the whole chunk, having another refcnt managed by PMD.
  fcb_opaque will have this pointer so that the callback func knows it.

> To summarize, I like the idea of your patchset, this is close to
> what I had in mind... which does not necessarly mean it is the good
> way to do ;)
> 
> I'm a bit afraid about ABI breakage, we need to check that a
> 18.02-compiled application still works well with this change.

I had the same concern so I made rte_pktmbuf_attach_extbuf() __rte_experimental.
Although this new ol_flag is introduced, it can only be set by the new API and
the rest of changes won't be effective unless this flag is set.
RTE_MBUF_HAS_EXTBUF() will always be false if -DALLOW_EXPERIMENTAL_API isn't
specified or rte_pktmbuf_attach_extbuf() isn't called. And there's no change
needed in a C file. For this reason, I don't think there's ABI breakage.

Sounds correct?

> About testing, I don't know if you checked the mbuf autotests,
> but it could also help to check that basic stuff still work.

I'll make sure all the tests pass before I submit a new version.


Thanks,
Yongseok
  
Yongseok Koh April 24, 2018, 2:04 a.m. UTC | #4
On Mon, Apr 23, 2018 at 11:53:04AM +0000, Ananyev, Konstantin wrote:
[...]
> > @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
> >  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> > 
> >  /**
> > + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> > + */
> > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> > +
> > +/**
> >   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> >   */
> > -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> > +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
> 
> As a nit:
> RTE_MBUF_DIRECT(mb)  (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0)

It was for better readability and I expected compiler did the same.
But, if you still want this way, I can change it.

[...]
> > +static inline char * __rte_experimental
> > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> > +	uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
> > +	void *fcb_opaque)
> > +{
> > +	void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> > +	struct rte_mbuf_ext_shared_info *shinfo;
> > +
> > +	shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
> > +			sizeof(uintptr_t));
> > +
> > +	if ((void *)shinfo <= buf_addr)
> > +		return NULL;
> > +
> > +	m->buf_addr = buf_addr;
> > +	m->buf_iova = rte_mempool_virt2iova(buf_addr);
> 
> 
> That wouldn't work for arbitrary extern buffer.
> Only for the one that is an element in some other mempool.
> For arbitrary external buffer - callee has to provide PA for it plus guarantee that
> it's VA would be locked down.
> From other side - if your intention is just to use only elements of other mempools -
> No need to have free_cb(). mempool_put should do.

Of course, I didn't mean that. That was a mistake. Please refer to my reply to
Olivier.

[...]
> >  /**
> > - * Detach an indirect packet mbuf.
> > + * @internal used by rte_pktmbuf_detach().
> > + *
> > + * Decrement the reference counter of the external buffer. When the
> > + * reference counter becomes 0, the buffer is freed by pre-registered
> > + * callback.
> > + */
> > +static inline void
> > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> > +{
> > +	struct rte_mbuf_ext_shared_info *shinfo;
> > +
> > +	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> > +
> > +	shinfo = rte_mbuf_ext_shinfo(m);
> > +
> > +	if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
> > +		shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);
> 
> 
> I understand the reason but extra function call for each external mbuf - seems quite expensive.
> Wonder is it possible to group them somehow and amortize the cost?

Good point. I thought about it today.

Comparing to the regular mbuf, maybe three differences. a) free function isn't
inlined but a real branch. b) no help from core local cache like mempool's c) no
free_bulk func like rte_mempool_put_bulk(). But these look quite costly and
complicated for the external buffer attachment.

For example, to free it in bulk, external buffers should be grouped as the
buffers would have different callback functions. To do that, I have to make an
API to pre-register an external buffer group to prepare resources for the bulk
free. Then, buffers can't be anonymous anymore but have to be registered in
advance. If so, it would be better to use existing APIs, especially when a user
wants high throughput...

Let me know if you have better idea to implement it. Then, I'll gladly take
that. Or, we can push any improvement patch in the next releases.


Thanks,
Yongseok
  
Olivier Matz April 24, 2018, 3:36 p.m. UTC | #5
Hi,

On Mon, Apr 23, 2018 at 06:29:57PM -0700, Yongseok Koh wrote:
> On Mon, Apr 23, 2018 at 06:18:43PM +0200, Olivier Matz wrote:
> > I'm a bit afraid about ABI breakage, we need to check that a
> > 18.02-compiled application still works well with this change.
> 
> I had the same concern so I made rte_pktmbuf_attach_extbuf() __rte_experimental.
> Although this new ol_flag is introduced, it can only be set by the new API and
> the rest of changes won't be effective unless this flag is set.
> RTE_MBUF_HAS_EXTBUF() will always be false if -DALLOW_EXPERIMENTAL_API isn't
> specified or rte_pktmbuf_attach_extbuf() isn't called. And there's no change
> needed in a C file. For this reason, I don't think there's ABI breakage.
> 
> Sounds correct?

Hmm, imagine you compile an application on top of 18.02.
Then, you update your dpdk libraries to 18.05.

The mlx driver may send mbufs pointing to an external buffer to the
application. When the application will call the mbuf free function, it
will probably not do the expected work, because most of the functions
involved are inline. So, to me this is an ABI breakage.

This is not a technical issue, since the ABI of mbuf will already be
broken this release (control mbuf removed). This is more a process
question, because an ABI breakage and its area should be announced.

Olivier
  
Ananyev, Konstantin April 25, 2018, 1:16 p.m. UTC | #6
> 
> On Mon, Apr 23, 2018 at 11:53:04AM +0000, Ananyev, Konstantin wrote:
> [...]
> > > @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
> > >  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> > >
> > >  /**
> > > + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> > > + */
> > > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> > > +
> > > +/**
> > >   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> > >   */
> > > -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> > > +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
> >
> > As a nit:
> > RTE_MBUF_DIRECT(mb)  (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0)
> 
> It was for better readability and I expected compiler did the same.
> But, if you still want this way, I can change it.

I know compilers are quite smart these days, but you never know for sure,
so yes, I think better to do that explicitly.

> 
> [...]
> > >  /**
> > > - * Detach an indirect packet mbuf.
> > > + * @internal used by rte_pktmbuf_detach().
> > > + *
> > > + * Decrement the reference counter of the external buffer. When the
> > > + * reference counter becomes 0, the buffer is freed by pre-registered
> > > + * callback.
> > > + */
> > > +static inline void
> > > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> > > +{
> > > +	struct rte_mbuf_ext_shared_info *shinfo;
> > > +
> > > +	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> > > +
> > > +	shinfo = rte_mbuf_ext_shinfo(m);
> > > +
> > > +	if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
> > > +		shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);
> >
> >
> > I understand the reason but extra function call for each external mbuf - seems quite expensive.
> > Wonder is it possible to group them somehow and amortize the cost?
> 
> Good point. I thought about it today.
> 
> Comparing to the regular mbuf, maybe three differences. a) free function isn't
> inlined but a real branch. b) no help from core local cache like mempool's c) no
> free_bulk func like rte_mempool_put_bulk(). But these look quite costly and
> complicated for the external buffer attachment.
> 
> For example, to free it in bulk, external buffers should be grouped as the
> buffers would have different callback functions. To do that, I have to make an
> API to pre-register an external buffer group to prepare resources for the bulk
> free. Then, buffers can't be anonymous anymore but have to be registered in
> advance. If so, it would be better to use existing APIs, especially when a user
> wants high throughput...
> 
> Let me know if you have better idea to implement it. Then, I'll gladly take
> that. Or, we can push any improvement patch in the next releases.

I don't have any extra-smart thoughts here.
One option I thought about - was to introduce group of external buffers with
common free routine (I think o mentioned it already).
Second - hide all that external buffer management inside mempool,
i.e. if user wants to use external buffers he create a mempool
(with rte_mbuf_ext_shared_info as elements?), then attach external buffer to shinfo
and call mbuf_attach_external(mbuf, shinfo).
Though for free we can just call mempool_put(shinfo) and let particular implementation
decide when/how call free_cb(), etc. 

Konstantin
  
Yongseok Koh April 25, 2018, 4:44 p.m. UTC | #7
On Wed, Apr 25, 2018 at 01:16:38PM +0000, Ananyev, Konstantin wrote:
> 
> 
> > 
> > On Mon, Apr 23, 2018 at 11:53:04AM +0000, Ananyev, Konstantin wrote:
> > [...]
> > > > @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
> > > >  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> > > >
> > > >  /**
> > > > + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> > > > + */
> > > > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> > > > +
> > > > +/**
> > > >   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> > > >   */
> > > > -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> > > > +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
> > >
> > > As a nit:
> > > RTE_MBUF_DIRECT(mb)  (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0)
> > 
> > It was for better readability and I expected compiler did the same.
> > But, if you still want this way, I can change it.
> 
> I know compilers are quite smart these days, but you never know for sure,
> so yes, I think better to do that explicitly.

Okay.

> > [...]
> > > >  /**
> > > > - * Detach an indirect packet mbuf.
> > > > + * @internal used by rte_pktmbuf_detach().
> > > > + *
> > > > + * Decrement the reference counter of the external buffer. When the
> > > > + * reference counter becomes 0, the buffer is freed by pre-registered
> > > > + * callback.
> > > > + */
> > > > +static inline void
> > > > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> > > > +{
> > > > +	struct rte_mbuf_ext_shared_info *shinfo;
> > > > +
> > > > +	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> > > > +
> > > > +	shinfo = rte_mbuf_ext_shinfo(m);
> > > > +
> > > > +	if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
> > > > +		shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);
> > >
> > >
> > > I understand the reason but extra function call for each external mbuf - seems quite expensive.
> > > Wonder is it possible to group them somehow and amortize the cost?
> > 
> > Good point. I thought about it today.
> > 
> > Comparing to the regular mbuf, maybe three differences. a) free function isn't
> > inlined but a real branch. b) no help from core local cache like mempool's c) no
> > free_bulk func like rte_mempool_put_bulk(). But these look quite costly and
> > complicated for the external buffer attachment.
> > 
> > For example, to free it in bulk, external buffers should be grouped as the
> > buffers would have different callback functions. To do that, I have to make an
> > API to pre-register an external buffer group to prepare resources for the bulk
> > free. Then, buffers can't be anonymous anymore but have to be registered in
> > advance. If so, it would be better to use existing APIs, especially when a user
> > wants high throughput...
> > 
> > Let me know if you have better idea to implement it. Then, I'll gladly take
> > that. Or, we can push any improvement patch in the next releases.
> 
> I don't have any extra-smart thoughts here.
> One option I thought about - was to introduce group of external buffers with
> common free routine (I think o mentioned it already).
> Second - hide all that external buffer management inside mempool,
> i.e. if user wants to use external buffers he create a mempool
> (with rte_mbuf_ext_shared_info as elements?), then attach external buffer to shinfo
> and call mbuf_attach_external(mbuf, shinfo).
> Though for free we can just call mempool_put(shinfo) and let particular implementation
> decide when/how call free_cb(), etc. 
I don't want to restrict external buffer to mempool object. Especially for
storage users, they want to use **any** buffer, even coming outside of DPDK.

However, will open a follow-up discussion for this in the next release window
probably with more measurement data.
Thank you for suggestions.

Yongseok
  
Ananyev, Konstantin April 25, 2018, 6:05 p.m. UTC | #8
> -----Original Message-----
> From: Yongseok Koh [mailto:yskoh@mellanox.com]
> Sent: Wednesday, April 25, 2018 5:44 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; olivier.matz@6wind.com; dev@dpdk.org;
> adrien.mazarguil@6wind.com; nelio.laranjeiro@6wind.com
> Subject: Re: [PATCH v3 1/2] mbuf: support attaching external buffer to mbuf
> 
> On Wed, Apr 25, 2018 at 01:16:38PM +0000, Ananyev, Konstantin wrote:
> >
> >
> > >
> > > On Mon, Apr 23, 2018 at 11:53:04AM +0000, Ananyev, Konstantin wrote:
> > > [...]
> > > > > @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
> > > > >  #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> > > > >
> > > > >  /**
> > > > > + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> > > > > + */
> > > > > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> > > > > +
> > > > > +/**
> > > > >   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> > > > >   */
> > > > > -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> > > > > +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
> > > >
> > > > As a nit:
> > > > RTE_MBUF_DIRECT(mb)  (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0)
> > >
> > > It was for better readability and I expected compiler did the same.
> > > But, if you still want this way, I can change it.
> >
> > I know compilers are quite smart these days, but you never know for sure,
> > so yes, I think better to do that explicitly.
> 
> Okay.
> 
> > > [...]
> > > > >  /**
> > > > > - * Detach an indirect packet mbuf.
> > > > > + * @internal used by rte_pktmbuf_detach().
> > > > > + *
> > > > > + * Decrement the reference counter of the external buffer. When the
> > > > > + * reference counter becomes 0, the buffer is freed by pre-registered
> > > > > + * callback.
> > > > > + */
> > > > > +static inline void
> > > > > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> > > > > +{
> > > > > +	struct rte_mbuf_ext_shared_info *shinfo;
> > > > > +
> > > > > +	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> > > > > +
> > > > > +	shinfo = rte_mbuf_ext_shinfo(m);
> > > > > +
> > > > > +	if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
> > > > > +		shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);
> > > >
> > > >
> > > > I understand the reason but extra function call for each external mbuf - seems quite expensive.
> > > > Wonder is it possible to group them somehow and amortize the cost?
> > >
> > > Good point. I thought about it today.
> > >
> > > Comparing to the regular mbuf, maybe three differences. a) free function isn't
> > > inlined but a real branch. b) no help from core local cache like mempool's c) no
> > > free_bulk func like rte_mempool_put_bulk(). But these look quite costly and
> > > complicated for the external buffer attachment.
> > >
> > > For example, to free it in bulk, external buffers should be grouped as the
> > > buffers would have different callback functions. To do that, I have to make an
> > > API to pre-register an external buffer group to prepare resources for the bulk
> > > free. Then, buffers can't be anonymous anymore but have to be registered in
> > > advance. If so, it would be better to use existing APIs, especially when a user
> > > wants high throughput...
> > >
> > > Let me know if you have better idea to implement it. Then, I'll gladly take
> > > that. Or, we can push any improvement patch in the next releases.
> >
> > I don't have any extra-smart thoughts here.
> > One option I thought about - was to introduce group of external buffers with
> > common free routine (I think o mentioned it already).
> > Second - hide all that external buffer management inside mempool,
> > i.e. if user wants to use external buffers he create a mempool
> > (with rte_mbuf_ext_shared_info as elements?), then attach external buffer to shinfo
> > and call mbuf_attach_external(mbuf, shinfo).
> > Though for free we can just call mempool_put(shinfo) and let particular implementation
> > decide when/how call free_cb(), etc.
> I don't want to restrict external buffer to mempool object. Especially for
> storage users, they want to use **any** buffer, even coming outside of DPDK.

I am not talking about the case when external buffer can be allocated from mempool.
I am talking about the implementation where shinfo is a a mempool element.
So to bring extrernal buffer into DPDK - users get a shinfo (from mempool) and attach it
to external buffer.
When no one needs that external buffer any more (shinfo.refcnt == 0) 
mempool_put() is invoked for shinfo.
Inside put() we can either call free_cb() or keep extrenal buffer for further usage.
Anyway just a thought.
Konstantin

> 
> However, will open a follow-up discussion for this in the next release window
> probably with more measurement data.
> Thank you for suggestions.
> 
> Yongseok
  

Patch

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 06eceba37..e64160c81 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -326,7 +326,7 @@  extern "C" {
 		PKT_TX_MACSEC |		 \
 		PKT_TX_SEC_OFFLOAD)
 
-#define __RESERVED           (1ULL << 61) /**< reserved for future mbuf use */
+#define EXT_ATTACHED_MBUF    (1ULL << 61) /**< Mbuf having external buffer */
 
 #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
 
@@ -568,6 +568,24 @@  struct rte_mbuf {
 
 } __rte_cache_aligned;
 
+/**
+ * Function typedef of callback to free externally attached buffer.
+ */
+typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
+
+/**
+ * Shared data at the end of an external buffer.
+ */
+struct rte_mbuf_ext_shared_info {
+	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
+	void *fcb_opaque;                        /**< Free callback argument */
+	RTE_STD_C11
+	union {
+		rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
+		uint16_t refcnt;          /**< Non-atomically accessed refcnt */
+	};
+};
+
 /**< Maximum number of nb_segs allowed. */
 #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
 
@@ -693,9 +711,14 @@  rte_mbuf_to_baddr(struct rte_mbuf *md)
 #define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
 
 /**
+ * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
+ */
+#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
+
+/**
  * Returns TRUE if given mbuf is direct, or FALSE otherwise.
  */
-#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
+#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
 
 /**
  * Private data in case of pktmbuf pool.
@@ -821,6 +844,58 @@  rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
 
 #endif /* RTE_MBUF_REFCNT_ATOMIC */
 
+/**
+ * Reads the refcnt of an external buffer.
+ *
+ * @param shinfo
+ *   Shared data of the external buffer.
+ * @return
+ *   Reference count number.
+ */
+static inline uint16_t
+rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)
+{
+	return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic));
+}
+
+/**
+ * Set refcnt of an external buffer.
+ *
+ * @param shinfo
+ *   Shared data of the external buffer.
+ * @param new_value
+ *   Value set
+ */
+static inline void
+rte_extbuf_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo,
+	uint16_t new_value)
+{
+	rte_atomic16_set(&shinfo->refcnt_atomic, new_value);
+}
+
+/**
+ * Add given value to refcnt of an external buffer and return its new
+ * value.
+ *
+ * @param shinfo
+ *   Shared data of the external buffer.
+ * @param value
+ *   Value to add/subtract
+ * @return
+ *   Updated value
+ */
+static inline uint16_t
+rte_extbuf_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
+	int16_t value)
+{
+	if (likely(rte_extbuf_refcnt_read(shinfo) == 1)) {
+		rte_extbuf_refcnt_set(shinfo, 1 + value);
+		return 1 + value;
+	}
+
+	return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value);
+}
+
 /** Mbuf prefetch */
 #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
 	if ((m) != NULL)                        \
@@ -1195,11 +1270,120 @@  static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
 }
 
 /**
+ * Return shared data of external buffer of a mbuf.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ * @return
+ *   The address of the shared data.
+ */
+static inline struct rte_mbuf_ext_shared_info *
+rte_mbuf_ext_shinfo(struct rte_mbuf *m)
+{
+	return (struct rte_mbuf_ext_shared_info *)
+		RTE_PTR_ADD(m->buf_addr, m->buf_len);
+}
+
+/**
+ * Attach an external buffer to a mbuf.
+ *
+ * User-managed anonymous buffer can be attached to an mbuf. When attaching
+ * it, corresponding free callback function and its argument should be
+ * provided. This callback function will be called once all the mbufs are
+ * detached from the buffer.
+ *
+ * More mbufs can be attached to the same external buffer by
+ * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
+ * this API.
+ *
+ * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
+ * ``rte_pktmbuf_detach()``.
+ *
+ * A few bytes in the trailer of the provided buffer will be dedicated for
+ * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt,
+ * callback function and so on. The shared data can be referenced by
+ * ``rte_mbuf_ext_shinfo()``
+ *
+ * Attaching an external buffer is quite similar to mbuf indirection in
+ * replacing buffer addresses and length of a mbuf, but a few differences:
+ * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
+ *   mbuf must be read-only. But external buffer has its own refcnt and it
+ *   starts from 1. Unless multiple mbufs are attached to a mbuf having an
+ *   external buffer, the external buffer is writable.
+ * - There's no need to allocate buffer from a mempool. Any buffer can be
+ *   attached with appropriate free callback.
+ * - Smaller metadata is required to maintain shared data such as refcnt.
+ *
+ * @warning
+ * @b EXPERIMENTAL: This API may change without prior notice.
+ * Once external buffer is enabled by allowing experimental API,
+ * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
+ * exclusive. A mbuf can be consiered direct if it is neither indirect nor
+ * having external buffer.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ * @param buf_addr
+ *   The pointer to the external buffer we're attaching to.
+ * @param buf_len
+ *   The size of the external buffer we're attaching to. This must be larger
+ *   than the size of ``struct rte_mbuf_ext_shared_info`` and padding for
+ *   alignment. If not enough, this function will return NULL.
+ * @param free_cb
+ *   Free callback function to call when the external buffer needs to be freed.
+ * @param fcb_opaque
+ *   Argument for the free callback function.
+ * @return
+ *   A pointer to the new start of the data on success, return NULL otherwise.
+ */
+static inline char * __rte_experimental
+rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
+	uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
+	void *fcb_opaque)
+{
+	void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
+	struct rte_mbuf_ext_shared_info *shinfo;
+
+	shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
+			sizeof(uintptr_t));
+
+	if ((void *)shinfo <= buf_addr)
+		return NULL;
+
+	m->buf_addr = buf_addr;
+	m->buf_iova = rte_mempool_virt2iova(buf_addr);
+	m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
+	m->data_len = 0;
+
+	rte_pktmbuf_reset_headroom(m);
+	m->ol_flags |= EXT_ATTACHED_MBUF;
+
+	rte_extbuf_refcnt_set(shinfo, 1);
+	shinfo->free_cb = free_cb;
+	shinfo->fcb_opaque = fcb_opaque;
+
+	return (char *)m->buf_addr + m->data_off;
+}
+
+/**
+ * Detach the external buffer attached to a mbuf, same as
+ * ``rte_pktmbuf_detach()``
+ *
+ * @param m
+ *   The mbuf having external buffer.
+ */
+#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m)
+
+/**
  * Attach packet mbuf to another packet mbuf.
  *
- * After attachment we refer the mbuf we attached as 'indirect',
- * while mbuf we attached to as 'direct'.
- * The direct mbuf's reference counter is incremented.
+ * If the mbuf we are attaching to isn't a direct buffer and is attached to
+ * an external buffer, the mbuf being attached will be attached to the
+ * external buffer instead of mbuf indirection.
+ *
+ * Otherwise, the mbuf will be indirectly attached. After attachment we
+ * refer the mbuf we attached as 'indirect', while mbuf we attached to as
+ * 'direct'.  The direct mbuf's reference counter is incremented.
  *
  * Right now, not supported:
  *  - attachment for already indirect mbuf (e.g. - mi has to be direct).
@@ -1213,19 +1397,18 @@  static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
  */
 static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 {
-	struct rte_mbuf *md;
-
 	RTE_ASSERT(RTE_MBUF_DIRECT(mi) &&
 	    rte_mbuf_refcnt_read(mi) == 1);
 
-	/* if m is not direct, get the mbuf that embeds the data */
-	if (RTE_MBUF_DIRECT(m))
-		md = m;
-	else
-		md = rte_mbuf_from_indirect(m);
+	if (RTE_MBUF_HAS_EXTBUF(m)) {
+		rte_extbuf_refcnt_update(rte_mbuf_ext_shinfo(m), 1);
+		mi->ol_flags = m->ol_flags;
+	} else {
+		rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1);
+		mi->priv_size = m->priv_size;
+		mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
+	}
 
-	rte_mbuf_refcnt_update(md, 1);
-	mi->priv_size = m->priv_size;
 	mi->buf_iova = m->buf_iova;
 	mi->buf_addr = m->buf_addr;
 	mi->buf_len = m->buf_len;
@@ -1241,7 +1424,6 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 	mi->next = NULL;
 	mi->pkt_len = mi->data_len;
 	mi->nb_segs = 1;
-	mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
 	mi->packet_type = m->packet_type;
 	mi->timestamp = m->timestamp;
 
@@ -1250,12 +1432,53 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 }
 
 /**
- * Detach an indirect packet mbuf.
+ * @internal used by rte_pktmbuf_detach().
+ *
+ * Decrement the reference counter of the external buffer. When the
+ * reference counter becomes 0, the buffer is freed by pre-registered
+ * callback.
+ */
+static inline void
+__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
+{
+	struct rte_mbuf_ext_shared_info *shinfo;
+
+	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
+
+	shinfo = rte_mbuf_ext_shinfo(m);
+
+	if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
+		shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);
+}
+
+/**
+ * @internal used by rte_pktmbuf_detach().
  *
+ * Decrement the direct mbuf's reference counter. When the reference
+ * counter becomes 0, the direct mbuf is freed.
+ */
+static inline void
+__rte_pktmbuf_free_direct(struct rte_mbuf *m)
+{
+	struct rte_mbuf *md = rte_mbuf_from_indirect(m);
+
+	RTE_ASSERT(RTE_MBUF_INDIRECT(m));
+
+	if (rte_mbuf_refcnt_update(md, -1) == 0) {
+		md->next = NULL;
+		md->nb_segs = 1;
+		rte_mbuf_refcnt_set(md, 1);
+		rte_mbuf_raw_free(md);
+	}
+}
+
+/**
+ * Detach a packet mbuf from external buffer or direct buffer.
+ *
+ *  - decrement refcnt and free the external/direct buffer if refcnt
+ *    becomes zero.
  *  - restore original mbuf address and length values.
  *  - reset pktmbuf data and data_len to their default values.
- *  - decrement the direct mbuf's reference counter. When the
- *  reference counter becomes 0, the direct mbuf is freed.
  *
  * All other fields of the given packet mbuf will be left intact.
  *
@@ -1264,10 +1487,14 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
  */
 static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
 {
-	struct rte_mbuf *md = rte_mbuf_from_indirect(m);
 	struct rte_mempool *mp = m->pool;
 	uint32_t mbuf_size, buf_len, priv_size;
 
+	if (RTE_MBUF_HAS_EXTBUF(m))
+		__rte_pktmbuf_free_extbuf(m);
+	else
+		__rte_pktmbuf_free_direct(m);
+
 	priv_size = rte_pktmbuf_priv_size(mp);
 	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
 	buf_len = rte_pktmbuf_data_room_size(mp);
@@ -1279,13 +1506,6 @@  static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
 	rte_pktmbuf_reset_headroom(m);
 	m->data_len = 0;
 	m->ol_flags = 0;
-
-	if (rte_mbuf_refcnt_update(md, -1) == 0) {
-		md->next = NULL;
-		md->nb_segs = 1;
-		rte_mbuf_refcnt_set(md, 1);
-		rte_mbuf_raw_free(md);
-	}
 }
 
 /**
@@ -1309,7 +1529,7 @@  rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 
 	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
 
-		if (RTE_MBUF_INDIRECT(m))
+		if (!RTE_MBUF_DIRECT(m))
 			rte_pktmbuf_detach(m);
 
 		if (m->next != NULL) {
@@ -1321,7 +1541,7 @@  rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 
 	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
 
-		if (RTE_MBUF_INDIRECT(m))
+		if (!RTE_MBUF_DIRECT(m))
 			rte_pktmbuf_detach(m);
 
 		if (m->next != NULL) {