[3/8] ethdev: add mbuf dynfield for incomplete IP reassembly
Checks
Commit Message
Hardware IP reassembly may be incomplete for multiple reasons like
reassembly timeout reached, duplicate fragments, etc.
To save application cycles to process these packets again, a new
mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added to
show that the mbuf received is not reassembled properly.
Now if this flag is set, application can retreive corresponding chain of
mbufs using mbuf dynfield set by the PMD. Now, it will be upto
application to either drop those fragments or wait for more time.
Signed-off-by: Akhil Goyal <gakhil@marvell.com>
---
lib/ethdev/ethdev_driver.h | 8 ++++++
lib/ethdev/rte_ethdev.c | 16 +++++++++++
lib/ethdev/rte_ethdev.h | 57 ++++++++++++++++++++++++++++++++++++++
lib/ethdev/version.map | 2 ++
lib/mbuf/rte_mbuf_core.h | 3 +-
5 files changed, 85 insertions(+), 1 deletion(-)
Comments
> Hardware IP reassembly may be incomplete for multiple reasons like
> reassembly timeout reached, duplicate fragments, etc.
> To save application cycles to process these packets again, a new
> mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added to
> show that the mbuf received is not reassembled properly.
If we use dynfiled for data, why not use dynflag for RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE?
That way we can avoid introduced hardcoded (always defined) flags for that case.
>
> Now if this flag is set, application can retreive corresponding chain of
> mbufs using mbuf dynfield set by the PMD. Now, it will be upto
> application to either drop those fragments or wait for more time.
>
> Signed-off-by: Akhil Goyal <gakhil@marvell.com>
> ---
> lib/ethdev/ethdev_driver.h | 8 ++++++
> lib/ethdev/rte_ethdev.c | 16 +++++++++++
> lib/ethdev/rte_ethdev.h | 57 ++++++++++++++++++++++++++++++++++++++
> lib/ethdev/version.map | 2 ++
> lib/mbuf/rte_mbuf_core.h | 3 +-
> 5 files changed, 85 insertions(+), 1 deletion(-)
>
> diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
> index 0ed53c14f3..9a0bab9a61 100644
> --- a/lib/ethdev/ethdev_driver.h
> +++ b/lib/ethdev/ethdev_driver.h
> @@ -1671,6 +1671,14 @@ int
> rte_eth_hairpin_queue_peer_unbind(uint16_t cur_port, uint16_t cur_queue,
> uint32_t direction);
>
> +/**
> + * @internal
> + * Register mbuf dynamic field for IP reassembly incomplete case.
> + */
> +__rte_internal
> +int
> +rte_eth_ip_reass_dynfield_register(void);
> +
>
> /*
> * Legacy ethdev API used internally by drivers.
> diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
> index ecc6c1fe37..d53ce4eaca 100644
> --- a/lib/ethdev/rte_ethdev.c
> +++ b/lib/ethdev/rte_ethdev.c
> @@ -6503,6 +6503,22 @@ rte_eth_ip_reassembly_conf_set(uint16_t port_id,
> (*dev->dev_ops->ip_reassembly_conf_set)(dev, conf));
> }
>
> +#define RTE_ETH_IP_REASS_DYNFIELD_NAME "rte_eth_ip_reass_dynfield"
> +int rte_eth_ip_reass_dynfield_offset = -1;
> +
> +int
> +rte_eth_ip_reass_dynfield_register(void)
> +{
> + static const struct rte_mbuf_dynfield dynfield_desc = {
> + .name = RTE_ETH_IP_REASS_DYNFIELD_NAME,
> + .size = sizeof(rte_eth_ip_reass_dynfield_t),
> + .align = __alignof__(rte_eth_ip_reass_dynfield_t),
> + };
> + rte_eth_ip_reass_dynfield_offset =
> + rte_mbuf_dynfield_register(&dynfield_desc);
> + return rte_eth_ip_reass_dynfield_offset;
> +}
> +
> RTE_LOG_REGISTER_DEFAULT(rte_eth_dev_logtype, INFO);
>
> RTE_INIT(ethdev_init_telemetry)
> diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
> index 891f9a6e06..c4024d2265 100644
> --- a/lib/ethdev/rte_ethdev.h
> +++ b/lib/ethdev/rte_ethdev.h
> @@ -5245,6 +5245,63 @@ __rte_experimental
> int rte_eth_ip_reassembly_conf_set(uint16_t port_id,
> struct rte_eth_ip_reass_params *conf);
>
> +/**
> + * In case of IP reassembly offload failure, ol_flags in mbuf will be set
> + * with RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE and packets will be returned
> + * without alteration. The application can retrieve the attached fragments
> + * using mbuf dynamic field.
> + */
> +typedef struct {
> + /**
> + * Next fragment packet. Application should fetch dynamic field of
> + * each fragment until a NULL is received and nb_frags is 0.
> + */
> + struct rte_mbuf *next_frag;
> + /** Time spent(in ms) by HW in waiting for further fragments. */
> + uint16_t time_spent;
> + /** Number of more fragments attached in mbuf dynamic fields. */
> + uint16_t nb_frags;
> +} rte_eth_ip_reass_dynfield_t;
Looks like a bit of overkill to me:
We do already have 'next' and 'nb_frags' fields inside mbuf,
why can't they be used here? Why a separate ones are necessary?
> +
> +extern int rte_eth_ip_reass_dynfield_offset;
> +
>
> > Hardware IP reassembly may be incomplete for multiple reasons like
> > reassembly timeout reached, duplicate fragments, etc.
> > To save application cycles to process these packets again, a new
> > mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added to
> > show that the mbuf received is not reassembled properly.
>
> If we use dynfiled for data, why not use dynflag for
> RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE?
> That way we can avoid introduced hardcoded (always defined) flags for that
> case.
I have not looked into using dynflag. Will explore if it can be used.
> >
> > +/**
> > + * In case of IP reassembly offload failure, ol_flags in mbuf will be set
> > + * with RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE and packets will be
> returned
> > + * without alteration. The application can retrieve the attached fragments
> > + * using mbuf dynamic field.
> > + */
> > +typedef struct {
> > + /**
> > + * Next fragment packet. Application should fetch dynamic field of
> > + * each fragment until a NULL is received and nb_frags is 0.
> > + */
> > + struct rte_mbuf *next_frag;
> > + /** Time spent(in ms) by HW in waiting for further fragments. */
> > + uint16_t time_spent;
> > + /** Number of more fragments attached in mbuf dynamic fields. */
> > + uint16_t nb_frags;
> > +} rte_eth_ip_reass_dynfield_t;
>
>
> Looks like a bit of overkill to me:
> We do already have 'next' and 'nb_frags' fields inside mbuf,
> why can't they be used here? Why a separate ones are necessary?
>
The next and nb_frags in mbuf is for segmented buffers and not IP fragments.
But here we will have separate mbufs in each dynfield denoting each of the
fragments which may have further segmented buffers.
> >
> > > Hardware IP reassembly may be incomplete for multiple reasons like
> > > reassembly timeout reached, duplicate fragments, etc.
> > > To save application cycles to process these packets again, a new
> > > mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added to
> > > show that the mbuf received is not reassembled properly.
> >
> > If we use dynfiled for data, why not use dynflag for
> > RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE?
> > That way we can avoid introduced hardcoded (always defined) flags for that
> > case.
>
> I have not looked into using dynflag. Will explore if it can be used.
>
>
> > >
> > > +/**
> > > + * In case of IP reassembly offload failure, ol_flags in mbuf will be set
> > > + * with RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE and packets will be
> > returned
> > > + * without alteration. The application can retrieve the attached fragments
> > > + * using mbuf dynamic field.
> > > + */
> > > +typedef struct {
> > > + /**
> > > + * Next fragment packet. Application should fetch dynamic field of
> > > + * each fragment until a NULL is received and nb_frags is 0.
> > > + */
> > > + struct rte_mbuf *next_frag;
> > > + /** Time spent(in ms) by HW in waiting for further fragments. */
> > > + uint16_t time_spent;
> > > + /** Number of more fragments attached in mbuf dynamic fields. */
> > > + uint16_t nb_frags;
> > > +} rte_eth_ip_reass_dynfield_t;
> >
> >
> > Looks like a bit of overkill to me:
> > We do already have 'next' and 'nb_frags' fields inside mbuf,
> > why can't they be used here? Why a separate ones are necessary?
> >
> The next and nb_frags in mbuf is for segmented buffers and not IP fragments.
> But here we will have separate mbufs in each dynfield denoting each of the
> fragments which may have further segmented buffers.
Makes sense, thanks for explanation.
Though in that case just 'struct rte_mbuf *next_frag' might be enough
(user will walk though the list till mbuf->next_frag != NULL)?
The reason I am asking: current sizeof(rte_eth_ip_reass_dynfield_t) is 16B,
which is quite a lot for mbuf, especially considering that it has to be continuous 16B.
Making it smaller (8B) or even splitting into 2 fileds (8+4) will give it more chances
to coexist with other dynfields.
> > > >
> > > > +/**
> > > > + * In case of IP reassembly offload failure, ol_flags in mbuf will be set
> > > > + * with RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE and packets will
> be
> > > returned
> > > > + * without alteration. The application can retrieve the attached fragments
> > > > + * using mbuf dynamic field.
> > > > + */
> > > > +typedef struct {
> > > > + /**
> > > > + * Next fragment packet. Application should fetch dynamic field of
> > > > + * each fragment until a NULL is received and nb_frags is 0.
> > > > + */
> > > > + struct rte_mbuf *next_frag;
> > > > + /** Time spent(in ms) by HW in waiting for further fragments. */
> > > > + uint16_t time_spent;
> > > > + /** Number of more fragments attached in mbuf dynamic fields. */
> > > > + uint16_t nb_frags;
> > > > +} rte_eth_ip_reass_dynfield_t;
> > >
> > >
> > > Looks like a bit of overkill to me:
> > > We do already have 'next' and 'nb_frags' fields inside mbuf,
> > > why can't they be used here? Why a separate ones are necessary?
> > >
> > The next and nb_frags in mbuf is for segmented buffers and not IP fragments.
> > But here we will have separate mbufs in each dynfield denoting each of the
> > fragments which may have further segmented buffers.
>
> Makes sense, thanks for explanation.
> Though in that case just 'struct rte_mbuf *next_frag' might be enough
> (user will walk though the list till mbuf->next_frag != NULL)?
> The reason I am asking: current sizeof(rte_eth_ip_reass_dynfield_t) is 16B,
> which is quite a lot for mbuf, especially considering that it has to be continuous
> 16B.
> Making it smaller (8B) or even splitting into 2 fileds (8+4) will give it more
> chances
> to coexist with other dynfields.
Even if we drop nb_frags, we will be left with uint16_t time_spent.
Are you suggesting to use separate dynfield altogether for 2 bytes of time_spent?
Hi Konstantin,
> > > Hardware IP reassembly may be incomplete for multiple reasons like
> > > reassembly timeout reached, duplicate fragments, etc.
> > > To save application cycles to process these packets again, a new
> > > mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added to
> > > show that the mbuf received is not reassembled properly.
> >
> > If we use dynfiled for data, why not use dynflag for
> > RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE?
> > That way we can avoid introduced hardcoded (always defined) flags for that
> > case.
>
> I have not looked into using dynflag. Will explore if it can be used.
The intent of adding this feature is to reduce application cycles for IP reassembly.
But if we use dynflag, it will take a lot of cycles to check if dyn flag is set or not.
As I understand, it first need to be looked up in a linked list and then checked.
And this will be checked for each packet even if there is no reassembly involved.
Hi Akhil,
> Hi Konstantin,
> > > > Hardware IP reassembly may be incomplete for multiple reasons like
> > > > reassembly timeout reached, duplicate fragments, etc.
> > > > To save application cycles to process these packets again, a new
> > > > mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added to
> > > > show that the mbuf received is not reassembled properly.
> > >
> > > If we use dynfiled for data, why not use dynflag for
> > > RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE?
> > > That way we can avoid introduced hardcoded (always defined) flags for that
> > > case.
> >
> > I have not looked into using dynflag. Will explore if it can be used.
> The intent of adding this feature is to reduce application cycles for IP reassembly.
> But if we use dynflag, it will take a lot of cycles to check if dyn flag is set or not.
> As I understand, it first need to be looked up in a linked list and then checked.
> And this will be checked for each packet even if there is no reassembly involved.
No, I don't think it is correct understanding.
For dyn-flag it is the same approach as for dyn-field.
At init time it selects the bit which will be used and return it'e value to the user.
Then user will set/check the at runtime.
So no linking list walks at runtime.
All you missing comparing to hard-coded values: complier optimizations.
> Hi Akhil,
>
> > Hi Konstantin,
> > > > > Hardware IP reassembly may be incomplete for multiple reasons like
> > > > > reassembly timeout reached, duplicate fragments, etc.
> > > > > To save application cycles to process these packets again, a new
> > > > > mbuf ol_flag (RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE) is added
> to
> > > > > show that the mbuf received is not reassembled properly.
> > > >
> > > > If we use dynfiled for data, why not use dynflag for
> > > > RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE?
> > > > That way we can avoid introduced hardcoded (always defined) flags for
> that
> > > > case.
> > >
> > > I have not looked into using dynflag. Will explore if it can be used.
> > The intent of adding this feature is to reduce application cycles for IP
> reassembly.
> > But if we use dynflag, it will take a lot of cycles to check if dyn flag is set or
> not.
> > As I understand, it first need to be looked up in a linked list and then checked.
> > And this will be checked for each packet even if there is no reassembly
> involved.
>
> No, I don't think it is correct understanding.
> For dyn-flag it is the same approach as for dyn-field.
> At init time it selects the bit which will be used and return it'e value to the user.
> Then user will set/check the at runtime.
> So no linking list walks at runtime.
> All you missing comparing to hard-coded values: complier optimizations.
>
Ok, got it. rte_mbuf_dynflag_lookup() need to happen only for the first mbuf.
I was checking is_timestamp_enabled() in test-pmd. Didn't see that dynflag was
a static variable.
I thought it was happening for each packet.
> > > > >
> > > > > +/**
> > > > > + * In case of IP reassembly offload failure, ol_flags in mbuf will be set
> > > > > + * with RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE and packets will
> > be
> > > > returned
> > > > > + * without alteration. The application can retrieve the attached fragments
> > > > > + * using mbuf dynamic field.
> > > > > + */
> > > > > +typedef struct {
> > > > > + /**
> > > > > + * Next fragment packet. Application should fetch dynamic field of
> > > > > + * each fragment until a NULL is received and nb_frags is 0.
> > > > > + */
> > > > > + struct rte_mbuf *next_frag;
> > > > > + /** Time spent(in ms) by HW in waiting for further fragments. */
> > > > > + uint16_t time_spent;
> > > > > + /** Number of more fragments attached in mbuf dynamic fields. */
> > > > > + uint16_t nb_frags;
> > > > > +} rte_eth_ip_reass_dynfield_t;
> > > >
> > > >
> > > > Looks like a bit of overkill to me:
> > > > We do already have 'next' and 'nb_frags' fields inside mbuf,
> > > > why can't they be used here? Why a separate ones are necessary?
> > > >
> > > The next and nb_frags in mbuf is for segmented buffers and not IP fragments.
> > > But here we will have separate mbufs in each dynfield denoting each of the
> > > fragments which may have further segmented buffers.
> >
> > Makes sense, thanks for explanation.
> > Though in that case just 'struct rte_mbuf *next_frag' might be enough
> > (user will walk though the list till mbuf->next_frag != NULL)?
> > The reason I am asking: current sizeof(rte_eth_ip_reass_dynfield_t) is 16B,
> > which is quite a lot for mbuf, especially considering that it has to be continuous
> > 16B.
> > Making it smaller (8B) or even splitting into 2 fileds (8+4) will give it more
> > chances
> > to coexist with other dynfields.
>
> Even if we drop nb_frags, we will be left with uint16_t time_spent.
> Are you suggesting to use separate dynfield altogether for 2 bytes of time_spent?
Yes, that's was my thought - split it into two separate fields, if possible.
@@ -1671,6 +1671,14 @@ int
rte_eth_hairpin_queue_peer_unbind(uint16_t cur_port, uint16_t cur_queue,
uint32_t direction);
+/**
+ * @internal
+ * Register mbuf dynamic field for IP reassembly incomplete case.
+ */
+__rte_internal
+int
+rte_eth_ip_reass_dynfield_register(void);
+
/*
* Legacy ethdev API used internally by drivers.
@@ -6503,6 +6503,22 @@ rte_eth_ip_reassembly_conf_set(uint16_t port_id,
(*dev->dev_ops->ip_reassembly_conf_set)(dev, conf));
}
+#define RTE_ETH_IP_REASS_DYNFIELD_NAME "rte_eth_ip_reass_dynfield"
+int rte_eth_ip_reass_dynfield_offset = -1;
+
+int
+rte_eth_ip_reass_dynfield_register(void)
+{
+ static const struct rte_mbuf_dynfield dynfield_desc = {
+ .name = RTE_ETH_IP_REASS_DYNFIELD_NAME,
+ .size = sizeof(rte_eth_ip_reass_dynfield_t),
+ .align = __alignof__(rte_eth_ip_reass_dynfield_t),
+ };
+ rte_eth_ip_reass_dynfield_offset =
+ rte_mbuf_dynfield_register(&dynfield_desc);
+ return rte_eth_ip_reass_dynfield_offset;
+}
+
RTE_LOG_REGISTER_DEFAULT(rte_eth_dev_logtype, INFO);
RTE_INIT(ethdev_init_telemetry)
@@ -5245,6 +5245,63 @@ __rte_experimental
int rte_eth_ip_reassembly_conf_set(uint16_t port_id,
struct rte_eth_ip_reass_params *conf);
+/**
+ * In case of IP reassembly offload failure, ol_flags in mbuf will be set
+ * with RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE and packets will be returned
+ * without alteration. The application can retrieve the attached fragments
+ * using mbuf dynamic field.
+ */
+typedef struct {
+ /**
+ * Next fragment packet. Application should fetch dynamic field of
+ * each fragment until a NULL is received and nb_frags is 0.
+ */
+ struct rte_mbuf *next_frag;
+ /** Time spent(in ms) by HW in waiting for further fragments. */
+ uint16_t time_spent;
+ /** Number of more fragments attached in mbuf dynamic fields. */
+ uint16_t nb_frags;
+} rte_eth_ip_reass_dynfield_t;
+
+extern int rte_eth_ip_reass_dynfield_offset;
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get pointer to mbuf dynamic field for getting incomplete
+ * reassembled fragments.
+ *
+ * For performance reason, no check is done,
+ * the dynamic field may not be registered.
+ * @see rte_eth_ip_reass_dynfield_is_registered
+ *
+ * @param mbuf packet to access
+ * @return pointer to mbuf dynamic field
+ */
+__rte_experimental
+static inline rte_eth_ip_reass_dynfield_t *
+rte_eth_ip_reass_dynfield(struct rte_mbuf *mbuf)
+{
+ return RTE_MBUF_DYNFIELD(mbuf,
+ rte_eth_ip_reass_dynfield_offset,
+ rte_eth_ip_reass_dynfield_t *);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Check whether the dynamic field is registered.
+ *
+ * @return true if rte_eth_ip_reass_dynfield_register() has been called.
+ */
+__rte_experimental
+static inline bool rte_eth_ip_reass_dynfield_is_registered(void)
+{
+ return rte_eth_ip_reass_dynfield_offset >= 0;
+}
+
#include <rte_ethdev_core.h>
@@ -259,6 +259,7 @@ EXPERIMENTAL {
#added in 22.03
rte_eth_ip_reassembly_conf_set;
+ rte_eth_ip_reass_dynfield_offset;
};
INTERNAL {
@@ -282,6 +283,7 @@ INTERNAL {
rte_eth_hairpin_queue_peer_bind;
rte_eth_hairpin_queue_peer_unbind;
rte_eth_hairpin_queue_peer_update;
+ rte_eth_ip_reass_dynfield_register;
rte_eth_representor_id_get;
rte_eth_switch_domain_alloc;
rte_eth_switch_domain_free;
@@ -233,10 +233,11 @@ extern "C" {
#define PKT_RX_OUTER_L4_CKSUM_INVALID \
RTE_DEPRECATED(PKT_RX_OUTER_L4_CKSUM_INVALID) \
RTE_MBUF_F_RX_OUTER_L4_CKSUM_INVALID
+#define RTE_MBUF_F_RX_IPREASSEMBLY_INCOMPLETE (1ULL << 23)
/* add new RX flags here, don't forget to update RTE_MBUF_F_FIRST_FREE */
-#define RTE_MBUF_F_FIRST_FREE (1ULL << 23)
+#define RTE_MBUF_F_FIRST_FREE (1ULL << 24)
#define PKT_FIRST_FREE RTE_DEPRECATED(PKT_FIRST_FREE) RTE_MBUF_F_FIRST_FREE
#define RTE_MBUF_F_LAST_FREE (1ULL << 40)
#define PKT_LAST_FREE RTE_DEPRECATED(PKT_LAST_FREE) RTE_MBUF_F_LAST_FREE