diff mbox series

net/bnxt: fixes for vxlan decap full offload

Message ID 1604249501-28929-1-git-send-email-venkatkumar.duvvuru@broadcom.com (mailing list archive)
State Accepted, archived
Delegated to: Ajit Khaparde
Headers show
Series net/bnxt: fixes for vxlan decap full offload | expand

Checks

Context Check Description
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Venkat Duvvuru Nov. 1, 2020, 4:51 p.m. UTC
1. When a PMD application queries for flow counters, it could ask PMD
   to reset the counters when the application is doing the counters
   accumulation. In this case, PMD should not accumulate rather reset
   the counter.

2. Some of the PMD applications may set the protocol field in the IPv4
   spec but don't set the mask. So, consider the mask in the proto
   value calculation.

4. The cached tunnel inner flow is not getting installed in the
   context of tunnel outer flow create because of the wrong
   error code check when tunnel outer flow is installed in the
   hardware.

5. When a dpdk application offloads the same tunnel inner flow on
   all the uplink ports, other than the first one the driver rejects
   the rest of them. However, the first tunnel inner flow request
   might not be of the correct physical port. This is fixed by
   caching the tunnel inner flow entry for all the ports on which
   the flow offload request has arrived on. The tunnel inner flows
   which were cached on the irrelevant ports will eventually get
   aged out as there won't be any traffic on these ports.

Fixes: 3c0da5ee4064 ("net/bnxt: add VXLAN decap offload support")

Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
Reviewed-by: Ajit Kumar Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
---
 drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c       |  1 +
 drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c          |  9 ++---
 drivers/net/bnxt/tf_ulp/ulp_flow_db.c         | 18 +++++++--
 drivers/net/bnxt/tf_ulp/ulp_flow_db.h         |  3 +-
 drivers/net/bnxt/tf_ulp/ulp_rte_parser.c      | 14 +++++++
 drivers/net/bnxt/tf_ulp/ulp_template_struct.h |  1 +
 drivers/net/bnxt/tf_ulp/ulp_tun.c             | 55 +++++++++++++++++++++------
 drivers/net/bnxt/tf_ulp/ulp_tun.h             | 13 +++++--
 8 files changed, 89 insertions(+), 25 deletions(-)

Comments

Ajit Khaparde Nov. 3, 2020, 12:23 a.m. UTC | #1
On Sun, Nov 1, 2020 at 8:52 AM Venkat Duvvuru <
venkatkumar.duvvuru@broadcom.com> wrote:

> 1. When a PMD application queries for flow counters, it could ask PMD
>    to reset the counters when the application is doing the counters
>    accumulation. In this case, PMD should not accumulate rather reset
>    the counter.
>
> 2. Some of the PMD applications may set the protocol field in the IPv4
>    spec but don't set the mask. So, consider the mask in the proto
>    value calculation.
>
> 4. The cached tunnel inner flow is not getting installed in the
>    context of tunnel outer flow create because of the wrong
>    error code check when tunnel outer flow is installed in the
>    hardware.
>
> 5. When a dpdk application offloads the same tunnel inner flow on
>    all the uplink ports, other than the first one the driver rejects
>    the rest of them. However, the first tunnel inner flow request
>    might not be of the correct physical port. This is fixed by
>    caching the tunnel inner flow entry for all the ports on which
>    the flow offload request has arrived on. The tunnel inner flows
>    which were cached on the irrelevant ports will eventually get
>    aged out as there won't be any traffic on these ports.
>
> Fixes: 3c0da5ee4064 ("net/bnxt: add VXLAN decap offload support")
>
> Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@broadcom.com>
> Reviewed-by: Ajit Kumar Khaparde <ajit.khaparde@broadcom.com>
> Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
>
Updated the fixes tag and commit message and applied to dpdk-next-net-brcm.

---
>  drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c       |  1 +
>  drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c          |  9 ++---
>  drivers/net/bnxt/tf_ulp/ulp_flow_db.c         | 18 +++++++--
>  drivers/net/bnxt/tf_ulp/ulp_flow_db.h         |  3 +-
>  drivers/net/bnxt/tf_ulp/ulp_rte_parser.c      | 14 +++++++
>  drivers/net/bnxt/tf_ulp/ulp_template_struct.h |  1 +
>  drivers/net/bnxt/tf_ulp/ulp_tun.c             | 55
> +++++++++++++++++++++------
>  drivers/net/bnxt/tf_ulp/ulp_tun.h             | 13 +++++--
>  8 files changed, 89 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
> b/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
> index 75a7dbe..a53b1cf 100644
> --- a/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
> +++ b/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
> @@ -175,6 +175,7 @@ bnxt_ulp_flow_create(struct rte_eth_dev *dev,
>         params.fid = fid;
>         params.func_id = func_id;
>         params.priority = attr->priority;
> +       params.port_id = bnxt_get_phy_port_id(dev->data->port_id);
>         /* Perform the rte flow post process */
>         ret = bnxt_ulp_rte_parser_post_process(&params);
>         if (ret == BNXT_TF_RC_ERROR)
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
> b/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
> index 734b419..4502551 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
> +++ b/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
> @@ -624,11 +624,10 @@ int ulp_fc_mgr_query_count_get(struct
> bnxt_ulp_context *ctxt,
>                 pthread_mutex_unlock(&ulp_fc_info->fc_lock);
>         } else if (params.resource_sub_type ==
>
> BNXT_ULP_RESOURCE_SUB_TYPE_INDEX_TYPE_INT_COUNT_ACC) {
> -               /* Get the stats from the parent child table */
> -               ulp_flow_db_parent_flow_count_get(ctxt,
> -                                                 flow_id,
> -                                                 &count->hits,
> -                                                 &count->bytes);
> +               /* Get stats from the parent child table */
> +               ulp_flow_db_parent_flow_count_get(ctxt, flow_id,
> +                                                 &count->hits,
> &count->bytes,
> +                                                 count->reset);
>                 count->hits_set = 1;
>                 count->bytes_set = 1;
>         } else {
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
> b/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
> index 5e7c8ab..a331410 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
> +++ b/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
> @@ -868,8 +868,9 @@ ulp_flow_db_fid_free(struct bnxt_ulp_context *ulp_ctxt,
>                      enum bnxt_ulp_fdb_type flow_type,
>                      uint32_t fid)
>  {
> -       struct bnxt_ulp_flow_db *flow_db;
> +       struct bnxt_tun_cache_entry *tun_tbl;
>         struct bnxt_ulp_flow_tbl *flow_tbl;
> +       struct bnxt_ulp_flow_db *flow_db;
>
>         flow_db = bnxt_ulp_cntxt_ptr2_flow_db_get(ulp_ctxt);
>         if (!flow_db) {
> @@ -908,6 +909,12 @@ ulp_flow_db_fid_free(struct bnxt_ulp_context
> *ulp_ctxt,
>         if (flow_type == BNXT_ULP_FDB_TYPE_REGULAR)
>                 ulp_flow_db_func_id_set(flow_db, fid, 0);
>
> +       tun_tbl = bnxt_ulp_cntxt_ptr2_tun_tbl_get(ulp_ctxt);
> +       if (!tun_tbl)
> +               return -EINVAL;
> +
> +       ulp_clear_tun_inner_entry(tun_tbl, fid);
> +
>         /* all good, return success */
>         return 0;
>  }
> @@ -1812,9 +1819,8 @@ ulp_flow_db_parent_flow_count_update(struct
> bnxt_ulp_context *ulp_ctxt,
>   */
>  int32_t
>  ulp_flow_db_parent_flow_count_get(struct bnxt_ulp_context *ulp_ctxt,
> -                                 uint32_t parent_fid,
> -                                 uint64_t *packet_count,
> -                                 uint64_t *byte_count)
> +                                 uint32_t parent_fid, uint64_t
> *packet_count,
> +                                 uint64_t *byte_count, uint8_t
> count_reset)
>  {
>         struct bnxt_ulp_flow_db *flow_db;
>         struct ulp_fdb_parent_child_db *p_pdb;
> @@ -1835,6 +1841,10 @@ ulp_flow_db_parent_flow_count_get(struct
> bnxt_ulp_context *ulp_ctxt,
>
> p_pdb->parent_flow_tbl[idx].pkt_count;
>                                 *byte_count =
>
> p_pdb->parent_flow_tbl[idx].byte_count;
> +                               if (count_reset) {
> +
>  p_pdb->parent_flow_tbl[idx].pkt_count = 0;
> +
>  p_pdb->parent_flow_tbl[idx].byte_count = 0;
> +                               }
>                         }
>                         return 0;
>                 }
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
> b/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
> index f7dfd67..a2a04ce 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
> +++ b/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
> @@ -390,7 +390,8 @@ int32_t
>  ulp_flow_db_parent_flow_count_get(struct bnxt_ulp_context *ulp_ctxt,
>                                   uint32_t parent_fid,
>                                   uint64_t *packet_count,
> -                                 uint64_t *byte_count);
> +                                 uint64_t *byte_count,
> +                                 uint8_t count_reset);
>
>  /*
>   * reset the parent accumulation counters
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
> b/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
> index df38b83..a54c55c 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
> +++ b/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
> @@ -1012,6 +1012,13 @@ ulp_rte_ipv4_hdr_handler(const struct rte_flow_item
> *item,
>                 ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_O_L3, 1);
>         }
>
> +       /* Some of the PMD applications may set the protocol field
> +        * in the IPv4 spec but don't set the mask. So, consider
> +        * the mask in the proto value calculation.
> +        */
> +       if (ipv4_mask)
> +               proto &= ipv4_mask->hdr.next_proto_id;
> +
>         /* Update the field protocol hdr bitmap */
>         ulp_rte_l3_proto_type_update(params, proto, inner_flag);
>         ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_L3_HDR_CNT, ++cnt);
> @@ -1150,6 +1157,13 @@ ulp_rte_ipv6_hdr_handler(const struct rte_flow_item
> *item,
>                 ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_O_L3, 1);
>         }
>
> +       /* Some of the PMD applications may set the protocol field
> +        * in the IPv6 spec but don't set the mask. So, consider
> +        * the mask in proto value calculation.
> +        */
> +       if (ipv6_mask)
> +               proto &= ipv6_mask->hdr.proto;
> +
>         /* Update the field protocol hdr bitmap */
>         ulp_rte_l3_proto_type_update(params, proto, inner_flag);
>         ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_L3_HDR_CNT, ++cnt);
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
> b/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
> index 9d690a9..6e5e8d6 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
> +++ b/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
> @@ -77,6 +77,7 @@ struct ulp_rte_parser_params {
>         uint32_t                        parent_flow;
>         uint32_t                        parent_fid;
>         uint16_t                        func_id;
> +       uint16_t                        port_id;
>         uint32_t                        class_id;
>         uint32_t                        act_tmpl;
>         struct bnxt_ulp_context         *ulp_ctx;
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_tun.c
> b/drivers/net/bnxt/tf_ulp/ulp_tun.c
> index e8d2861..44b9b4b 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_tun.c
> +++ b/drivers/net/bnxt/tf_ulp/ulp_tun.c
> @@ -55,7 +55,8 @@ ulp_install_outer_tun_flow(struct ulp_rte_parser_params
> *params,
>                RTE_ETHER_ADDR_LEN);
>
>         tun_entry->valid = true;
> -       tun_entry->state = BNXT_ULP_FLOW_STATE_TUN_O_OFFLD;
> +       tun_entry->tun_flow_info[params->port_id].state =
> +                               BNXT_ULP_FLOW_STATE_TUN_O_OFFLD;
>         tun_entry->outer_tun_flow_id = params->fid;
>
>         /* F1 and it's related F2s are correlated based on
> @@ -83,20 +84,23 @@ ulp_install_outer_tun_flow(struct
> ulp_rte_parser_params *params,
>
>  /* This function programs the inner tunnel flow in the hardware. */
>  static void
> -ulp_install_inner_tun_flow(struct bnxt_tun_cache_entry *tun_entry)
> +ulp_install_inner_tun_flow(struct bnxt_tun_cache_entry *tun_entry,
> +                          struct ulp_rte_parser_params *tun_o_params)
>  {
>         struct bnxt_ulp_mapper_create_parms mparms = { 0 };
> +       struct ulp_per_port_flow_info *flow_info;
>         struct ulp_rte_parser_params *params;
>         int ret;
>
>         /* F2 doesn't have tunnel dmac, use the tunnel dmac that was
>          * stored during F1 programming.
>          */
> -       params = &tun_entry->first_inner_tun_params;
> +       flow_info = &tun_entry->tun_flow_info[tun_o_params->port_id];
> +       params = &flow_info->first_inner_tun_params;
>         memcpy(&params->hdr_field[ULP_TUN_O_DMAC_HDR_FIELD_INDEX],
>                tun_entry->t_dmac, RTE_ETHER_ADDR_LEN);
>         params->parent_fid = tun_entry->outer_tun_flow_id;
> -       params->fid = tun_entry->first_inner_tun_flow_id;
> +       params->fid = flow_info->first_tun_i_fid;
>
>         bnxt_ulp_init_mapper_params(&mparms, params,
>                                     BNXT_ULP_FDB_TYPE_REGULAR);
> @@ -117,18 +121,20 @@ ulp_post_process_outer_tun_flow(struct
> ulp_rte_parser_params *params,
>         enum bnxt_ulp_tun_flow_state flow_state;
>         int ret;
>
> -       flow_state = tun_entry->state;
> +       flow_state = tun_entry->tun_flow_info[params->port_id].state;
>         ret = ulp_install_outer_tun_flow(params, tun_entry, tun_idx);
> -       if (ret)
> +       if (ret == BNXT_TF_RC_ERROR) {
> +               PMD_DRV_LOG(ERR, "Failed to create outer tunnel flow.");
>                 return ret;
> +       }
>
>         /* If flow_state == BNXT_ULP_FLOW_STATE_NORMAL before installing
>          * F1, that means F2 is not deferred. Hence, no need to install F2.
>          */
>         if (flow_state != BNXT_ULP_FLOW_STATE_NORMAL)
> -               ulp_install_inner_tun_flow(tun_entry);
> +               ulp_install_inner_tun_flow(tun_entry, params);
>
> -       return 0;
> +       return BNXT_TF_RC_FID;
>  }
>
>  /* This function will be called if inner tunnel flow request comes before
> @@ -138,6 +144,7 @@ static int32_t
>  ulp_post_process_first_inner_tun_flow(struct ulp_rte_parser_params
> *params,
>                                       struct bnxt_tun_cache_entry
> *tun_entry)
>  {
> +       struct ulp_per_port_flow_info *flow_info;
>         int ret;
>
>         ret = ulp_matcher_pattern_match(params, &params->class_id);
> @@ -153,11 +160,12 @@ ulp_post_process_first_inner_tun_flow(struct
> ulp_rte_parser_params *params,
>          * So, just cache the F2 information and program it in the context
>          * of F1 flow installation.
>          */
> -       memcpy(&tun_entry->first_inner_tun_params, params,
> +       flow_info = &tun_entry->tun_flow_info[params->port_id];
> +       memcpy(&flow_info->first_inner_tun_params, params,
>                sizeof(struct ulp_rte_parser_params));
>
> -       tun_entry->first_inner_tun_flow_id = params->fid;
> -       tun_entry->state = BNXT_ULP_FLOW_STATE_TUN_I_CACHED;
> +       flow_info->first_tun_i_fid = params->fid;
> +       flow_info->state = BNXT_ULP_FLOW_STATE_TUN_I_CACHED;
>
>         /* F1 and it's related F2s are correlated based on
>          * Tunnel Destination IP Address. It could be already set, if
> @@ -259,7 +267,7 @@ ulp_post_process_tun_flow(struct ulp_rte_parser_params
> *params)
>         if (rc == BNXT_TF_RC_ERROR)
>                 return rc;
>
> -       flow_state = tun_entry->state;
> +       flow_state = tun_entry->tun_flow_info[params->port_id].state;
>         /* Outer tunnel flow validation */
>         outer_tun_sig = BNXT_OUTER_TUN_SIGNATURE(l3_tun, params);
>         outer_tun_flow = BNXT_OUTER_TUN_FLOW(outer_tun_sig);
> @@ -308,3 +316,26 @@ ulp_clear_tun_entry(struct bnxt_tun_cache_entry
> *tun_tbl, uint8_t tun_idx)
>         memset(&tun_tbl[tun_idx], 0,
>                 sizeof(struct bnxt_tun_cache_entry));
>  }
> +
> +/* When a dpdk application offloads the same tunnel inner flow
> + * on all the uplink ports, a tunnel inner flow entry is cached
> + * even if it is not for the right uplink port. Such tunnel
> + * inner flows will eventually get aged out as there won't be
> + * any traffic on these ports. When such a flow destroy is
> + * called, cleanup the tunnel inner flow entry.
> + */
> +void
> +ulp_clear_tun_inner_entry(struct bnxt_tun_cache_entry *tun_tbl, uint32_t
> fid)
> +{
> +       struct ulp_per_port_flow_info *flow_info;
> +       int i, j;
> +
> +       for (i = 0; i < BNXT_ULP_MAX_TUN_CACHE_ENTRIES ; i++) {
> +               for (j = 0; j < RTE_MAX_ETHPORTS; j++) {
> +                       flow_info = &tun_tbl[i].tun_flow_info[j];
> +                       if (flow_info->first_tun_i_fid == fid &&
> +                           flow_info->state ==
> BNXT_ULP_FLOW_STATE_TUN_I_CACHED)
> +                               memset(flow_info, 0, sizeof(*flow_info));
> +               }
> +       }
> +}
> diff --git a/drivers/net/bnxt/tf_ulp/ulp_tun.h
> b/drivers/net/bnxt/tf_ulp/ulp_tun.h
> index ad70ae6..a4ccdb5 100644
> --- a/drivers/net/bnxt/tf_ulp/ulp_tun.h
> +++ b/drivers/net/bnxt/tf_ulp/ulp_tun.h
> @@ -70,8 +70,13 @@ enum bnxt_ulp_tun_flow_state {
>         BNXT_ULP_FLOW_STATE_TUN_I_CACHED
>  };
>
> -struct bnxt_tun_cache_entry {
> +struct ulp_per_port_flow_info {
>         enum bnxt_ulp_tun_flow_state    state;
> +       uint32_t                        first_tun_i_fid;
> +       struct ulp_rte_parser_params    first_inner_tun_params;
> +};
> +
> +struct bnxt_tun_cache_entry {
>         bool                            valid;
>         bool                            t_dst_ip_valid;
>         uint8_t                         t_dmac[RTE_ETHER_ADDR_LEN];
> @@ -80,13 +85,15 @@ struct bnxt_tun_cache_entry {
>                 uint8_t                 t_dst_ip6[16];
>         };
>         uint32_t                        outer_tun_flow_id;
> -       uint32_t                        first_inner_tun_flow_id;
>         uint16_t                        outer_tun_rej_cnt;
>         uint16_t                        inner_tun_rej_cnt;
> -       struct ulp_rte_parser_params    first_inner_tun_params;
> +       struct ulp_per_port_flow_info   tun_flow_info[RTE_MAX_ETHPORTS];
>  };
>
>  void
>  ulp_clear_tun_entry(struct bnxt_tun_cache_entry *tun_tbl, uint8_t
> tun_idx);
>
> +void
> +ulp_clear_tun_inner_entry(struct bnxt_tun_cache_entry *tun_tbl, uint32_t
> fid);
> +
>  #endif
> --
> 2.7.4
>
>
diff mbox series

Patch

diff --git a/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c b/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
index 75a7dbe..a53b1cf 100644
--- a/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
+++ b/drivers/net/bnxt/tf_ulp/bnxt_ulp_flow.c
@@ -175,6 +175,7 @@  bnxt_ulp_flow_create(struct rte_eth_dev *dev,
 	params.fid = fid;
 	params.func_id = func_id;
 	params.priority = attr->priority;
+	params.port_id = bnxt_get_phy_port_id(dev->data->port_id);
 	/* Perform the rte flow post process */
 	ret = bnxt_ulp_rte_parser_post_process(&params);
 	if (ret == BNXT_TF_RC_ERROR)
diff --git a/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c b/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
index 734b419..4502551 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
+++ b/drivers/net/bnxt/tf_ulp/ulp_fc_mgr.c
@@ -624,11 +624,10 @@  int ulp_fc_mgr_query_count_get(struct bnxt_ulp_context *ctxt,
 		pthread_mutex_unlock(&ulp_fc_info->fc_lock);
 	} else if (params.resource_sub_type ==
 			BNXT_ULP_RESOURCE_SUB_TYPE_INDEX_TYPE_INT_COUNT_ACC) {
-		/* Get the stats from the parent child table */
-		ulp_flow_db_parent_flow_count_get(ctxt,
-						  flow_id,
-						  &count->hits,
-						  &count->bytes);
+		/* Get stats from the parent child table */
+		ulp_flow_db_parent_flow_count_get(ctxt, flow_id,
+						  &count->hits, &count->bytes,
+						  count->reset);
 		count->hits_set = 1;
 		count->bytes_set = 1;
 	} else {
diff --git a/drivers/net/bnxt/tf_ulp/ulp_flow_db.c b/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
index 5e7c8ab..a331410 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
+++ b/drivers/net/bnxt/tf_ulp/ulp_flow_db.c
@@ -868,8 +868,9 @@  ulp_flow_db_fid_free(struct bnxt_ulp_context *ulp_ctxt,
 		     enum bnxt_ulp_fdb_type flow_type,
 		     uint32_t fid)
 {
-	struct bnxt_ulp_flow_db *flow_db;
+	struct bnxt_tun_cache_entry *tun_tbl;
 	struct bnxt_ulp_flow_tbl *flow_tbl;
+	struct bnxt_ulp_flow_db *flow_db;
 
 	flow_db = bnxt_ulp_cntxt_ptr2_flow_db_get(ulp_ctxt);
 	if (!flow_db) {
@@ -908,6 +909,12 @@  ulp_flow_db_fid_free(struct bnxt_ulp_context *ulp_ctxt,
 	if (flow_type == BNXT_ULP_FDB_TYPE_REGULAR)
 		ulp_flow_db_func_id_set(flow_db, fid, 0);
 
+	tun_tbl = bnxt_ulp_cntxt_ptr2_tun_tbl_get(ulp_ctxt);
+	if (!tun_tbl)
+		return -EINVAL;
+
+	ulp_clear_tun_inner_entry(tun_tbl, fid);
+
 	/* all good, return success */
 	return 0;
 }
@@ -1812,9 +1819,8 @@  ulp_flow_db_parent_flow_count_update(struct bnxt_ulp_context *ulp_ctxt,
  */
 int32_t
 ulp_flow_db_parent_flow_count_get(struct bnxt_ulp_context *ulp_ctxt,
-				  uint32_t parent_fid,
-				  uint64_t *packet_count,
-				  uint64_t *byte_count)
+				  uint32_t parent_fid, uint64_t *packet_count,
+				  uint64_t *byte_count, uint8_t count_reset)
 {
 	struct bnxt_ulp_flow_db *flow_db;
 	struct ulp_fdb_parent_child_db *p_pdb;
@@ -1835,6 +1841,10 @@  ulp_flow_db_parent_flow_count_get(struct bnxt_ulp_context *ulp_ctxt,
 					p_pdb->parent_flow_tbl[idx].pkt_count;
 				*byte_count =
 					p_pdb->parent_flow_tbl[idx].byte_count;
+				if (count_reset) {
+					p_pdb->parent_flow_tbl[idx].pkt_count = 0;
+					p_pdb->parent_flow_tbl[idx].byte_count = 0;
+				}
 			}
 			return 0;
 		}
diff --git a/drivers/net/bnxt/tf_ulp/ulp_flow_db.h b/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
index f7dfd67..a2a04ce 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
+++ b/drivers/net/bnxt/tf_ulp/ulp_flow_db.h
@@ -390,7 +390,8 @@  int32_t
 ulp_flow_db_parent_flow_count_get(struct bnxt_ulp_context *ulp_ctxt,
 				  uint32_t parent_fid,
 				  uint64_t *packet_count,
-				  uint64_t *byte_count);
+				  uint64_t *byte_count,
+				  uint8_t count_reset);
 
 /*
  * reset the parent accumulation counters
diff --git a/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c b/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
index df38b83..a54c55c 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
+++ b/drivers/net/bnxt/tf_ulp/ulp_rte_parser.c
@@ -1012,6 +1012,13 @@  ulp_rte_ipv4_hdr_handler(const struct rte_flow_item *item,
 		ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_O_L3, 1);
 	}
 
+	/* Some of the PMD applications may set the protocol field
+	 * in the IPv4 spec but don't set the mask. So, consider
+	 * the mask in the proto value calculation.
+	 */
+	if (ipv4_mask)
+		proto &= ipv4_mask->hdr.next_proto_id;
+
 	/* Update the field protocol hdr bitmap */
 	ulp_rte_l3_proto_type_update(params, proto, inner_flag);
 	ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_L3_HDR_CNT, ++cnt);
@@ -1150,6 +1157,13 @@  ulp_rte_ipv6_hdr_handler(const struct rte_flow_item *item,
 		ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_O_L3, 1);
 	}
 
+	/* Some of the PMD applications may set the protocol field
+	 * in the IPv6 spec but don't set the mask. So, consider
+	 * the mask in proto value calculation.
+	 */
+	if (ipv6_mask)
+		proto &= ipv6_mask->hdr.proto;
+
 	/* Update the field protocol hdr bitmap */
 	ulp_rte_l3_proto_type_update(params, proto, inner_flag);
 	ULP_COMP_FLD_IDX_WR(params, BNXT_ULP_CF_IDX_L3_HDR_CNT, ++cnt);
diff --git a/drivers/net/bnxt/tf_ulp/ulp_template_struct.h b/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
index 9d690a9..6e5e8d6 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
+++ b/drivers/net/bnxt/tf_ulp/ulp_template_struct.h
@@ -77,6 +77,7 @@  struct ulp_rte_parser_params {
 	uint32_t			parent_flow;
 	uint32_t			parent_fid;
 	uint16_t			func_id;
+	uint16_t			port_id;
 	uint32_t			class_id;
 	uint32_t			act_tmpl;
 	struct bnxt_ulp_context		*ulp_ctx;
diff --git a/drivers/net/bnxt/tf_ulp/ulp_tun.c b/drivers/net/bnxt/tf_ulp/ulp_tun.c
index e8d2861..44b9b4b 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_tun.c
+++ b/drivers/net/bnxt/tf_ulp/ulp_tun.c
@@ -55,7 +55,8 @@  ulp_install_outer_tun_flow(struct ulp_rte_parser_params *params,
 	       RTE_ETHER_ADDR_LEN);
 
 	tun_entry->valid = true;
-	tun_entry->state = BNXT_ULP_FLOW_STATE_TUN_O_OFFLD;
+	tun_entry->tun_flow_info[params->port_id].state =
+				BNXT_ULP_FLOW_STATE_TUN_O_OFFLD;
 	tun_entry->outer_tun_flow_id = params->fid;
 
 	/* F1 and it's related F2s are correlated based on
@@ -83,20 +84,23 @@  ulp_install_outer_tun_flow(struct ulp_rte_parser_params *params,
 
 /* This function programs the inner tunnel flow in the hardware. */
 static void
-ulp_install_inner_tun_flow(struct bnxt_tun_cache_entry *tun_entry)
+ulp_install_inner_tun_flow(struct bnxt_tun_cache_entry *tun_entry,
+			   struct ulp_rte_parser_params *tun_o_params)
 {
 	struct bnxt_ulp_mapper_create_parms mparms = { 0 };
+	struct ulp_per_port_flow_info *flow_info;
 	struct ulp_rte_parser_params *params;
 	int ret;
 
 	/* F2 doesn't have tunnel dmac, use the tunnel dmac that was
 	 * stored during F1 programming.
 	 */
-	params = &tun_entry->first_inner_tun_params;
+	flow_info = &tun_entry->tun_flow_info[tun_o_params->port_id];
+	params = &flow_info->first_inner_tun_params;
 	memcpy(&params->hdr_field[ULP_TUN_O_DMAC_HDR_FIELD_INDEX],
 	       tun_entry->t_dmac, RTE_ETHER_ADDR_LEN);
 	params->parent_fid = tun_entry->outer_tun_flow_id;
-	params->fid = tun_entry->first_inner_tun_flow_id;
+	params->fid = flow_info->first_tun_i_fid;
 
 	bnxt_ulp_init_mapper_params(&mparms, params,
 				    BNXT_ULP_FDB_TYPE_REGULAR);
@@ -117,18 +121,20 @@  ulp_post_process_outer_tun_flow(struct ulp_rte_parser_params *params,
 	enum bnxt_ulp_tun_flow_state flow_state;
 	int ret;
 
-	flow_state = tun_entry->state;
+	flow_state = tun_entry->tun_flow_info[params->port_id].state;
 	ret = ulp_install_outer_tun_flow(params, tun_entry, tun_idx);
-	if (ret)
+	if (ret == BNXT_TF_RC_ERROR) {
+		PMD_DRV_LOG(ERR, "Failed to create outer tunnel flow.");
 		return ret;
+	}
 
 	/* If flow_state == BNXT_ULP_FLOW_STATE_NORMAL before installing
 	 * F1, that means F2 is not deferred. Hence, no need to install F2.
 	 */
 	if (flow_state != BNXT_ULP_FLOW_STATE_NORMAL)
-		ulp_install_inner_tun_flow(tun_entry);
+		ulp_install_inner_tun_flow(tun_entry, params);
 
-	return 0;
+	return BNXT_TF_RC_FID;
 }
 
 /* This function will be called if inner tunnel flow request comes before
@@ -138,6 +144,7 @@  static int32_t
 ulp_post_process_first_inner_tun_flow(struct ulp_rte_parser_params *params,
 				      struct bnxt_tun_cache_entry *tun_entry)
 {
+	struct ulp_per_port_flow_info *flow_info;
 	int ret;
 
 	ret = ulp_matcher_pattern_match(params, &params->class_id);
@@ -153,11 +160,12 @@  ulp_post_process_first_inner_tun_flow(struct ulp_rte_parser_params *params,
 	 * So, just cache the F2 information and program it in the context
 	 * of F1 flow installation.
 	 */
-	memcpy(&tun_entry->first_inner_tun_params, params,
+	flow_info = &tun_entry->tun_flow_info[params->port_id];
+	memcpy(&flow_info->first_inner_tun_params, params,
 	       sizeof(struct ulp_rte_parser_params));
 
-	tun_entry->first_inner_tun_flow_id = params->fid;
-	tun_entry->state = BNXT_ULP_FLOW_STATE_TUN_I_CACHED;
+	flow_info->first_tun_i_fid = params->fid;
+	flow_info->state = BNXT_ULP_FLOW_STATE_TUN_I_CACHED;
 
 	/* F1 and it's related F2s are correlated based on
 	 * Tunnel Destination IP Address. It could be already set, if
@@ -259,7 +267,7 @@  ulp_post_process_tun_flow(struct ulp_rte_parser_params *params)
 	if (rc == BNXT_TF_RC_ERROR)
 		return rc;
 
-	flow_state = tun_entry->state;
+	flow_state = tun_entry->tun_flow_info[params->port_id].state;
 	/* Outer tunnel flow validation */
 	outer_tun_sig = BNXT_OUTER_TUN_SIGNATURE(l3_tun, params);
 	outer_tun_flow = BNXT_OUTER_TUN_FLOW(outer_tun_sig);
@@ -308,3 +316,26 @@  ulp_clear_tun_entry(struct bnxt_tun_cache_entry *tun_tbl, uint8_t tun_idx)
 	memset(&tun_tbl[tun_idx], 0,
 		sizeof(struct bnxt_tun_cache_entry));
 }
+
+/* When a dpdk application offloads the same tunnel inner flow
+ * on all the uplink ports, a tunnel inner flow entry is cached
+ * even if it is not for the right uplink port. Such tunnel
+ * inner flows will eventually get aged out as there won't be
+ * any traffic on these ports. When such a flow destroy is
+ * called, cleanup the tunnel inner flow entry.
+ */
+void
+ulp_clear_tun_inner_entry(struct bnxt_tun_cache_entry *tun_tbl, uint32_t fid)
+{
+	struct ulp_per_port_flow_info *flow_info;
+	int i, j;
+
+	for (i = 0; i < BNXT_ULP_MAX_TUN_CACHE_ENTRIES ; i++) {
+		for (j = 0; j < RTE_MAX_ETHPORTS; j++) {
+			flow_info = &tun_tbl[i].tun_flow_info[j];
+			if (flow_info->first_tun_i_fid == fid &&
+			    flow_info->state == BNXT_ULP_FLOW_STATE_TUN_I_CACHED)
+				memset(flow_info, 0, sizeof(*flow_info));
+		}
+	}
+}
diff --git a/drivers/net/bnxt/tf_ulp/ulp_tun.h b/drivers/net/bnxt/tf_ulp/ulp_tun.h
index ad70ae6..a4ccdb5 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_tun.h
+++ b/drivers/net/bnxt/tf_ulp/ulp_tun.h
@@ -70,8 +70,13 @@  enum bnxt_ulp_tun_flow_state {
 	BNXT_ULP_FLOW_STATE_TUN_I_CACHED
 };
 
-struct bnxt_tun_cache_entry {
+struct ulp_per_port_flow_info {
 	enum bnxt_ulp_tun_flow_state	state;
+	uint32_t			first_tun_i_fid;
+	struct ulp_rte_parser_params	first_inner_tun_params;
+};
+
+struct bnxt_tun_cache_entry {
 	bool				valid;
 	bool				t_dst_ip_valid;
 	uint8_t				t_dmac[RTE_ETHER_ADDR_LEN];
@@ -80,13 +85,15 @@  struct bnxt_tun_cache_entry {
 		uint8_t			t_dst_ip6[16];
 	};
 	uint32_t			outer_tun_flow_id;
-	uint32_t			first_inner_tun_flow_id;
 	uint16_t			outer_tun_rej_cnt;
 	uint16_t			inner_tun_rej_cnt;
-	struct ulp_rte_parser_params	first_inner_tun_params;
+	struct ulp_per_port_flow_info	tun_flow_info[RTE_MAX_ETHPORTS];
 };
 
 void
 ulp_clear_tun_entry(struct bnxt_tun_cache_entry *tun_tbl, uint8_t tun_idx);
 
+void
+ulp_clear_tun_inner_entry(struct bnxt_tun_cache_entry *tun_tbl, uint32_t fid);
+
 #endif