[3/4] net/mlx5: accelerate DV flow counter query

Message ID 1562594861-27123-4-git-send-email-matan@mellanox.com
State Superseded
Delegated to: Raslan Darawsheh
Headers show
Series
  • net/mlx5: accelerate DV flow counters mangement
Related show

Checks

Context Check Description
ci/Intel-compilation fail Compilation issues
ci/checkpatch warning coding style issues

Commit Message

Matan Azrad July 8, 2019, 2:07 p.m.
All the DV counters are cashed in the PMD memory and are contained in
pools which are contained in containers according to the counters
allocation type - batch or single.

Currently, the flow counter query is done synchronously in pool
resolution means that on the user request a FW command is triggered to
read all the counters in the pool.

A new feature of devX to asynchronously read batch of flow counters
allows to accelerate the user query operation.

Using the DPDK host thread, the PMD periodically triggers asynchronous
query in pool resolution for all the counter pools and an interrupt is
triggered by the FW when the values are updated.
In the interrupt handler the pool counter values raw data is replaced
using a double buffer algorithm (very fast).
In the user query, the PMD just returns the last query values from the
PMD cache - no system-calls and FW commands are triggered from the user
control thread on query operation!

More synchronization is added with the host thread:
        Container resize uses double buffer algorithm.
        Pools growing in container uses atomic operation.
        Pool query buffer replace uses a spinlock.
        Pool minimum devX counter ID uses atomic operation.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
---
 doc/guides/rel_notes/release_19_08.rst |   6 +-
 drivers/net/mlx5/Makefile              |   5 ++
 drivers/net/mlx5/meson.build           |   2 +
 drivers/net/mlx5/mlx5.c                |   9 ++
 drivers/net/mlx5/mlx5.h                |  44 ++++++++--
 drivers/net/mlx5/mlx5_devx_cmds.c      |  48 ++++++++++-
 drivers/net/mlx5/mlx5_ethdev.c         |  85 +++++++++++++++++--
 drivers/net/mlx5/mlx5_flow.c           | 147 +++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_flow.h           |   8 ++
 drivers/net/mlx5/mlx5_flow_dv.c        | 141 ++++++++++++++++++++-----------
 drivers/net/mlx5/mlx5_glue.c           |  62 ++++++++++++++
 drivers/net/mlx5/mlx5_glue.h           |  15 ++++
 12 files changed, 506 insertions(+), 66 deletions(-)

Patch

diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index ab5052e..5fb8552 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -190,11 +190,13 @@  New Features
   Added telemetry mode to l3fwd-power application to report
   application level busyness, empty and full polls of rte_eth_rx_burst().
 
-* **Updated Mellanox mlx5 driver.**
+* **Updated Mellanox mlx5 PMD.**
 
    Updated Mellanox mlx5 driver with new features and improvements, including:
 
-   * Added support for match on ICMP/ICMP6's code and type.
+  * Added support for match on ICMP/ICMP6's code and type.
+  * Accelerate flows with count action creation and destroy.
+  * Accelerate flows counter query.
 
 Removed Items
 -------------
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index b210c80..76d40b1 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -173,6 +173,11 @@  mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum MLX5DV_FLOW_ACTION_COUNTERS_DEVX \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_IBV_DEVX_ASYNC \
+		infiniband/mlx5dv.h \
+		func mlx5dv_devx_obj_query_async \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_ETHTOOL_LINK_MODE_25G \
 		/usr/include/linux/ethtool.h \
 		enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index 3eff22e..fabd490 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -122,6 +122,8 @@  if build
 		'mlx5dv_devx_obj_create' ],
 		[ 'HAVE_IBV_FLOW_DEVX_COUNTERS', 'infiniband/mlx5dv.h',
 		'MLX5DV_FLOW_ACTION_COUNTERS_DEVX' ],
+		[ 'HAVE_IBV_DEVX_ASYNC', 'infiniband/mlx5dv.h',
+		'mlx5dv_devx_obj_query_async' ],
 		[ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
 		'MLX5DV_DR_DOMAIN_TYPE_NIC_RX' ],
 		[ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h',
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 62be141..a8d824e 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -37,6 +37,7 @@ 
 #include <rte_rwlock.h>
 #include <rte_spinlock.h>
 #include <rte_string_fns.h>
+#include <rte_alarm.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -201,7 +202,15 @@  struct mlx5_dev_spawn_data {
 	struct mlx5_counter_stats_mem_mng *mng;
 	uint8_t i;
 	int j;
+	int retries = 1024;
 
+	rte_errno = 0;
+	while (--retries) {
+		rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
+		if (rte_errno != EINPROGRESS)
+			break;
+		rte_pause();
+	}
 	for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) {
 		struct mlx5_flow_counter_pool *pool;
 		uint32_t batch = !!(i % 2);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3944b5f..4ce352a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -257,6 +257,7 @@  struct mlx5_drop {
 };
 
 #define MLX5_COUNTERS_PER_POOL 512
+#define MLX5_MAX_PENDING_QUERIES 4
 
 struct mlx5_flow_counter_pool;
 
@@ -283,7 +284,10 @@  struct mlx5_flow_counter {
 		struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
 		struct mlx5_flow_counter_pool *pool; /**< The counter pool. */
 	};
-	uint64_t hits; /**< Reset value of hits packets. */
+	union {
+		uint64_t hits; /**< Reset value of hits packets. */
+		int64_t query_gen; /**< Generation of the last release. */
+	};
 	uint64_t bytes; /**< Reset value of bytes. */
 	void *action; /**< Pointer to the dv action. */
 };
@@ -294,10 +298,17 @@  struct mlx5_flow_counter {
 struct mlx5_flow_counter_pool {
 	TAILQ_ENTRY(mlx5_flow_counter_pool) next;
 	struct mlx5_counters counters; /* Free counter list. */
-	struct mlx5_devx_obj *min_dcs;
-	/* The devx object of the minimum counter ID in the pool. */
-	struct mlx5_counter_stats_raw *raw; /* The counter stats memory raw. */
-	struct mlx5_flow_counter counters_raw[]; /* The counters memory. */
+	union {
+		struct mlx5_devx_obj *min_dcs;
+		rte_atomic64_t a64_dcs;
+	};
+	/* The devx object of the minimum counter ID. */
+	rte_atomic64_t query_gen;
+	uint32_t n_counters: 16; /* Number of devx allocated counters. */
+	rte_spinlock_t sl; /* The pool lock. */
+	struct mlx5_counter_stats_raw *raw;
+	struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */
+	struct mlx5_flow_counter counters_raw[]; /* The pool counters memory. */
 };
 
 struct mlx5_counter_stats_raw;
@@ -322,7 +333,7 @@  struct mlx5_counter_stats_raw {
 
 /* Container structure for counter pools. */
 struct mlx5_pools_container {
-	uint16_t n_valid; /* Number of valid pools. */
+	rte_atomic16_t n_valid; /* Number of valid pools. */
 	uint16_t n; /* Number of pools. */
 	struct mlx5_counter_pools pool_list; /* Counter pool list. */
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
@@ -332,9 +343,16 @@  struct mlx5_pools_container {
 
 /* Counter global management structure. */
 struct mlx5_flow_counter_mng {
-	struct mlx5_pools_container ccont[2];
+	uint8_t mhi[2]; /* master \ host container index. */
+	struct mlx5_pools_container ccont[2 * 2];
+	/* 2 containers for single and for batch for double-buffer. */
 	struct mlx5_counters flow_counters; /* Legacy flow counter list. */
+	uint8_t pending_queries;
+	uint8_t batch;
+	uint16_t pool_index;
+	uint8_t query_thread_on;
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
+	LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
 };
 
 /* Per port data of shared IB device. */
@@ -408,6 +426,8 @@  struct mlx5_ibv_shared {
 	pthread_mutex_t intr_mutex; /* Interrupt config mutex. */
 	uint32_t intr_cnt; /* Interrupt handler reference counter. */
 	struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
+	struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
+	struct mlx5dv_devx_cmd_comp *devx_comp; /* DEVX async comp obj. */
 	struct mlx5_ibv_shared_port port[]; /* per device port data array. */
 };
 
@@ -520,6 +540,7 @@  int mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
 				struct rte_pci_addr *pci_addr);
 void mlx5_dev_link_status_handler(void *arg);
 void mlx5_dev_interrupt_handler(void *arg);
+void mlx5_dev_interrupt_handler_devx(void *arg);
 void mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev);
 void mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev);
 int mlx5_set_link_down(struct rte_eth_dev *dev);
@@ -641,6 +662,10 @@  int mlx5_ctrl_flow(struct rte_eth_dev *dev,
 		   struct rte_flow_item_eth *eth_mask);
 int mlx5_flow_create_drop_queue(struct rte_eth_dev *dev);
 void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev);
+void mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh,
+				       uint64_t async_id, int status);
+void mlx5_set_query_alarm(struct mlx5_ibv_shared *sh);
+void mlx5_flow_query_alarm(void *arg);
 
 /* mlx5_mp.c */
 void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev);
@@ -678,9 +703,12 @@  struct mlx5_devx_obj *mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
 int mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_obj *dcs,
 				     int clear, uint32_t n_counters,
 				     uint64_t *pkts, uint64_t *bytes,
-				     uint32_t mkey, void *addr);
+				     uint32_t mkey, void *addr,
+				     struct mlx5dv_devx_cmd_comp *cmd_comp,
+				     uint64_t async_id);
 int mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx,
 				 struct mlx5_hca_attr *attr);
 struct mlx5_devx_obj *mlx5_devx_cmd_mkey_create(struct ibv_context *ctx,
 					     struct mlx5_devx_mkey_attr *attr);
+int mlx5_devx_get_out_command_status(void *out);
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_devx_cmds.c b/drivers/net/mlx5/mlx5_devx_cmds.c
index 92f2fc8..28d967a 100644
--- a/drivers/net/mlx5/mlx5_devx_cmds.c
+++ b/drivers/net/mlx5/mlx5_devx_cmds.c
@@ -66,14 +66,21 @@  struct mlx5_devx_obj *
  *   The mkey key for batch query.
  *  @param addr
  *    The address in the mkey range for batch query.
+ *  @param cmd_comp
+ *   The completion object for asynchronous batch query.
+ *  @param async_id
+ *    The ID to be returned in the asynchronous batch query response.
  *
  * @return
  *   0 on success, a negative value otherwise.
  */
 int
-mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_obj *dcs, int clear,
-				 uint32_t n_counters, uint64_t *pkts,
-				 uint64_t *bytes, uint32_t mkey, void *addr)
+mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_obj *dcs,
+				 int clear, uint32_t n_counters,
+				 uint64_t *pkts, uint64_t *bytes,
+				 uint32_t mkey, void *addr,
+				 struct mlx5dv_devx_cmd_comp *cmd_comp,
+				 uint64_t async_id)
 {
 	int out_len = MLX5_ST_SZ_BYTES(query_flow_counter_out) +
 			MLX5_ST_SZ_BYTES(traffic_counter);
@@ -96,7 +103,13 @@  struct mlx5_devx_obj *
 		MLX5_SET64(query_flow_counter_in, in, address,
 			   (uint64_t)(uintptr_t)addr);
 	}
-	rc = mlx5_glue->devx_obj_query(dcs->obj, in, sizeof(in), out, out_len);
+	if (!cmd_comp)
+		rc = mlx5_glue->devx_obj_query(dcs->obj, in, sizeof(in), out,
+					       out_len);
+	else
+		rc = mlx5_glue->devx_obj_query_async(dcs->obj, in, sizeof(in),
+						     out_len, async_id,
+						     cmd_comp);
 	if (rc) {
 		DRV_LOG(ERR, "Failed to query devx counters with rc %d\n ", rc);
 		rte_errno = rc;
@@ -169,6 +182,33 @@  struct mlx5_devx_obj *
 }
 
 /**
+ * Get status of devx command response.
+ * Mainly used for asynchronous commands.
+ *
+ * @param[in] out
+ *   The out response buffer.
+ *
+ * @return
+ *   0 on success, non-zero value otherwise.
+ */
+int
+mlx5_devx_get_out_command_status(void *out)
+{
+	int status;
+
+	if (!out)
+		return -EINVAL;
+	status = MLX5_GET(query_flow_counter_out, out, status);
+	if (status) {
+		int syndrome = MLX5_GET(query_flow_counter_out, out, syndrome);
+
+		DRV_LOG(ERR, "Bad devX status %x, syndrome = %x\n", status,
+			syndrome);
+	}
+	return status;
+}
+
+/**
  * Destroy any object allocated by a Devx API.
  *
  * @param[in] obj
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index eeefe4d..004901a 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1433,6 +1433,38 @@  int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 }
 
 /**
+ * Handle DEVX interrupts from the NIC.
+ * This function is probably called from the DPDK host thread.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler_devx(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_ASYNC
+	(void)cb_arg;
+	return;
+#else
+	struct mlx5_ibv_shared *sh = cb_arg;
+	union {
+		struct mlx5dv_devx_async_cmd_hdr cmd_resp;
+		uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+			    MLX5_ST_SZ_BYTES(traffic_counter) +
+			    sizeof(struct mlx5dv_devx_async_cmd_hdr)];
+	} out;
+	uint8_t *buf = out.buf + sizeof(out.cmd_resp);
+
+	while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
+						   &out.cmd_resp,
+						   sizeof(out.buf)))
+		mlx5_flow_async_pool_query_handle
+			(sh, (uint64_t)out.cmd_resp.wr_id,
+			 mlx5_devx_get_out_command_status(buf));
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+/**
  * Uninstall shared asynchronous device events handler.
  * This function is implemented to support event sharing
  * between multiple ports of single IB device.
@@ -1464,6 +1496,17 @@  int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 				     mlx5_dev_interrupt_handler, sh);
 	sh->intr_handle.fd = 0;
 	sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+	if (sh->intr_handle_devx.fd) {
+		rte_intr_callback_unregister(&sh->intr_handle_devx,
+					     mlx5_dev_interrupt_handler_devx,
+					     sh);
+		sh->intr_handle_devx.fd = 0;
+		sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
+	}
+	if (sh->devx_comp) {
+		mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
+		sh->devx_comp = NULL;
+	}
 exit:
 	pthread_mutex_unlock(&sh->intr_mutex);
 }
@@ -1507,17 +1550,49 @@  int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
 	if (ret) {
 		DRV_LOG(INFO, "failed to change file descriptor"
 			      " async event queue");
-		/* Indicate there will be no interrupts. */
-		dev->data->dev_conf.intr_conf.lsc = 0;
-		dev->data->dev_conf.intr_conf.rmv = 0;
-		sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
-		goto exit;
+		goto error;
 	}
 	sh->intr_handle.fd = sh->ctx->async_fd;
 	sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
 	rte_intr_callback_register(&sh->intr_handle,
 				   mlx5_dev_interrupt_handler, sh);
+	if (priv->config.devx) {
+#ifndef HAVE_IBV_DEVX_ASYNC
+		goto error_unregister;
+#else
+		sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
+		if (sh->devx_comp) {
+			flags = fcntl(sh->devx_comp->fd, F_GETFL);
+			ret = fcntl(sh->devx_comp->fd, F_SETFL,
+				    flags | O_NONBLOCK);
+			if (ret) {
+				DRV_LOG(INFO, "failed to change file descriptor"
+					      " devx async event queue");
+				goto error_unregister;
+			}
+			sh->intr_handle_devx.fd = sh->devx_comp->fd;
+			sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
+			rte_intr_callback_register
+				(&sh->intr_handle_devx,
+				 mlx5_dev_interrupt_handler_devx, sh);
+		} else {
+			DRV_LOG(INFO, "failed to create devx async command "
+				"completion");
+			goto error_unregister;
+		}
+#endif /* HAVE_IBV_DEVX_ASYNC */
+	}
 	sh->intr_cnt++;
+error_unregister:
+	rte_intr_callback_unregister(&sh->intr_handle,
+				     mlx5_dev_interrupt_handler, sh);
+error:
+	/* Indicate there will be no interrupts. */
+	dev->data->dev_conf.intr_conf.lsc = 0;
+	dev->data->dev_conf.intr_conf.rmv = 0;
+	sh->intr_handle.fd = 0;
+	sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+	sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
 exit:
 	pthread_mutex_unlock(&sh->intr_mutex);
 }
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 534cd93..1c5431d 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -3078,3 +3078,150 @@  struct rte_flow *
 	}
 	return 0;
 }
+
+#define MLX5_POOL_QUERY_FREQ_US 1000000
+
+/**
+ * Set the periodic procedure for triggering asynchronous batch queries for all
+ * the counter pools.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_ibv_shared object.
+ */
+void
+mlx5_set_query_alarm(struct mlx5_ibv_shared *sh)
+{
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(sh, 0, 0);
+	uint32_t pools_n = rte_atomic16_read(&cont->n_valid);
+	uint32_t us;
+
+	cont = MLX5_CNT_CONTAINER(sh, 1, 0);
+	pools_n += rte_atomic16_read(&cont->n_valid);
+	us = MLX5_POOL_QUERY_FREQ_US / pools_n;
+	DRV_LOG(DEBUG, "Set alarm for %u pools each %u us\n", pools_n, us);
+	if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
+		sh->cmng.query_thread_on = 0;
+		DRV_LOG(ERR, "Cannot reinitialize query alarm\n");
+	} else {
+		sh->cmng.query_thread_on = 1;
+	}
+}
+
+/**
+ * The periodic procedure for triggering asynchronous batch queries for all the
+ * counter pools. This function is probably called by the host thread.
+ *
+ * @param[in] arg
+ *   The parameter for the alarm process.
+ */
+void
+mlx5_flow_query_alarm(void *arg)
+{
+	struct mlx5_ibv_shared *sh = arg;
+	struct mlx5_devx_obj *dcs;
+	uint16_t offset;
+	int ret;
+	uint8_t batch = sh->cmng.batch;
+	uint16_t pool_index = sh->cmng.pool_index;
+	struct mlx5_pools_container *cont;
+	struct mlx5_pools_container *mcont;
+	struct mlx5_flow_counter_pool *pool;
+
+	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
+		goto set_alarm;
+next_container:
+	cont = MLX5_CNT_CONTAINER(sh, batch, 1);
+	mcont = MLX5_CNT_CONTAINER(sh, batch, 0);
+	/* Check if resize was done and need to flip a container. */
+	if (cont != mcont) {
+		if (cont->pools) {
+			/* Clean the old container. */
+			rte_free(cont->pools);
+			memset(cont, 0, sizeof(*cont));
+		}
+		rte_cio_wmb();
+		 /* Flip the host container. */
+		sh->cmng.mhi[batch] ^= (uint8_t)2;
+		cont = mcont;
+	}
+	if (!cont->pools) {
+		/* 2 empty containers case is unexpected. */
+		if (unlikely(batch != sh->cmng.batch))
+			goto set_alarm;
+		batch ^= 0x1;
+		pool_index = 0;
+		goto next_container;
+	}
+	pool = cont->pools[pool_index];
+	if (pool->raw_hw)
+		/* There is a pool query in progress. */
+		goto set_alarm;
+	pool->raw_hw =
+		LIST_FIRST(&sh->cmng.free_stat_raws);
+	if (!pool->raw_hw)
+		/* No free counter statistics raw memory. */
+		goto set_alarm;
+	dcs = (struct mlx5_devx_obj *)(uintptr_t)rte_atomic64_read
+							      (&pool->a64_dcs);
+	offset = batch ? 0 : dcs->id % MLX5_COUNTERS_PER_POOL;
+	ret = mlx5_devx_cmd_flow_counter_query(dcs, 0, MLX5_COUNTERS_PER_POOL -
+					       offset, NULL, NULL,
+					       pool->raw_hw->mem_mng->dm->id,
+					       (void *)(uintptr_t)
+					       (pool->raw_hw->data + offset),
+					       sh->devx_comp,
+					       (uint64_t)(uintptr_t)pool);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to trigger asynchronous query for dcs ID"
+			" %d\n", pool->min_dcs->id);
+		pool->raw_hw = NULL;
+		goto set_alarm;
+	}
+	pool->raw_hw->min_dcs_id = dcs->id;
+	LIST_REMOVE(pool->raw_hw, next);
+	sh->cmng.pending_queries++;
+	pool_index++;
+	if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
+		batch ^= 0x1;
+		pool_index = 0;
+	}
+set_alarm:
+	sh->cmng.batch = batch;
+	sh->cmng.pool_index = pool_index;
+	mlx5_set_query_alarm(sh);
+}
+
+/**
+ * Handler for the HW respond about ready values from an asynchronous batch
+ * query. This function is probably called by the host thread.
+ *
+ * @param[in] sh
+ *   The pointer to the shared IB device context.
+ * @param[in] async_id
+ *   The Devx async ID.
+ * @param[in] status
+ *   The status of the completion.
+ */
+void
+mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh,
+				  uint64_t async_id, int status)
+{
+	struct mlx5_flow_counter_pool *pool =
+		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
+	struct mlx5_counter_stats_raw *raw_to_free;
+
+	if (unlikely(status)) {
+		raw_to_free = pool->raw_hw;
+	} else {
+		raw_to_free = pool->raw;
+		rte_spinlock_lock(&pool->sl);
+		pool->raw = pool->raw_hw;
+		rte_spinlock_unlock(&pool->sl);
+		rte_atomic64_add(&pool->query_gen, 1);
+		/* Be sure the new raw counters data is updated in memory. */
+		rte_cio_wmb();
+	}
+	LIST_INSERT_HEAD(&sh->cmng.free_stat_raws, raw_to_free, next);
+	pool->raw_hw = NULL;
+	sh->cmng.pending_queries--;
+}
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index fbd09d0..0d6f64a 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -21,6 +21,9 @@ 
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_atomic.h>
+#include <rte_alarm.h>
+
 #include "mlx5.h"
 #include "mlx5_prm.h"
 
@@ -409,6 +412,11 @@  struct mlx5_flow_driver_ops {
 	mlx5_flow_query_t query;
 };
 
+#define MLX5_CNT_CONTAINER(sh, batch, thread) (&(sh)->cmng.ccont \
+	[(((sh)->cmng.mhi[batch] >> (thread)) & 0x1) * 2 + (batch)])
+#define MLX5_CNT_CONTAINER_UNUSED(sh, batch, thread) (&(sh)->cmng.ccont \
+	[(~((sh)->cmng.mhi[batch] >> (thread)) & 0x1) * 2 + (batch)])
+
 /* mlx5_flow.c */
 
 uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow *dev_flow, int tunnel,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 3b7a43e..b4a1463 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -2115,7 +2115,6 @@  struct field_modify_info modify_tcp[] = {
 }
 
 #define MLX5_CNT_CONTAINER_RESIZE 64
-#define MLX5_CNT_CONTAINER(priv, batch) (&(priv)->sh->cmng.ccont[batch])
 
 /**
  * Get a pool by a counter.
@@ -2238,39 +2237,53 @@  struct field_modify_info modify_tcp[] = {
  *   Whether the pool is for counter that was allocated by batch command.
  *
  * @return
- *   The container pointer on success, otherwise NULL and rte_errno is set.
+ *   The new container pointer on success, otherwise NULL and rte_errno is set.
  */
 static struct mlx5_pools_container *
 flow_dv_container_resize(struct rte_eth_dev *dev, uint32_t batch)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv, batch);
+	struct mlx5_pools_container *cont =
+			MLX5_CNT_CONTAINER(priv->sh, batch, 0);
+	struct mlx5_pools_container *new_cont =
+			MLX5_CNT_CONTAINER_UNUSED(priv->sh, batch, 0);
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
 	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
-	struct mlx5_flow_counter_pool **new_pools = rte_calloc(__func__, 1,
-							       mem_size, 0);
-	if (!new_pools) {
+	int i;
+
+	if (cont != MLX5_CNT_CONTAINER(priv->sh, batch, 1)) {
+		/* The last resize still hasn't detected by the host thread. */
+		rte_errno = EAGAIN;
+		return NULL;
+	}
+	new_cont->pools = rte_calloc(__func__, 1, mem_size, 0);
+	if (!new_cont->pools) {
 		rte_errno = ENOMEM;
 		return NULL;
 	}
+	if (cont->n)
+		memcpy(new_cont->pools, cont->pools, cont->n *
+		       sizeof(struct mlx5_flow_counter_pool *));
 	mem_mng = flow_dv_create_counter_stat_mem_mng(dev,
-						    MLX5_CNT_CONTAINER_RESIZE);
+		MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES);
 	if (!mem_mng) {
-		rte_free(new_pools);
+		rte_free(new_cont->pools);
 		return NULL;
 	}
-	if (cont->n) {
-		memcpy(new_pools, cont->pools,
-		       cont->n * sizeof(struct mlx5_flow_counter_pool *));
-		rte_free(cont->pools);
-	} else {
-		TAILQ_INIT(&cont->pool_list);
-	}
-	cont->pools = new_pools;
-	cont->n = resize;
-	cont->init_mem_mng = mem_mng;
-	return cont;
+	for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
+		LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws,
+				 mem_mng->raws + MLX5_CNT_CONTAINER_RESIZE +
+				 i, next);
+	new_cont->n = resize;
+	rte_atomic16_set(&new_cont->n_valid, rte_atomic16_read(&cont->n_valid));
+	TAILQ_INIT(&new_cont->pool_list);
+	TAILQ_CONCAT(&new_cont->pool_list, &cont->pool_list, next);
+	new_cont->init_mem_mng = mem_mng;
+	rte_cio_wmb();
+	 /* Flip the master container. */
+	priv->sh->cmng.mhi[batch] ^= (uint8_t)1;
+	return new_cont;
 }
 
 /**
@@ -2295,22 +2308,22 @@  struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_flow_counter_pool *pool =
 			flow_dv_counter_pool_get(cnt);
-	uint16_t offset = pool->min_dcs->id % MLX5_COUNTERS_PER_POOL;
-	int ret = mlx5_devx_cmd_flow_counter_query
-		(pool->min_dcs, 0, MLX5_COUNTERS_PER_POOL - offset, NULL,
-		 NULL, pool->raw->mem_mng->dm->id,
-		 (void *)(uintptr_t)(pool->raw->data +
-		 offset));
-
-	if (ret) {
-		DRV_LOG(ERR, "Failed to trigger synchronous"
-			" query for dcs ID %d\n",
-			pool->min_dcs->id);
-		return ret;
+	int offset = cnt - &pool->counters_raw[0];
+
+	rte_spinlock_lock(&pool->sl);
+	/*
+	 * The single counters allocation may allocate smaller ID than the
+	 * current allocated in parallel to the host reading.
+	 * In this case the new counter values must be reported as 0.
+	 */
+	if (unlikely(!cnt->batch && cnt->dcs->id < pool->raw->min_dcs_id)) {
+		*pkts = 0;
+		*bytes = 0;
+	} else {
+		*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+		*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
 	}
-	offset = cnt - &pool->counters_raw[0];
-	*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
-	*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	rte_spinlock_unlock(&pool->sl);
 	return 0;
 }
 
@@ -2333,10 +2346,12 @@  struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv, batch);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+							       0);
+	int16_t n_valid = rte_atomic16_read(&cont->n_valid);
 	uint32_t size;
 
-	if (cont->n == cont->n_valid) {
+	if (cont->n == n_valid) {
 		cont = flow_dv_container_resize(dev, batch);
 		if (!cont)
 			return NULL;
@@ -2349,12 +2364,21 @@  struct field_modify_info modify_tcp[] = {
 		return NULL;
 	}
 	pool->min_dcs = dcs;
-	pool->raw = cont->init_mem_mng->raws + cont->n_valid  %
-			MLX5_CNT_CONTAINER_RESIZE;
+	pool->raw = cont->init_mem_mng->raws + n_valid %
+						     MLX5_CNT_CONTAINER_RESIZE;
+	pool->raw_hw = NULL;
+	rte_spinlock_init(&pool->sl);
+	/*
+	 * The generation of the new allocated counters in this pool is 0, 2 in
+	 * the pool generation makes all the counters valid for allocation.
+	 */
+	rte_atomic64_set(&pool->query_gen, 0x2);
 	TAILQ_INIT(&pool->counters);
 	TAILQ_INSERT_TAIL(&cont->pool_list, pool, next);
-	cont->pools[cont->n_valid] = pool;
-	cont->n_valid++;
+	cont->pools[n_valid] = pool;
+	/* Pool initialization must be updated before host thread access. */
+	rte_cio_wmb();
+	rte_atomic16_add(&cont->n_valid, 1);
 	return pool;
 }
 
@@ -2388,8 +2412,8 @@  struct field_modify_info modify_tcp[] = {
 		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0);
 		if (!dcs)
 			return NULL;
-		pool = flow_dv_find_pool_by_id(MLX5_CNT_CONTAINER(priv, batch),
-					       dcs->id);
+		pool = flow_dv_find_pool_by_id
+			(MLX5_CNT_CONTAINER(priv->sh, batch, 0), dcs->id);
 		if (!pool) {
 			pool = flow_dv_pool_create(dev, dcs, batch);
 			if (!pool) {
@@ -2397,7 +2421,8 @@  struct field_modify_info modify_tcp[] = {
 				return NULL;
 			}
 		} else if (dcs->id < pool->min_dcs->id) {
-			pool->min_dcs->id = dcs->id;
+			rte_atomic64_set(&pool->a64_dcs,
+					 (int64_t)(uintptr_t)dcs);
 		}
 		cnt = &pool->counters_raw[dcs->id % MLX5_COUNTERS_PER_POOL];
 		TAILQ_INSERT_HEAD(&pool->counters, cnt, next);
@@ -2486,8 +2511,13 @@  struct field_modify_info modify_tcp[] = {
 	 * shared counters from the single container.
 	 */
 	uint32_t batch = (group && !shared) ? 1 : 0;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv, batch);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
+							       0);
 
+#ifndef HAVE_IBV_DEVX_ASYNC
+	rte_errno = ENOTSUP;
+	return NULL;
+#endif
 	if (!priv->config.devx) {
 		rte_errno = ENOTSUP;
 		return NULL;
@@ -2504,9 +2534,22 @@  struct field_modify_info modify_tcp[] = {
 		}
 	}
 	/* Pools which has a free counters are in the start. */
-	pool = TAILQ_FIRST(&cont->pool_list);
-	if (pool)
+	TAILQ_FOREACH(pool, &cont->pool_list, next) {
+		/*
+		 * The free counter reset values must be updated between the
+		 * counter release to the counter allocation, so, at least one
+		 * query must be done in this time. ensure it by saving the
+		 * query generation in the release time.
+		 * The free list is sorted according to the generation - so if
+		 * the first one is not updated, all the others are not
+		 * updated too.
+		 */
 		cnt_free = TAILQ_FIRST(&pool->counters);
+		if (cnt_free && cnt_free->query_gen + 1 <
+		    rte_atomic64_read(&pool->query_gen))
+			break;
+		cnt_free = NULL;
+	}
 	if (!cnt_free) {
 		pool = flow_dv_counter_pool_prepare(dev, &cnt_free, batch);
 		if (!pool)
@@ -2539,6 +2582,9 @@  struct field_modify_info modify_tcp[] = {
 	cnt_free->shared = shared;
 	cnt_free->ref_cnt = 1;
 	cnt_free->id = id;
+	if (!priv->sh->cmng.query_thread_on)
+		/* Start the asynchronous batch query by the host thread. */
+		mlx5_set_query_alarm(priv->sh);
 	TAILQ_REMOVE(&pool->counters, cnt_free, next);
 	if (TAILQ_EMPTY(&pool->counters)) {
 		/* Move the pool to the end of the container pool list. */
@@ -2566,8 +2612,9 @@  struct field_modify_info modify_tcp[] = {
 		struct mlx5_flow_counter_pool *pool =
 				flow_dv_counter_pool_get(counter);
 
-		/* Put the counter in the end - the earliest one. */
+		/* Put the counter in the end - the last updated one. */
 		TAILQ_INSERT_TAIL(&pool->counters, counter, next);
+		counter->query_gen = rte_atomic64_read(&pool->query_gen);
 	}
 }
 
diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c
index ba5fd06..942f89d 100644
--- a/drivers/net/mlx5/mlx5_glue.c
+++ b/drivers/net/mlx5/mlx5_glue.c
@@ -849,6 +849,64 @@ 
 #endif
 }
 
+static struct mlx5dv_devx_cmd_comp *
+mlx5_glue_devx_create_cmd_comp(struct ibv_context *ctx)
+{
+#ifdef HAVE_IBV_DEVX_ASYNC
+	return mlx5dv_devx_create_cmd_comp(ctx);
+#else
+	(void)ctx;
+	errno = -ENOTSUP;
+	return NULL;
+#endif
+}
+
+static void
+mlx5_glue_devx_destroy_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp)
+{
+#ifdef HAVE_IBV_DEVX_ASYNC
+	mlx5dv_devx_destroy_cmd_comp(cmd_comp);
+#else
+	(void)cmd_comp;
+	errno = -ENOTSUP;
+#endif
+}
+
+static int
+mlx5_glue_devx_obj_query_async(struct mlx5dv_devx_obj *obj, const void *in,
+			       size_t inlen, size_t outlen, uint64_t wr_id,
+			       struct mlx5dv_devx_cmd_comp *cmd_comp)
+{
+#ifdef HAVE_IBV_DEVX_ASYNC
+	return mlx5dv_devx_obj_query_async(obj, in, inlen, outlen, wr_id,
+					   cmd_comp);
+#else
+	(void)obj;
+	(void)in;
+	(void)inlen;
+	(void)outlen;
+	(void)wr_id;
+	(void)cmd_comp;
+	return -ENOTSUP;
+#endif
+}
+
+static int
+mlx5_glue_devx_get_async_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp,
+				  struct mlx5dv_devx_async_cmd_hdr *cmd_resp,
+				  size_t cmd_resp_len)
+{
+#ifdef HAVE_IBV_DEVX_ASYNC
+	return mlx5dv_devx_get_async_cmd_comp(cmd_comp, cmd_resp,
+					      cmd_resp_len);
+#else
+	(void)cmd_comp;
+	(void)cmd_resp;
+	(void)cmd_resp_len;
+	return -ENOTSUP;
+#endif
+}
+
 static struct mlx5dv_devx_umem *
 mlx5_glue_devx_umem_reg(struct ibv_context *context, void *addr, size_t size,
 			uint32_t access)
@@ -957,6 +1015,10 @@ 
 	.devx_obj_query = mlx5_glue_devx_obj_query,
 	.devx_obj_modify = mlx5_glue_devx_obj_modify,
 	.devx_general_cmd = mlx5_glue_devx_general_cmd,
+	.devx_create_cmd_comp = mlx5_glue_devx_create_cmd_comp,
+	.devx_destroy_cmd_comp = mlx5_glue_devx_destroy_cmd_comp,
+	.devx_obj_query_async = mlx5_glue_devx_obj_query_async,
+	.devx_get_async_cmd_comp = mlx5_glue_devx_get_async_cmd_comp,
 	.devx_umem_reg = mlx5_glue_devx_umem_reg,
 	.devx_umem_dereg = mlx5_glue_devx_umem_dereg,
 };
diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h
index 18b1ce6..9facdb9 100644
--- a/drivers/net/mlx5/mlx5_glue.h
+++ b/drivers/net/mlx5/mlx5_glue.h
@@ -64,6 +64,11 @@ 
 struct mlx5dv_devx_umem;
 #endif
 
+#ifndef HAVE_IBV_DEVX_ASYNC
+struct mlx5dv_devx_cmd_comp;
+struct mlx5dv_devx_async_cmd_hdr;
+#endif
+
 #ifndef HAVE_MLX5DV_DR
 enum  mlx5dv_dr_domain_type { unused, };
 struct mlx5dv_dr_domain;
@@ -210,6 +215,16 @@  struct mlx5_glue {
 	int (*devx_general_cmd)(struct ibv_context *context,
 				const void *in, size_t inlen,
 				void *out, size_t outlen);
+	struct mlx5dv_devx_cmd_comp *(*devx_create_cmd_comp)
+					(struct ibv_context *context);
+	void (*devx_destroy_cmd_comp)(struct mlx5dv_devx_cmd_comp *cmd_comp);
+	int (*devx_obj_query_async)(struct mlx5dv_devx_obj *obj,
+				    const void *in, size_t inlen,
+				    size_t outlen, uint64_t wr_id,
+				    struct mlx5dv_devx_cmd_comp *cmd_comp);
+	int (*devx_get_async_cmd_comp)(struct mlx5dv_devx_cmd_comp *cmd_comp,
+				       struct mlx5dv_devx_async_cmd_hdr *resp,
+				       size_t cmd_resp_len);
 	struct mlx5dv_devx_umem *(*devx_umem_reg)(struct ibv_context *context,
 						  void *addr, size_t size,
 						  uint32_t access);