diff mbox series

[v4,3/7] ethdev: introduce Rx queue based fill threshold

Message ID	20220603124821.1148119-4-spiked@nvidia.com (mailing list archive)
State	Superseded, archived
Delegated to:	Andrew Rybchenko
Headers	Received-SPF: Pass (protection.outlook.com: domain of nvidia.com designates 12.22.5.238 as permitted sender) receiver=protection.outlook.com; client-ip=12.22.5.238; helo=mail.nvidia.com; pr=C From: Spike Du <spiked@nvidia.com> To: <matan@nvidia.com>, <viacheslavo@nvidia.com>, <orika@nvidia.com>, <thomas@monjalon.net>, Wenzhuo Lu <wenzhuo.lu@intel.com>, Beilei Xing <beilei.xing@intel.com>, Bernard Iremonger <bernard.iremonger@intel.com>, "Ray Kinsella" <mdr@ashroe.eu>, Neil Horman <nhorman@tuxdriver.com> CC: <andrew.rybchenko@oktetlabs.ru>, <stephen@networkplumber.org>, <mb@smartsharesystems.com>, <dev@dpdk.org>, <rasland@nvidia.com> Subject: [PATCH v4 3/7] ethdev: introduce Rx queue based fill threshold Date: Fri, 3 Jun 2022 15:48:17 +0300 Message-ID: <20220603124821.1148119-4-spiked@nvidia.com> In-Reply-To: <20220603124821.1148119-1-spiked@nvidia.com> References: <20220524152041.737154-1-spiked@nvidia.com> <20220603124821.1148119-1-spiked@nvidia.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain Precedence: list Errors-To: dev-bounces@dpdk.org
Series	introduce per-queue fill threshold and host shaper \| [v4,0/7] introduce per-queue fill threshold and host shaper [v4,1/7] net/mlx5: add LWM support for Rxq [v4,2/7] common/mlx5: share interrupt management [v4,3/7] ethdev: introduce Rx queue based fill threshold [v4,4/7] net/mlx5: add LWM event handling support [v4,5/7] net/mlx5: support Rx queue based fill threshold [v4,6/7] net/mlx5: add private API to config host port shaper [v4,7/7] app/testpmd: add Host Shaper command

Checks

Context	Check	Description
ci/checkpatch	warning	coding style issues

Commit Message

Spike Du June 3, 2022, 12:48 p.m. UTC

  Fill threshold describes the fullness of a Rx queue. If the Rx
queue fullness is above the threshold, the device will trigger the event
RTE_ETH_EVENT_RX_FILL_THRESH.
Fill threshold is defined as a percentage of Rx queue size with valid
value of [0,99].
Setting fill threshold to 0 means disable it, which is the default.
Add fill threshold configuration and query driver callbacks in eth_dev_ops.
Add command line options to support fill_thresh per-rxq configure.
- Command syntax:
  set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>

- Example commands:
To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
testpmd> set port 1 rxq 0 fill_thresh 30

To disable fill_thresh on port 1 rxq 0:
testpmd> set port 1 rxq 0 fill_thresh 0

Signed-off-by: Spike Du <spiked@nvidia.com>
---
 app/test-pmd/cmdline.c     | 68 +++++++++++++++++++++++++++++++++++++++++++
 app/test-pmd/config.c      | 21 ++++++++++++++
 app/test-pmd/testpmd.c     | 18 ++++++++++++
 app/test-pmd/testpmd.h     |  2 ++
 lib/ethdev/ethdev_driver.h | 22 ++++++++++++++
 lib/ethdev/rte_ethdev.c    | 52 +++++++++++++++++++++++++++++++++
 lib/ethdev/rte_ethdev.h    | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/ethdev/version.map     |  2 ++
 8 files changed, 257 insertions(+)

Comments

Ray Kinsella June 3, 2022, 2:30 p.m. UTC | #1

Spike Du <spiked@nvidia.com> writes:

> Fill threshold describes the fullness of a Rx queue. If the Rx
> queue fullness is above the threshold, the device will trigger the event
> RTE_ETH_EVENT_RX_FILL_THRESH.
> Fill threshold is defined as a percentage of Rx queue size with valid
> value of [0,99].
> Setting fill threshold to 0 means disable it, which is the default.
> Add fill threshold configuration and query driver callbacks in eth_dev_ops.
> Add command line options to support fill_thresh per-rxq configure.
> - Command syntax:
>   set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
>
> - Example commands:
> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
> testpmd> set port 1 rxq 0 fill_thresh 30
>
> To disable fill_thresh on port 1 rxq 0:
> testpmd> set port 1 rxq 0 fill_thresh 0
>
> Signed-off-by: Spike Du <spiked@nvidia.com>
> ---
>  app/test-pmd/cmdline.c     | 68 +++++++++++++++++++++++++++++++++++++++++++
>  app/test-pmd/config.c      | 21 ++++++++++++++
>  app/test-pmd/testpmd.c     | 18 ++++++++++++
>  app/test-pmd/testpmd.h     |  2 ++
>  lib/ethdev/ethdev_driver.h | 22 ++++++++++++++
>  lib/ethdev/rte_ethdev.c    | 52 +++++++++++++++++++++++++++++++++
>  lib/ethdev/rte_ethdev.h    | 72 ++++++++++++++++++++++++++++++++++++++++++++++
>  lib/ethdev/version.map     |  2 ++
>  8 files changed, 257 insertions(+)
>
> diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
> index 0410bad..918581e 100644
> --- a/app/test-pmd/cmdline.c
> +++ b/app/test-pmd/cmdline.c
> @@ -17823,6 +17823,73 @@ struct cmd_show_port_flow_transfer_proxy_result {
>  	}
>  };
>  
> +/* *** SET FILL THRESHOLD FOR A RXQ OF A PORT *** */
> +struct cmd_rxq_fill_thresh_result {
> +	cmdline_fixed_string_t set;
> +	cmdline_fixed_string_t port;
> +	uint16_t port_num;
> +	cmdline_fixed_string_t rxq;
> +	uint16_t rxq_num;
> +	cmdline_fixed_string_t fill_thresh;
> +	uint16_t fill_thresh_num;
> +};
> +
> +static void cmd_rxq_fill_thresh_parsed(void *parsed_result,
> +		__rte_unused struct cmdline *cl,
> +		__rte_unused void *data)
> +{
> +	struct cmd_rxq_fill_thresh_result *res = parsed_result;
> +	int ret = 0;
> +
> +	if ((strcmp(res->set, "set") == 0) && (strcmp(res->port, "port") == 0)
> +	    && (strcmp(res->rxq, "rxq") == 0)
> +	    && (strcmp(res->fill_thresh, "fill_thresh") == 0))
> +		ret = set_rxq_fill_thresh(res->port_num, res->rxq_num,
> +				  res->fill_thresh_num);
> +	if (ret < 0)
> +		printf("rxq_fill_thresh_cmd error: (%s)\n", strerror(-ret));
> +
> +}
> +
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_set =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				set, "set");
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_port =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				port, "port");
> +cmdline_parse_token_num_t cmd_rxq_fill_thresh_portnum =
> +	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				port_num, RTE_UINT16);
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_rxq =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				rxq, "rxq");
> +cmdline_parse_token_num_t cmd_rxq_fill_thresh_rxqnum =
> +	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				rxq_num, RTE_UINT8);
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_fill_thresh =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				fill_thresh, "fill_thresh");
> +cmdline_parse_token_num_t cmd_rxq_fill_thresh_fill_threshnum =
> +	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				fill_thresh_num, RTE_UINT16);
> +
> +cmdline_parse_inst_t cmd_rxq_fill_thresh = {
> +	.f = cmd_rxq_fill_thresh_parsed,
> +	.data = (void *)0,
> +	.help_str = "set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>"
> +		"Set fill_thresh for rxq on port_id",
> +	.tokens = {
> +		(void *)&cmd_rxq_fill_thresh_set,
> +		(void *)&cmd_rxq_fill_thresh_port,
> +		(void *)&cmd_rxq_fill_thresh_portnum,
> +		(void *)&cmd_rxq_fill_thresh_rxq,
> +		(void *)&cmd_rxq_fill_thresh_rxqnum,
> +		(void *)&cmd_rxq_fill_thresh_fill_thresh,
> +		(void *)&cmd_rxq_fill_thresh_fill_threshnum,
> +		NULL,
> +	},
> +};
> +
>  /* ******************************************************************************** */
>  
>  /* list of instructions */
> @@ -18110,6 +18177,7 @@ struct cmd_show_port_flow_transfer_proxy_result {
>  	(cmdline_parse_inst_t *)&cmd_show_capability,
>  	(cmdline_parse_inst_t *)&cmd_set_flex_is_pattern,
>  	(cmdline_parse_inst_t *)&cmd_set_flex_spec_pattern,
> +	(cmdline_parse_inst_t *)&cmd_rxq_fill_thresh,
>  	NULL,
>  };
>  
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 1b1e738..d0c519b 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -6342,3 +6342,24 @@ struct igb_ring_desc_16_bytes {
>  		printf("  %s\n", buf);
>  	}
>  }
> +
> +int
> +set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx, uint16_t fill_thresh)
> +{
> +	struct rte_eth_link link;
> +	int ret;
> +
> +	if (port_id_is_invalid(port_id, ENABLED_WARN))
> +		return -EINVAL;
> +	ret = eth_link_get_nowait_print_err(port_id, &link);
> +	if (ret < 0)
> +		return -EINVAL;
> +	if (fill_thresh > 99)
> +		return -EINVAL;
> +	ret = rte_eth_rx_fill_thresh_set(port_id, queue_idx, fill_thresh);
> +
> +	if (ret)
> +		return ret;
> +	return 0;
> +}
> +
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index 767765d..1209230 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -420,6 +420,7 @@ struct fwd_engine * fwd_engines[] = {
>  	[RTE_ETH_EVENT_NEW] = "device probed",
>  	[RTE_ETH_EVENT_DESTROY] = "device released",
>  	[RTE_ETH_EVENT_FLOW_AGED] = "flow aged",
> +	[RTE_ETH_EVENT_RX_FILL_THRESH] = "rxq fill threshold reached",
>  	[RTE_ETH_EVENT_MAX] = NULL,
>  };
>  
> @@ -3616,6 +3617,10 @@ struct pmd_test_command {
>  eth_event_callback(portid_t port_id, enum rte_eth_event_type type, void *param,
>  		  void *ret_param)
>  {
> +	struct rte_eth_dev_info dev_info;
> +	uint16_t rxq_id;
> +	uint8_t fill_thresh;
> +	int ret;
>  	RTE_SET_USED(param);
>  	RTE_SET_USED(ret_param);
>  
> @@ -3647,6 +3652,19 @@ struct pmd_test_command {
>  		ports[port_id].port_status = RTE_PORT_CLOSED;
>  		printf("Port %u is closed\n", port_id);
>  		break;
> +	case RTE_ETH_EVENT_RX_FILL_THRESH:
> +		ret = rte_eth_dev_info_get(port_id, &dev_info);
> +		if (ret != 0)
> +			break;
> +		/* fill_thresh query API rewinds rxq_id, no need to check max rxq num. */
> +		for (rxq_id = 0; ; rxq_id++) {
> +			ret = rte_eth_rx_fill_thresh_query(port_id, &rxq_id, &fill_thresh);
> +			if (ret <= 0)
> +				break;
> +			printf("Received fill_thresh event, port:%d rxq_id:%d\n",
> +			       port_id, rxq_id);
> +		}
> +		break;
>  	default:
>  		break;
>  	}
> diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
> index 78a5f4e..c7a144e 100644
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -1173,6 +1173,8 @@ uint16_t tx_pkt_set_dynf(uint16_t port_id, __rte_unused uint16_t queue,
>  void flex_item_create(portid_t port_id, uint16_t flex_id, const char *filename);
>  void flex_item_destroy(portid_t port_id, uint16_t flex_id);
>  void port_flex_item_flush(portid_t port_id);
> +int set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx,
> +			uint16_t fill_thresh);
>  
>  extern int flow_parse(const char *src, void *result, unsigned int size,
>  		      struct rte_flow_attr **attr,
> diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
> index 69d9dc2..7ef7dba 100644
> --- a/lib/ethdev/ethdev_driver.h
> +++ b/lib/ethdev/ethdev_driver.h
> @@ -470,6 +470,23 @@ typedef int (*eth_rx_queue_setup_t)(struct rte_eth_dev *dev,
>  				    const struct rte_eth_rxconf *rx_conf,
>  				    struct rte_mempool *mb_pool);
>  
> +/**
> + * @internal Set Rx queue fill threshold.
> + * @see rte_eth_rx_fill_thresh_set()
> + */
> +typedef int (*eth_rx_queue_fill_thresh_set_t)(struct rte_eth_dev *dev,
> +				      uint16_t rx_queue_id,
> +				      uint8_t fill_thresh);
> +
> +/**
> + * @internal Query queue fill threshold event.
> + * @see rte_eth_rx_fill_thresh_query()
> + */
> +
> +typedef int (*eth_rx_queue_fill_thresh_query_t)(struct rte_eth_dev *dev,
> +					uint16_t *rx_queue_id,
> +					uint8_t *fill_thresh);
> +
>  /** @internal Setup a transmit queue of an Ethernet device. */
>  typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
>  				    uint16_t tx_queue_id,
> @@ -1168,6 +1185,11 @@ struct eth_dev_ops {
>  	/** Priority flow control queue configure */
>  	priority_flow_ctrl_queue_config_t priority_flow_ctrl_queue_config;
>  
> +	/** Set Rx queue fill threshold. */
> +	eth_rx_queue_fill_thresh_set_t rx_queue_fill_thresh_set;
> +	/** Query Rx queue fill threshold event. */
> +	eth_rx_queue_fill_thresh_query_t rx_queue_fill_thresh_query;
> +
>  	/** Set Unicast Table Array */
>  	eth_uc_hash_table_set_t    uc_hash_table_set;
>  	/** Set Unicast hash bitmap */
> diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
> index a175867..69a1f75 100644
> --- a/lib/ethdev/rte_ethdev.c
> +++ b/lib/ethdev/rte_ethdev.c
> @@ -4424,6 +4424,58 @@ int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx,
>  							queue_idx, tx_rate));
>  }
>  
> +int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
> +			       uint8_t fill_thresh)
> +{
> +	struct rte_eth_dev *dev;
> +	struct rte_eth_dev_info dev_info;
> +	int ret;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> +	dev = &rte_eth_devices[port_id];
> +
> +	ret = rte_eth_dev_info_get(port_id, &dev_info);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (queue_id > dev_info.max_rx_queues) {
> +		RTE_ETHDEV_LOG(ERR,
> +			"Set queue fill thresh: port %u: invalid queue ID=%u.\n",
> +			port_id, queue_id);
> +		return -EINVAL;
> +	}
> +
> +	if (fill_thresh > 99)
> +		return -EINVAL;
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_fill_thresh_set, -ENOTSUP);
> +	return eth_err(port_id, (*dev->dev_ops->rx_queue_fill_thresh_set)(dev,
> +							     queue_id, fill_thresh));
> +}
> +
> +int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
> +				 uint8_t *fill_thresh)
> +{
> +	struct rte_eth_dev_info dev_info;
> +	struct rte_eth_dev *dev;
> +	int ret;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> +	dev = &rte_eth_devices[port_id];
> +
> +	ret = rte_eth_dev_info_get(port_id, &dev_info);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (queue_id == NULL)
> +		return -EINVAL;
> +	if (*queue_id >= dev_info.max_rx_queues)
> +		*queue_id = 0;
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_fill_thresh_query, -ENOTSUP);
> +	return eth_err(port_id, (*dev->dev_ops->rx_queue_fill_thresh_query)(dev,
> +							     queue_id, fill_thresh));
> +}
> +
>  RTE_INIT(eth_dev_init_fp_ops)
>  {
>  	uint32_t i;
> diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
> index 04225bb..d44e5da 100644
> --- a/lib/ethdev/rte_ethdev.h
> +++ b/lib/ethdev/rte_ethdev.h
> @@ -1931,6 +1931,14 @@ struct rte_eth_rxq_info {
>  	uint8_t queue_state;        /**< one of RTE_ETH_QUEUE_STATE_*. */
>  	uint16_t nb_desc;           /**< configured number of RXDs. */
>  	uint16_t rx_buf_size;       /**< hardware receive buffer size. */
> +	/**
> +	 * Per-queue Rx fill threshold defined as percentage of Rx queue
> +	 * size. If Rx queue receives traffic higher than this percentage,
> +	 * the event RTE_ETH_EVENT_RX_FILL_THESH is triggered.
> +	 * Value 0 means threshold monitoring is disabled, no event is
> +	 * triggered.
> +	 */
> +	uint8_t fill_thresh;
>  } __rte_cache_min_aligned;
>  
>  /**
> @@ -3672,6 +3680,65 @@ int rte_eth_dev_set_vlan_ether_type(uint16_t port_id,
>   */
>  int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on);
>  
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Set Rx queue based fill threshold.
> + *
> + * @param port_id
> + *  The port identifier of the Ethernet device.
> + * @param queue_id
> + *  The index of the receive queue.
> + * @param fill_thresh
> + *  The fill threshold percentage of Rx queue size which describes
> + *  the fullness of Rx queue. If the Rx queue fullness is above it,
> + *  the device will trigger the event RTE_ETH_EVENT_RX_FILL_THRESH.
> + *  [1-99] to set a new fill thresold.
> + *  0 to disable thresold monitoring.
> + *
> + * @return
> + *   - 0 if successful.
> + *   - negative if failed.
> + */
> +__rte_experimental
> +int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
> +			       uint8_t fill_thresh);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Query Rx queue based fill threshold event.
> + * The function queries all queues in the port circularly until one
> + * pending fill_thresh event is found or no pending fill_thresh event is found.
> + *
> + * @param port_id
> + *  The port identifier of the Ethernet device.
> + * @param queue_id
> + *  The API caller sets the starting Rx queue id in the pointer.
> + *  If the queue_id is bigger than maximum queue id of the port,
> + *  it's rewinded to 0 so that application can keep calling
> + *  this function to handle all pending fill_thresh events in the queues
> + *  with a simple increment between calls.
> + *  If a Rx queue has pending fill_thresh event, the pointer is updated
> + *  with this Rx queue id; otherwise this pointer's content is
> + *  unchanged.
> + * @param fill_thresh
> + *  The pointer to the fill threshold percentage of Rx queue.
> + *  If Rx queue with pending fill_thresh event is found, the queue's fill_thresh
> + *  percentage is stored in this pointer, otherwise the pointer's
> + *  content is unchanged.
> + *
> + * @return
> + *   - 1 if a Rx queue with pending fill_thresh event is found.
> + *   - 0 if no Rx queue with pending fill_thresh event is found.
> + *   - -EINVAL if queue_id is NULL.
> + */
> +__rte_experimental
> +int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
> +				 uint8_t *fill_thresh);
> +
>  typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
>  		void *userdata);
>  
> @@ -3877,6 +3944,11 @@ enum rte_eth_event_type {
>  	RTE_ETH_EVENT_DESTROY,  /**< port is released */
>  	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
>  	RTE_ETH_EVENT_FLOW_AGED,/**< New aged-out flows is detected */
> +	/**
> +	 *  Fill threshold value is exceeded in a queue.
> +	 *  @see rte_eth_rx_fill_thresh_set()
> +	 */
> +	RTE_ETH_EVENT_RX_FILL_THRESH,
>  	RTE_ETH_EVENT_MAX       /**< max value of this enum */
>  };
>  
> diff --git a/lib/ethdev/version.map b/lib/ethdev/version.map
> index daca785..29b1fe8 100644
> --- a/lib/ethdev/version.map
> +++ b/lib/ethdev/version.map
> @@ -285,6 +285,8 @@ EXPERIMENTAL {
>  	rte_mtr_color_in_protocol_priority_get;
>  	rte_mtr_color_in_protocol_set;
>  	rte_mtr_meter_vlan_table_update;

Comment with # added in 22.XX

> +	rte_eth_rx_fill_thresh_set;
> +	rte_eth_rx_fill_thresh_query;
>  };
>  
>  INTERNAL {

Andrew Rybchenko June 4, 2022, 12:46 p.m. UTC | #2

On 6/3/22 15:48, Spike Du wrote:
> Fill threshold describes the fullness of a Rx queue. If the Rx
> queue fullness is above the threshold, the device will trigger the event
> RTE_ETH_EVENT_RX_FILL_THRESH.

Sorry, I'm not sure that I understand. As far as I know the process to
add more Rx buffers to Rx queue is called 'refill' in many drivers. So
fill level is a number (or percentage) of free buffers in an Rx queue.
If so, fill threashold should be a minimum fill level and below the
level we should generate an event.

However reading the first paragraph of the descrition it looks like you
mean oposite thing - a number (or percentage) of ready Rx buffers with
received packets.

I think that the term "fill threshold" is suggested by me, but I did it
with mine understanding of the added feature. Now I'm confused.

Moreover, I don't understand how "fill threshold" could be in terms of
ready Rx buffers. HW simply don't really know when ready Rx buffers are
processed by SW. So, HW can't say for sure how many ready Rx buffers are
pending. It could be calculated as Rx queue size minus number of free Rx
buffers, but it is imprecise. First of all not all Rx descriptors could
be used. Second, HW ring size could differ queue size specified in SW.
Queue size specified in SW could just limit maximum nubmer of free Rx
buffers provided by the driver.

> Fill threshold is defined as a percentage of Rx queue size with valid
> value of [0,99].
> Setting fill threshold to 0 means disable it, which is the default.
> Add fill threshold configuration and query driver callbacks in eth_dev_ops.
> Add command line options to support fill_thresh per-rxq configure.
> - Command syntax:
>    set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
> 
> - Example commands:
> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
> testpmd> set port 1 rxq 0 fill_thresh 30
> 
> To disable fill_thresh on port 1 rxq 0:
> testpmd> set port 1 rxq 0 fill_thresh 0
> 
> Signed-off-by: Spike Du <spiked@nvidia.com>
> ---
>   app/test-pmd/cmdline.c     | 68 +++++++++++++++++++++++++++++++++++++++++++
>   app/test-pmd/config.c      | 21 ++++++++++++++
>   app/test-pmd/testpmd.c     | 18 ++++++++++++
>   app/test-pmd/testpmd.h     |  2 ++
>   lib/ethdev/ethdev_driver.h | 22 ++++++++++++++
>   lib/ethdev/rte_ethdev.c    | 52 +++++++++++++++++++++++++++++++++
>   lib/ethdev/rte_ethdev.h    | 72 ++++++++++++++++++++++++++++++++++++++++++++++
>   lib/ethdev/version.map     |  2 ++
>   8 files changed, 257 insertions(+)
> 
> diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
> index 0410bad..918581e 100644
> --- a/app/test-pmd/cmdline.c
> +++ b/app/test-pmd/cmdline.c
> @@ -17823,6 +17823,73 @@ struct cmd_show_port_flow_transfer_proxy_result {
>   	}
>   };
>   
> +/* *** SET FILL THRESHOLD FOR A RXQ OF A PORT *** */
> +struct cmd_rxq_fill_thresh_result {
> +	cmdline_fixed_string_t set;
> +	cmdline_fixed_string_t port;
> +	uint16_t port_num;
> +	cmdline_fixed_string_t rxq;
> +	uint16_t rxq_num;
> +	cmdline_fixed_string_t fill_thresh;
> +	uint16_t fill_thresh_num;

uint8_t to be consistent with ethdev

> +};
> +
> +static void cmd_rxq_fill_thresh_parsed(void *parsed_result,
> +		__rte_unused struct cmdline *cl,
> +		__rte_unused void *data)
> +{
> +	struct cmd_rxq_fill_thresh_result *res = parsed_result;
> +	int ret = 0;
> +
> +	if ((strcmp(res->set, "set") == 0) && (strcmp(res->port, "port") == 0)
> +	    && (strcmp(res->rxq, "rxq") == 0)
> +	    && (strcmp(res->fill_thresh, "fill_thresh") == 0))
> +		ret = set_rxq_fill_thresh(res->port_num, res->rxq_num,
> +				  res->fill_thresh_num);
> +	if (ret < 0)
> +		printf("rxq_fill_thresh_cmd error: (%s)\n", strerror(-ret));
> +
> +}
> +
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_set =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				set, "set");
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_port =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				port, "port");
> +cmdline_parse_token_num_t cmd_rxq_fill_thresh_portnum =
> +	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				port_num, RTE_UINT16);
> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_rxq =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				rxq, "rxq");
> +cmdline_parse_token_num_t cmd_rxq_fill_thresh_rxqnum =
> +	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				rxq_num, RTE_UINT8);

RTE_UINT16 since it is an Rx queue ID

> +cmdline_parse_token_string_t cmd_rxq_fill_thresh_fill_thresh =
> +	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				fill_thresh, "fill_thresh");
> +cmdline_parse_token_num_t cmd_rxq_fill_thresh_fill_threshnum =
> +	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> +				fill_thresh_num, RTE_UINT16);

RTE_UINT16 -> RTE_UINT8

> +
> +cmdline_parse_inst_t cmd_rxq_fill_thresh = {
> +	.f = cmd_rxq_fill_thresh_parsed,
> +	.data = (void *)0,
> +	.help_str = "set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>"
> +		"Set fill_thresh for rxq on port_id",
> +	.tokens = {
> +		(void *)&cmd_rxq_fill_thresh_set,
> +		(void *)&cmd_rxq_fill_thresh_port,
> +		(void *)&cmd_rxq_fill_thresh_portnum,
> +		(void *)&cmd_rxq_fill_thresh_rxq,
> +		(void *)&cmd_rxq_fill_thresh_rxqnum,
> +		(void *)&cmd_rxq_fill_thresh_fill_thresh,
> +		(void *)&cmd_rxq_fill_thresh_fill_threshnum,
> +		NULL,
> +	},
> +};

Please, add 'static' keyword to all above cmdline_parse_* variable.

> +
>   /* ******************************************************************************** */
>   
>   /* list of instructions */
> @@ -18110,6 +18177,7 @@ struct cmd_show_port_flow_transfer_proxy_result {
>   	(cmdline_parse_inst_t *)&cmd_show_capability,
>   	(cmdline_parse_inst_t *)&cmd_set_flex_is_pattern,
>   	(cmdline_parse_inst_t *)&cmd_set_flex_spec_pattern,
> +	(cmdline_parse_inst_t *)&cmd_rxq_fill_thresh,
>   	NULL,
>   };
>   
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 1b1e738..d0c519b 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -6342,3 +6342,24 @@ struct igb_ring_desc_16_bytes {
>   		printf("  %s\n", buf);
>   	}
>   }
> +
> +int
> +set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx, uint16_t fill_thresh)

uint8_t for fill_threash since the type is used in ethdev API

> +{
> +	struct rte_eth_link link;
> +	int ret;
> +
> +	if (port_id_is_invalid(port_id, ENABLED_WARN))
> +		return -EINVAL;
> +	ret = eth_link_get_nowait_print_err(port_id, &link);
> +	if (ret < 0)
> +		return -EINVAL;
> +	if (fill_thresh > 99)
> +		return -EINVAL;
> +	ret = rte_eth_rx_fill_thresh_set(port_id, queue_idx, fill_thresh);
> +
> +	if (ret)

Please remove extra empty line above and compare with 0 explicitly.

> +		return ret;
> +	return 0;
> +}
> +
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index 767765d..1209230 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -420,6 +420,7 @@ struct fwd_engine * fwd_engines[] = {
>   	[RTE_ETH_EVENT_NEW] = "device probed",
>   	[RTE_ETH_EVENT_DESTROY] = "device released",
>   	[RTE_ETH_EVENT_FLOW_AGED] = "flow aged",
> +	[RTE_ETH_EVENT_RX_FILL_THRESH] = "rxq fill threshold reached",
>   	[RTE_ETH_EVENT_MAX] = NULL,
>   };
>   
> @@ -3616,6 +3617,10 @@ struct pmd_test_command {
>   eth_event_callback(portid_t port_id, enum rte_eth_event_type type, void *param,
>   		  void *ret_param)
>   {
> +	struct rte_eth_dev_info dev_info;
> +	uint16_t rxq_id;
> +	uint8_t fill_thresh;
> +	int ret;
>   	RTE_SET_USED(param);
>   	RTE_SET_USED(ret_param);
>   
> @@ -3647,6 +3652,19 @@ struct pmd_test_command {
>   		ports[port_id].port_status = RTE_PORT_CLOSED;
>   		printf("Port %u is closed\n", port_id);
>   		break;
> +	case RTE_ETH_EVENT_RX_FILL_THRESH:
> +		ret = rte_eth_dev_info_get(port_id, &dev_info);
> +		if (ret != 0)
> +			break;

What's the point to get device info above? It is unused as far as I can
see.

> +		/* fill_thresh query API rewinds rxq_id, no need to check max rxq num. */
> +		for (rxq_id = 0; ; rxq_id++) {
> +			ret = rte_eth_rx_fill_thresh_query(port_id, &rxq_id, &fill_thresh);
> +			if (ret <= 0)
> +				break; > +			printf("Received fill_thresh event, port:%d rxq_id:%d\n",
> +			       port_id, rxq_id);
> +		}
> +		break;
>   	default:
>   		break;
>   	}
> diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
> index 78a5f4e..c7a144e 100644
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -1173,6 +1173,8 @@ uint16_t tx_pkt_set_dynf(uint16_t port_id, __rte_unused uint16_t queue,
>   void flex_item_create(portid_t port_id, uint16_t flex_id, const char *filename);
>   void flex_item_destroy(portid_t port_id, uint16_t flex_id);
>   void port_flex_item_flush(portid_t port_id);
> +int set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx,

it is better to be consistent with variable nameing
queue_idx -> queue_id

> +			uint16_t fill_thresh);

uint16_t -> uint8_t since ethdev API uses uint8_t. It must be
consistent with ethdev.

>   
>   extern int flow_parse(const char *src, void *result, unsigned int size,
>   		      struct rte_flow_attr **attr,
> diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
> index 69d9dc2..7ef7dba 100644
> --- a/lib/ethdev/ethdev_driver.h
> +++ b/lib/ethdev/ethdev_driver.h
> @@ -470,6 +470,23 @@ typedef int (*eth_rx_queue_setup_t)(struct rte_eth_dev *dev,
>   				    const struct rte_eth_rxconf *rx_conf,
>   				    struct rte_mempool *mb_pool);
>   
> +/**
> + * @internal Set Rx queue fill threshold.
> + * @see rte_eth_rx_fill_thresh_set()
> + */
> +typedef int (*eth_rx_queue_fill_thresh_set_t)(struct rte_eth_dev *dev,
> +				      uint16_t rx_queue_id,
> +				      uint8_t fill_thresh);
> +
> +/**
> + * @internal Query queue fill threshold event.
> + * @see rte_eth_rx_fill_thresh_query()
> + */
> +
> +typedef int (*eth_rx_queue_fill_thresh_query_t)(struct rte_eth_dev *dev,
> +					uint16_t *rx_queue_id,
> +					uint8_t *fill_thresh);

Should fill_thresh be round UP or DOWN by the driver?
Order of prototypes definition must follow order of ops in the
structure.

> +
>   /** @internal Setup a transmit queue of an Ethernet device. */
>   typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
>   				    uint16_t tx_queue_id,
> @@ -1168,6 +1185,11 @@ struct eth_dev_ops {
>   	/** Priority flow control queue configure */
>   	priority_flow_ctrl_queue_config_t priority_flow_ctrl_queue_config;
>   
> +	/** Set Rx queue fill threshold. */
> +	eth_rx_queue_fill_thresh_set_t rx_queue_fill_thresh_set;
> +	/** Query Rx queue fill threshold event. */
> +	eth_rx_queue_fill_thresh_query_t rx_queue_fill_thresh_query;
> +

Why is it added in the middle fo the structure?
Isn't it better to add at the end?

>   	/** Set Unicast Table Array */
>   	eth_uc_hash_table_set_t    uc_hash_table_set;
>   	/** Set Unicast hash bitmap */
> diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
> index a175867..69a1f75 100644
> --- a/lib/ethdev/rte_ethdev.c
> +++ b/lib/ethdev/rte_ethdev.c
> @@ -4424,6 +4424,58 @@ int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx,
>   							queue_idx, tx_rate));
>   }
>   
> +int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
> +			       uint8_t fill_thresh)
> +{
> +	struct rte_eth_dev *dev;
> +	struct rte_eth_dev_info dev_info;
> +	int ret;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> +	dev = &rte_eth_devices[port_id];
> +
> +	ret = rte_eth_dev_info_get(port_id, &dev_info);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (queue_id > dev_info.max_rx_queues) {

We don't need dev_info to get number of Rx queues.
dev->data->nb_rx_queues does the job.

> +		RTE_ETHDEV_LOG(ERR,
> +			"Set queue fill thresh: port %u: invalid queue ID=%u.\n",
> +			port_id, queue_id);
> +		return -EINVAL;
> +	}
> +
> +	if (fill_thresh > 99)
> +		return -EINVAL;

Why do you have error log above, but don't have it here?

> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_fill_thresh_set, -ENOTSUP);
> +	return eth_err(port_id, (*dev->dev_ops->rx_queue_fill_thresh_set)(dev,
> +							     queue_id, fill_thresh));
> +}
> +
> +int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
> +				 uint8_t *fill_thresh)
> +{
> +	struct rte_eth_dev_info dev_info;
> +	struct rte_eth_dev *dev;
> +	int ret;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> +	dev = &rte_eth_devices[port_id];
> +
> +	ret = rte_eth_dev_info_get(port_id, &dev_info);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (queue_id == NULL)
> +		return -EINVAL;
> +	if (*queue_id >= dev_info.max_rx_queues)

Same here. you don't need dev_info

> +		*queue_id = 0;
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_fill_thresh_query, -ENOTSUP);
> +	return eth_err(port_id, (*dev->dev_ops->rx_queue_fill_thresh_query)(dev,
> +							     queue_id, fill_thresh));
> +}
> +
>   RTE_INIT(eth_dev_init_fp_ops)
>   {
>   	uint32_t i;
> diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
> index 04225bb..d44e5da 100644
> --- a/lib/ethdev/rte_ethdev.h
> +++ b/lib/ethdev/rte_ethdev.h
> @@ -1931,6 +1931,14 @@ struct rte_eth_rxq_info {
>   	uint8_t queue_state;        /**< one of RTE_ETH_QUEUE_STATE_*. */
>   	uint16_t nb_desc;           /**< configured number of RXDs. */
>   	uint16_t rx_buf_size;       /**< hardware receive buffer size. */
> +	/**
> +	 * Per-queue Rx fill threshold defined as percentage of Rx queue
> +	 * size. If Rx queue receives traffic higher than this percentage,
> +	 * the event RTE_ETH_EVENT_RX_FILL_THESH is triggered.
> +	 * Value 0 means threshold monitoring is disabled, no event is
> +	 * triggered.
> +	 */
> +	uint8_t fill_thresh;
>   } __rte_cache_min_aligned;
>   
>   /**
> @@ -3672,6 +3680,65 @@ int rte_eth_dev_set_vlan_ether_type(uint16_t port_id,
>    */
>   int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on);
>   
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Set Rx queue based fill threshold.
> + *
> + * @param port_id
> + *  The port identifier of the Ethernet device.
> + * @param queue_id
> + *  The index of the receive queue.
> + * @param fill_thresh
> + *  The fill threshold percentage of Rx queue size which describes
> + *  the fullness of Rx queue. If the Rx queue fullness is above it,
> + *  the device will trigger the event RTE_ETH_EVENT_RX_FILL_THRESH.
> + *  [1-99] to set a new fill thresold.
> + *  0 to disable thresold monitoring.
> + *
> + * @return
> + *   - 0 if successful.
> + *   - negative if failed.
> + */
> +__rte_experimental
> +int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
> +			       uint8_t fill_thresh);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Query Rx queue based fill threshold event.
> + * The function queries all queues in the port circularly until one
> + * pending fill_thresh event is found or no pending fill_thresh event is found.
> + *
> + * @param port_id
> + *  The port identifier of the Ethernet device.
> + * @param queue_id
> + *  The API caller sets the starting Rx queue id in the pointer.
> + *  If the queue_id is bigger than maximum queue id of the port,
> + *  it's rewinded to 0 so that application can keep calling
> + *  this function to handle all pending fill_thresh events in the queues
> + *  with a simple increment between calls.
> + *  If a Rx queue has pending fill_thresh event, the pointer is updated
> + *  with this Rx queue id; otherwise this pointer's content is
> + *  unchanged.
> + * @param fill_thresh
> + *  The pointer to the fill threshold percentage of Rx queue.
> + *  If Rx queue with pending fill_thresh event is found, the queue's fill_thresh
> + *  percentage is stored in this pointer, otherwise the pointer's
> + *  content is unchanged.
> + *
> + * @return
> + *   - 1 if a Rx queue with pending fill_thresh event is found.
> + *   - 0 if no Rx queue with pending fill_thresh event is found.
> + *   - -EINVAL if queue_id is NULL.
> + */
> +__rte_experimental
> +int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
> +				 uint8_t *fill_thresh);
> +
>   typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
>   		void *userdata);
>   
> @@ -3877,6 +3944,11 @@ enum rte_eth_event_type {
>   	RTE_ETH_EVENT_DESTROY,  /**< port is released */
>   	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
>   	RTE_ETH_EVENT_FLOW_AGED,/**< New aged-out flows is detected */
> +	/**
> +	 *  Fill threshold value is exceeded in a queue.
> +	 *  @see rte_eth_rx_fill_thresh_set()
> +	 */
> +	RTE_ETH_EVENT_RX_FILL_THRESH,
>   	RTE_ETH_EVENT_MAX       /**< max value of this enum */
>   };
>   
> diff --git a/lib/ethdev/version.map b/lib/ethdev/version.map
> index daca785..29b1fe8 100644
> --- a/lib/ethdev/version.map
> +++ b/lib/ethdev/version.map
> @@ -285,6 +285,8 @@ EXPERIMENTAL {
>   	rte_mtr_color_in_protocol_priority_get;
>   	rte_mtr_color_in_protocol_set;
>   	rte_mtr_meter_vlan_table_update;
> +	rte_eth_rx_fill_thresh_set;
> +	rte_eth_rx_fill_thresh_query;
>   };
>   
>   INTERNAL {

Spike Du June 6, 2022, 1:16 p.m. UTC | #3

Hi Andrew,
	Please see below for "fill threshold" concept, I'm ok with other comments about code.

Regards,
Spike.


> -----Original Message-----
> From: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
> Sent: Saturday, June 4, 2022 8:46 PM
> To: Spike Du <spiked@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> Slava Ovsiienko <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>;
> NBU-Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>;
> Wenzhuo Lu <wenzhuo.lu@intel.com>; Beilei Xing <beilei.xing@intel.com>;
> Bernard Iremonger <bernard.iremonger@intel.com>; Ray Kinsella
> <mdr@ashroe.eu>; Neil Horman <nhorman@tuxdriver.com>
> Cc: stephen@networkplumber.org; mb@smartsharesystems.com;
> dev@dpdk.org; Raslan Darawsheh <rasland@nvidia.com>
> Subject: Re: [PATCH v4 3/7] ethdev: introduce Rx queue based fill threshold
> 
> External email: Use caution opening links or attachments
> 
> 
> On 6/3/22 15:48, Spike Du wrote:
> > Fill threshold describes the fullness of a Rx queue. If the Rx queue
> > fullness is above the threshold, the device will trigger the event
> > RTE_ETH_EVENT_RX_FILL_THRESH.
> 
> Sorry, I'm not sure that I understand. As far as I know the process to add
> more Rx buffers to Rx queue is called 'refill' in many drivers. So fill level is a
> number (or percentage) of free buffers in an Rx queue.
> If so, fill threashold should be a minimum fill level and below the level we
> should generate an event.
> 
> However reading the first paragraph of the descrition it looks like you mean
> oposite thing - a number (or percentage) of ready Rx buffers with received
> packets.
> 
> I think that the term "fill threshold" is suggested by me, but I did it with mine
> understanding of the added feature. Now I'm confused.
> 
> Moreover, I don't understand how "fill threshold" could be in terms of ready
> Rx buffers. HW simply don't really know when ready Rx buffers are
> processed by SW. So, HW can't say for sure how many ready Rx buffers are
> pending. It could be calculated as Rx queue size minus number of free Rx
> buffers, but it is imprecise. First of all not all Rx descriptors could be used.
> Second, HW ring size could differ queue size specified in SW.
> Queue size specified in SW could just limit maximum nubmer of free Rx
> buffers provided by the driver.
> 

Let me use other terms because "fill"/"refill" is also ambiguous to me.
In a RX ring, there are Rx buffers with received packets, you call it "ready Rx buffers", there is a RTE api rte_eth_rx_queue_count() to get the number,
It's also called "used descriptors" in the code.
Also there are Rx buffers provided by SW to allow HW "fill in" received packets, we can call it "usable Rx buffers" (here "usable" means usable for HW).
Let's define Rx queue "fullness":
	Fullness = ready-Rx-buffers/Rxq-size
On the opposite, we have "emptiness"
	Emptiness = usable-Rx-buffers/Rxq-size
Here "fill threshold" describes "fullness", it's not "refill" described in you above words. Because in your words, "refill" is the opposite, it's filling "usable/free Rx buffers", or "emptiness".

I can only briefly explain how mlx5 works to get LWM, because I'm not a Firmware guy.
Mlx5 Rx queue is basically RDMA queue. It has two indexes: producer index which increases when HW fills in packet, consumer index which increases when SW consumes the packet.
The queue size is known when it's created. The fullness is something like (producer_index - consumer_index) (I don't consider in wrap-around here).
So mlx5 has the way to get the fullness or emptiness in HW or FW. 
Another detail is mlx5 uses the term "LWM"(limit watermark), which describes "emptiness". When usable-Rx-buffers is below LWM, we trigger an event.
But Thomas think "fullness" is easier to understand, so we use "fullness" in rte APIs and we'll translate it to LWM in mlx5 PMD.


> > Fill threshold is defined as a percentage of Rx queue size with valid
> > value of [0,99].
> > Setting fill threshold to 0 means disable it, which is the default.
> > Add fill threshold configuration and query driver callbacks in eth_dev_ops.
> > Add command line options to support fill_thresh per-rxq configure.
> > - Command syntax:
> >    set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
> >
> > - Example commands:
> > To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
> > testpmd> set port 1 rxq 0 fill_thresh 30
> >
> > To disable fill_thresh on port 1 rxq 0:
> > testpmd> set port 1 rxq 0 fill_thresh 0
> >
> > Signed-off-by: Spike Du <spiked@nvidia.com>
> > ---
> >   app/test-pmd/cmdline.c     | 68
> +++++++++++++++++++++++++++++++++++++++++++
> >   app/test-pmd/config.c      | 21 ++++++++++++++
> >   app/test-pmd/testpmd.c     | 18 ++++++++++++
> >   app/test-pmd/testpmd.h     |  2 ++
> >   lib/ethdev/ethdev_driver.h | 22 ++++++++++++++
> >   lib/ethdev/rte_ethdev.c    | 52 +++++++++++++++++++++++++++++++++
> >   lib/ethdev/rte_ethdev.h    | 72
> ++++++++++++++++++++++++++++++++++++++++++++++
> >   lib/ethdev/version.map     |  2 ++
> >   8 files changed, 257 insertions(+)
> >
> > diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index
> > 0410bad..918581e 100644
> > --- a/app/test-pmd/cmdline.c
> > +++ b/app/test-pmd/cmdline.c
> > @@ -17823,6 +17823,73 @@ struct
> cmd_show_port_flow_transfer_proxy_result {
> >       }
> >   };
> >
> > +/* *** SET FILL THRESHOLD FOR A RXQ OF A PORT *** */ struct
> > +cmd_rxq_fill_thresh_result {
> > +     cmdline_fixed_string_t set;
> > +     cmdline_fixed_string_t port;
> > +     uint16_t port_num;
> > +     cmdline_fixed_string_t rxq;
> > +     uint16_t rxq_num;
> > +     cmdline_fixed_string_t fill_thresh;
> > +     uint16_t fill_thresh_num;
> 
> uint8_t to be consistent with ethdev
> 
> > +};
> > +
> > +static void cmd_rxq_fill_thresh_parsed(void *parsed_result,
> > +             __rte_unused struct cmdline *cl,
> > +             __rte_unused void *data) {
> > +     struct cmd_rxq_fill_thresh_result *res = parsed_result;
> > +     int ret = 0;
> > +
> > +     if ((strcmp(res->set, "set") == 0) && (strcmp(res->port, "port") == 0)
> > +         && (strcmp(res->rxq, "rxq") == 0)
> > +         && (strcmp(res->fill_thresh, "fill_thresh") == 0))
> > +             ret = set_rxq_fill_thresh(res->port_num, res->rxq_num,
> > +                               res->fill_thresh_num);
> > +     if (ret < 0)
> > +             printf("rxq_fill_thresh_cmd error: (%s)\n",
> > + strerror(-ret));
> > +
> > +}
> > +
> > +cmdline_parse_token_string_t cmd_rxq_fill_thresh_set =
> > +     TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             set, "set");
> > +cmdline_parse_token_string_t cmd_rxq_fill_thresh_port =
> > +     TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             port, "port"); cmdline_parse_token_num_t
> > +cmd_rxq_fill_thresh_portnum =
> > +     TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             port_num, RTE_UINT16);
> > +cmdline_parse_token_string_t cmd_rxq_fill_thresh_rxq =
> > +     TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             rxq, "rxq"); cmdline_parse_token_num_t
> > +cmd_rxq_fill_thresh_rxqnum =
> > +     TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             rxq_num, RTE_UINT8);
> 
> RTE_UINT16 since it is an Rx queue ID
> 
> > +cmdline_parse_token_string_t cmd_rxq_fill_thresh_fill_thresh =
> > +     TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             fill_thresh, "fill_thresh");
> > +cmdline_parse_token_num_t cmd_rxq_fill_thresh_fill_threshnum =
> > +     TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
> > +                             fill_thresh_num, RTE_UINT16);
> 
> RTE_UINT16 -> RTE_UINT8
> 
> > +
> > +cmdline_parse_inst_t cmd_rxq_fill_thresh = {
> > +     .f = cmd_rxq_fill_thresh_parsed,
> > +     .data = (void *)0,
> > +     .help_str = "set port <port_id> rxq <rxq_id> fill_thresh
> <fill_thresh_num>"
> > +             "Set fill_thresh for rxq on port_id",
> > +     .tokens = {
> > +             (void *)&cmd_rxq_fill_thresh_set,
> > +             (void *)&cmd_rxq_fill_thresh_port,
> > +             (void *)&cmd_rxq_fill_thresh_portnum,
> > +             (void *)&cmd_rxq_fill_thresh_rxq,
> > +             (void *)&cmd_rxq_fill_thresh_rxqnum,
> > +             (void *)&cmd_rxq_fill_thresh_fill_thresh,
> > +             (void *)&cmd_rxq_fill_thresh_fill_threshnum,
> > +             NULL,
> > +     },
> > +};
> 
> Please, add 'static' keyword to all above cmdline_parse_* variable.
> 
> > +
> >   /*
> >
> **********************************************************
> ************
> > ********** */
> >
> >   /* list of instructions */
> > @@ -18110,6 +18177,7 @@ struct
> cmd_show_port_flow_transfer_proxy_result {
> >       (cmdline_parse_inst_t *)&cmd_show_capability,
> >       (cmdline_parse_inst_t *)&cmd_set_flex_is_pattern,
> >       (cmdline_parse_inst_t *)&cmd_set_flex_spec_pattern,
> > +     (cmdline_parse_inst_t *)&cmd_rxq_fill_thresh,
> >       NULL,
> >   };
> >
> > diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c index
> > 1b1e738..d0c519b 100644
> > --- a/app/test-pmd/config.c
> > +++ b/app/test-pmd/config.c
> > @@ -6342,3 +6342,24 @@ struct igb_ring_desc_16_bytes {
> >               printf("  %s\n", buf);
> >       }
> >   }
> > +
> > +int
> > +set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx, uint16_t
> > +fill_thresh)
> 
> uint8_t for fill_threash since the type is used in ethdev API
> 
> > +{
> > +     struct rte_eth_link link;
> > +     int ret;
> > +
> > +     if (port_id_is_invalid(port_id, ENABLED_WARN))
> > +             return -EINVAL;
> > +     ret = eth_link_get_nowait_print_err(port_id, &link);
> > +     if (ret < 0)
> > +             return -EINVAL;
> > +     if (fill_thresh > 99)
> > +             return -EINVAL;
> > +     ret = rte_eth_rx_fill_thresh_set(port_id, queue_idx,
> > + fill_thresh);
> > +
> > +     if (ret)
> 
> Please remove extra empty line above and compare with 0 explicitly.
> 
> > +             return ret;
> > +     return 0;
> > +}
> > +
> > diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index
> > 767765d..1209230 100644
> > --- a/app/test-pmd/testpmd.c
> > +++ b/app/test-pmd/testpmd.c
> > @@ -420,6 +420,7 @@ struct fwd_engine * fwd_engines[] = {
> >       [RTE_ETH_EVENT_NEW] = "device probed",
> >       [RTE_ETH_EVENT_DESTROY] = "device released",
> >       [RTE_ETH_EVENT_FLOW_AGED] = "flow aged",
> > +     [RTE_ETH_EVENT_RX_FILL_THRESH] = "rxq fill threshold reached",
> >       [RTE_ETH_EVENT_MAX] = NULL,
> >   };
> >
> > @@ -3616,6 +3617,10 @@ struct pmd_test_command {
> >   eth_event_callback(portid_t port_id, enum rte_eth_event_type type,
> void *param,
> >                 void *ret_param)
> >   {
> > +     struct rte_eth_dev_info dev_info;
> > +     uint16_t rxq_id;
> > +     uint8_t fill_thresh;
> > +     int ret;
> >       RTE_SET_USED(param);
> >       RTE_SET_USED(ret_param);
> >
> > @@ -3647,6 +3652,19 @@ struct pmd_test_command {
> >               ports[port_id].port_status = RTE_PORT_CLOSED;
> >               printf("Port %u is closed\n", port_id);
> >               break;
> > +     case RTE_ETH_EVENT_RX_FILL_THRESH:
> > +             ret = rte_eth_dev_info_get(port_id, &dev_info);
> > +             if (ret != 0)
> > +                     break;
> 
> What's the point to get device info above? It is unused as far as I can see.
> 
> > +             /* fill_thresh query API rewinds rxq_id, no need to check max rxq
> num. */
> > +             for (rxq_id = 0; ; rxq_id++) {
> > +                     ret = rte_eth_rx_fill_thresh_query(port_id, &rxq_id,
> &fill_thresh);
> > +                     if (ret <= 0)
> > +                             break; > +                      printf("Received fill_thresh event,
> port:%d rxq_id:%d\n",
> > +                            port_id, rxq_id);
> > +             }
> > +             break;
> >       default:
> >               break;
> >       }
> > diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h index
> > 78a5f4e..c7a144e 100644
> > --- a/app/test-pmd/testpmd.h
> > +++ b/app/test-pmd/testpmd.h
> > @@ -1173,6 +1173,8 @@ uint16_t tx_pkt_set_dynf(uint16_t port_id,
> __rte_unused uint16_t queue,
> >   void flex_item_create(portid_t port_id, uint16_t flex_id, const char
> *filename);
> >   void flex_item_destroy(portid_t port_id, uint16_t flex_id);
> >   void port_flex_item_flush(portid_t port_id);
> > +int set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx,
> 
> it is better to be consistent with variable nameing queue_idx -> queue_id
> 
> > +                     uint16_t fill_thresh);
> 
> uint16_t -> uint8_t since ethdev API uses uint8_t. It must be consistent with
> ethdev.
> 
> >
> >   extern int flow_parse(const char *src, void *result, unsigned int size,
> >                     struct rte_flow_attr **attr, diff --git
> > a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h index
> > 69d9dc2..7ef7dba 100644
> > --- a/lib/ethdev/ethdev_driver.h
> > +++ b/lib/ethdev/ethdev_driver.h
> > @@ -470,6 +470,23 @@ typedef int (*eth_rx_queue_setup_t)(struct
> rte_eth_dev *dev,
> >                                   const struct rte_eth_rxconf *rx_conf,
> >                                   struct rte_mempool *mb_pool);
> >
> > +/**
> > + * @internal Set Rx queue fill threshold.
> > + * @see rte_eth_rx_fill_thresh_set()
> > + */
> > +typedef int (*eth_rx_queue_fill_thresh_set_t)(struct rte_eth_dev *dev,
> > +                                   uint16_t rx_queue_id,
> > +                                   uint8_t fill_thresh);
> > +
> > +/**
> > + * @internal Query queue fill threshold event.
> > + * @see rte_eth_rx_fill_thresh_query()  */
> > +
> > +typedef int (*eth_rx_queue_fill_thresh_query_t)(struct rte_eth_dev
> *dev,
> > +                                     uint16_t *rx_queue_id,
> > +                                     uint8_t *fill_thresh);
> 
> Should fill_thresh be round UP or DOWN by the driver?
> Order of prototypes definition must follow order of ops in the structure.
> 
> > +
> >   /** @internal Setup a transmit queue of an Ethernet device. */
> >   typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
> >                                   uint16_t tx_queue_id, @@ -1168,6
> > +1185,11 @@ struct eth_dev_ops {
> >       /** Priority flow control queue configure */
> >       priority_flow_ctrl_queue_config_t
> > priority_flow_ctrl_queue_config;
> >
> > +     /** Set Rx queue fill threshold. */
> > +     eth_rx_queue_fill_thresh_set_t rx_queue_fill_thresh_set;
> > +     /** Query Rx queue fill threshold event. */
> > +     eth_rx_queue_fill_thresh_query_t rx_queue_fill_thresh_query;
> > +
> 
> Why is it added in the middle fo the structure?
> Isn't it better to add at the end?
> 
> >       /** Set Unicast Table Array */
> >       eth_uc_hash_table_set_t    uc_hash_table_set;
> >       /** Set Unicast hash bitmap */
> > diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c index
> > a175867..69a1f75 100644
> > --- a/lib/ethdev/rte_ethdev.c
> > +++ b/lib/ethdev/rte_ethdev.c
> > @@ -4424,6 +4424,58 @@ int rte_eth_set_queue_rate_limit(uint16_t
> port_id, uint16_t queue_idx,
> >                                                       queue_idx, tx_rate));
> >   }
> >
> > +int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
> > +                            uint8_t fill_thresh) {
> > +     struct rte_eth_dev *dev;
> > +     struct rte_eth_dev_info dev_info;
> > +     int ret;
> > +
> > +     RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> > +     dev = &rte_eth_devices[port_id];
> > +
> > +     ret = rte_eth_dev_info_get(port_id, &dev_info);
> > +     if (ret != 0)
> > +             return ret;
> > +
> > +     if (queue_id > dev_info.max_rx_queues) {
> 
> We don't need dev_info to get number of Rx queues.
> dev->data->nb_rx_queues does the job.
> 
> > +             RTE_ETHDEV_LOG(ERR,
> > +                     "Set queue fill thresh: port %u: invalid queue ID=%u.\n",
> > +                     port_id, queue_id);
> > +             return -EINVAL;
> > +     }
> > +
> > +     if (fill_thresh > 99)
> > +             return -EINVAL;
> 
> Why do you have error log above, but don't have it here?
> 
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >rx_queue_fill_thresh_set, -ENOTSUP);
> > +     return eth_err(port_id, (*dev->dev_ops-
> >rx_queue_fill_thresh_set)(dev,
> > +                                                          queue_id,
> > +fill_thresh)); }
> > +
> > +int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
> > +                              uint8_t *fill_thresh) {
> > +     struct rte_eth_dev_info dev_info;
> > +     struct rte_eth_dev *dev;
> > +     int ret;
> > +
> > +     RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
> > +     dev = &rte_eth_devices[port_id];
> > +
> > +     ret = rte_eth_dev_info_get(port_id, &dev_info);
> > +     if (ret != 0)
> > +             return ret;
> > +
> > +     if (queue_id == NULL)
> > +             return -EINVAL;
> > +     if (*queue_id >= dev_info.max_rx_queues)
> 
> Same here. you don't need dev_info
> 
> > +             *queue_id = 0;
> > +
> > +     RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >rx_queue_fill_thresh_query, -ENOTSUP);
> > +     return eth_err(port_id, (*dev->dev_ops-
> >rx_queue_fill_thresh_query)(dev,
> > +                                                          queue_id,
> > +fill_thresh)); }
> > +
> >   RTE_INIT(eth_dev_init_fp_ops)
> >   {
> >       uint32_t i;
> > diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h index
> > 04225bb..d44e5da 100644
> > --- a/lib/ethdev/rte_ethdev.h
> > +++ b/lib/ethdev/rte_ethdev.h
> > @@ -1931,6 +1931,14 @@ struct rte_eth_rxq_info {
> >       uint8_t queue_state;        /**< one of RTE_ETH_QUEUE_STATE_*. */
> >       uint16_t nb_desc;           /**< configured number of RXDs. */
> >       uint16_t rx_buf_size;       /**< hardware receive buffer size. */
> > +     /**
> > +      * Per-queue Rx fill threshold defined as percentage of Rx queue
> > +      * size. If Rx queue receives traffic higher than this percentage,
> > +      * the event RTE_ETH_EVENT_RX_FILL_THESH is triggered.
> > +      * Value 0 means threshold monitoring is disabled, no event is
> > +      * triggered.
> > +      */
> > +     uint8_t fill_thresh;
> >   } __rte_cache_min_aligned;
> >
> >   /**
> > @@ -3672,6 +3680,65 @@ int rte_eth_dev_set_vlan_ether_type(uint16_t
> port_id,
> >    */
> >   int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int
> > on);
> >
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Set Rx queue based fill threshold.
> > + *
> > + * @param port_id
> > + *  The port identifier of the Ethernet device.
> > + * @param queue_id
> > + *  The index of the receive queue.
> > + * @param fill_thresh
> > + *  The fill threshold percentage of Rx queue size which describes
> > + *  the fullness of Rx queue. If the Rx queue fullness is above it,
> > + *  the device will trigger the event RTE_ETH_EVENT_RX_FILL_THRESH.
> > + *  [1-99] to set a new fill thresold.
> > + *  0 to disable thresold monitoring.
> > + *
> > + * @return
> > + *   - 0 if successful.
> > + *   - negative if failed.
> > + */
> > +__rte_experimental
> > +int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
> > +                            uint8_t fill_thresh);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Query Rx queue based fill threshold event.
> > + * The function queries all queues in the port circularly until one
> > + * pending fill_thresh event is found or no pending fill_thresh event is
> found.
> > + *
> > + * @param port_id
> > + *  The port identifier of the Ethernet device.
> > + * @param queue_id
> > + *  The API caller sets the starting Rx queue id in the pointer.
> > + *  If the queue_id is bigger than maximum queue id of the port,
> > + *  it's rewinded to 0 so that application can keep calling
> > + *  this function to handle all pending fill_thresh events in the
> > +queues
> > + *  with a simple increment between calls.
> > + *  If a Rx queue has pending fill_thresh event, the pointer is
> > +updated
> > + *  with this Rx queue id; otherwise this pointer's content is
> > + *  unchanged.
> > + * @param fill_thresh
> > + *  The pointer to the fill threshold percentage of Rx queue.
> > + *  If Rx queue with pending fill_thresh event is found, the queue's
> > +fill_thresh
> > + *  percentage is stored in this pointer, otherwise the pointer's
> > + *  content is unchanged.
> > + *
> > + * @return
> > + *   - 1 if a Rx queue with pending fill_thresh event is found.
> > + *   - 0 if no Rx queue with pending fill_thresh event is found.
> > + *   - -EINVAL if queue_id is NULL.
> > + */
> > +__rte_experimental
> > +int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
> > +                              uint8_t *fill_thresh);
> > +
> >   typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t
> count,
> >               void *userdata);
> >
> > @@ -3877,6 +3944,11 @@ enum rte_eth_event_type {
> >       RTE_ETH_EVENT_DESTROY,  /**< port is released */
> >       RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
> >       RTE_ETH_EVENT_FLOW_AGED,/**< New aged-out flows is detected */
> > +     /**
> > +      *  Fill threshold value is exceeded in a queue.
> > +      *  @see rte_eth_rx_fill_thresh_set()
> > +      */
> > +     RTE_ETH_EVENT_RX_FILL_THRESH,
> >       RTE_ETH_EVENT_MAX       /**< max value of this enum */
> >   };
> >
> > diff --git a/lib/ethdev/version.map b/lib/ethdev/version.map index
> > daca785..29b1fe8 100644
> > --- a/lib/ethdev/version.map
> > +++ b/lib/ethdev/version.map
> > @@ -285,6 +285,8 @@ EXPERIMENTAL {
> >       rte_mtr_color_in_protocol_priority_get;
> >       rte_mtr_color_in_protocol_set;
> >       rte_mtr_meter_vlan_table_update;
> > +     rte_eth_rx_fill_thresh_set;
> > +     rte_eth_rx_fill_thresh_query;
> >   };
> >
> >   INTERNAL {

Stephen Hemminger June 6, 2022, 3:49 p.m. UTC | #4

On Fri, 3 Jun 2022 15:48:17 +0300
Spike Du <spiked@nvidia.com> wrote:

> Fill threshold describes the fullness of a Rx queue. If the Rx
> queue fullness is above the threshold, the device will trigger the event
> RTE_ETH_EVENT_RX_FILL_THRESH.
> Fill threshold is defined as a percentage of Rx queue size with valid
> value of [0,99].
> Setting fill threshold to 0 means disable it, which is the default.
> Add fill threshold configuration and query driver callbacks in eth_dev_ops.
> Add command line options to support fill_thresh per-rxq configure.
> - Command syntax:
>   set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
> 
> - Example commands:
> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
> testpmd> set port 1 rxq 0 fill_thresh 30  
> 
> To disable fill_thresh on port 1 rxq 0:
> testpmd> set port 1 rxq 0 fill_thresh 0  
> 
> Signed-off-by: Spike Du <spiked@nvidia.com>

Could the name be shortened to just rte_event_rx_thresh?

The eth_ and _fill_ part are redundant.

Andrew Rybchenko June 6, 2022, 5:15 p.m. UTC | #5

On 6/6/22 16:16, Spike Du wrote:
> Hi Andrew,
> 	Please see below for "fill threshold" concept, I'm ok with other comments about code.
> 
> Regards,
> Spike.
> 
> 
>> -----Original Message-----
>> From: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>> Sent: Saturday, June 4, 2022 8:46 PM
>> To: Spike Du <spiked@nvidia.com>; Matan Azrad <matan@nvidia.com>;
>> Slava Ovsiienko <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>;
>> NBU-Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>;
>> Wenzhuo Lu <wenzhuo.lu@intel.com>; Beilei Xing <beilei.xing@intel.com>;
>> Bernard Iremonger <bernard.iremonger@intel.com>; Ray Kinsella
>> <mdr@ashroe.eu>; Neil Horman <nhorman@tuxdriver.com>
>> Cc: stephen@networkplumber.org; mb@smartsharesystems.com;
>> dev@dpdk.org; Raslan Darawsheh <rasland@nvidia.com>
>> Subject: Re: [PATCH v4 3/7] ethdev: introduce Rx queue based fill threshold
>>
>> External email: Use caution opening links or attachments
>>
>>
>> On 6/3/22 15:48, Spike Du wrote:
>>> Fill threshold describes the fullness of a Rx queue. If the Rx queue
>>> fullness is above the threshold, the device will trigger the event
>>> RTE_ETH_EVENT_RX_FILL_THRESH.
>>
>> Sorry, I'm not sure that I understand. As far as I know the process to add
>> more Rx buffers to Rx queue is called 'refill' in many drivers. So fill level is a
>> number (or percentage) of free buffers in an Rx queue.
>> If so, fill threashold should be a minimum fill level and below the level we
>> should generate an event.
>>
>> However reading the first paragraph of the descrition it looks like you mean
>> oposite thing - a number (or percentage) of ready Rx buffers with received
>> packets.
>>
>> I think that the term "fill threshold" is suggested by me, but I did it with mine
>> understanding of the added feature. Now I'm confused.
>>
>> Moreover, I don't understand how "fill threshold" could be in terms of ready
>> Rx buffers. HW simply don't really know when ready Rx buffers are
>> processed by SW. So, HW can't say for sure how many ready Rx buffers are
>> pending. It could be calculated as Rx queue size minus number of free Rx
>> buffers, but it is imprecise. First of all not all Rx descriptors could be used.
>> Second, HW ring size could differ queue size specified in SW.
>> Queue size specified in SW could just limit maximum nubmer of free Rx
>> buffers provided by the driver.
>>
> 
> Let me use other terms because "fill"/"refill" is also ambiguous to me.
> In a RX ring, there are Rx buffers with received packets, you call it "ready Rx buffers", there is a RTE api rte_eth_rx_queue_count() to get the number,
> It's also called "used descriptors" in the code.
> Also there are Rx buffers provided by SW to allow HW "fill in" received packets, we can call it "usable Rx buffers" (here "usable" means usable for HW).

May be it is better to stick to Rx descriptor status terminology?
Available - Rx descriptor available to HW to put received packet to
Done - Rx descriptor with received packet reported to Sw
Unavailable - other (e.g. gap which cannot be used or just processed
Done, but not refilled (made available to HW).

> Let's define Rx queue "fullness":
> 	Fullness = ready-Rx-buffers/Rxq-size

i.e. number of DONE descriptors divided by RxQ size

> On the opposite, we have "emptiness"
> 	Emptiness = usable-Rx-buffers/Rxq-size

i.e. number of AVAIL descriptors divided by RxQ size
Note, that AVAIL != RxQ-size - DONE

HW really knows number of available descriptors by its nature.
It is a space between latest done and latest received on refill.

HW does not know which descriptors are DONE, since some which
are DONE before could be already processed by SW, but not yet
made available again.


> Here "fill threshold" describes "fullness", it's not "refill" described in you above words. Because in your words, "refill" is the opposite, it's filling "usable/free Rx buffers", or "emptiness".
> 
> I can only briefly explain how mlx5 works to get LWM, because I'm not a Firmware guy.
> Mlx5 Rx queue is basically RDMA queue. It has two indexes: producer index which increases when HW fills in packet, consumer index which increases when SW consumes the packet.
> The queue size is known when it's created. The fullness is something like (producer_index - consumer_index) (I don't consider in wrap-around here).
> So mlx5 has the way to get the fullness or emptiness in HW or FW.
> Another detail is mlx5 uses the term "LWM"(limit watermark), which describes "emptiness". When usable-Rx-buffers is below LWM, we trigger an event.
> But Thomas think "fullness" is easier to understand, so we use "fullness" in rte APIs and we'll translate it to LWM in mlx5 PMD.

HW simply does now know fullness and there can't generate any events
based on it. It is a problem on Rx when there is now available
descriptors.  I.e. emptiness.

>>> Fill threshold is defined as a percentage of Rx queue size with valid
>>> value of [0,99].
>>> Setting fill threshold to 0 means disable it, which is the default.
>>> Add fill threshold configuration and query driver callbacks in eth_dev_ops.
>>> Add command line options to support fill_thresh per-rxq configure.
>>> - Command syntax:
>>>     set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
>>>
>>> - Example commands:
>>> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
>>> testpmd> set port 1 rxq 0 fill_thresh 30
>>>
>>> To disable fill_thresh on port 1 rxq 0:
>>> testpmd> set port 1 rxq 0 fill_thresh 0
>>>
>>> Signed-off-by: Spike Du <spiked@nvidia.com>

[snip]

Thomas Monjalon June 6, 2022, 9:30 p.m. UTC | #6

It seems we share a common understanding
and we need to agree on a good wording
for the most meaningful API.
Questions inline below:

06/06/2022 19:15, Andrew Rybchenko:
> On 6/6/22 16:16, Spike Du wrote:
> > Hi Andrew,
> > 	Please see below for "fill threshold" concept, I'm ok with other comments about code.
> > 
> > Regards,
> > Spike.
> > 
> > 
> > From: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
> >> On 6/3/22 15:48, Spike Du wrote:
> >>> Fill threshold describes the fullness of a Rx queue. If the Rx queue
> >>> fullness is above the threshold, the device will trigger the event
> >>> RTE_ETH_EVENT_RX_FILL_THRESH.
> >>
> >> Sorry, I'm not sure that I understand. As far as I know the process to add
> >> more Rx buffers to Rx queue is called 'refill' in many drivers. So fill level is a
> >> number (or percentage) of free buffers in an Rx queue.
> >> If so, fill threashold should be a minimum fill level and below the level we
> >> should generate an event.
> >>
> >> However reading the first paragraph of the descrition it looks like you mean
> >> oposite thing - a number (or percentage) of ready Rx buffers with received
> >> packets.
> >>
> >> I think that the term "fill threshold" is suggested by me, but I did it with mine
> >> understanding of the added feature. Now I'm confused.
> >>
> >> Moreover, I don't understand how "fill threshold" could be in terms of ready
> >> Rx buffers. HW simply don't really know when ready Rx buffers are
> >> processed by SW. So, HW can't say for sure how many ready Rx buffers are
> >> pending. It could be calculated as Rx queue size minus number of free Rx
> >> buffers, but it is imprecise. First of all not all Rx descriptors could be used.
> >> Second, HW ring size could differ queue size specified in SW.
> >> Queue size specified in SW could just limit maximum nubmer of free Rx
> >> buffers provided by the driver.
> >>
> > 
> > Let me use other terms because "fill"/"refill" is also ambiguous to me.
> > In a RX ring, there are Rx buffers with received packets, you call it "ready Rx buffers", there is a RTE api rte_eth_rx_queue_count() to get the number,
> > It's also called "used descriptors" in the code.
> > Also there are Rx buffers provided by SW to allow HW "fill in" received packets, we can call it "usable Rx buffers" (here "usable" means usable for HW).
> 
> May be it is better to stick to Rx descriptor status terminology?
> Available - Rx descriptor available to HW to put received packet to
> Done - Rx descriptor with received packet reported to Sw
> Unavailable - other (e.g. gap which cannot be used or just processed
> Done, but not refilled (made available to HW).
> 
> > Let's define Rx queue "fullness":
> > 	Fullness = ready-Rx-buffers/Rxq-size
> 
> i.e. number of DONE descriptors divided by RxQ size
> 
> > On the opposite, we have "emptiness"
> > 	Emptiness = usable-Rx-buffers/Rxq-size
> 
> i.e. number of AVAIL descriptors divided by RxQ size
> Note, that AVAIL != RxQ-size - DONE
> 
> HW really knows number of available descriptors by its nature.
> It is a space between latest done and latest received on refill.
> 
> HW does not know which descriptors are DONE, since some which
> are DONE before could be already processed by SW, but not yet
> made available again.
> 
> 
> > Here "fill threshold" describes "fullness", it's not "refill" described in you above words. Because in your words, "refill" is the opposite, it's filling "usable/free Rx buffers", or "emptiness".
> > 
> > I can only briefly explain how mlx5 works to get LWM, because I'm not a Firmware guy.
> > Mlx5 Rx queue is basically RDMA queue. It has two indexes: producer index which increases when HW fills in packet, consumer index which increases when SW consumes the packet.
> > The queue size is known when it's created. The fullness is something like (producer_index - consumer_index) (I don't consider in wrap-around here).
> > So mlx5 has the way to get the fullness or emptiness in HW or FW.
> > Another detail is mlx5 uses the term "LWM"(limit watermark), which describes "emptiness". When usable-Rx-buffers is below LWM, we trigger an event.
> > But Thomas think "fullness" is easier to understand, so we use "fullness" in rte APIs and we'll translate it to LWM in mlx5 PMD.

I may be wrong :)

> HW simply does now know fullness and there can't generate any events
> based on it. It is a problem on Rx when there is now available
> descriptors.  I.e. emptiness.

So you think "empty_thresh" would be better?
Or "avail_thresh"?

> >>> Fill threshold is defined as a percentage of Rx queue size with valid
> >>> value of [0,99].
> >>> Setting fill threshold to 0 means disable it, which is the default.
> >>> Add fill threshold configuration and query driver callbacks in eth_dev_ops.
> >>> Add command line options to support fill_thresh per-rxq configure.
> >>> - Command syntax:
> >>>     set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
> >>>
> >>> - Example commands:
> >>> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
> >>> testpmd> set port 1 rxq 0 fill_thresh 30
> >>>
> >>> To disable fill_thresh on port 1 rxq 0:
> >>> testpmd> set port 1 rxq 0 fill_thresh 0
> >>>
> >>> Signed-off-by: Spike Du <spiked@nvidia.com>

Spike Du June 7, 2022, 6 a.m. UTC | #7

If you think HW knows "emptiness", and we want to stick to DPDK descriptor terms. 
Can we use the name "avail_thresh?"
When HW sees available descriptors is under this threshold, an event is triggered.

> -----Original Message-----
> From: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
> Sent: Tuesday, June 7, 2022 1:16 AM
> To: Spike Du <spiked@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> Slava Ovsiienko <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>;
> NBU-Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>;
> Wenzhuo Lu <wenzhuo.lu@intel.com>; Beilei Xing <beilei.xing@intel.com>;
> Bernard Iremonger <bernard.iremonger@intel.com>; Ray Kinsella
> <mdr@ashroe.eu>; Neil Horman <nhorman@tuxdriver.com>
> Cc: stephen@networkplumber.org; mb@smartsharesystems.com;
> dev@dpdk.org; Raslan Darawsheh <rasland@nvidia.com>
> Subject: Re: [PATCH v4 3/7] ethdev: introduce Rx queue based fill threshold
> 
> External email: Use caution opening links or attachments
> 
> 
> On 6/6/22 16:16, Spike Du wrote:
> > Hi Andrew,
> >       Please see below for "fill threshold" concept, I'm ok with other
> comments about code.
> >
> > Regards,
> > Spike.
> >
> >
> >> -----Original Message-----
> >> From: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
> >> Sent: Saturday, June 4, 2022 8:46 PM
> >> To: Spike Du <spiked@nvidia.com>; Matan Azrad <matan@nvidia.com>;
> >> Slava Ovsiienko <viacheslavo@nvidia.com>; Ori Kam <orika@nvidia.com>;
> >> NBU-Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>;
> Wenzhuo
> >> Lu <wenzhuo.lu@intel.com>; Beilei Xing <beilei.xing@intel.com>;
> >> Bernard Iremonger <bernard.iremonger@intel.com>; Ray Kinsella
> >> <mdr@ashroe.eu>; Neil Horman <nhorman@tuxdriver.com>
> >> Cc: stephen@networkplumber.org; mb@smartsharesystems.com;
> >> dev@dpdk.org; Raslan Darawsheh <rasland@nvidia.com>
> >> Subject: Re: [PATCH v4 3/7] ethdev: introduce Rx queue based fill
> >> threshold
> >>
> >> External email: Use caution opening links or attachments
> >>
> >>
> >> On 6/3/22 15:48, Spike Du wrote:
> >>> Fill threshold describes the fullness of a Rx queue. If the Rx queue
> >>> fullness is above the threshold, the device will trigger the event
> >>> RTE_ETH_EVENT_RX_FILL_THRESH.
> >>
> >> Sorry, I'm not sure that I understand. As far as I know the process
> >> to add more Rx buffers to Rx queue is called 'refill' in many
> >> drivers. So fill level is a number (or percentage) of free buffers in an Rx
> queue.
> >> If so, fill threashold should be a minimum fill level and below the
> >> level we should generate an event.
> >>
> >> However reading the first paragraph of the descrition it looks like
> >> you mean oposite thing - a number (or percentage) of ready Rx buffers
> >> with received packets.
> >>
> >> I think that the term "fill threshold" is suggested by me, but I did
> >> it with mine understanding of the added feature. Now I'm confused.
> >>
> >> Moreover, I don't understand how "fill threshold" could be in terms
> >> of ready Rx buffers. HW simply don't really know when ready Rx
> >> buffers are processed by SW. So, HW can't say for sure how many ready
> >> Rx buffers are pending. It could be calculated as Rx queue size minus
> >> number of free Rx buffers, but it is imprecise. First of all not all Rx
> descriptors could be used.
> >> Second, HW ring size could differ queue size specified in SW.
> >> Queue size specified in SW could just limit maximum nubmer of free Rx
> >> buffers provided by the driver.
> >>
> >
> > Let me use other terms because "fill"/"refill" is also ambiguous to me.
> > In a RX ring, there are Rx buffers with received packets, you call it
> > "ready Rx buffers", there is a RTE api rte_eth_rx_queue_count() to get the
> number, It's also called "used descriptors" in the code.
> > Also there are Rx buffers provided by SW to allow HW "fill in" received
> packets, we can call it "usable Rx buffers" (here "usable" means usable for
> HW).
> 
> May be it is better to stick to Rx descriptor status terminology?
> Available - Rx descriptor available to HW to put received packet to Done - Rx
> descriptor with received packet reported to Sw Unavailable - other (e.g. gap
> which cannot be used or just processed Done, but not refilled (made
> available to HW).
> 
> > Let's define Rx queue "fullness":
> >       Fullness = ready-Rx-buffers/Rxq-size
> 
> i.e. number of DONE descriptors divided by RxQ size
> 
> > On the opposite, we have "emptiness"
> >       Emptiness = usable-Rx-buffers/Rxq-size
> 
> i.e. number of AVAIL descriptors divided by RxQ size Note, that AVAIL !=
> RxQ-size - DONE
> 
> HW really knows number of available descriptors by its nature.
> It is a space between latest done and latest received on refill.
> 
> HW does not know which descriptors are DONE, since some which are DONE
> before could be already processed by SW, but not yet made available again.
> 
> 
> > Here "fill threshold" describes "fullness", it's not "refill" described in you
> above words. Because in your words, "refill" is the opposite, it's filling
> "usable/free Rx buffers", or "emptiness".
> >
> > I can only briefly explain how mlx5 works to get LWM, because I'm not a
> Firmware guy.
> > Mlx5 Rx queue is basically RDMA queue. It has two indexes: producer index
> which increases when HW fills in packet, consumer index which increases
> when SW consumes the packet.
> > The queue size is known when it's created. The fullness is something like
> (producer_index - consumer_index) (I don't consider in wrap-around here).
> > So mlx5 has the way to get the fullness or emptiness in HW or FW.
> > Another detail is mlx5 uses the term "LWM"(limit watermark), which
> describes "emptiness". When usable-Rx-buffers is below LWM, we trigger an
> event.
> > But Thomas think "fullness" is easier to understand, so we use "fullness" in
> rte APIs and we'll translate it to LWM in mlx5 PMD.
> 
> HW simply does now know fullness and there can't generate any events
> based on it. It is a problem on Rx when there is now available descriptors.  I.e.
> emptiness.
> 
> >>> Fill threshold is defined as a percentage of Rx queue size with
> >>> valid value of [0,99].
> >>> Setting fill threshold to 0 means disable it, which is the default.
> >>> Add fill threshold configuration and query driver callbacks in
> eth_dev_ops.
> >>> Add command line options to support fill_thresh per-rxq configure.
> >>> - Command syntax:
> >>>     set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
> >>>
> >>> - Example commands:
> >>> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
> >>> testpmd> set port 1 rxq 0 fill_thresh 30
> >>>
> >>> To disable fill_thresh on port 1 rxq 0:
> >>> testpmd> set port 1 rxq 0 fill_thresh 0
> >>>
> >>> Signed-off-by: Spike Du <spiked@nvidia.com>
> 
> [snip]

Andrew Rybchenko June 7, 2022, 8:02 a.m. UTC | #8

On 6/7/22 00:30, Thomas Monjalon wrote:
> It seems we share a common understanding
> and we need to agree on a good wording
> for the most meaningful API.
> Questions inline below:
> 
> 06/06/2022 19:15, Andrew Rybchenko:
>> On 6/6/22 16:16, Spike Du wrote:
>>> Hi Andrew,
>>> 	Please see below for "fill threshold" concept, I'm ok with other comments about code.
>>>
>>> Regards,
>>> Spike.
>>>
>>>
>>> From: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>>> On 6/3/22 15:48, Spike Du wrote:
>>>>> Fill threshold describes the fullness of a Rx queue. If the Rx queue
>>>>> fullness is above the threshold, the device will trigger the event
>>>>> RTE_ETH_EVENT_RX_FILL_THRESH.
>>>>
>>>> Sorry, I'm not sure that I understand. As far as I know the process to add
>>>> more Rx buffers to Rx queue is called 'refill' in many drivers. So fill level is a
>>>> number (or percentage) of free buffers in an Rx queue.
>>>> If so, fill threashold should be a minimum fill level and below the level we
>>>> should generate an event.
>>>>
>>>> However reading the first paragraph of the descrition it looks like you mean
>>>> oposite thing - a number (or percentage) of ready Rx buffers with received
>>>> packets.
>>>>
>>>> I think that the term "fill threshold" is suggested by me, but I did it with mine
>>>> understanding of the added feature. Now I'm confused.
>>>>
>>>> Moreover, I don't understand how "fill threshold" could be in terms of ready
>>>> Rx buffers. HW simply don't really know when ready Rx buffers are
>>>> processed by SW. So, HW can't say for sure how many ready Rx buffers are
>>>> pending. It could be calculated as Rx queue size minus number of free Rx
>>>> buffers, but it is imprecise. First of all not all Rx descriptors could be used.
>>>> Second, HW ring size could differ queue size specified in SW.
>>>> Queue size specified in SW could just limit maximum nubmer of free Rx
>>>> buffers provided by the driver.
>>>>
>>>
>>> Let me use other terms because "fill"/"refill" is also ambiguous to me.
>>> In a RX ring, there are Rx buffers with received packets, you call it "ready Rx buffers", there is a RTE api rte_eth_rx_queue_count() to get the number,
>>> It's also called "used descriptors" in the code.
>>> Also there are Rx buffers provided by SW to allow HW "fill in" received packets, we can call it "usable Rx buffers" (here "usable" means usable for HW).
>>
>> May be it is better to stick to Rx descriptor status terminology?
>> Available - Rx descriptor available to HW to put received packet to
>> Done - Rx descriptor with received packet reported to Sw
>> Unavailable - other (e.g. gap which cannot be used or just processed
>> Done, but not refilled (made available to HW).
>>
>>> Let's define Rx queue "fullness":
>>> 	Fullness = ready-Rx-buffers/Rxq-size
>>
>> i.e. number of DONE descriptors divided by RxQ size
>>
>>> On the opposite, we have "emptiness"
>>> 	Emptiness = usable-Rx-buffers/Rxq-size
>>
>> i.e. number of AVAIL descriptors divided by RxQ size
>> Note, that AVAIL != RxQ-size - DONE
>>
>> HW really knows number of available descriptors by its nature.
>> It is a space between latest done and latest received on refill.
>>
>> HW does not know which descriptors are DONE, since some which
>> are DONE before could be already processed by SW, but not yet
>> made available again.
>>
>>
>>> Here "fill threshold" describes "fullness", it's not "refill" described in you above words. Because in your words, "refill" is the opposite, it's filling "usable/free Rx buffers", or "emptiness".
>>>
>>> I can only briefly explain how mlx5 works to get LWM, because I'm not a Firmware guy.
>>> Mlx5 Rx queue is basically RDMA queue. It has two indexes: producer index which increases when HW fills in packet, consumer index which increases when SW consumes the packet.
>>> The queue size is known when it's created. The fullness is something like (producer_index - consumer_index) (I don't consider in wrap-around here).
>>> So mlx5 has the way to get the fullness or emptiness in HW or FW.
>>> Another detail is mlx5 uses the term "LWM"(limit watermark), which describes "emptiness". When usable-Rx-buffers is below LWM, we trigger an event.
>>> But Thomas think "fullness" is easier to understand, so we use "fullness" in rte APIs and we'll translate it to LWM in mlx5 PMD.
> 
> I may be wrong :)
> 
>> HW simply does now know fullness and there can't generate any events
>> based on it. It is a problem on Rx when there is now available
>> descriptors.  I.e. emptiness.
> 
> So you think "empty_thresh" would be better?
> Or "avail_thresh"?

I'd go for "avail_thresh" since it is consistent with descriptor status
API terminology.

One moment I thought that there is a problem with rx_free_threshold, but
finally realize that there is no problem, since it is in terms of free
descriptors which could be made available (refilled).

>>>>> Fill threshold is defined as a percentage of Rx queue size with valid
>>>>> value of [0,99].
>>>>> Setting fill threshold to 0 means disable it, which is the default.
>>>>> Add fill threshold configuration and query driver callbacks in eth_dev_ops.
>>>>> Add command line options to support fill_thresh per-rxq configure.
>>>>> - Command syntax:
>>>>>      set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>
>>>>>
>>>>> - Example commands:
>>>>> To configure fill_thresh as 30% of rxq size on port 1 rxq 0:
>>>>> testpmd> set port 1 rxq 0 fill_thresh 30
>>>>>
>>>>> To disable fill_thresh on port 1 rxq 0:
>>>>> testpmd> set port 1 rxq 0 fill_thresh 0
>>>>>
>>>>> Signed-off-by: Spike Du <spiked@nvidia.com>
> 
> 
>

diff mbox series

Patch

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 0410bad..918581e 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -17823,6 +17823,73 @@  struct cmd_show_port_flow_transfer_proxy_result {
 	}
 };
 
+/* *** SET FILL THRESHOLD FOR A RXQ OF A PORT *** */
+struct cmd_rxq_fill_thresh_result {
+	cmdline_fixed_string_t set;
+	cmdline_fixed_string_t port;
+	uint16_t port_num;
+	cmdline_fixed_string_t rxq;
+	uint16_t rxq_num;
+	cmdline_fixed_string_t fill_thresh;
+	uint16_t fill_thresh_num;
+};
+
+static void cmd_rxq_fill_thresh_parsed(void *parsed_result,
+		__rte_unused struct cmdline *cl,
+		__rte_unused void *data)
+{
+	struct cmd_rxq_fill_thresh_result *res = parsed_result;
+	int ret = 0;
+
+	if ((strcmp(res->set, "set") == 0) && (strcmp(res->port, "port") == 0)
+	    && (strcmp(res->rxq, "rxq") == 0)
+	    && (strcmp(res->fill_thresh, "fill_thresh") == 0))
+		ret = set_rxq_fill_thresh(res->port_num, res->rxq_num,
+				  res->fill_thresh_num);
+	if (ret < 0)
+		printf("rxq_fill_thresh_cmd error: (%s)\n", strerror(-ret));
+
+}
+
+cmdline_parse_token_string_t cmd_rxq_fill_thresh_set =
+	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				set, "set");
+cmdline_parse_token_string_t cmd_rxq_fill_thresh_port =
+	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				port, "port");
+cmdline_parse_token_num_t cmd_rxq_fill_thresh_portnum =
+	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				port_num, RTE_UINT16);
+cmdline_parse_token_string_t cmd_rxq_fill_thresh_rxq =
+	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				rxq, "rxq");
+cmdline_parse_token_num_t cmd_rxq_fill_thresh_rxqnum =
+	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				rxq_num, RTE_UINT8);
+cmdline_parse_token_string_t cmd_rxq_fill_thresh_fill_thresh =
+	TOKEN_STRING_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				fill_thresh, "fill_thresh");
+cmdline_parse_token_num_t cmd_rxq_fill_thresh_fill_threshnum =
+	TOKEN_NUM_INITIALIZER(struct cmd_rxq_fill_thresh_result,
+				fill_thresh_num, RTE_UINT16);
+
+cmdline_parse_inst_t cmd_rxq_fill_thresh = {
+	.f = cmd_rxq_fill_thresh_parsed,
+	.data = (void *)0,
+	.help_str = "set port <port_id> rxq <rxq_id> fill_thresh <fill_thresh_num>"
+		"Set fill_thresh for rxq on port_id",
+	.tokens = {
+		(void *)&cmd_rxq_fill_thresh_set,
+		(void *)&cmd_rxq_fill_thresh_port,
+		(void *)&cmd_rxq_fill_thresh_portnum,
+		(void *)&cmd_rxq_fill_thresh_rxq,
+		(void *)&cmd_rxq_fill_thresh_rxqnum,
+		(void *)&cmd_rxq_fill_thresh_fill_thresh,
+		(void *)&cmd_rxq_fill_thresh_fill_threshnum,
+		NULL,
+	},
+};
+
 /* ******************************************************************************** */
 
 /* list of instructions */
@@ -18110,6 +18177,7 @@  struct cmd_show_port_flow_transfer_proxy_result {
 	(cmdline_parse_inst_t *)&cmd_show_capability,
 	(cmdline_parse_inst_t *)&cmd_set_flex_is_pattern,
 	(cmdline_parse_inst_t *)&cmd_set_flex_spec_pattern,
+	(cmdline_parse_inst_t *)&cmd_rxq_fill_thresh,
 	NULL,
 };
 
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 1b1e738..d0c519b 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -6342,3 +6342,24 @@  struct igb_ring_desc_16_bytes {
 		printf("  %s\n", buf);
 	}
 }
+
+int
+set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx, uint16_t fill_thresh)
+{
+	struct rte_eth_link link;
+	int ret;
+
+	if (port_id_is_invalid(port_id, ENABLED_WARN))
+		return -EINVAL;
+	ret = eth_link_get_nowait_print_err(port_id, &link);
+	if (ret < 0)
+		return -EINVAL;
+	if (fill_thresh > 99)
+		return -EINVAL;
+	ret = rte_eth_rx_fill_thresh_set(port_id, queue_idx, fill_thresh);
+
+	if (ret)
+		return ret;
+	return 0;
+}
+
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 767765d..1209230 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -420,6 +420,7 @@  struct fwd_engine * fwd_engines[] = {
 	[RTE_ETH_EVENT_NEW] = "device probed",
 	[RTE_ETH_EVENT_DESTROY] = "device released",
 	[RTE_ETH_EVENT_FLOW_AGED] = "flow aged",
+	[RTE_ETH_EVENT_RX_FILL_THRESH] = "rxq fill threshold reached",
 	[RTE_ETH_EVENT_MAX] = NULL,
 };
 
@@ -3616,6 +3617,10 @@  struct pmd_test_command {
 eth_event_callback(portid_t port_id, enum rte_eth_event_type type, void *param,
 		  void *ret_param)
 {
+	struct rte_eth_dev_info dev_info;
+	uint16_t rxq_id;
+	uint8_t fill_thresh;
+	int ret;
 	RTE_SET_USED(param);
 	RTE_SET_USED(ret_param);
 
@@ -3647,6 +3652,19 @@  struct pmd_test_command {
 		ports[port_id].port_status = RTE_PORT_CLOSED;
 		printf("Port %u is closed\n", port_id);
 		break;
+	case RTE_ETH_EVENT_RX_FILL_THRESH:
+		ret = rte_eth_dev_info_get(port_id, &dev_info);
+		if (ret != 0)
+			break;
+		/* fill_thresh query API rewinds rxq_id, no need to check max rxq num. */
+		for (rxq_id = 0; ; rxq_id++) {
+			ret = rte_eth_rx_fill_thresh_query(port_id, &rxq_id, &fill_thresh);
+			if (ret <= 0)
+				break;
+			printf("Received fill_thresh event, port:%d rxq_id:%d\n",
+			       port_id, rxq_id);
+		}
+		break;
 	default:
 		break;
 	}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 78a5f4e..c7a144e 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -1173,6 +1173,8 @@  uint16_t tx_pkt_set_dynf(uint16_t port_id, __rte_unused uint16_t queue,
 void flex_item_create(portid_t port_id, uint16_t flex_id, const char *filename);
 void flex_item_destroy(portid_t port_id, uint16_t flex_id);
 void port_flex_item_flush(portid_t port_id);
+int set_rxq_fill_thresh(portid_t port_id, uint16_t queue_idx,
+			uint16_t fill_thresh);
 
 extern int flow_parse(const char *src, void *result, unsigned int size,
 		      struct rte_flow_attr **attr,
diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
index 69d9dc2..7ef7dba 100644
--- a/lib/ethdev/ethdev_driver.h
+++ b/lib/ethdev/ethdev_driver.h
@@ -470,6 +470,23 @@  typedef int (*eth_rx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_rxconf *rx_conf,
 				    struct rte_mempool *mb_pool);
 
+/**
+ * @internal Set Rx queue fill threshold.
+ * @see rte_eth_rx_fill_thresh_set()
+ */
+typedef int (*eth_rx_queue_fill_thresh_set_t)(struct rte_eth_dev *dev,
+				      uint16_t rx_queue_id,
+				      uint8_t fill_thresh);
+
+/**
+ * @internal Query queue fill threshold event.
+ * @see rte_eth_rx_fill_thresh_query()
+ */
+
+typedef int (*eth_rx_queue_fill_thresh_query_t)(struct rte_eth_dev *dev,
+					uint16_t *rx_queue_id,
+					uint8_t *fill_thresh);
+
 /** @internal Setup a transmit queue of an Ethernet device. */
 typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    uint16_t tx_queue_id,
@@ -1168,6 +1185,11 @@  struct eth_dev_ops {
 	/** Priority flow control queue configure */
 	priority_flow_ctrl_queue_config_t priority_flow_ctrl_queue_config;
 
+	/** Set Rx queue fill threshold. */
+	eth_rx_queue_fill_thresh_set_t rx_queue_fill_thresh_set;
+	/** Query Rx queue fill threshold event. */
+	eth_rx_queue_fill_thresh_query_t rx_queue_fill_thresh_query;
+
 	/** Set Unicast Table Array */
 	eth_uc_hash_table_set_t    uc_hash_table_set;
 	/** Set Unicast hash bitmap */
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index a175867..69a1f75 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -4424,6 +4424,58 @@  int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx,
 							queue_idx, tx_rate));
 }
 
+int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
+			       uint8_t fill_thresh)
+{
+	struct rte_eth_dev *dev;
+	struct rte_eth_dev_info dev_info;
+	int ret;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+	dev = &rte_eth_devices[port_id];
+
+	ret = rte_eth_dev_info_get(port_id, &dev_info);
+	if (ret != 0)
+		return ret;
+
+	if (queue_id > dev_info.max_rx_queues) {
+		RTE_ETHDEV_LOG(ERR,
+			"Set queue fill thresh: port %u: invalid queue ID=%u.\n",
+			port_id, queue_id);
+		return -EINVAL;
+	}
+
+	if (fill_thresh > 99)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_fill_thresh_set, -ENOTSUP);
+	return eth_err(port_id, (*dev->dev_ops->rx_queue_fill_thresh_set)(dev,
+							     queue_id, fill_thresh));
+}
+
+int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
+				 uint8_t *fill_thresh)
+{
+	struct rte_eth_dev_info dev_info;
+	struct rte_eth_dev *dev;
+	int ret;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+	dev = &rte_eth_devices[port_id];
+
+	ret = rte_eth_dev_info_get(port_id, &dev_info);
+	if (ret != 0)
+		return ret;
+
+	if (queue_id == NULL)
+		return -EINVAL;
+	if (*queue_id >= dev_info.max_rx_queues)
+		*queue_id = 0;
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_fill_thresh_query, -ENOTSUP);
+	return eth_err(port_id, (*dev->dev_ops->rx_queue_fill_thresh_query)(dev,
+							     queue_id, fill_thresh));
+}
+
 RTE_INIT(eth_dev_init_fp_ops)
 {
 	uint32_t i;
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index 04225bb..d44e5da 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -1931,6 +1931,14 @@  struct rte_eth_rxq_info {
 	uint8_t queue_state;        /**< one of RTE_ETH_QUEUE_STATE_*. */
 	uint16_t nb_desc;           /**< configured number of RXDs. */
 	uint16_t rx_buf_size;       /**< hardware receive buffer size. */
+	/**
+	 * Per-queue Rx fill threshold defined as percentage of Rx queue
+	 * size. If Rx queue receives traffic higher than this percentage,
+	 * the event RTE_ETH_EVENT_RX_FILL_THESH is triggered.
+	 * Value 0 means threshold monitoring is disabled, no event is
+	 * triggered.
+	 */
+	uint8_t fill_thresh;
 } __rte_cache_min_aligned;
 
 /**
@@ -3672,6 +3680,65 @@  int rte_eth_dev_set_vlan_ether_type(uint16_t port_id,
  */
 int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Set Rx queue based fill threshold.
+ *
+ * @param port_id
+ *  The port identifier of the Ethernet device.
+ * @param queue_id
+ *  The index of the receive queue.
+ * @param fill_thresh
+ *  The fill threshold percentage of Rx queue size which describes
+ *  the fullness of Rx queue. If the Rx queue fullness is above it,
+ *  the device will trigger the event RTE_ETH_EVENT_RX_FILL_THRESH.
+ *  [1-99] to set a new fill thresold.
+ *  0 to disable thresold monitoring.
+ *
+ * @return
+ *   - 0 if successful.
+ *   - negative if failed.
+ */
+__rte_experimental
+int rte_eth_rx_fill_thresh_set(uint16_t port_id, uint16_t queue_id,
+			       uint8_t fill_thresh);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Query Rx queue based fill threshold event.
+ * The function queries all queues in the port circularly until one
+ * pending fill_thresh event is found or no pending fill_thresh event is found.
+ *
+ * @param port_id
+ *  The port identifier of the Ethernet device.
+ * @param queue_id
+ *  The API caller sets the starting Rx queue id in the pointer.
+ *  If the queue_id is bigger than maximum queue id of the port,
+ *  it's rewinded to 0 so that application can keep calling
+ *  this function to handle all pending fill_thresh events in the queues
+ *  with a simple increment between calls.
+ *  If a Rx queue has pending fill_thresh event, the pointer is updated
+ *  with this Rx queue id; otherwise this pointer's content is
+ *  unchanged.
+ * @param fill_thresh
+ *  The pointer to the fill threshold percentage of Rx queue.
+ *  If Rx queue with pending fill_thresh event is found, the queue's fill_thresh
+ *  percentage is stored in this pointer, otherwise the pointer's
+ *  content is unchanged.
+ *
+ * @return
+ *   - 1 if a Rx queue with pending fill_thresh event is found.
+ *   - 0 if no Rx queue with pending fill_thresh event is found.
+ *   - -EINVAL if queue_id is NULL.
+ */
+__rte_experimental
+int rte_eth_rx_fill_thresh_query(uint16_t port_id, uint16_t *queue_id,
+				 uint8_t *fill_thresh);
+
 typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
 		void *userdata);
 
@@ -3877,6 +3944,11 @@  enum rte_eth_event_type {
 	RTE_ETH_EVENT_DESTROY,  /**< port is released */
 	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
 	RTE_ETH_EVENT_FLOW_AGED,/**< New aged-out flows is detected */
+	/**
+	 *  Fill threshold value is exceeded in a queue.
+	 *  @see rte_eth_rx_fill_thresh_set()
+	 */
+	RTE_ETH_EVENT_RX_FILL_THRESH,
 	RTE_ETH_EVENT_MAX       /**< max value of this enum */
 };
 
diff --git a/lib/ethdev/version.map b/lib/ethdev/version.map
index daca785..29b1fe8 100644
--- a/lib/ethdev/version.map
+++ b/lib/ethdev/version.map
@@ -285,6 +285,8 @@  EXPERIMENTAL {
 	rte_mtr_color_in_protocol_priority_get;
 	rte_mtr_color_in_protocol_set;
 	rte_mtr_meter_vlan_table_update;
+	rte_eth_rx_fill_thresh_set;
+	rte_eth_rx_fill_thresh_query;
 };
 
 INTERNAL {