[01/13] ethdev: support setup function for hairpin queue

Message ID 1569479349-36962-2-git-send-email-orika@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series add hairpin feature |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Ori Kam Sept. 26, 2019, 6:28 a.m. UTC
  This commit introduce the RX/TX hairpin setup function.

Hairpin is RX/TX queue that is used by the nic in order to offload
wire to wire traffic.

Each hairpin queue is binded to one or more queues from other type.
For example TX hairpin queue should be binded to at least 1 RX hairpin
queue and vice versa.

Signed-off-by: Ori Kam <orika@mellanox.com>
---
 lib/librte_ethdev/rte_ethdev.c           | 213 +++++++++++++++++++++++++++++++
 lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
 lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
 lib/librte_ethdev/rte_ethdev_version.map |   4 +
 4 files changed, 380 insertions(+)
  

Comments

Andrew Rybchenko Sept. 26, 2019, 12:18 p.m. UTC | #1
On 9/26/19 9:28 AM, Ori Kam wrote:
> This commit introduce the RX/TX hairpin setup function.

RX/TX should be Rx/Tx here and everywhere below.

> Hairpin is RX/TX queue that is used by the nic in order to offload
> wire to wire traffic.
>
> Each hairpin queue is binded to one or more queues from other type.
> For example TX hairpin queue should be binded to at least 1 RX hairpin
> queue and vice versa.

How should application find out that hairpin queues are supported?
How many?
How should application find out which ports/queues could be used for pining?
Is hair-pinning domain on device level sufficient to expose limitations?

> Signed-off-by: Ori Kam <orika@mellanox.com>
> ---
>   lib/librte_ethdev/rte_ethdev.c           | 213 +++++++++++++++++++++++++++++++
>   lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
>   lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
>   lib/librte_ethdev/rte_ethdev_version.map |   4 +
>   4 files changed, 380 insertions(+)
>
> diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
> index 30b0c78..4021f38 100644
> --- a/lib/librte_ethdev/rte_ethdev.c
> +++ b/lib/librte_ethdev/rte_ethdev.c
> @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
>   }
>   
>   int
> +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
> +			       uint16_t nb_rx_desc, unsigned int socket_id,
> +			       const struct rte_eth_rxconf *rx_conf,
> +			       const struct rte_eth_hairpin_conf *hairpin_conf)

Below code duplicates rte_eth_rx_queue_setup() a lot and it is very
bad from maintenance point of view. Similar problem with Tx hairpin 
queue setup.

> +{
> +	int ret;
> +	struct rte_eth_dev *dev;
> +	struct rte_eth_dev_info dev_info;
> +	struct rte_eth_rxconf local_conf;
> +	void **rxq;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +
> +	dev = &rte_eth_devices[port_id];
> +	if (rx_queue_id >= dev->data->nb_rx_queues) {
> +		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id);
> +		return -EINVAL;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_hairpin_queue_setup,
> +				-ENOTSUP);
> +
> +	rte_eth_dev_info_get(port_id, &dev_info);
> +
> +	/* Use default specified by driver, if nb_rx_desc is zero */
> +	if (nb_rx_desc == 0) {
> +		nb_rx_desc = dev_info.default_rxportconf.ring_size;
> +		/* If driver default is also zero, fall back on EAL default */
> +		if (nb_rx_desc == 0)
> +			nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
> +	}
> +
> +	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
> +			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
> +			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
> +
> +		RTE_ETHDEV_LOG(ERR,
> +			       "Invalid value for nb_rx_desc(=%hu), should be: "
> +			       "<= %hu, >= %hu, and a product of %hu\n",
> +			nb_rx_desc, dev_info.rx_desc_lim.nb_max,
> +			dev_info.rx_desc_lim.nb_min,
> +			dev_info.rx_desc_lim.nb_align);
> +		return -EINVAL;
> +	}
> +
> +	if (dev->data->dev_started &&
> +		!(dev_info.dev_capa &
> +			RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP))
> +		return -EBUSY;
> +
> +	if (dev->data->dev_started &&
> +		(dev->data->rx_queue_state[rx_queue_id] !=
> +			RTE_ETH_QUEUE_STATE_STOPPED))
> +		return -EBUSY;
> +
> +	rxq = dev->data->rx_queues;
> +	if (rxq[rx_queue_id]) {
> +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_release,
> +					-ENOTSUP);
> +		(*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]);
> +		rxq[rx_queue_id] = NULL;
> +	}
> +
> +	if (rx_conf == NULL)
> +		rx_conf = &dev_info.default_rxconf;
> +
> +	local_conf = *rx_conf;
> +
> +	/*
> +	 * If an offloading has already been enabled in
> +	 * rte_eth_dev_configure(), it has been enabled on all queues,
> +	 * so there is no need to enable it in this queue again.
> +	 * The local_conf.offloads input to underlying PMD only carries
> +	 * those offloadings which are only enabled on this queue and
> +	 * not enabled on all queues.
> +	 */
> +	local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads;
> +
> +	/*
> +	 * New added offloadings for this queue are those not enabled in
> +	 * rte_eth_dev_configure() and they must be per-queue type.
> +	 * A pure per-port offloading can't be enabled on a queue while
> +	 * disabled on another queue. A pure per-port offloading can't
> +	 * be enabled for any queue as new added one if it hasn't been
> +	 * enabled in rte_eth_dev_configure().
> +	 */
> +	if ((local_conf.offloads & dev_info.rx_queue_offload_capa) !=
> +	     local_conf.offloads) {
> +		RTE_ETHDEV_LOG(ERR,
> +			"Ethdev port_id=%d rx_queue_id=%d, "
> +			"new added offloads 0x%"PRIx64" must be "
> +			"within per-queue offload capabilities "
> +			"0x%"PRIx64" in %s()\n",
> +			port_id, rx_queue_id, local_conf.offloads,
> +			dev_info.rx_queue_offload_capa,
> +			__func__);
> +		return -EINVAL;
> +	}
> +
> +	ret = (*dev->dev_ops->rx_hairpin_queue_setup)(dev, rx_queue_id,
> +						      nb_rx_desc, socket_id,
> +						      &local_conf,
> +						      hairpin_conf);
> +
> +	return eth_err(port_id, ret);
> +}
> +
> +int
>   rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
>   		       uint16_t nb_tx_desc, unsigned int socket_id,
>   		       const struct rte_eth_txconf *tx_conf)
> @@ -1799,6 +1908,110 @@ struct rte_eth_dev *
>   		       tx_queue_id, nb_tx_desc, socket_id, &local_conf));
>   }
>   
> +int
> +rte_eth_tx_hairpin_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
> +			       uint16_t nb_tx_desc, unsigned int socket_id,
> +			       const struct rte_eth_txconf *tx_conf,
> +			       const struct rte_eth_hairpin_conf *hairpin_conf)
> +{
> +	struct rte_eth_dev *dev;
> +	struct rte_eth_dev_info dev_info;
> +	struct rte_eth_txconf local_conf;
> +	void **txq;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +
> +	dev = &rte_eth_devices[port_id];
> +	if (tx_queue_id >= dev->data->nb_tx_queues) {
> +		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id);
> +		return -EINVAL;
> +	}
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_hairpin_queue_setup,
> +				-ENOTSUP);
> +
> +	rte_eth_dev_info_get(port_id, &dev_info);
> +
> +	/* Use default specified by driver, if nb_tx_desc is zero */
> +	if (nb_tx_desc == 0) {
> +		nb_tx_desc = dev_info.default_txportconf.ring_size;
> +		/* If driver default is zero, fall back on EAL default */
> +		if (nb_tx_desc == 0)
> +			nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
> +	}
> +	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
> +	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
> +	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
> +		RTE_ETHDEV_LOG(ERR,
> +			       "Invalid value for nb_tx_desc(=%hu), "
> +			       "should be: <= %hu, >= %hu, and a product of "
> +			       " %hu\n",
> +			       nb_tx_desc, dev_info.tx_desc_lim.nb_max,
> +			       dev_info.tx_desc_lim.nb_min,
> +			       dev_info.tx_desc_lim.nb_align);
> +		return -EINVAL;
> +	}
> +
> +	if (dev->data->dev_started &&
> +		!(dev_info.dev_capa &
> +		  RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP))
> +		return -EBUSY;
> +
> +	if (dev->data->dev_started &&
> +		(dev->data->tx_queue_state[tx_queue_id] !=
> +		 RTE_ETH_QUEUE_STATE_STOPPED))
> +		return -EBUSY;
> +
> +	txq = dev->data->tx_queues;
> +	if (txq[tx_queue_id]) {
> +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_release,
> +					-ENOTSUP);
> +		(*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]);
> +		txq[tx_queue_id] = NULL;
> +	}
> +
> +	if (tx_conf == NULL)
> +		tx_conf = &dev_info.default_txconf;
> +
> +	local_conf = *tx_conf;
> +
> +	/*
> +	 * If an offloading has already been enabled in
> +	 * rte_eth_dev_configure(), it has been enabled on all queues,
> +	 * so there is no need to enable it in this queue again.
> +	 * The local_conf.offloads input to underlying PMD only carries
> +	 * those offloadings which are only enabled on this queue and
> +	 * not enabled on all queues.
> +	 */
> +	local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads;
> +
> +	/*
> +	 * New added offloadings for this queue are those not enabled in
> +	 * rte_eth_dev_configure() and they must be per-queue type.
> +	 * A pure per-port offloading can't be enabled on a queue while
> +	 * disabled on another queue. A pure per-port offloading can't
> +	 * be enabled for any queue as new added one if it hasn't been
> +	 * enabled in rte_eth_dev_configure().
> +	 */
> +	if ((local_conf.offloads & dev_info.tx_queue_offload_capa) !=
> +	     local_conf.offloads) {
> +		RTE_ETHDEV_LOG(ERR,
> +			       "Ethdev port_id=%d tx_queue_id=%d, new added "
> +			       "offloads 0x%"PRIx64" must be within "
> +			       "per-queue offload capabilities 0x%"PRIx64" "
> +			       "in %s()\n",
> +			       port_id, tx_queue_id, local_conf.offloads,
> +			       dev_info.tx_queue_offload_capa,
> +			       __func__);
> +		return -EINVAL;
> +	}
> +
> +	return eth_err(port_id, (*dev->dev_ops->tx_hairpin_queue_setup)
> +		       (dev, tx_queue_id, nb_tx_desc, socket_id, &local_conf,
> +			hairpin_conf));
> +}
> +
>   void
>   rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t unsent,
>   		void *userdata __rte_unused)
> diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
> index 475dbda..b3b1597 100644
> --- a/lib/librte_ethdev/rte_ethdev.h
> +++ b/lib/librte_ethdev/rte_ethdev.h
> @@ -803,6 +803,30 @@ struct rte_eth_txconf {
>   	uint64_t offloads;
>   };
>   
> +#define RTE_ETH_MAX_HAIRPIN_PEERS 32
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * A structure used to hold hairpin peer data.
> + */
> +struct rte_eth_hairpin_peer {
> +	uint16_t port; /**< Peer port. */
> +	uint16_t queue; /**< Peer queue. */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * A structure used to configure hairpin binding.
> + */
> +struct rte_eth_hairpin_conf {
> +	uint16_t peer_n; /**< The number of peers. */
> +	struct rte_eth_hairpin_peer peers[RTE_ETH_MAX_HAIRPIN_PEERS];
> +};
> +
>   /**
>    * A structure contains information about HW descriptor ring limitations.
>    */
> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
>   		struct rte_mempool *mb_pool);
>   
>   /**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Allocate and set up a hairpin receive queue for an Ethernet device.
> + *
> + * The function set up the selected queue to be used in hairpin.
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + * @param rx_queue_id
> + *   The index of the receive queue to set up.
> + *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
> + *   to rte_eth_dev_configure().

Is any Rx queue may be setup as hairpin queue?
Can it be still used for regular traffic?

> + * @param nb_rx_desc
> + *   The number of receive descriptors to allocate for the receive ring.

Does it still make sense for hairpin queue?

> + * @param socket_id
> + *   The *socket_id* argument is the socket identifier in case of NUMA.
> + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
> + *   the DMA memory allocated for the receive descriptors of the ring.

Is it still required to be provided for hairpin Rx queue?

> + * @param rx_conf
> + *   The pointer to the configuration data to be used for the receive queue.
> + *   NULL value is allowed, in which case default RX configuration
> + *   will be used.
> + *   The *rx_conf* structure contains an *rx_thresh* structure with the values
> + *   of the Prefetch, Host, and Write-Back threshold registers of the receive
> + *   ring.
> + *   In addition it contains the hardware offloads features to activate using
> + *   the DEV_RX_OFFLOAD_* flags.
> + *   If an offloading set in rx_conf->offloads
> + *   hasn't been set in the input argument eth_conf->rxmode.offloads
> + *   to rte_eth_dev_configure(), it is a new added offloading, it must be
> + *   per-queue type and it is enabled for the queue.
> + *   No need to repeat any bit in rx_conf->offloads which has already been
> + *   enabled in rte_eth_dev_configure() at port level. An offloading enabled
> + *   at port level can't be disabled at queue level.

Which offloads still make sense in the case of hairpin Rx queue?
What about threshhods, drop enable?

> + * @param hairpin_conf
> + *   The pointer to the hairpin binding configuration.
> + * @return
> + *   - 0: Success, receive queue correctly set up.
> + *   - -EINVAL: The size of network buffers which can be allocated from the
> + *      memory pool does not fit the various buffer sizes allowed by the
> + *      device controller.
> + *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
> + *      allocate network memory buffers from the memory pool when
> + *      initializing receive descriptors.
> + */
> +__rte_experimental
> +int rte_eth_rx_hairpin_queue_setup
> +	(uint16_t port_id, uint16_t rx_queue_id,
> +	 uint16_t nb_rx_desc, unsigned int socket_id,
> +	 const struct rte_eth_rxconf *rx_conf,
> +	 const struct rte_eth_hairpin_conf *hairpin_conf);
> +
> +/**
>    * Allocate and set up a transmit queue for an Ethernet device.
>    *
>    * @param port_id
> @@ -1821,6 +1899,73 @@ int rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
>   		const struct rte_eth_txconf *tx_conf);
>   
>   /**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Allocate and set up a transmit hairpin queue for an Ethernet device.
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + * @param tx_queue_id
> + *   The index of the transmit queue to set up.
> + *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
> + *   to rte_eth_dev_configure().

Is any Tx queue may be setup as hairpin queue?

> + * @param nb_tx_desc
> + *   The number of transmit descriptors to allocate for the transmit ring.

Is it really required for hairpin queue? Are min/max/align limits still 
the same?

> + * @param socket_id
> + *   The *socket_id* argument is the socket identifier in case of NUMA.
> + *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
> + *   the DMA memory allocated for the transmit descriptors of the ring.

Does it still make sense for Tx hairpin queue?

> + * @param tx_conf
> + *   The pointer to the configuration data to be used for the transmit queue.
> + *   NULL value is allowed, in which case default RX configuration
> + *   will be used.
> + *   The *tx_conf* structure contains the following data:
> + *   - The *tx_thresh* structure with the values of the Prefetch, Host, and
> + *     Write-Back threshold registers of the transmit ring.
> + *     When setting Write-Back threshold to the value greater then zero,
> + *     *tx_rs_thresh* value should be explicitly set to one.
> + *   - The *tx_free_thresh* value indicates the [minimum] number of network
> + *     buffers that must be pending in the transmit ring to trigger their
> + *     [implicit] freeing by the driver transmit function.
> + *   - The *tx_rs_thresh* value indicates the [minimum] number of transmit
> + *     descriptors that must be pending in the transmit ring before setting the
> + *     RS bit on a descriptor by the driver transmit function.
> + *     The *tx_rs_thresh* value should be less or equal then
> + *     *tx_free_thresh* value, and both of them should be less then
> + *     *nb_tx_desc* - 3.

I'm not sure that everything above makes sense for hairpin Tx queue.

> + *   - The *txq_flags* member contains flags to pass to the TX queue setup
> + *     function to configure the behavior of the TX queue. This should be set
> + *     to 0 if no special configuration is required.
> + *     This API is obsolete and will be deprecated. Applications
> + *     should set it to ETH_TXQ_FLAGS_IGNORE and use
> + *     the offloads field below.

There is no txq_flags for a long time already. So, I'm wondering when it was
copies from rte_eth_tx_queue_setup().

> + *   - The *offloads* member contains Tx offloads to be enabled.
> + *     If an offloading set in tx_conf->offloads
> + *     hasn't been set in the input argument eth_conf->txmode.offloads
> + *     to rte_eth_dev_configure(), it is a new added offloading, it must be
> + *     per-queue type and it is enabled for the queue.
> + *     No need to repeat any bit in tx_conf->offloads which has already been
> + *     enabled in rte_eth_dev_configure() at port level. An offloading enabled
> + *     at port level can't be disabled at queue level.

Which offloads do really make sense and valid to use for hairpin Tx queues?
Do we need separate caps for hairpin offloads?

> + *
> + *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0 forces
> + *     the transmit function to use default values.
> + * @param hairpin_conf
> + *   The hairpin binding configuration.
> + *
> + * @return
> + *   - 0: Success, the transmit queue is correctly set up.
> + *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
> + */
> +__rte_experimental
> +int rte_eth_tx_hairpin_queue_setup
> +	(uint16_t port_id, uint16_t tx_queue_id,
> +	 uint16_t nb_tx_desc, unsigned int socket_id,
> +	 const struct rte_eth_txconf *tx_conf,
> +	 const struct rte_eth_hairpin_conf *hairpin_conf);
> +
> +/**
>    * Return the NUMA socket to which an Ethernet device is connected
>    *
>    * @param port_id
>

[snip]
  
Ori Kam Sept. 26, 2019, 3:58 p.m. UTC | #2
Hi Andrew,
Thanks for your comments PSB.
 
> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> On 9/26/19 9:28 AM, Ori Kam wrote:
> > This commit introduce the RX/TX hairpin setup function.
>
> > RX/TX should be Rx/Tx here and everywhere below.
> >
> > Hairpin is RX/TX queue that is used by the nic in order to offload
> > wire to wire traffic.
> >
> > Each hairpin queue is binded to one or more queues from other type.
> > For example TX hairpin queue should be binded to at least 1 RX hairpin
> > queue and vice versa.
>
> How should application find out that hairpin queues are supported?

It should be stated in the release note of the DPDK, when manufacture adds support for this.
In addition if the application try to set hairpin queue and it fails it can mean depending on the
error that the hairpin is not supported.

> How many?

There is no limit to the number of hairpin queues from application all queues can be hairpin queues.

> How should application find out which ports/queues could be used for
> pining?

All ports and queues can be supported, if the application request invalid combination, for example
in current Mellanox implementation binding between two ports then the setup function will  fail.

If you would like I can add capability for this, but there are too many options. For example number
of queues, binding limitations all of those will be very hard to declare. 


> Is hair-pinning domain on device level sufficient to expose limitations?
>
I'm sorry but I don’t understand your question.

> > Signed-off-by: Ori Kam <orika@mellanox.com>
> > ---
> >   lib/librte_ethdev/rte_ethdev.c           | 213
>> +++++++++++++++++++++++++++++++
> >   lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
> >   lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
> >   lib/librte_ethdev/rte_ethdev_version.map |   4 +
> >   4 files changed, 380 insertions(+)
> >
> > diff --git a/lib/librte_ethdev/rte_ethdev.c
> > b/lib/librte_ethdev/rte_ethdev.c index 30b0c78..4021f38 100644
> > --- a/lib/librte_ethdev/rte_ethdev.c
> > +++ b/lib/librte_ethdev/rte_ethdev.c
> > @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
> >   }
> >
> >   int
> > +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t
> > rx_queue_id,
> > +			       uint16_t nb_rx_desc, unsigned int socket_id,
> > +			       const struct rte_eth_rxconf *rx_conf,
> > +			       const struct rte_eth_hairpin_conf *hairpin_conf)
>
> > Below code duplicates rte_eth_rx_queue_setup() a lot and it is very bad
> > from maintenance point of view. Similar problem with Tx hairpin queue
> > setup.
> >

I'm aware of that. The reasons I choose it are: (same goes to Tx)
1. use the same function approach, meaning to use the current  setup function
    the issues with this are:
     * API break.
     * It will have extra parameters, for example mempool will not be used
        for hairpin and hairpin configuration will not be used for normal queue.
        It is possible to use a struct but again API break and some fields are not used.
     * we are just starting with the hairpin, most likely there will be modification so
         it is better to have a different function.
     * From application he undertand that this is a different kind of queue, which shouldn't be 
         used by the application.

> > +{
> > +	int ret;
> > +	struct rte_eth_dev *dev;
> > +	struct rte_eth_dev_info dev_info;
> > +	struct rte_eth_rxconf local_conf;
> > +	void **rxq;
> > +
> > +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > +
> > +	dev = &rte_eth_devices[port_id];
> > +	if (rx_queue_id >= dev->data->nb_rx_queues) {
> > +		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n",
> rx_queue_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -
> ENOTSUP);
> > +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >rx_hairpin_queue_setup,
> > +				-ENOTSUP);
> > +
> > +	rte_eth_dev_info_get(port_id, &dev_info);
> > +
> > +	/* Use default specified by driver, if nb_rx_desc is zero */
> > +	if (nb_rx_desc == 0) {
> > +		nb_rx_desc = dev_info.default_rxportconf.ring_size;
> > +		/* If driver default is also zero, fall back on EAL default */
> > +		if (nb_rx_desc == 0)
> > +			nb_rx_desc =
> RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
> > +	}
> > +
> > +	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
> > +			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
> > +			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
> > +
> > +		RTE_ETHDEV_LOG(ERR,
> > +			       "Invalid value for nb_rx_desc(=%hu), should be: "
> > +			       "<= %hu, >= %hu, and a product of %hu\n",
> > +			nb_rx_desc, dev_info.rx_desc_lim.nb_max,
> > +			dev_info.rx_desc_lim.nb_min,
> > +			dev_info.rx_desc_lim.nb_align);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (dev->data->dev_started &&
> > +		!(dev_info.dev_capa &
> > +
> 	RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP))
> > +		return -EBUSY;
> > +
> > +	if (dev->data->dev_started &&
> > +		(dev->data->rx_queue_state[rx_queue_id] !=
> > +			RTE_ETH_QUEUE_STATE_STOPPED))
> > +		return -EBUSY;
> > +
> > +	rxq = dev->data->rx_queues;
> > +	if (rxq[rx_queue_id]) {
> > +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >rx_queue_release,
> > +					-ENOTSUP);
> > +		(*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]);
> > +		rxq[rx_queue_id] = NULL;
> > +	}
> > +
> > +	if (rx_conf == NULL)
> > +		rx_conf = &dev_info.default_rxconf;
> > +
> > +	local_conf = *rx_conf;
> > +
> > +	/*
> > +	 * If an offloading has already been enabled in
> > +	 * rte_eth_dev_configure(), it has been enabled on all queues,
> > +	 * so there is no need to enable it in this queue again.
> > +	 * The local_conf.offloads input to underlying PMD only carries
> > +	 * those offloadings which are only enabled on this queue and
> > +	 * not enabled on all queues.
> > +	 */
> > +	local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads;
> > +
> > +	/*
> > +	 * New added offloadings for this queue are those not enabled in
> > +	 * rte_eth_dev_configure() and they must be per-queue type.
> > +	 * A pure per-port offloading can't be enabled on a queue while
> > +	 * disabled on another queue. A pure per-port offloading can't
> > +	 * be enabled for any queue as new added one if it hasn't been
> > +	 * enabled in rte_eth_dev_configure().
> > +	 */
> > +	if ((local_conf.offloads & dev_info.rx_queue_offload_capa) !=
> > +	     local_conf.offloads) {
> > +		RTE_ETHDEV_LOG(ERR,
> > +			"Ethdev port_id=%d rx_queue_id=%d, "
> > +			"new added offloads 0x%"PRIx64" must be "
> > +			"within per-queue offload capabilities "
> > +			"0x%"PRIx64" in %s()\n",
> > +			port_id, rx_queue_id, local_conf.offloads,
> > +			dev_info.rx_queue_offload_capa,
> > +			__func__);
> > +		return -EINVAL;
> > +	}
> > +
> > +	ret = (*dev->dev_ops->rx_hairpin_queue_setup)(dev,
> rx_queue_id,
> > +						      nb_rx_desc, socket_id,
> > +						      &local_conf,
> > +						      hairpin_conf);
> > +
> > +	return eth_err(port_id, ret);
> > +}
> > +
> > +int
> >   rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
> >   		       uint16_t nb_tx_desc, unsigned int socket_id,
> >   		       const struct rte_eth_txconf *tx_conf) @@ -1799,6
> +1908,110
> > @@ struct rte_eth_dev *
> >   		       tx_queue_id, nb_tx_desc, socket_id, &local_conf));
> >   }
> >
> > +int
> > +rte_eth_tx_hairpin_queue_setup(uint16_t port_id, uint16_t
> tx_queue_id,
> > +			       uint16_t nb_tx_desc, unsigned int socket_id,
> > +			       const struct rte_eth_txconf *tx_conf,
> > +			       const struct rte_eth_hairpin_conf *hairpin_conf)
> {
> > +	struct rte_eth_dev *dev;
> > +	struct rte_eth_dev_info dev_info;
> > +	struct rte_eth_txconf local_conf;
> > +	void **txq;
> > +
> > +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> > +
> > +	dev = &rte_eth_devices[port_id];
> > +	if (tx_queue_id >= dev->data->nb_tx_queues) {
> > +		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n",
> tx_queue_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -
> ENOTSUP);
> > +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >tx_hairpin_queue_setup,
> > +				-ENOTSUP);
> > +
> > +	rte_eth_dev_info_get(port_id, &dev_info);
> > +
> > +	/* Use default specified by driver, if nb_tx_desc is zero */
> > +	if (nb_tx_desc == 0) {
> > +		nb_tx_desc = dev_info.default_txportconf.ring_size;
> > +		/* If driver default is zero, fall back on EAL default */
> > +		if (nb_tx_desc == 0)
> > +			nb_tx_desc =
> RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
> > +	}
> > +	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
> > +	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
> > +	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
> > +		RTE_ETHDEV_LOG(ERR,
> > +			       "Invalid value for nb_tx_desc(=%hu), "
> > +			       "should be: <= %hu, >= %hu, and a product of "
> > +			       " %hu\n",
> > +			       nb_tx_desc, dev_info.tx_desc_lim.nb_max,
> > +			       dev_info.tx_desc_lim.nb_min,
> > +			       dev_info.tx_desc_lim.nb_align);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (dev->data->dev_started &&
> > +		!(dev_info.dev_capa &
> > +		  RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP))
> > +		return -EBUSY;
> > +
> > +	if (dev->data->dev_started &&
> > +		(dev->data->tx_queue_state[tx_queue_id] !=
> > +		 RTE_ETH_QUEUE_STATE_STOPPED))
> > +		return -EBUSY;
> > +
> > +	txq = dev->data->tx_queues;
> > +	if (txq[tx_queue_id]) {
> > +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >tx_queue_release,
> > +					-ENOTSUP);
> > +		(*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]);
> > +		txq[tx_queue_id] = NULL;
> > +	}
> > +
> > +	if (tx_conf == NULL)
> > +		tx_conf = &dev_info.default_txconf;
> > +
> > +	local_conf = *tx_conf;
> > +
> > +	/*
> > +	 * If an offloading has already been enabled in
> > +	 * rte_eth_dev_configure(), it has been enabled on all queues,
> > +	 * so there is no need to enable it in this queue again.
> > +	 * The local_conf.offloads input to underlying PMD only carries
> > +	 * those offloadings which are only enabled on this queue and
> > +	 * not enabled on all queues.
> > +	 */
> > +	local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads;
> > +
> > +	/*
> > +	 * New added offloadings for this queue are those not enabled in
> > +	 * rte_eth_dev_configure() and they must be per-queue type.
> > +	 * A pure per-port offloading can't be enabled on a queue while
> > +	 * disabled on another queue. A pure per-port offloading can't
> > +	 * be enabled for any queue as new added one if it hasn't been
> > +	 * enabled in rte_eth_dev_configure().
> > +	 */
> > +	if ((local_conf.offloads & dev_info.tx_queue_offload_capa) !=
> > +	     local_conf.offloads) {
> > +		RTE_ETHDEV_LOG(ERR,
> > +			       "Ethdev port_id=%d tx_queue_id=%d, new
> added "
> > +			       "offloads 0x%"PRIx64" must be within "
> > +			       "per-queue offload capabilities 0x%"PRIx64" "
> > +			       "in %s()\n",
> > +			       port_id, tx_queue_id, local_conf.offloads,
> > +			       dev_info.tx_queue_offload_capa,
> > +			       __func__);
> > +		return -EINVAL;
> > +	}
> > +
> > +	return eth_err(port_id, (*dev->dev_ops->tx_hairpin_queue_setup)
> > +		       (dev, tx_queue_id, nb_tx_desc, socket_id, &local_conf,
> > +			hairpin_conf));
> > +}
> > +
> >   void
> >   rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t
> unsent,
> >   		void *userdata __rte_unused)
> > diff --git a/lib/librte_ethdev/rte_ethdev.h
> > b/lib/librte_ethdev/rte_ethdev.h index 475dbda..b3b1597 100644
> > --- a/lib/librte_ethdev/rte_ethdev.h
> > +++ b/lib/librte_ethdev/rte_ethdev.h
> > @@ -803,6 +803,30 @@ struct rte_eth_txconf {
> >   	uint64_t offloads;
> >   };
> >
> > +#define RTE_ETH_MAX_HAIRPIN_PEERS 32
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> > +notice
> > + *
> > + * A structure used to hold hairpin peer data.
> > + */
> > +struct rte_eth_hairpin_peer {
> > +	uint16_t port; /**< Peer port. */
> > +	uint16_t queue; /**< Peer queue. */
> > +};
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> > +notice
> > + *
> > + * A structure used to configure hairpin binding.
> > + */
> > +struct rte_eth_hairpin_conf {
> > +	uint16_t peer_n; /**< The number of peers. */
> > +	struct rte_eth_hairpin_peer peers[RTE_ETH_MAX_HAIRPIN_PEERS];
> };
> > +
> >   /**
> >    * A structure contains information about HW descriptor ring limitations.
> >    */
> > @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id,
> uint16_t rx_queue_id,
> >   		struct rte_mempool *mb_pool);
> >
> >   /**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> > + notice
> > + *
> > + * Allocate and set up a hairpin receive queue for an Ethernet device.
> > + *
> > + * The function set up the selected queue to be used in hairpin.
> > + *
> > + * @param port_id
> > + *   The port identifier of the Ethernet device.
> > + * @param rx_queue_id
> > + *   The index of the receive queue to set up.
> > + *   The value must be in the range [0, nb_rx_queue - 1] previously
> supplied
> > + *   to rte_eth_dev_configure().
>
> Is any Rx queue may be setup as hairpin queue?
> Can it be still used for regular traffic?
>

No if a queue is used as hairpin it can't be used for normal traffic.
This is also why I like the idea of two different functions, in order to create
This distinction.

> > + * @param nb_rx_desc
> > + *   The number of receive descriptors to allocate for the receive ring.
>
> Does it still make sense for hairpin queue?
>

Yes, since it can affect memory size used by the device, and can affect performance.

> > + * @param socket_id
> > + *   The *socket_id* argument is the socket identifier in case of NUMA.
> > + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint
> for
> > + *   the DMA memory allocated for the receive descriptors of the ring.
>
> Is it still required to be provided for hairpin Rx queue?
>

Yes, for internal PMD structures to be allocated, but we can if pressed remove it.

> > + * @param rx_conf
> > + *   The pointer to the configuration data to be used for the receive
> queue.
> > + *   NULL value is allowed, in which case default RX configuration
> > + *   will be used.
> > + *   The *rx_conf* structure contains an *rx_thresh* structure with the
> values
> > + *   of the Prefetch, Host, and Write-Back threshold registers of the
> receive
> > + *   ring.
> > + *   In addition it contains the hardware offloads features to activate using
> > + *   the DEV_RX_OFFLOAD_* flags.
> > + *   If an offloading set in rx_conf->offloads
> > + *   hasn't been set in the input argument eth_conf->rxmode.offloads
> > + *   to rte_eth_dev_configure(), it is a new added offloading, it must be
> > + *   per-queue type and it is enabled for the queue.
> > + *   No need to repeat any bit in rx_conf->offloads which has already been
> > + *   enabled in rte_eth_dev_configure() at port level. An offloading
> enabled
> > + *   at port level can't be disabled at queue level.
>
> Which offloads still make sense in the case of hairpin Rx queue?
> What about threshhods, drop enable?
>

Drop and thresholds make sense, for example the application can state that,
in case of back pressure to start dropping packets in order not to affect the
entire nic.
regarding offloads mainly vlan strip or vlan insert but those can also 
be used in rte_flow.
But future offloads like QoS or other maybe shared.

> > + * @param hairpin_conf
> > + *   The pointer to the hairpin binding configuration.
> > + * @return
> > + *   - 0: Success, receive queue correctly set up.
> > + *   - -EINVAL: The size of network buffers which can be allocated from the
> > + *      memory pool does not fit the various buffer sizes allowed by the
> > + *      device controller.
> > + *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
> > + *      allocate network memory buffers from the memory pool when
> > + *      initializing receive descriptors.
> > + */
> > +__rte_experimental
> > +int rte_eth_rx_hairpin_queue_setup
> > +	(uint16_t port_id, uint16_t rx_queue_id,
> > +	 uint16_t nb_rx_desc, unsigned int socket_id,
> > +	 const struct rte_eth_rxconf *rx_conf,
> > +	 const struct rte_eth_hairpin_conf *hairpin_conf);
> > +
> > +/**
> >    * Allocate and set up a transmit queue for an Ethernet device.
> >    *
> >    * @param port_id
> > @@ -1821,6 +1899,73 @@ int rte_eth_tx_queue_setup(uint16_t port_id,
> uint16_t tx_queue_id,
> >   		const struct rte_eth_txconf *tx_conf);
> >
> >   /**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> > + notice
> > + *
> > + * Allocate and set up a transmit hairpin queue for an Ethernet device.
> > + *
> > + * @param port_id
> > + *   The port identifier of the Ethernet device.
> > + * @param tx_queue_id
> > + *   The index of the transmit queue to set up.
> > + *   The value must be in the range [0, nb_tx_queue - 1] previously
> supplied
> > + *   to rte_eth_dev_configure().
>
> Is any Tx queue may be setup as hairpin queue?
>

Yes just like any Rx queue.

> > + * @param nb_tx_desc
> > + *   The number of transmit descriptors to allocate for the transmit ring.
>
> Is it really required for hairpin queue? Are min/max/align limits still the
> same?
>
The number of descriptors can effect memory and performance.
Regarding min/max/align I guess this depends on the implementation in the nic.

> > + * @param socket_id
> > + *   The *socket_id* argument is the socket identifier in case of NUMA.
> > + *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
> > + *   the DMA memory allocated for the transmit descriptors of the ring.
>
> Does it still make sense for Tx hairpin queue?
>

Same as for the RX, it is used for internal PMD structures, but maybe on 
other nics they can use this.

> > + * @param tx_conf
> > + *   The pointer to the configuration data to be used for the transmit
> queue.
> > + *   NULL value is allowed, in which case default RX configuration
> > + *   will be used.
> > + *   The *tx_conf* structure contains the following data:
> > + *   - The *tx_thresh* structure with the values of the Prefetch, Host, and
> > + *     Write-Back threshold registers of the transmit ring.
> > + *     When setting Write-Back threshold to the value greater then zero,
> > + *     *tx_rs_thresh* value should be explicitly set to one.
> > + *   - The *tx_free_thresh* value indicates the [minimum] number of
> network
> > + *     buffers that must be pending in the transmit ring to trigger their
> > + *     [implicit] freeing by the driver transmit function.
> > + *   - The *tx_rs_thresh* value indicates the [minimum] number of
> transmit
> > + *     descriptors that must be pending in the transmit ring before setting
> the
> > + *     RS bit on a descriptor by the driver transmit function.
> > + *     The *tx_rs_thresh* value should be less or equal then
> > + *     *tx_free_thresh* value, and both of them should be less then
> > + *     *nb_tx_desc* - 3.
>
> I'm not sure that everything above makes sense for hairpin Tx queue.
>

You are right not all of them make sense,
But since I don't know other nics I prefer to give them those values, if they need them.
If you wish I can change the documentation.

> > + *   - The *txq_flags* member contains flags to pass to the TX queue
> setup
> > + *     function to configure the behavior of the TX queue. This should be
> set
> > + *     to 0 if no special configuration is required.
> > + *     This API is obsolete and will be deprecated. Applications
> > + *     should set it to ETH_TXQ_FLAGS_IGNORE and use
> > + *     the offloads field below.
>
> There is no txq_flags for a long time already. So, I'm wondering when it was
> copies from rte_eth_tx_queue_setup().
>
My bad from 17.11. will fix.

> > + *   - The *offloads* member contains Tx offloads to be enabled.
> > + *     If an offloading set in tx_conf->offloads
> > + *     hasn't been set in the input argument eth_conf->txmode.offloads
> > + *     to rte_eth_dev_configure(), it is a new added offloading, it must be
> > + *     per-queue type and it is enabled for the queue.
> > + *     No need to repeat any bit in tx_conf->offloads which has already
> been
> > + *     enabled in rte_eth_dev_configure() at port level. An offloading
> enabled
> > + *     at port level can't be disabled at queue level.
>
> Which offloads do really make sense and valid to use for hairpin Tx queues?
> Do we need separate caps for hairpin offloads?
>
I'm sure that we will need caps for example QoS but I don't know which yet.


> > + *
> > + *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0
> forces
> > + *     the transmit function to use default values.
> > + * @param hairpin_conf
> > + *   The hairpin binding configuration.
> > + *
> > + * @return
> > + *   - 0: Success, the transmit queue is correctly set up.
> > + *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
> > + */
> > +__rte_experimental
> > +int rte_eth_tx_hairpin_queue_setup
> > +	(uint16_t port_id, uint16_t tx_queue_id,
> > +	 uint16_t nb_tx_desc, unsigned int socket_id,
> > +	 const struct rte_eth_txconf *tx_conf,
> > +	 const struct rte_eth_hairpin_conf *hairpin_conf);
> > +
> > +/**
> >    * Return the NUMA socket to which an Ethernet device is connected
> >    *
> >    * @param port_id
> >
>
> [snip]
  
Andrew Rybchenko Sept. 26, 2019, 5:24 p.m. UTC | #3
On 9/26/19 6:58 PM, Ori Kam wrote:
> Hi Andrew,
> Thanks for your comments PSB.
>   
>> -----Original Message-----
>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>> On 9/26/19 9:28 AM, Ori Kam wrote:
>>> This commit introduce the RX/TX hairpin setup function.
>>> RX/TX should be Rx/Tx here and everywhere below.
>>>
>>> Hairpin is RX/TX queue that is used by the nic in order to offload
>>> wire to wire traffic.
>>>
>>> Each hairpin queue is binded to one or more queues from other type.
>>> For example TX hairpin queue should be binded to at least 1 RX hairpin
>>> queue and vice versa.
>> How should application find out that hairpin queues are supported?
> It should be stated in the release note of the DPDK, when manufacture adds support for this.
> In addition if the application try to set hairpin queue and it fails it can mean depending on the
> error that the hairpin is not supported.

I'm talking about dev_info-like information. Documentation is nice, but 
it is not
very useful to implement application which works with NICs from 
different vendors.

>> How many?
> There is no limit to the number of hairpin queues from application all queues can be hairpin queues.

I'm pretty sure that it could be vendor specific.

>> How should application find out which ports/queues could be used for
>> pining?
> All ports and queues can be supported, if the application request invalid combination, for example
> in current Mellanox implementation binding between two ports then the setup function will  fail.
>
> If you would like I can add capability for this, but there are too many options. For example number
> of queues, binding limitations all of those will be very hard to declare.
>
>
>> Is hair-pinning domain on device level sufficient to expose limitations?
>>
> I'm sorry but I don’t understand your question.

I was just trying to imagine how we could  say that we can hairpin
one port Rx queues to another port Tx queues.

>>> Signed-off-by: Ori Kam <orika@mellanox.com>
>>> ---
>>>    lib/librte_ethdev/rte_ethdev.c           | 213
>>> +++++++++++++++++++++++++++++++
>>>    lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
>>>    lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
>>>    lib/librte_ethdev/rte_ethdev_version.map |   4 +
>>>    4 files changed, 380 insertions(+)
>>>
>>> diff --git a/lib/librte_ethdev/rte_ethdev.c
>>> b/lib/librte_ethdev/rte_ethdev.c index 30b0c78..4021f38 100644
>>> --- a/lib/librte_ethdev/rte_ethdev.c
>>> +++ b/lib/librte_ethdev/rte_ethdev.c
>>> @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
>>>    }
>>>
>>>    int
>>> +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t
>>> rx_queue_id,
>>> +			       uint16_t nb_rx_desc, unsigned int socket_id,
>>> +			       const struct rte_eth_rxconf *rx_conf,
>>> +			       const struct rte_eth_hairpin_conf *hairpin_conf)
>>> Below code duplicates rte_eth_rx_queue_setup() a lot and it is very bad
>>> from maintenance point of view. Similar problem with Tx hairpin queue
>>> setup.
>>>
> I'm aware of that. The reasons I choose it are: (same goes to Tx)
> 1. use the same function approach, meaning to use the current  setup function
>      the issues with this are:
>       * API break.
>       * It will have extra parameters, for example mempool will not be used
>          for hairpin and hairpin configuration will not be used for normal queue.
>          It is possible to use a struct but again API break and some fields are not used.
>       * we are just starting with the hairpin, most likely there will be modification so
>           it is better to have a different function.
>       * From application he undertand that this is a different kind of queue, which shouldn't be
>           used by the application.

It does not excuse to duplicate so much code below. If we have separate
dev_info-like limitations for hairpin, it would make sense, but I hope that
it would be still possible to avoid code duplication.

>>> +{
>>> +	int ret;
>>> +	struct rte_eth_dev *dev;
>>> +	struct rte_eth_dev_info dev_info;
>>> +	struct rte_eth_rxconf local_conf;
>>> +	void **rxq;
>>> +
>>> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>>> +
>>> +	dev = &rte_eth_devices[port_id];
>>> +	if (rx_queue_id >= dev->data->nb_rx_queues) {
>>> +		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n",
>> rx_queue_id);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -
>> ENOTSUP);
>>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
>>> rx_hairpin_queue_setup,
>>> +				-ENOTSUP);
>>> +
>>> +	rte_eth_dev_info_get(port_id, &dev_info);
>>> +
>>> +	/* Use default specified by driver, if nb_rx_desc is zero */
>>> +	if (nb_rx_desc == 0) {
>>> +		nb_rx_desc = dev_info.default_rxportconf.ring_size;
>>> +		/* If driver default is also zero, fall back on EAL default */
>>> +		if (nb_rx_desc == 0)
>>> +			nb_rx_desc =
>> RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
>>> +	}
>>> +
>>> +	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
>>> +			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
>>> +			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
>>> +
>>> +		RTE_ETHDEV_LOG(ERR,
>>> +			       "Invalid value for nb_rx_desc(=%hu), should be: "
>>> +			       "<= %hu, >= %hu, and a product of %hu\n",
>>> +			nb_rx_desc, dev_info.rx_desc_lim.nb_max,
>>> +			dev_info.rx_desc_lim.nb_min,
>>> +			dev_info.rx_desc_lim.nb_align);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	if (dev->data->dev_started &&
>>> +		!(dev_info.dev_capa &
>>> +
>> 	RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP))
>>> +		return -EBUSY;
>>> +
>>> +	if (dev->data->dev_started &&
>>> +		(dev->data->rx_queue_state[rx_queue_id] !=
>>> +			RTE_ETH_QUEUE_STATE_STOPPED))
>>> +		return -EBUSY;
>>> +
>>> +	rxq = dev->data->rx_queues;
>>> +	if (rxq[rx_queue_id]) {
>>> +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
>>> rx_queue_release,
>>> +					-ENOTSUP);
>>> +		(*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]);
>>> +		rxq[rx_queue_id] = NULL;
>>> +	}
>>> +
>>> +	if (rx_conf == NULL)
>>> +		rx_conf = &dev_info.default_rxconf;
>>> +
>>> +	local_conf = *rx_conf;
>>> +
>>> +	/*
>>> +	 * If an offloading has already been enabled in
>>> +	 * rte_eth_dev_configure(), it has been enabled on all queues,
>>> +	 * so there is no need to enable it in this queue again.
>>> +	 * The local_conf.offloads input to underlying PMD only carries
>>> +	 * those offloadings which are only enabled on this queue and
>>> +	 * not enabled on all queues.
>>> +	 */
>>> +	local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads;
>>> +
>>> +	/*
>>> +	 * New added offloadings for this queue are those not enabled in
>>> +	 * rte_eth_dev_configure() and they must be per-queue type.
>>> +	 * A pure per-port offloading can't be enabled on a queue while
>>> +	 * disabled on another queue. A pure per-port offloading can't
>>> +	 * be enabled for any queue as new added one if it hasn't been
>>> +	 * enabled in rte_eth_dev_configure().
>>> +	 */
>>> +	if ((local_conf.offloads & dev_info.rx_queue_offload_capa) !=
>>> +	     local_conf.offloads) {
>>> +		RTE_ETHDEV_LOG(ERR,
>>> +			"Ethdev port_id=%d rx_queue_id=%d, "
>>> +			"new added offloads 0x%"PRIx64" must be "
>>> +			"within per-queue offload capabilities "
>>> +			"0x%"PRIx64" in %s()\n",
>>> +			port_id, rx_queue_id, local_conf.offloads,
>>> +			dev_info.rx_queue_offload_capa,
>>> +			__func__);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	ret = (*dev->dev_ops->rx_hairpin_queue_setup)(dev,
>> rx_queue_id,
>>> +						      nb_rx_desc, socket_id,
>>> +						      &local_conf,
>>> +						      hairpin_conf);
>>> +
>>> +	return eth_err(port_id, ret);
>>> +}
>>> +
>>> +int
>>>    rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
>>>    		       uint16_t nb_tx_desc, unsigned int socket_id,
>>>    		       const struct rte_eth_txconf *tx_conf) @@ -1799,6
>> +1908,110
>>> @@ struct rte_eth_dev *
>>>    		       tx_queue_id, nb_tx_desc, socket_id, &local_conf));
>>>    }
>>>
>>> +int
>>> +rte_eth_tx_hairpin_queue_setup(uint16_t port_id, uint16_t
>> tx_queue_id,
>>> +			       uint16_t nb_tx_desc, unsigned int socket_id,
>>> +			       const struct rte_eth_txconf *tx_conf,
>>> +			       const struct rte_eth_hairpin_conf *hairpin_conf)
>> {
>>> +	struct rte_eth_dev *dev;
>>> +	struct rte_eth_dev_info dev_info;
>>> +	struct rte_eth_txconf local_conf;
>>> +	void **txq;
>>> +
>>> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>>> +
>>> +	dev = &rte_eth_devices[port_id];
>>> +	if (tx_queue_id >= dev->data->nb_tx_queues) {
>>> +		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n",
>> tx_queue_id);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -
>> ENOTSUP);
>>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
>>> tx_hairpin_queue_setup,
>>> +				-ENOTSUP);
>>> +
>>> +	rte_eth_dev_info_get(port_id, &dev_info);
>>> +
>>> +	/* Use default specified by driver, if nb_tx_desc is zero */
>>> +	if (nb_tx_desc == 0) {
>>> +		nb_tx_desc = dev_info.default_txportconf.ring_size;
>>> +		/* If driver default is zero, fall back on EAL default */
>>> +		if (nb_tx_desc == 0)
>>> +			nb_tx_desc =
>> RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
>>> +	}
>>> +	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
>>> +	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
>>> +	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
>>> +		RTE_ETHDEV_LOG(ERR,
>>> +			       "Invalid value for nb_tx_desc(=%hu), "
>>> +			       "should be: <= %hu, >= %hu, and a product of "
>>> +			       " %hu\n",
>>> +			       nb_tx_desc, dev_info.tx_desc_lim.nb_max,
>>> +			       dev_info.tx_desc_lim.nb_min,
>>> +			       dev_info.tx_desc_lim.nb_align);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	if (dev->data->dev_started &&
>>> +		!(dev_info.dev_capa &
>>> +		  RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP))
>>> +		return -EBUSY;
>>> +
>>> +	if (dev->data->dev_started &&
>>> +		(dev->data->tx_queue_state[tx_queue_id] !=
>>> +		 RTE_ETH_QUEUE_STATE_STOPPED))
>>> +		return -EBUSY;
>>> +
>>> +	txq = dev->data->tx_queues;
>>> +	if (txq[tx_queue_id]) {
>>> +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
>>> tx_queue_release,
>>> +					-ENOTSUP);
>>> +		(*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]);
>>> +		txq[tx_queue_id] = NULL;
>>> +	}
>>> +
>>> +	if (tx_conf == NULL)
>>> +		tx_conf = &dev_info.default_txconf;
>>> +
>>> +	local_conf = *tx_conf;
>>> +
>>> +	/*
>>> +	 * If an offloading has already been enabled in
>>> +	 * rte_eth_dev_configure(), it has been enabled on all queues,
>>> +	 * so there is no need to enable it in this queue again.
>>> +	 * The local_conf.offloads input to underlying PMD only carries
>>> +	 * those offloadings which are only enabled on this queue and
>>> +	 * not enabled on all queues.
>>> +	 */
>>> +	local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads;
>>> +
>>> +	/*
>>> +	 * New added offloadings for this queue are those not enabled in
>>> +	 * rte_eth_dev_configure() and they must be per-queue type.
>>> +	 * A pure per-port offloading can't be enabled on a queue while
>>> +	 * disabled on another queue. A pure per-port offloading can't
>>> +	 * be enabled for any queue as new added one if it hasn't been
>>> +	 * enabled in rte_eth_dev_configure().
>>> +	 */
>>> +	if ((local_conf.offloads & dev_info.tx_queue_offload_capa) !=
>>> +	     local_conf.offloads) {
>>> +		RTE_ETHDEV_LOG(ERR,
>>> +			       "Ethdev port_id=%d tx_queue_id=%d, new
>> added "
>>> +			       "offloads 0x%"PRIx64" must be within "
>>> +			       "per-queue offload capabilities 0x%"PRIx64" "
>>> +			       "in %s()\n",
>>> +			       port_id, tx_queue_id, local_conf.offloads,
>>> +			       dev_info.tx_queue_offload_capa,
>>> +			       __func__);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	return eth_err(port_id, (*dev->dev_ops->tx_hairpin_queue_setup)
>>> +		       (dev, tx_queue_id, nb_tx_desc, socket_id, &local_conf,
>>> +			hairpin_conf));
>>> +}
>>> +
>>>    void
>>>    rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t
>> unsent,
>>>    		void *userdata __rte_unused)
>>> diff --git a/lib/librte_ethdev/rte_ethdev.h
>>> b/lib/librte_ethdev/rte_ethdev.h index 475dbda..b3b1597 100644
>>> --- a/lib/librte_ethdev/rte_ethdev.h
>>> +++ b/lib/librte_ethdev/rte_ethdev.h
>>> @@ -803,6 +803,30 @@ struct rte_eth_txconf {
>>>    	uint64_t offloads;
>>>    };
>>>
>>> +#define RTE_ETH_MAX_HAIRPIN_PEERS 32
>>> +
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>>> +notice
>>> + *
>>> + * A structure used to hold hairpin peer data.
>>> + */
>>> +struct rte_eth_hairpin_peer {
>>> +	uint16_t port; /**< Peer port. */
>>> +	uint16_t queue; /**< Peer queue. */
>>> +};
>>> +
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>>> +notice
>>> + *
>>> + * A structure used to configure hairpin binding.
>>> + */
>>> +struct rte_eth_hairpin_conf {
>>> +	uint16_t peer_n; /**< The number of peers. */
>>> +	struct rte_eth_hairpin_peer peers[RTE_ETH_MAX_HAIRPIN_PEERS];
>> };
>>> +
>>>    /**
>>>     * A structure contains information about HW descriptor ring limitations.
>>>     */
>>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id,
>> uint16_t rx_queue_id,
>>>    		struct rte_mempool *mb_pool);
>>>
>>>    /**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>>> + notice
>>> + *
>>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
>>> + *
>>> + * The function set up the selected queue to be used in hairpin.
>>> + *
>>> + * @param port_id
>>> + *   The port identifier of the Ethernet device.
>>> + * @param rx_queue_id
>>> + *   The index of the receive queue to set up.
>>> + *   The value must be in the range [0, nb_rx_queue - 1] previously
>> supplied
>>> + *   to rte_eth_dev_configure().
>> Is any Rx queue may be setup as hairpin queue?
>> Can it be still used for regular traffic?
>>
> No if a queue is used as hairpin it can't be used for normal traffic.
> This is also why I like the idea of two different functions, in order to create
> This distinction.

If so, do we need at least debug-level checks in Tx/Rx burst functions?
Is it required to patch rte flow RSS action to ensure that Rx queues of
only one kind are specified?
What about attempt to add Rx/Tx callbacks for hairpin queues?

>>> + * @param nb_rx_desc
>>> + *   The number of receive descriptors to allocate for the receive ring.
>> Does it still make sense for hairpin queue?
>>
> Yes, since it can affect memory size used by the device, and can affect performance.
>
>>> + * @param socket_id
>>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
>>> + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint
>> for
>>> + *   the DMA memory allocated for the receive descriptors of the ring.
>> Is it still required to be provided for hairpin Rx queue?
>>
> Yes, for internal PMD structures to be allocated, but we can if pressed remove it.
>
>>> + * @param rx_conf
>>> + *   The pointer to the configuration data to be used for the receive
>> queue.
>>> + *   NULL value is allowed, in which case default RX configuration
>>> + *   will be used.
>>> + *   The *rx_conf* structure contains an *rx_thresh* structure with the
>> values
>>> + *   of the Prefetch, Host, and Write-Back threshold registers of the
>> receive
>>> + *   ring.
>>> + *   In addition it contains the hardware offloads features to activate using
>>> + *   the DEV_RX_OFFLOAD_* flags.
>>> + *   If an offloading set in rx_conf->offloads
>>> + *   hasn't been set in the input argument eth_conf->rxmode.offloads
>>> + *   to rte_eth_dev_configure(), it is a new added offloading, it must be
>>> + *   per-queue type and it is enabled for the queue.
>>> + *   No need to repeat any bit in rx_conf->offloads which has already been
>>> + *   enabled in rte_eth_dev_configure() at port level. An offloading
>> enabled
>>> + *   at port level can't be disabled at queue level.
>> Which offloads still make sense in the case of hairpin Rx queue?
>> What about threshhods, drop enable?
>>
> Drop and thresholds make sense, for example the application can state that,
> in case of back pressure to start dropping packets in order not to affect the
> entire nic.
> regarding offloads mainly vlan strip or vlan insert but those can also
> be used in rte_flow.
> But future offloads like QoS or other maybe shared.

I'm not a fan of dead parameters which are added just to use
the same structure. It raises too many questions on maintenance.
Also I don't like idea to share hairpin and regular offloads.
May be it is OK to share namespace (still unsure), but capabilities
are definitely different and some regular offloads are simply not
applicable to hairpin case.

>>> + * @param hairpin_conf
>>> + *   The pointer to the hairpin binding configuration.
>>> + * @return
>>> + *   - 0: Success, receive queue correctly set up.
>>> + *   - -EINVAL: The size of network buffers which can be allocated from the
>>> + *      memory pool does not fit the various buffer sizes allowed by the
>>> + *      device controller.
>>> + *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
>>> + *      allocate network memory buffers from the memory pool when
>>> + *      initializing receive descriptors.
>>> + */
>>> +__rte_experimental
>>> +int rte_eth_rx_hairpin_queue_setup
>>> +	(uint16_t port_id, uint16_t rx_queue_id,
>>> +	 uint16_t nb_rx_desc, unsigned int socket_id,
>>> +	 const struct rte_eth_rxconf *rx_conf,
>>> +	 const struct rte_eth_hairpin_conf *hairpin_conf);
>>> +
>>> +/**
>>>     * Allocate and set up a transmit queue for an Ethernet device.
>>>     *
>>>     * @param port_id
>>> @@ -1821,6 +1899,73 @@ int rte_eth_tx_queue_setup(uint16_t port_id,
>> uint16_t tx_queue_id,
>>>    		const struct rte_eth_txconf *tx_conf);
>>>
>>>    /**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>>> + notice
>>> + *
>>> + * Allocate and set up a transmit hairpin queue for an Ethernet device.
>>> + *
>>> + * @param port_id
>>> + *   The port identifier of the Ethernet device.
>>> + * @param tx_queue_id
>>> + *   The index of the transmit queue to set up.
>>> + *   The value must be in the range [0, nb_tx_queue - 1] previously
>> supplied
>>> + *   to rte_eth_dev_configure().
>> Is any Tx queue may be setup as hairpin queue?
>>
> Yes just like any Rx queue.
>
>>> + * @param nb_tx_desc
>>> + *   The number of transmit descriptors to allocate for the transmit ring.
>> Is it really required for hairpin queue? Are min/max/align limits still the
>> same?
>>
> The number of descriptors can effect memory and performance.
> Regarding min/max/align I guess this depends on the implementation in the nic.

Again, it looks like separate dev_info-like information.

>>> + * @param socket_id
>>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
>>> + *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
>>> + *   the DMA memory allocated for the transmit descriptors of the ring.
>> Does it still make sense for Tx hairpin queue?
>>
> Same as for the RX, it is used for internal PMD structures, but maybe on
> other nics they can use this.
>
>>> + * @param tx_conf
>>> + *   The pointer to the configuration data to be used for the transmit
>> queue.
>>> + *   NULL value is allowed, in which case default RX configuration
>>> + *   will be used.
>>> + *   The *tx_conf* structure contains the following data:
>>> + *   - The *tx_thresh* structure with the values of the Prefetch, Host, and
>>> + *     Write-Back threshold registers of the transmit ring.
>>> + *     When setting Write-Back threshold to the value greater then zero,
>>> + *     *tx_rs_thresh* value should be explicitly set to one.
>>> + *   - The *tx_free_thresh* value indicates the [minimum] number of
>> network
>>> + *     buffers that must be pending in the transmit ring to trigger their
>>> + *     [implicit] freeing by the driver transmit function.
>>> + *   - The *tx_rs_thresh* value indicates the [minimum] number of
>> transmit
>>> + *     descriptors that must be pending in the transmit ring before setting
>> the
>>> + *     RS bit on a descriptor by the driver transmit function.
>>> + *     The *tx_rs_thresh* value should be less or equal then
>>> + *     *tx_free_thresh* value, and both of them should be less then
>>> + *     *nb_tx_desc* - 3.
>> I'm not sure that everything above makes sense for hairpin Tx queue.
>>
> You are right not all of them make sense,
> But since I don't know other nics I prefer to give them those values, if they need them.
> If you wish I can change the documentation.

Dead parameters are not nice.

>>> + *   - The *txq_flags* member contains flags to pass to the TX queue
>> setup
>>> + *     function to configure the behavior of the TX queue. This should be
>> set
>>> + *     to 0 if no special configuration is required.
>>> + *     This API is obsolete and will be deprecated. Applications
>>> + *     should set it to ETH_TXQ_FLAGS_IGNORE and use
>>> + *     the offloads field below.
>> There is no txq_flags for a long time already. So, I'm wondering when it was
>> copies from rte_eth_tx_queue_setup().
>>
> My bad from 17.11. will fix.
>
>>> + *   - The *offloads* member contains Tx offloads to be enabled.
>>> + *     If an offloading set in tx_conf->offloads
>>> + *     hasn't been set in the input argument eth_conf->txmode.offloads
>>> + *     to rte_eth_dev_configure(), it is a new added offloading, it must be
>>> + *     per-queue type and it is enabled for the queue.
>>> + *     No need to repeat any bit in tx_conf->offloads which has already
>> been
>>> + *     enabled in rte_eth_dev_configure() at port level. An offloading
>> enabled
>>> + *     at port level can't be disabled at queue level.
>> Which offloads do really make sense and valid to use for hairpin Tx queues?
>> Do we need separate caps for hairpin offloads?
>>
> I'm sure that we will need caps for example QoS but I don't know which yet.

Same as Rx.

>>> + *
>>> + *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0
>> forces
>>> + *     the transmit function to use default values.
>>> + * @param hairpin_conf
>>> + *   The hairpin binding configuration.
>>> + *
>>> + * @return
>>> + *   - 0: Success, the transmit queue is correctly set up.
>>> + *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
>>> + */
>>> +__rte_experimental
>>> +int rte_eth_tx_hairpin_queue_setup
>>> +	(uint16_t port_id, uint16_t tx_queue_id,
>>> +	 uint16_t nb_tx_desc, unsigned int socket_id,
>>> +	 const struct rte_eth_txconf *tx_conf,
>>> +	 const struct rte_eth_hairpin_conf *hairpin_conf);
>>> +
>>> +/**
>>>     * Return the NUMA socket to which an Ethernet device is connected
>>>     *
>>>     * @param port_id
>>>
>> [snip]
  
Ori Kam Sept. 28, 2019, 3:19 p.m. UTC | #4
Hi Andrew.
PSB

> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Thursday, September 26, 2019 8:24 PM
> To: Ori Kam <orika@mellanox.com>; Thomas Monjalon
> <thomas@monjalon.net>; Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: dev@dpdk.org; jingjing.wu@intel.com; stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH 01/13] ethdev: support setup function for
> hairpin queue
> 
> On 9/26/19 6:58 PM, Ori Kam wrote:
> > Hi Andrew,
> > Thanks for your comments PSB.
> >
> >> -----Original Message-----
> >> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >> On 9/26/19 9:28 AM, Ori Kam wrote:
> >>> This commit introduce the RX/TX hairpin setup function.
> >>> RX/TX should be Rx/Tx here and everywhere below.
> >>>
> >>> Hairpin is RX/TX queue that is used by the nic in order to offload
> >>> wire to wire traffic.
> >>>
> >>> Each hairpin queue is binded to one or more queues from other type.
> >>> For example TX hairpin queue should be binded to at least 1 RX hairpin
> >>> queue and vice versa.
> >> How should application find out that hairpin queues are supported?
> > It should be stated in the release note of the DPDK, when manufacture adds
> support for this.
> > In addition if the application try to set hairpin queue and it fails it can mean
> depending on the
> > error that the hairpin is not supported.
> 
> I'm talking about dev_info-like information. Documentation is nice, but
> it is not
> very useful to implement application which works with NICs from
> different vendors.
> 

What if we add get hairpin capabilities function.
We could have,  the max number of queues, if the nic support 1:n connection,
which offloads are supported and so on. So basically create a new set of capabilities
for hairpin this I think will also remove other concern that you have.
What do you think?
  
> >> How many?
> > There is no limit to the number of hairpin queues from application all queues
> can be hairpin queues.
> 
> I'm pretty sure that it could be vendor specific.
>

Please see my answer above.
 
> >> How should application find out which ports/queues could be used for
> >> pining?
> > All ports and queues can be supported, if the application request invalid
> combination, for example
> > in current Mellanox implementation binding between two ports then the
> setup function will  fail.
> >
> > If you would like I can add capability for this, but there are too many options.
> For example number
> > of queues, binding limitations all of those will be very hard to declare.
> >
> >
> >> Is hair-pinning domain on device level sufficient to expose limitations?
> >>
> > I'm sorry but I don’t understand your question.
> 
> I was just trying to imagine how we could  say that we can hairpin
> one port Rx queues to another port Tx queues.
>

Like I suggested above if I will add a capability function we could have
a field that says port_binidng supported, or something else, along this line.
 
> >>> Signed-off-by: Ori Kam <orika@mellanox.com>
> >>> ---
> >>>    lib/librte_ethdev/rte_ethdev.c           | 213
> >>> +++++++++++++++++++++++++++++++
> >>>    lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
> >>>    lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
> >>>    lib/librte_ethdev/rte_ethdev_version.map |   4 +
> >>>    4 files changed, 380 insertions(+)
> >>>
> >>> diff --git a/lib/librte_ethdev/rte_ethdev.c
> >>> b/lib/librte_ethdev/rte_ethdev.c index 30b0c78..4021f38 100644
> >>> --- a/lib/librte_ethdev/rte_ethdev.c
> >>> +++ b/lib/librte_ethdev/rte_ethdev.c
> >>> @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
> >>>    }
> >>>
> >>>    int
> >>> +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t
> >>> rx_queue_id,
> >>> +			       uint16_t nb_rx_desc, unsigned int socket_id,
> >>> +			       const struct rte_eth_rxconf *rx_conf,
> >>> +			       const struct rte_eth_hairpin_conf *hairpin_conf)
> >>> Below code duplicates rte_eth_rx_queue_setup() a lot and it is very bad
> >>> from maintenance point of view. Similar problem with Tx hairpin queue
> >>> setup.
> >>>
> > I'm aware of that. The reasons I choose it are: (same goes to Tx)
> > 1. use the same function approach, meaning to use the current  setup
> function
> >      the issues with this are:
> >       * API break.
> >       * It will have extra parameters, for example mempool will not be used
> >          for hairpin and hairpin configuration will not be used for normal queue.
> >          It is possible to use a struct but again API break and some fields are not
> used.
> >       * we are just starting with the hairpin, most likely there will be
> modification so
> >           it is better to have a different function.
> >       * From application he undertand that this is a different kind of queue,
> which shouldn't be
> >           used by the application.
> 
> It does not excuse to duplicate so much code below. If we have separate
> dev_info-like limitations for hairpin, it would make sense, but I hope that
> it would be still possible to avoid code duplication.
> 

We can start with the most basic implementation, which will mean that the function
will almost be empty, when other vendors or Mellanox will require some additional
test or code they will be able to decide if to add new code to he function, or
extract the shared code from the standard function to a specific function, and
use this function in both setup functions.
What do you think? 

> >>> +{
> >>> +	int ret;
> >>> +	struct rte_eth_dev *dev;
> >>> +	struct rte_eth_dev_info dev_info;
> >>> +	struct rte_eth_rxconf local_conf;
> >>> +	void **rxq;
> >>> +
> >>> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> >>> +
> >>> +	dev = &rte_eth_devices[port_id];
> >>> +	if (rx_queue_id >= dev->data->nb_rx_queues) {
> >>> +		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n",
> >> rx_queue_id);
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -
> >> ENOTSUP);
> >>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >>> rx_hairpin_queue_setup,
> >>> +				-ENOTSUP);
> >>> +
> >>> +	rte_eth_dev_info_get(port_id, &dev_info);
> >>> +
> >>> +	/* Use default specified by driver, if nb_rx_desc is zero */
> >>> +	if (nb_rx_desc == 0) {
> >>> +		nb_rx_desc = dev_info.default_rxportconf.ring_size;
> >>> +		/* If driver default is also zero, fall back on EAL default */
> >>> +		if (nb_rx_desc == 0)
> >>> +			nb_rx_desc =
> >> RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
> >>> +	}
> >>> +
> >>> +	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
> >>> +			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
> >>> +			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
> >>> +
> >>> +		RTE_ETHDEV_LOG(ERR,
> >>> +			       "Invalid value for nb_rx_desc(=%hu), should be: "
> >>> +			       "<= %hu, >= %hu, and a product of %hu\n",
> >>> +			nb_rx_desc, dev_info.rx_desc_lim.nb_max,
> >>> +			dev_info.rx_desc_lim.nb_min,
> >>> +			dev_info.rx_desc_lim.nb_align);
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	if (dev->data->dev_started &&
> >>> +		!(dev_info.dev_capa &
> >>> +
> >> 	RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP))
> >>> +		return -EBUSY;
> >>> +
> >>> +	if (dev->data->dev_started &&
> >>> +		(dev->data->rx_queue_state[rx_queue_id] !=
> >>> +			RTE_ETH_QUEUE_STATE_STOPPED))
> >>> +		return -EBUSY;
> >>> +
> >>> +	rxq = dev->data->rx_queues;
> >>> +	if (rxq[rx_queue_id]) {
> >>> +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >>> rx_queue_release,
> >>> +					-ENOTSUP);
> >>> +		(*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]);
> >>> +		rxq[rx_queue_id] = NULL;
> >>> +	}
> >>> +
> >>> +	if (rx_conf == NULL)
> >>> +		rx_conf = &dev_info.default_rxconf;
> >>> +
> >>> +	local_conf = *rx_conf;
> >>> +
> >>> +	/*
> >>> +	 * If an offloading has already been enabled in
> >>> +	 * rte_eth_dev_configure(), it has been enabled on all queues,
> >>> +	 * so there is no need to enable it in this queue again.
> >>> +	 * The local_conf.offloads input to underlying PMD only carries
> >>> +	 * those offloadings which are only enabled on this queue and
> >>> +	 * not enabled on all queues.
> >>> +	 */
> >>> +	local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads;
> >>> +
> >>> +	/*
> >>> +	 * New added offloadings for this queue are those not enabled in
> >>> +	 * rte_eth_dev_configure() and they must be per-queue type.
> >>> +	 * A pure per-port offloading can't be enabled on a queue while
> >>> +	 * disabled on another queue. A pure per-port offloading can't
> >>> +	 * be enabled for any queue as new added one if it hasn't been
> >>> +	 * enabled in rte_eth_dev_configure().
> >>> +	 */
> >>> +	if ((local_conf.offloads & dev_info.rx_queue_offload_capa) !=
> >>> +	     local_conf.offloads) {
> >>> +		RTE_ETHDEV_LOG(ERR,
> >>> +			"Ethdev port_id=%d rx_queue_id=%d, "
> >>> +			"new added offloads 0x%"PRIx64" must be "
> >>> +			"within per-queue offload capabilities "
> >>> +			"0x%"PRIx64" in %s()\n",
> >>> +			port_id, rx_queue_id, local_conf.offloads,
> >>> +			dev_info.rx_queue_offload_capa,
> >>> +			__func__);
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	ret = (*dev->dev_ops->rx_hairpin_queue_setup)(dev,
> >> rx_queue_id,
> >>> +						      nb_rx_desc, socket_id,
> >>> +						      &local_conf,
> >>> +						      hairpin_conf);
> >>> +
> >>> +	return eth_err(port_id, ret);
> >>> +}
> >>> +
> >>> +int
> >>>    rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
> >>>    		       uint16_t nb_tx_desc, unsigned int socket_id,
> >>>    		       const struct rte_eth_txconf *tx_conf) @@ -1799,6
> >> +1908,110
> >>> @@ struct rte_eth_dev *
> >>>    		       tx_queue_id, nb_tx_desc, socket_id, &local_conf));
> >>>    }
> >>>
> >>> +int
> >>> +rte_eth_tx_hairpin_queue_setup(uint16_t port_id, uint16_t
> >> tx_queue_id,
> >>> +			       uint16_t nb_tx_desc, unsigned int socket_id,
> >>> +			       const struct rte_eth_txconf *tx_conf,
> >>> +			       const struct rte_eth_hairpin_conf *hairpin_conf)
> >> {
> >>> +	struct rte_eth_dev *dev;
> >>> +	struct rte_eth_dev_info dev_info;
> >>> +	struct rte_eth_txconf local_conf;
> >>> +	void **txq;
> >>> +
> >>> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> >>> +
> >>> +	dev = &rte_eth_devices[port_id];
> >>> +	if (tx_queue_id >= dev->data->nb_tx_queues) {
> >>> +		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n",
> >> tx_queue_id);
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -
> >> ENOTSUP);
> >>> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >>> tx_hairpin_queue_setup,
> >>> +				-ENOTSUP);
> >>> +
> >>> +	rte_eth_dev_info_get(port_id, &dev_info);
> >>> +
> >>> +	/* Use default specified by driver, if nb_tx_desc is zero */
> >>> +	if (nb_tx_desc == 0) {
> >>> +		nb_tx_desc = dev_info.default_txportconf.ring_size;
> >>> +		/* If driver default is zero, fall back on EAL default */
> >>> +		if (nb_tx_desc == 0)
> >>> +			nb_tx_desc =
> >> RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
> >>> +	}
> >>> +	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
> >>> +	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
> >>> +	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
> >>> +		RTE_ETHDEV_LOG(ERR,
> >>> +			       "Invalid value for nb_tx_desc(=%hu), "
> >>> +			       "should be: <= %hu, >= %hu, and a product of "
> >>> +			       " %hu\n",
> >>> +			       nb_tx_desc, dev_info.tx_desc_lim.nb_max,
> >>> +			       dev_info.tx_desc_lim.nb_min,
> >>> +			       dev_info.tx_desc_lim.nb_align);
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	if (dev->data->dev_started &&
> >>> +		!(dev_info.dev_capa &
> >>> +		  RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP))
> >>> +		return -EBUSY;
> >>> +
> >>> +	if (dev->data->dev_started &&
> >>> +		(dev->data->tx_queue_state[tx_queue_id] !=
> >>> +		 RTE_ETH_QUEUE_STATE_STOPPED))
> >>> +		return -EBUSY;
> >>> +
> >>> +	txq = dev->data->tx_queues;
> >>> +	if (txq[tx_queue_id]) {
> >>> +		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops-
> >>> tx_queue_release,
> >>> +					-ENOTSUP);
> >>> +		(*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]);
> >>> +		txq[tx_queue_id] = NULL;
> >>> +	}
> >>> +
> >>> +	if (tx_conf == NULL)
> >>> +		tx_conf = &dev_info.default_txconf;
> >>> +
> >>> +	local_conf = *tx_conf;
> >>> +
> >>> +	/*
> >>> +	 * If an offloading has already been enabled in
> >>> +	 * rte_eth_dev_configure(), it has been enabled on all queues,
> >>> +	 * so there is no need to enable it in this queue again.
> >>> +	 * The local_conf.offloads input to underlying PMD only carries
> >>> +	 * those offloadings which are only enabled on this queue and
> >>> +	 * not enabled on all queues.
> >>> +	 */
> >>> +	local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads;
> >>> +
> >>> +	/*
> >>> +	 * New added offloadings for this queue are those not enabled in
> >>> +	 * rte_eth_dev_configure() and they must be per-queue type.
> >>> +	 * A pure per-port offloading can't be enabled on a queue while
> >>> +	 * disabled on another queue. A pure per-port offloading can't
> >>> +	 * be enabled for any queue as new added one if it hasn't been
> >>> +	 * enabled in rte_eth_dev_configure().
> >>> +	 */
> >>> +	if ((local_conf.offloads & dev_info.tx_queue_offload_capa) !=
> >>> +	     local_conf.offloads) {
> >>> +		RTE_ETHDEV_LOG(ERR,
> >>> +			       "Ethdev port_id=%d tx_queue_id=%d, new
> >> added "
> >>> +			       "offloads 0x%"PRIx64" must be within "
> >>> +			       "per-queue offload capabilities 0x%"PRIx64" "
> >>> +			       "in %s()\n",
> >>> +			       port_id, tx_queue_id, local_conf.offloads,
> >>> +			       dev_info.tx_queue_offload_capa,
> >>> +			       __func__);
> >>> +		return -EINVAL;
> >>> +	}
> >>> +
> >>> +	return eth_err(port_id, (*dev->dev_ops->tx_hairpin_queue_setup)
> >>> +		       (dev, tx_queue_id, nb_tx_desc, socket_id, &local_conf,
> >>> +			hairpin_conf));
> >>> +}
> >>> +
> >>>    void
> >>>    rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t
> >> unsent,
> >>>    		void *userdata __rte_unused)
> >>> diff --git a/lib/librte_ethdev/rte_ethdev.h
> >>> b/lib/librte_ethdev/rte_ethdev.h index 475dbda..b3b1597 100644
> >>> --- a/lib/librte_ethdev/rte_ethdev.h
> >>> +++ b/lib/librte_ethdev/rte_ethdev.h
> >>> @@ -803,6 +803,30 @@ struct rte_eth_txconf {
> >>>    	uint64_t offloads;
> >>>    };
> >>>
> >>> +#define RTE_ETH_MAX_HAIRPIN_PEERS 32
> >>> +
> >>> +/**
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> >>> +notice
> >>> + *
> >>> + * A structure used to hold hairpin peer data.
> >>> + */
> >>> +struct rte_eth_hairpin_peer {
> >>> +	uint16_t port; /**< Peer port. */
> >>> +	uint16_t queue; /**< Peer queue. */
> >>> +};
> >>> +
> >>> +/**
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> >>> +notice
> >>> + *
> >>> + * A structure used to configure hairpin binding.
> >>> + */
> >>> +struct rte_eth_hairpin_conf {
> >>> +	uint16_t peer_n; /**< The number of peers. */
> >>> +	struct rte_eth_hairpin_peer peers[RTE_ETH_MAX_HAIRPIN_PEERS];
> >> };
> >>> +
> >>>    /**
> >>>     * A structure contains information about HW descriptor ring limitations.
> >>>     */
> >>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id,
> >> uint16_t rx_queue_id,
> >>>    		struct rte_mempool *mb_pool);
> >>>
> >>>    /**
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> >>> + notice
> >>> + *
> >>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
> >>> + *
> >>> + * The function set up the selected queue to be used in hairpin.
> >>> + *
> >>> + * @param port_id
> >>> + *   The port identifier of the Ethernet device.
> >>> + * @param rx_queue_id
> >>> + *   The index of the receive queue to set up.
> >>> + *   The value must be in the range [0, nb_rx_queue - 1] previously
> >> supplied
> >>> + *   to rte_eth_dev_configure().
> >> Is any Rx queue may be setup as hairpin queue?
> >> Can it be still used for regular traffic?
> >>
> > No if a queue is used as hairpin it can't be used for normal traffic.
> > This is also why I like the idea of two different functions, in order to create
> > This distinction.
> 
> If so, do we need at least debug-level checks in Tx/Rx burst functions?
> Is it required to patch rte flow RSS action to ensure that Rx queues of
> only one kind are specified?
> What about attempt to add Rx/Tx callbacks for hairpin queues?
> 

I think the checks should be done in PMD level. Since from high level they are the 
same. Call backs for Rx/Tx doesn't make sense, since the idea is to bypass the
CPU. 

> >>> + * @param nb_rx_desc
> >>> + *   The number of receive descriptors to allocate for the receive ring.
> >> Does it still make sense for hairpin queue?
> >>
> > Yes, since it can affect memory size used by the device, and can affect
> performance.
> >
> >>> + * @param socket_id
> >>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
> >>> + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint
> >> for
> >>> + *   the DMA memory allocated for the receive descriptors of the ring.
> >> Is it still required to be provided for hairpin Rx queue?
> >>
> > Yes, for internal PMD structures to be allocated, but we can if pressed
> remove it.
> >
> >>> + * @param rx_conf
> >>> + *   The pointer to the configuration data to be used for the receive
> >> queue.
> >>> + *   NULL value is allowed, in which case default RX configuration
> >>> + *   will be used.
> >>> + *   The *rx_conf* structure contains an *rx_thresh* structure with the
> >> values
> >>> + *   of the Prefetch, Host, and Write-Back threshold registers of the
> >> receive
> >>> + *   ring.
> >>> + *   In addition it contains the hardware offloads features to activate using
> >>> + *   the DEV_RX_OFFLOAD_* flags.
> >>> + *   If an offloading set in rx_conf->offloads
> >>> + *   hasn't been set in the input argument eth_conf->rxmode.offloads
> >>> + *   to rte_eth_dev_configure(), it is a new added offloading, it must be
> >>> + *   per-queue type and it is enabled for the queue.
> >>> + *   No need to repeat any bit in rx_conf->offloads which has already been
> >>> + *   enabled in rte_eth_dev_configure() at port level. An offloading
> >> enabled
> >>> + *   at port level can't be disabled at queue level.
> >> Which offloads still make sense in the case of hairpin Rx queue?
> >> What about threshhods, drop enable?
> >>
> > Drop and thresholds make sense, for example the application can state that,
> > in case of back pressure to start dropping packets in order not to affect the
> > entire nic.
> > regarding offloads mainly vlan strip or vlan insert but those can also
> > be used in rte_flow.
> > But future offloads like QoS or other maybe shared.
> 
> I'm not a fan of dead parameters which are added just to use
> the same structure. It raises too many questions on maintenance.
> Also I don't like idea to share hairpin and regular offloads.
> May be it is OK to share namespace (still unsure), but capabilities
> are definitely different and some regular offloads are simply not
> applicable to hairpin case.
> 
I agree with you I think that my suggestion above (new caps for hairpin)
solve this issue. Do you agree?
I will remove the rte_eth_txconf and only hae the hairpin_conf with some new
fields, same for the Rx, is that O.K.?


> >>> + * @param hairpin_conf
> >>> + *   The pointer to the hairpin binding configuration.
> >>> + * @return
> >>> + *   - 0: Success, receive queue correctly set up.
> >>> + *   - -EINVAL: The size of network buffers which can be allocated from the
> >>> + *      memory pool does not fit the various buffer sizes allowed by the
> >>> + *      device controller.
> >>> + *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
> >>> + *      allocate network memory buffers from the memory pool when
> >>> + *      initializing receive descriptors.
> >>> + */
> >>> +__rte_experimental
> >>> +int rte_eth_rx_hairpin_queue_setup
> >>> +	(uint16_t port_id, uint16_t rx_queue_id,
> >>> +	 uint16_t nb_rx_desc, unsigned int socket_id,
> >>> +	 const struct rte_eth_rxconf *rx_conf,
> >>> +	 const struct rte_eth_hairpin_conf *hairpin_conf);
> >>> +
> >>> +/**
> >>>     * Allocate and set up a transmit queue for an Ethernet device.
> >>>     *
> >>>     * @param port_id
> >>> @@ -1821,6 +1899,73 @@ int rte_eth_tx_queue_setup(uint16_t port_id,
> >> uint16_t tx_queue_id,
> >>>    		const struct rte_eth_txconf *tx_conf);
> >>>
> >>>    /**
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> >>> + notice
> >>> + *
> >>> + * Allocate and set up a transmit hairpin queue for an Ethernet device.
> >>> + *
> >>> + * @param port_id
> >>> + *   The port identifier of the Ethernet device.
> >>> + * @param tx_queue_id
> >>> + *   The index of the transmit queue to set up.
> >>> + *   The value must be in the range [0, nb_tx_queue - 1] previously
> >> supplied
> >>> + *   to rte_eth_dev_configure().
> >> Is any Tx queue may be setup as hairpin queue?
> >>
> > Yes just like any Rx queue.
> >
> >>> + * @param nb_tx_desc
> >>> + *   The number of transmit descriptors to allocate for the transmit ring.
> >> Is it really required for hairpin queue? Are min/max/align limits still the
> >> same?
> >>
> > The number of descriptors can effect memory and performance.
> > Regarding min/max/align I guess this depends on the implementation in the
> nic.
> 
> Again, it looks like separate dev_info-like information.
> 

Please see comments above.

> >>> + * @param socket_id
> >>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
> >>> + *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
> >>> + *   the DMA memory allocated for the transmit descriptors of the ring.
> >> Does it still make sense for Tx hairpin queue?
> >>
> > Same as for the RX, it is used for internal PMD structures, but maybe on
> > other nics they can use this.
> >
> >>> + * @param tx_conf
> >>> + *   The pointer to the configuration data to be used for the transmit
> >> queue.
> >>> + *   NULL value is allowed, in which case default RX configuration
> >>> + *   will be used.
> >>> + *   The *tx_conf* structure contains the following data:
> >>> + *   - The *tx_thresh* structure with the values of the Prefetch, Host, and
> >>> + *     Write-Back threshold registers of the transmit ring.
> >>> + *     When setting Write-Back threshold to the value greater then zero,
> >>> + *     *tx_rs_thresh* value should be explicitly set to one.
> >>> + *   - The *tx_free_thresh* value indicates the [minimum] number of
> >> network
> >>> + *     buffers that must be pending in the transmit ring to trigger their
> >>> + *     [implicit] freeing by the driver transmit function.
> >>> + *   - The *tx_rs_thresh* value indicates the [minimum] number of
> >> transmit
> >>> + *     descriptors that must be pending in the transmit ring before setting
> >> the
> >>> + *     RS bit on a descriptor by the driver transmit function.
> >>> + *     The *tx_rs_thresh* value should be less or equal then
> >>> + *     *tx_free_thresh* value, and both of them should be less then
> >>> + *     *nb_tx_desc* - 3.
> >> I'm not sure that everything above makes sense for hairpin Tx queue.
> >>
> > You are right not all of them make sense,
> > But since I don't know other nics I prefer to give them those values, if they
> need them.
> > If you wish I can change the documentation.
> 
> Dead parameters are not nice.
>

See comments above.
 
> >>> + *   - The *txq_flags* member contains flags to pass to the TX queue
> >> setup
> >>> + *     function to configure the behavior of the TX queue. This should be
> >> set
> >>> + *     to 0 if no special configuration is required.
> >>> + *     This API is obsolete and will be deprecated. Applications
> >>> + *     should set it to ETH_TXQ_FLAGS_IGNORE and use
> >>> + *     the offloads field below.
> >> There is no txq_flags for a long time already. So, I'm wondering when it was
> >> copies from rte_eth_tx_queue_setup().
> >>
> > My bad from 17.11. will fix.
> >
> >>> + *   - The *offloads* member contains Tx offloads to be enabled.
> >>> + *     If an offloading set in tx_conf->offloads
> >>> + *     hasn't been set in the input argument eth_conf->txmode.offloads
> >>> + *     to rte_eth_dev_configure(), it is a new added offloading, it must be
> >>> + *     per-queue type and it is enabled for the queue.
> >>> + *     No need to repeat any bit in tx_conf->offloads which has already
> >> been
> >>> + *     enabled in rte_eth_dev_configure() at port level. An offloading
> >> enabled
> >>> + *     at port level can't be disabled at queue level.
> >> Which offloads do really make sense and valid to use for hairpin Tx queues?
> >> Do we need separate caps for hairpin offloads?
> >>
> > I'm sure that we will need caps for example QoS but I don't know which yet.
> 
> Same as Rx.
>

Agree please see comments above.
 
> >>> + *
> >>> + *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0
> >> forces
> >>> + *     the transmit function to use default values.
> >>> + * @param hairpin_conf
> >>> + *   The hairpin binding configuration.
> >>> + *
> >>> + * @return
> >>> + *   - 0: Success, the transmit queue is correctly set up.
> >>> + *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
> >>> + */
> >>> +__rte_experimental
> >>> +int rte_eth_tx_hairpin_queue_setup
> >>> +	(uint16_t port_id, uint16_t tx_queue_id,
> >>> +	 uint16_t nb_tx_desc, unsigned int socket_id,
> >>> +	 const struct rte_eth_txconf *tx_conf,
> >>> +	 const struct rte_eth_hairpin_conf *hairpin_conf);
> >>> +
> >>> +/**
> >>>     * Return the NUMA socket to which an Ethernet device is connected
> >>>     *
> >>>     * @param port_id
> >>>
> >> [snip]

Thanks,
Ori
  
Andrew Rybchenko Sept. 29, 2019, 12:10 p.m. UTC | #5
Hi Ori,

On 9/28/19 6:19 PM, Ori Kam wrote:
> Hi Andrew.
> PSB
>
>> -----Original Message-----
>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>> Sent: Thursday, September 26, 2019 8:24 PM
>> To: Ori Kam <orika@mellanox.com>; Thomas Monjalon
>> <thomas@monjalon.net>; Ferruh Yigit <ferruh.yigit@intel.com>
>> Cc: dev@dpdk.org; jingjing.wu@intel.com; stephen@networkplumber.org
>> Subject: Re: [dpdk-dev] [PATCH 01/13] ethdev: support setup function for
>> hairpin queue
>>
>> On 9/26/19 6:58 PM, Ori Kam wrote:
>>> Hi Andrew,
>>> Thanks for your comments PSB.
>>>
>>>> -----Original Message-----
>>>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>>>> On 9/26/19 9:28 AM, Ori Kam wrote:
>>>>> This commit introduce the RX/TX hairpin setup function.
>>>>> RX/TX should be Rx/Tx here and everywhere below.
>>>>>
>>>>> Hairpin is RX/TX queue that is used by the nic in order to offload
>>>>> wire to wire traffic.
>>>>>
>>>>> Each hairpin queue is binded to one or more queues from other type.
>>>>> For example TX hairpin queue should be binded to at least 1 RX hairpin
>>>>> queue and vice versa.
>>>> How should application find out that hairpin queues are supported?
>>> It should be stated in the release note of the DPDK, when manufacture adds
>> support for this.
>>> In addition if the application try to set hairpin queue and it fails it can mean
>> depending on the
>>> error that the hairpin is not supported.
>> I'm talking about dev_info-like information. Documentation is nice, but
>> it is not
>> very useful to implement application which works with NICs from
>> different vendors.
>>
> What if we add get hairpin capabilities function.
> We could have,  the max number of queues, if the nic support 1:n connection,
> which offloads are supported and so on. So basically create a new set of capabilities
> for hairpin this I think will also remove other concern that you have.
> What do you think?

Yes, I think an API to report capabilities would be useful.
It should be also used in setup functions in order to do checks on
generic level that setup request is OK vs caps.

>>>> How many?
>>> There is no limit to the number of hairpin queues from application all queues
>> can be hairpin queues.
>>
>> I'm pretty sure that it could be vendor specific.
>>
> Please see my answer above.
>   
>>>> How should application find out which ports/queues could be used for
>>>> pining?
>>> All ports and queues can be supported, if the application request invalid
>> combination, for example
>>> in current Mellanox implementation binding between two ports then the
>> setup function will  fail.
>>> If you would like I can add capability for this, but there are too many options.
>> For example number
>>> of queues, binding limitations all of those will be very hard to declare.
>>>
>>>
>>>> Is hair-pinning domain on device level sufficient to expose limitations?
>>>>
>>> I'm sorry but I don’t understand your question.
>> I was just trying to imagine how we could  say that we can hairpin
>> one port Rx queues to another port Tx queues.
>>
> Like I suggested above if I will add a capability function we could have
> a field that says port_binidng supported, or something else, along this line.

Not sure that I understand, but I'll take a look when submitted.

>>>>> Signed-off-by: Ori Kam <orika@mellanox.com>
>>>>> ---
>>>>>     lib/librte_ethdev/rte_ethdev.c           | 213
>>>>> +++++++++++++++++++++++++++++++
>>>>>     lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
>>>>>     lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
>>>>>     lib/librte_ethdev/rte_ethdev_version.map |   4 +
>>>>>     4 files changed, 380 insertions(+)
>>>>>
>>>>> diff --git a/lib/librte_ethdev/rte_ethdev.c
>>>>> b/lib/librte_ethdev/rte_ethdev.c index 30b0c78..4021f38 100644
>>>>> --- a/lib/librte_ethdev/rte_ethdev.c
>>>>> +++ b/lib/librte_ethdev/rte_ethdev.c
>>>>> @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
>>>>>     }
>>>>>
>>>>>     int
>>>>> +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t
>>>>> rx_queue_id,
>>>>> +			       uint16_t nb_rx_desc, unsigned int socket_id,
>>>>> +			       const struct rte_eth_rxconf *rx_conf,
>>>>> +			       const struct rte_eth_hairpin_conf *hairpin_conf)
>>>>> Below code duplicates rte_eth_rx_queue_setup() a lot and it is very bad
>>>>> from maintenance point of view. Similar problem with Tx hairpin queue
>>>>> setup.
>>>>>
>>> I'm aware of that. The reasons I choose it are: (same goes to Tx)
>>> 1. use the same function approach, meaning to use the current  setup
>> function
>>>       the issues with this are:
>>>        * API break.
>>>        * It will have extra parameters, for example mempool will not be used
>>>           for hairpin and hairpin configuration will not be used for normal queue.
>>>           It is possible to use a struct but again API break and some fields are not
>> used.
>>>        * we are just starting with the hairpin, most likely there will be
>> modification so
>>>            it is better to have a different function.
>>>        * From application he undertand that this is a different kind of queue,
>> which shouldn't be
>>>            used by the application.
>> It does not excuse to duplicate so much code below. If we have separate
>> dev_info-like limitations for hairpin, it would make sense, but I hope that
>> it would be still possible to avoid code duplication.
>>
> We can start with the most basic implementation, which will mean that the function
> will almost be empty, when other vendors or Mellanox will require some additional
> test or code they will be able to decide if to add new code to he function, or
> extract the shared code from the standard function to a specific function, and
> use this function in both setup functions.
> What do you think?

Let's try and take a look at the code.

[snip]

>>>>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id,
>>>> uint16_t rx_queue_id,
>>>>>     		struct rte_mempool *mb_pool);
>>>>>
>>>>>     /**
>>>>> + * @warning
>>>>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>>>>> + notice
>>>>> + *
>>>>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
>>>>> + *
>>>>> + * The function set up the selected queue to be used in hairpin.
>>>>> + *
>>>>> + * @param port_id
>>>>> + *   The port identifier of the Ethernet device.
>>>>> + * @param rx_queue_id
>>>>> + *   The index of the receive queue to set up.
>>>>> + *   The value must be in the range [0, nb_rx_queue - 1] previously
>>>> supplied
>>>>> + *   to rte_eth_dev_configure().
>>>> Is any Rx queue may be setup as hairpin queue?
>>>> Can it be still used for regular traffic?
>>>>
>>> No if a queue is used as hairpin it can't be used for normal traffic.
>>> This is also why I like the idea of two different functions, in order to create
>>> This distinction.
>> If so, do we need at least debug-level checks in Tx/Rx burst functions?
>> Is it required to patch rte flow RSS action to ensure that Rx queues of
>> only one kind are specified?
>> What about attempt to add Rx/Tx callbacks for hairpin queues?
>>
> I think the checks should be done in PMD level. Since from high level they are the
> same.

Sorry, I don't understand why. If something could be checked on generic 
level,
it should be done to avoid duplication in all drivers.

> Call backs for Rx/Tx doesn't make sense, since the idea is to bypass the CPU.

If so, I think rte_eth_add_tx_callback() should be patched to return an 
error
if specified queue is hairpin. Same for Rx.
Any other cases?

>>>>> + * @param nb_rx_desc
>>>>> + *   The number of receive descriptors to allocate for the receive ring.
>>>> Does it still make sense for hairpin queue?
>>>>
>>> Yes, since it can affect memory size used by the device, and can affect
>> performance.
>>>>> + * @param socket_id
>>>>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
>>>>> + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint
>>>> for
>>>>> + *   the DMA memory allocated for the receive descriptors of the ring.
>>>> Is it still required to be provided for hairpin Rx queue?
>>>>
>>> Yes, for internal PMD structures to be allocated, but we can if pressed
>> remove it.
>>>>> + * @param rx_conf
>>>>> + *   The pointer to the configuration data to be used for the receive
>>>> queue.
>>>>> + *   NULL value is allowed, in which case default RX configuration
>>>>> + *   will be used.
>>>>> + *   The *rx_conf* structure contains an *rx_thresh* structure with the
>>>> values
>>>>> + *   of the Prefetch, Host, and Write-Back threshold registers of the
>>>> receive
>>>>> + *   ring.
>>>>> + *   In addition it contains the hardware offloads features to activate using
>>>>> + *   the DEV_RX_OFFLOAD_* flags.
>>>>> + *   If an offloading set in rx_conf->offloads
>>>>> + *   hasn't been set in the input argument eth_conf->rxmode.offloads
>>>>> + *   to rte_eth_dev_configure(), it is a new added offloading, it must be
>>>>> + *   per-queue type and it is enabled for the queue.
>>>>> + *   No need to repeat any bit in rx_conf->offloads which has already been
>>>>> + *   enabled in rte_eth_dev_configure() at port level. An offloading
>>>> enabled
>>>>> + *   at port level can't be disabled at queue level.
>>>> Which offloads still make sense in the case of hairpin Rx queue?
>>>> What about threshhods, drop enable?
>>>>
>>> Drop and thresholds make sense, for example the application can state that,
>>> in case of back pressure to start dropping packets in order not to affect the
>>> entire nic.
>>> regarding offloads mainly vlan strip or vlan insert but those can also
>>> be used in rte_flow.
>>> But future offloads like QoS or other maybe shared.
>> I'm not a fan of dead parameters which are added just to use
>> the same structure. It raises too many questions on maintenance.
>> Also I don't like idea to share hairpin and regular offloads.
>> May be it is OK to share namespace (still unsure), but capabilities
>> are definitely different and some regular offloads are simply not
>> applicable to hairpin case.
>>
> I agree with you I think that my suggestion above (new caps for hairpin)
> solve this issue. Do you agree?
> I will remove the rte_eth_txconf and only hae the hairpin_conf with some new
> fields, same for the Rx, is that O.K.?

I think it would be better to keep only used parameters.
Anyway, it is experimental API and we can add missing parameters
when required.

[snip]

Thanks,
Andrew.
  
Ori Kam Oct. 2, 2019, 12:19 p.m. UTC | #6
Hi Andrew,

Sorry it took me some time to responded, (I'm on vacation 😊)
I think we are in most cases in agreement. The only open issue is the 
checks so please see my comments below.
As soon as we get to understanding about this issue, I will start working on V2.

Thanks,
Ori
 
> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Sunday, September 29, 2019 3:11 PM
> To: Ori Kam <orika@mellanox.com>; Thomas Monjalon
> <thomas@monjalon.net>; Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: dev@dpdk.org; jingjing.wu@intel.com; stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH 01/13] ethdev: support setup function for
> hairpin queue
> 
> Hi Ori,
> 
> On 9/28/19 6:19 PM, Ori Kam wrote:
> > Hi Andrew.
> > PSB
> >
> >> -----Original Message-----
> >> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >> Sent: Thursday, September 26, 2019 8:24 PM
> >> To: Ori Kam <orika@mellanox.com>; Thomas Monjalon
> >> <thomas@monjalon.net>; Ferruh Yigit <ferruh.yigit@intel.com>
> >> Cc: dev@dpdk.org; jingjing.wu@intel.com; stephen@networkplumber.org
> >> Subject: Re: [dpdk-dev] [PATCH 01/13] ethdev: support setup function for
> >> hairpin queue
> >>
> >> On 9/26/19 6:58 PM, Ori Kam wrote:
> >>> Hi Andrew,
> >>> Thanks for your comments PSB.
> >>>
> >>>> -----Original Message-----
> >>>> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >>>> On 9/26/19 9:28 AM, Ori Kam wrote:
> >>>>> This commit introduce the RX/TX hairpin setup function.
> >>>>> RX/TX should be Rx/Tx here and everywhere below.
> >>>>>
> >>>>> Hairpin is RX/TX queue that is used by the nic in order to offload
> >>>>> wire to wire traffic.
> >>>>>
> >>>>> Each hairpin queue is binded to one or more queues from other type.
> >>>>> For example TX hairpin queue should be binded to at least 1 RX hairpin
> >>>>> queue and vice versa.
> >>>> How should application find out that hairpin queues are supported?
> >>> It should be stated in the release note of the DPDK, when manufacture
> adds
> >> support for this.
> >>> In addition if the application try to set hairpin queue and it fails it can
> mean
> >> depending on the
> >>> error that the hairpin is not supported.
> >> I'm talking about dev_info-like information. Documentation is nice, but
> >> it is not
> >> very useful to implement application which works with NICs from
> >> different vendors.
> >>
> > What if we add get hairpin capabilities function.
> > We could have,  the max number of queues, if the nic support 1:n connection,
> > which offloads are supported and so on. So basically create a new set of
> capabilities
> > for hairpin this I think will also remove other concern that you have.
> > What do you think?
> 
> Yes, I think an API to report capabilities would be useful.
> It should be also used in setup functions in order to do checks on
> generic level that setup request is OK vs caps.
> 

Will be in my next version.

> >>>> How many?
> >>> There is no limit to the number of hairpin queues from application all
> queues
> >> can be hairpin queues.
> >>
> >> I'm pretty sure that it could be vendor specific.
> >>
> > Please see my answer above.
> >
> >>>> How should application find out which ports/queues could be used for
> >>>> pining?
> >>> All ports and queues can be supported, if the application request invalid
> >> combination, for example
> >>> in current Mellanox implementation binding between two ports then the
> >> setup function will  fail.
> >>> If you would like I can add capability for this, but there are too many
> options.
> >> For example number
> >>> of queues, binding limitations all of those will be very hard to declare.
> >>>
> >>>
> >>>> Is hair-pinning domain on device level sufficient to expose limitations?
> >>>>
> >>> I'm sorry but I don’t understand your question.
> >> I was just trying to imagine how we could  say that we can hairpin
> >> one port Rx queues to another port Tx queues.
> >>
> > Like I suggested above if I will add a capability function we could have
> > a field that says port_binidng supported, or something else, along this line.
> 
> Not sure that I understand, but I'll take a look when submitted.
> 

Thanks.

> >>>>> Signed-off-by: Ori Kam <orika@mellanox.com>
> >>>>> ---
> >>>>>     lib/librte_ethdev/rte_ethdev.c           | 213
> >>>>> +++++++++++++++++++++++++++++++
> >>>>>     lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
> >>>>>     lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
> >>>>>     lib/librte_ethdev/rte_ethdev_version.map |   4 +
> >>>>>     4 files changed, 380 insertions(+)
> >>>>>
> >>>>> diff --git a/lib/librte_ethdev/rte_ethdev.c
> >>>>> b/lib/librte_ethdev/rte_ethdev.c index 30b0c78..4021f38 100644
> >>>>> --- a/lib/librte_ethdev/rte_ethdev.c
> >>>>> +++ b/lib/librte_ethdev/rte_ethdev.c
> >>>>> @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
> >>>>>     }
> >>>>>
> >>>>>     int
> >>>>> +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t
> >>>>> rx_queue_id,
> >>>>> +			       uint16_t nb_rx_desc, unsigned int
> socket_id,
> >>>>> +			       const struct rte_eth_rxconf *rx_conf,
> >>>>> +			       const struct rte_eth_hairpin_conf
> *hairpin_conf)
> >>>>> Below code duplicates rte_eth_rx_queue_setup() a lot and it is very bad
> >>>>> from maintenance point of view. Similar problem with Tx hairpin queue
> >>>>> setup.
> >>>>>
> >>> I'm aware of that. The reasons I choose it are: (same goes to Tx)
> >>> 1. use the same function approach, meaning to use the current  setup
> >> function
> >>>       the issues with this are:
> >>>        * API break.
> >>>        * It will have extra parameters, for example mempool will not be used
> >>>           for hairpin and hairpin configuration will not be used for normal
> queue.
> >>>           It is possible to use a struct but again API break and some fields are
> not
> >> used.
> >>>        * we are just starting with the hairpin, most likely there will be
> >> modification so
> >>>            it is better to have a different function.
> >>>        * From application he undertand that this is a different kind of queue,
> >> which shouldn't be
> >>>            used by the application.
> >> It does not excuse to duplicate so much code below. If we have separate
> >> dev_info-like limitations for hairpin, it would make sense, but I hope that
> >> it would be still possible to avoid code duplication.
> >>
> > We can start with the most basic implementation, which will mean that the
> function
> > will almost be empty, when other vendors or Mellanox will require some
> additional
> > test or code they will be able to decide if to add new code to he function, or
> > extract the shared code from the standard function to a specific function, and
> > use this function in both setup functions.
> > What do you think?
> 
> Let's try and take a look at the code.
>

Thanks, 

 
> [snip]
> 
> >>>>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t
> port_id,
> >>>> uint16_t rx_queue_id,
> >>>>>     		struct rte_mempool *mb_pool);
> >>>>>
> >>>>>     /**
> >>>>> + * @warning
> >>>>> + * @b EXPERIMENTAL: this API may change, or be removed, without
> prior
> >>>>> + notice
> >>>>> + *
> >>>>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
> >>>>> + *
> >>>>> + * The function set up the selected queue to be used in hairpin.
> >>>>> + *
> >>>>> + * @param port_id
> >>>>> + *   The port identifier of the Ethernet device.
> >>>>> + * @param rx_queue_id
> >>>>> + *   The index of the receive queue to set up.
> >>>>> + *   The value must be in the range [0, nb_rx_queue - 1] previously
> >>>> supplied
> >>>>> + *   to rte_eth_dev_configure().
> >>>> Is any Rx queue may be setup as hairpin queue?
> >>>> Can it be still used for regular traffic?
> >>>>
> >>> No if a queue is used as hairpin it can't be used for normal traffic.
> >>> This is also why I like the idea of two different functions, in order to create
> >>> This distinction.
> >> If so, do we need at least debug-level checks in Tx/Rx burst functions?
> >> Is it required to patch rte flow RSS action to ensure that Rx queues of
> >> only one kind are specified?
> >> What about attempt to add Rx/Tx callbacks for hairpin queues?
> >>
> > I think the checks should be done in PMD level. Since from high level they are
> the
> > same.
> 
> Sorry, I don't understand why. If something could be checked on generic
> level,
> it should be done to avoid duplication in all drivers.
> 

The issue with this approach is that at the ethdev level we don't know anything about the queue.
This will mean that we will need to add extra functions to query the queue type for each PMD.
We could also assume that if to get type function exist in the pmd then the queue is always standard queue.
So my suggestion if you would like to move the checks is to add queue type enum in the ethdev level, and add
function call to query the queue type. What do you think?

> > Call backs for Rx/Tx doesn't make sense, since the idea is to bypass the CPU.
> 
> If so, I think rte_eth_add_tx_callback() should be patched to return an
> error
> if specified queue is hairpin. Same for Rx.
> Any other cases?
> 

Same answer as above.

> >>>>> + * @param nb_rx_desc
> >>>>> + *   The number of receive descriptors to allocate for the receive ring.
> >>>> Does it still make sense for hairpin queue?
> >>>>
> >>> Yes, since it can affect memory size used by the device, and can affect
> >> performance.
> >>>>> + * @param socket_id
> >>>>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
> >>>>> + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint
> >>>> for
> >>>>> + *   the DMA memory allocated for the receive descriptors of the ring.
> >>>> Is it still required to be provided for hairpin Rx queue?
> >>>>
> >>> Yes, for internal PMD structures to be allocated, but we can if pressed
> >> remove it.
> >>>>> + * @param rx_conf
> >>>>> + *   The pointer to the configuration data to be used for the receive
> >>>> queue.
> >>>>> + *   NULL value is allowed, in which case default RX configuration
> >>>>> + *   will be used.
> >>>>> + *   The *rx_conf* structure contains an *rx_thresh* structure with the
> >>>> values
> >>>>> + *   of the Prefetch, Host, and Write-Back threshold registers of the
> >>>> receive
> >>>>> + *   ring.
> >>>>> + *   In addition it contains the hardware offloads features to activate
> using
> >>>>> + *   the DEV_RX_OFFLOAD_* flags.
> >>>>> + *   If an offloading set in rx_conf->offloads
> >>>>> + *   hasn't been set in the input argument eth_conf->rxmode.offloads
> >>>>> + *   to rte_eth_dev_configure(), it is a new added offloading, it must be
> >>>>> + *   per-queue type and it is enabled for the queue.
> >>>>> + *   No need to repeat any bit in rx_conf->offloads which has already
> been
> >>>>> + *   enabled in rte_eth_dev_configure() at port level. An offloading
> >>>> enabled
> >>>>> + *   at port level can't be disabled at queue level.
> >>>> Which offloads still make sense in the case of hairpin Rx queue?
> >>>> What about threshhods, drop enable?
> >>>>
> >>> Drop and thresholds make sense, for example the application can state
> that,
> >>> in case of back pressure to start dropping packets in order not to affect the
> >>> entire nic.
> >>> regarding offloads mainly vlan strip or vlan insert but those can also
> >>> be used in rte_flow.
> >>> But future offloads like QoS or other maybe shared.
> >> I'm not a fan of dead parameters which are added just to use
> >> the same structure. It raises too many questions on maintenance.
> >> Also I don't like idea to share hairpin and regular offloads.
> >> May be it is OK to share namespace (still unsure), but capabilities
> >> are definitely different and some regular offloads are simply not
> >> applicable to hairpin case.
> >>
> > I agree with you I think that my suggestion above (new caps for hairpin)
> > solve this issue. Do you agree?
> > I will remove the rte_eth_txconf and only hae the hairpin_conf with some
> new
> > fields, same for the Rx, is that O.K.?
> 
> I think it would be better to keep only used parameters.
> Anyway, it is experimental API and we can add missing parameters
> when required.
> 

Agree.

> [snip]
> 
> Thanks,
> Andrew.
  
Andrew Rybchenko Oct. 3, 2019, 1:26 p.m. UTC | #7
Hi Ori,

@Thomas, @Ferruh, please, see question below.

On 10/2/19 3:19 PM, Ori Kam wrote:
> Hi Andrew,
>
> Sorry it took me some time to responded, (I'm on vacation 😊)
> I think we are in most cases in agreement. The only open issue is the
> checks so please see my comments below.
> As soon as we get to understanding about this issue, I will start working on V2.
>
> Thanks,
> Ori

[snip]

>>>>>>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id,
>>>>>> uint16_t rx_queue_id,
>>>>>>>      		struct rte_mempool *mb_pool);
>>>>>>>
>>>>>>>      /**
>>>>>>> + * @warning
>>>>>>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>>>>>>> + notice
>>>>>>> + *
>>>>>>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
>>>>>>> + *
>>>>>>> + * The function set up the selected queue to be used in hairpin.
>>>>>>> + *
>>>>>>> + * @param port_id
>>>>>>> + *   The port identifier of the Ethernet device.
>>>>>>> + * @param rx_queue_id
>>>>>>> + *   The index of the receive queue to set up.
>>>>>>> + *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
>>>>>>> + *   to rte_eth_dev_configure().
>>>>>> Is any Rx queue may be setup as hairpin queue?
>>>>>> Can it be still used for regular traffic?
>>>>>>
>>>>> No if a queue is used as hairpin it can't be used for normal traffic.
>>>>> This is also why I like the idea of two different functions, in order to create
>>>>> This distinction.
>>>> If so, do we need at least debug-level checks in Tx/Rx burst functions?
>>>> Is it required to patch rte flow RSS action to ensure that Rx queues of
>>>> only one kind are specified?
>>>> What about attempt to add Rx/Tx callbacks for hairpin queues?
>>>>
>>> I think the checks should be done in PMD level. Since from high level they are the
>>> same.
>> Sorry, I don't understand why. If something could be checked on generic level,
>> it should be done to avoid duplication in all drivers.
> The issue with this approach is that at the ethdev level we don't know anything about the queue.
> This will mean that we will need to add extra functions to query the queue type for each PMD.
> We could also assume that if to get type function exist in the pmd then the queue is always standard queue.
> So my suggestion if you would like to move the checks is to add queue type enum in the ethdev level, and add
> function call to query the queue type. What do you think?

I would consider to use dev_data rx_queue_state and tx_queue_state to
keep the information to have it directly available without extra function
calls. Or add extra information. dev_data is internal and it looks like not
a problem. What do you think?

>>> Call backs for Rx/Tx doesn't make sense, since the idea is to bypass the CPU.
>> If so, I think rte_eth_add_tx_callback() should be patched to return an
>> error
>> if specified queue is hairpin. Same for Rx.
>> Any other cases?
>>
> Same answer as above.

[snip]

Andrew.
  
Ori Kam Oct. 3, 2019, 5:46 p.m. UTC | #8
Hi Andrew,

@Thomas Monjalon, @Ferruh Yigit

Please comment if you have any issues with my answer.

Thanks,
Ori

> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Thursday, October 3, 2019 4:26 PM
> To: Ori Kam <orika@mellanox.com>; Thomas Monjalon
> <thomas@monjalon.net>; Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: dev@dpdk.org; jingjing.wu@intel.com; stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH 01/13] ethdev: support setup function for
> hairpin queue
> 
> Hi Ori,
> 
> @Thomas, @Ferruh, please, see question below.
> 
> On 10/2/19 3:19 PM, Ori Kam wrote:
> > Hi Andrew,
> >
> > Sorry it took me some time to responded, (I'm on vacation 😊)
> > I think we are in most cases in agreement. The only open issue is the
> > checks so please see my comments below.
> > As soon as we get to understanding about this issue, I will start working on V2.
> >
> > Thanks,
> > Ori
> 
> [snip]
> 
> >>>>>>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t
> port_id,
> >>>>>> uint16_t rx_queue_id,
> >>>>>>>      		struct rte_mempool *mb_pool);
> >>>>>>>
> >>>>>>>      /**
> >>>>>>> + * @warning
> >>>>>>> + * @b EXPERIMENTAL: this API may change, or be removed, without
> prior
> >>>>>>> + notice
> >>>>>>> + *
> >>>>>>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
> >>>>>>> + *
> >>>>>>> + * The function set up the selected queue to be used in hairpin.
> >>>>>>> + *
> >>>>>>> + * @param port_id
> >>>>>>> + *   The port identifier of the Ethernet device.
> >>>>>>> + * @param rx_queue_id
> >>>>>>> + *   The index of the receive queue to set up.
> >>>>>>> + *   The value must be in the range [0, nb_rx_queue - 1] previously
> supplied
> >>>>>>> + *   to rte_eth_dev_configure().
> >>>>>> Is any Rx queue may be setup as hairpin queue?
> >>>>>> Can it be still used for regular traffic?
> >>>>>>
> >>>>> No if a queue is used as hairpin it can't be used for normal traffic.
> >>>>> This is also why I like the idea of two different functions, in order to
> create
> >>>>> This distinction.
> >>>> If so, do we need at least debug-level checks in Tx/Rx burst functions?
> >>>> Is it required to patch rte flow RSS action to ensure that Rx queues of
> >>>> only one kind are specified?
> >>>> What about attempt to add Rx/Tx callbacks for hairpin queues?
> >>>>
> >>> I think the checks should be done in PMD level. Since from high level they
> are the
> >>> same.
> >> Sorry, I don't understand why. If something could be checked on generic
> level,
> >> it should be done to avoid duplication in all drivers.
> > The issue with this approach is that at the ethdev level we don't know
> anything about the queue.
> > This will mean that we will need to add extra functions to query the queue
> type for each PMD.
> > We could also assume that if to get type function exist in the pmd then the
> queue is always standard queue.
> > So my suggestion if you would like to move the checks is to add queue type
> enum in the ethdev level, and add
> > function call to query the queue type. What do you think?
> 
> I would consider to use dev_data rx_queue_state and tx_queue_state to
> keep the information to have it directly available without extra function
> calls. Or add extra information. dev_data is internal and it looks like not
> a problem. What do you think?
> 

I like the new state idea, it will save some memory in the dev_data, compared to having it
in the dev_data. Will also avoid extra ABI change.

> >>> Call backs for Rx/Tx doesn't make sense, since the idea is to bypass the
> CPU.
> >> If so, I think rte_eth_add_tx_callback() should be patched to return an
> >> error
> >> if specified queue is hairpin. Same for Rx.
> >> Any other cases?
> >>
> > Same answer as above.
> 
> [snip]
> 
> Andrew.
  
Ray Kinsella Oct. 3, 2019, 6:39 p.m. UTC | #9
Hi

On 26/09/2019 13:18, Andrew Rybchenko wrote:
> On 9/26/19 9:28 AM, Ori Kam wrote:
>> This commit introduce the RX/TX hairpin setup function.
> 
> RX/TX should be Rx/Tx here and everywhere below.
> 
>> Hairpin is RX/TX queue that is used by the nic in order to offload
>> wire to wire traffic.
>>
>> Each hairpin queue is binded to one or more queues from other type.
>> For example TX hairpin queue should be binded to at least 1 RX hairpin
>> queue and vice versa.
> 
> How should application find out that hairpin queues are supported?

You might want to look this patch "[dpdk-dev] [PATCH v2 0/4] get Rx/Tx
packet burst mode information" from Haiyue Wang.

Where he adds a information bitmask to describe the capabilities of the PMD.

Ray K

> How many?
> How should application find out which ports/queues could be used for
> pining?
> Is hair-pinning domain on device level sufficient to expose limitations?
> 
>> Signed-off-by: Ori Kam <orika@mellanox.com>
>> ---
>>   lib/librte_ethdev/rte_ethdev.c           | 213
>> +++++++++++++++++++++++++++++++
>>   lib/librte_ethdev/rte_ethdev.h           | 145 +++++++++++++++++++++
>>   lib/librte_ethdev/rte_ethdev_core.h      |  18 +++
>>   lib/librte_ethdev/rte_ethdev_version.map |   4 +
>>   4 files changed, 380 insertions(+)
>>
>> diff --git a/lib/librte_ethdev/rte_ethdev.c
>> b/lib/librte_ethdev/rte_ethdev.c
>> index 30b0c78..4021f38 100644
>> --- a/lib/librte_ethdev/rte_ethdev.c
>> +++ b/lib/librte_ethdev/rte_ethdev.c
>> @@ -1701,6 +1701,115 @@ struct rte_eth_dev *
>>   }
>>     int
>> +rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
>> +                   uint16_t nb_rx_desc, unsigned int socket_id,
>> +                   const struct rte_eth_rxconf *rx_conf,
>> +                   const struct rte_eth_hairpin_conf *hairpin_conf)
> 
> Below code duplicates rte_eth_rx_queue_setup() a lot and it is very
> bad from maintenance point of view. Similar problem with Tx hairpin
> queue setup.
> 
>> +{
>> +    int ret;
>> +    struct rte_eth_dev *dev;
>> +    struct rte_eth_dev_info dev_info;
>> +    struct rte_eth_rxconf local_conf;
>> +    void **rxq;
>> +
>> +    RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>> +
>> +    dev = &rte_eth_devices[port_id];
>> +    if (rx_queue_id >= dev->data->nb_rx_queues) {
>> +        RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id);
>> +        return -EINVAL;
>> +    }
>> +
>> +    RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
>> +    RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_hairpin_queue_setup,
>> +                -ENOTSUP);
>> +
>> +    rte_eth_dev_info_get(port_id, &dev_info);
>> +
>> +    /* Use default specified by driver, if nb_rx_desc is zero */
>> +    if (nb_rx_desc == 0) {
>> +        nb_rx_desc = dev_info.default_rxportconf.ring_size;
>> +        /* If driver default is also zero, fall back on EAL default */
>> +        if (nb_rx_desc == 0)
>> +            nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
>> +    }
>> +
>> +    if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
>> +            nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
>> +            nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
>> +
>> +        RTE_ETHDEV_LOG(ERR,
>> +                   "Invalid value for nb_rx_desc(=%hu), should be: "
>> +                   "<= %hu, >= %hu, and a product of %hu\n",
>> +            nb_rx_desc, dev_info.rx_desc_lim.nb_max,
>> +            dev_info.rx_desc_lim.nb_min,
>> +            dev_info.rx_desc_lim.nb_align);
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (dev->data->dev_started &&
>> +        !(dev_info.dev_capa &
>> +            RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP))
>> +        return -EBUSY;
>> +
>> +    if (dev->data->dev_started &&
>> +        (dev->data->rx_queue_state[rx_queue_id] !=
>> +            RTE_ETH_QUEUE_STATE_STOPPED))
>> +        return -EBUSY;
>> +
>> +    rxq = dev->data->rx_queues;
>> +    if (rxq[rx_queue_id]) {
>> +        RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_release,
>> +                    -ENOTSUP);
>> +        (*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]);
>> +        rxq[rx_queue_id] = NULL;
>> +    }
>> +
>> +    if (rx_conf == NULL)
>> +        rx_conf = &dev_info.default_rxconf;
>> +
>> +    local_conf = *rx_conf;
>> +
>> +    /*
>> +     * If an offloading has already been enabled in
>> +     * rte_eth_dev_configure(), it has been enabled on all queues,
>> +     * so there is no need to enable it in this queue again.
>> +     * The local_conf.offloads input to underlying PMD only carries
>> +     * those offloadings which are only enabled on this queue and
>> +     * not enabled on all queues.
>> +     */
>> +    local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads;
>> +
>> +    /*
>> +     * New added offloadings for this queue are those not enabled in
>> +     * rte_eth_dev_configure() and they must be per-queue type.
>> +     * A pure per-port offloading can't be enabled on a queue while
>> +     * disabled on another queue. A pure per-port offloading can't
>> +     * be enabled for any queue as new added one if it hasn't been
>> +     * enabled in rte_eth_dev_configure().
>> +     */
>> +    if ((local_conf.offloads & dev_info.rx_queue_offload_capa) !=
>> +         local_conf.offloads) {
>> +        RTE_ETHDEV_LOG(ERR,
>> +            "Ethdev port_id=%d rx_queue_id=%d, "
>> +            "new added offloads 0x%"PRIx64" must be "
>> +            "within per-queue offload capabilities "
>> +            "0x%"PRIx64" in %s()\n",
>> +            port_id, rx_queue_id, local_conf.offloads,
>> +            dev_info.rx_queue_offload_capa,
>> +            __func__);
>> +        return -EINVAL;
>> +    }
>> +
>> +    ret = (*dev->dev_ops->rx_hairpin_queue_setup)(dev, rx_queue_id,
>> +                              nb_rx_desc, socket_id,
>> +                              &local_conf,
>> +                              hairpin_conf);
>> +
>> +    return eth_err(port_id, ret);
>> +}
>> +
>> +int
>>   rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
>>                  uint16_t nb_tx_desc, unsigned int socket_id,
>>                  const struct rte_eth_txconf *tx_conf)
>> @@ -1799,6 +1908,110 @@ struct rte_eth_dev *
>>                  tx_queue_id, nb_tx_desc, socket_id, &local_conf));
>>   }
>>   +int
>> +rte_eth_tx_hairpin_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
>> +                   uint16_t nb_tx_desc, unsigned int socket_id,
>> +                   const struct rte_eth_txconf *tx_conf,
>> +                   const struct rte_eth_hairpin_conf *hairpin_conf)
>> +{
>> +    struct rte_eth_dev *dev;
>> +    struct rte_eth_dev_info dev_info;
>> +    struct rte_eth_txconf local_conf;
>> +    void **txq;
>> +
>> +    RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>> +
>> +    dev = &rte_eth_devices[port_id];
>> +    if (tx_queue_id >= dev->data->nb_tx_queues) {
>> +        RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id);
>> +        return -EINVAL;
>> +    }
>> +
>> +    RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
>> +    RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_hairpin_queue_setup,
>> +                -ENOTSUP);
>> +
>> +    rte_eth_dev_info_get(port_id, &dev_info);
>> +
>> +    /* Use default specified by driver, if nb_tx_desc is zero */
>> +    if (nb_tx_desc == 0) {
>> +        nb_tx_desc = dev_info.default_txportconf.ring_size;
>> +        /* If driver default is zero, fall back on EAL default */
>> +        if (nb_tx_desc == 0)
>> +            nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
>> +    }
>> +    if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
>> +        nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
>> +        nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
>> +        RTE_ETHDEV_LOG(ERR,
>> +                   "Invalid value for nb_tx_desc(=%hu), "
>> +                   "should be: <= %hu, >= %hu, and a product of "
>> +                   " %hu\n",
>> +                   nb_tx_desc, dev_info.tx_desc_lim.nb_max,
>> +                   dev_info.tx_desc_lim.nb_min,
>> +                   dev_info.tx_desc_lim.nb_align);
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (dev->data->dev_started &&
>> +        !(dev_info.dev_capa &
>> +          RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP))
>> +        return -EBUSY;
>> +
>> +    if (dev->data->dev_started &&
>> +        (dev->data->tx_queue_state[tx_queue_id] !=
>> +         RTE_ETH_QUEUE_STATE_STOPPED))
>> +        return -EBUSY;
>> +
>> +    txq = dev->data->tx_queues;
>> +    if (txq[tx_queue_id]) {
>> +        RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_release,
>> +                    -ENOTSUP);
>> +        (*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]);
>> +        txq[tx_queue_id] = NULL;
>> +    }
>> +
>> +    if (tx_conf == NULL)
>> +        tx_conf = &dev_info.default_txconf;
>> +
>> +    local_conf = *tx_conf;
>> +
>> +    /*
>> +     * If an offloading has already been enabled in
>> +     * rte_eth_dev_configure(), it has been enabled on all queues,
>> +     * so there is no need to enable it in this queue again.
>> +     * The local_conf.offloads input to underlying PMD only carries
>> +     * those offloadings which are only enabled on this queue and
>> +     * not enabled on all queues.
>> +     */
>> +    local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads;
>> +
>> +    /*
>> +     * New added offloadings for this queue are those not enabled in
>> +     * rte_eth_dev_configure() and they must be per-queue type.
>> +     * A pure per-port offloading can't be enabled on a queue while
>> +     * disabled on another queue. A pure per-port offloading can't
>> +     * be enabled for any queue as new added one if it hasn't been
>> +     * enabled in rte_eth_dev_configure().
>> +     */
>> +    if ((local_conf.offloads & dev_info.tx_queue_offload_capa) !=
>> +         local_conf.offloads) {
>> +        RTE_ETHDEV_LOG(ERR,
>> +                   "Ethdev port_id=%d tx_queue_id=%d, new added "
>> +                   "offloads 0x%"PRIx64" must be within "
>> +                   "per-queue offload capabilities 0x%"PRIx64" "
>> +                   "in %s()\n",
>> +                   port_id, tx_queue_id, local_conf.offloads,
>> +                   dev_info.tx_queue_offload_capa,
>> +                   __func__);
>> +        return -EINVAL;
>> +    }
>> +
>> +    return eth_err(port_id, (*dev->dev_ops->tx_hairpin_queue_setup)
>> +               (dev, tx_queue_id, nb_tx_desc, socket_id, &local_conf,
>> +            hairpin_conf));
>> +}
>> +
>>   void
>>   rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t
>> unsent,
>>           void *userdata __rte_unused)
>> diff --git a/lib/librte_ethdev/rte_ethdev.h
>> b/lib/librte_ethdev/rte_ethdev.h
>> index 475dbda..b3b1597 100644
>> --- a/lib/librte_ethdev/rte_ethdev.h
>> +++ b/lib/librte_ethdev/rte_ethdev.h
>> @@ -803,6 +803,30 @@ struct rte_eth_txconf {
>>       uint64_t offloads;
>>   };
>>   +#define RTE_ETH_MAX_HAIRPIN_PEERS 32
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>> notice
>> + *
>> + * A structure used to hold hairpin peer data.
>> + */
>> +struct rte_eth_hairpin_peer {
>> +    uint16_t port; /**< Peer port. */
>> +    uint16_t queue; /**< Peer queue. */
>> +};
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>> notice
>> + *
>> + * A structure used to configure hairpin binding.
>> + */
>> +struct rte_eth_hairpin_conf {
>> +    uint16_t peer_n; /**< The number of peers. */
>> +    struct rte_eth_hairpin_peer peers[RTE_ETH_MAX_HAIRPIN_PEERS];
>> +};
>> +
>>   /**
>>    * A structure contains information about HW descriptor ring
>> limitations.
>>    */
>> @@ -1769,6 +1793,60 @@ int rte_eth_rx_queue_setup(uint16_t port_id,
>> uint16_t rx_queue_id,
>>           struct rte_mempool *mb_pool);
>>     /**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>> notice
>> + *
>> + * Allocate and set up a hairpin receive queue for an Ethernet device.
>> + *
>> + * The function set up the selected queue to be used in hairpin.
>> + *
>> + * @param port_id
>> + *   The port identifier of the Ethernet device.
>> + * @param rx_queue_id
>> + *   The index of the receive queue to set up.
>> + *   The value must be in the range [0, nb_rx_queue - 1] previously
>> supplied
>> + *   to rte_eth_dev_configure().
> 
> Is any Rx queue may be setup as hairpin queue?
> Can it be still used for regular traffic?
> 
>> + * @param nb_rx_desc
>> + *   The number of receive descriptors to allocate for the receive ring.
> 
> Does it still make sense for hairpin queue?
> 
>> + * @param socket_id
>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
>> + *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
>> + *   the DMA memory allocated for the receive descriptors of the ring.
> 
> Is it still required to be provided for hairpin Rx queue?
> 
>> + * @param rx_conf
>> + *   The pointer to the configuration data to be used for the receive
>> queue.
>> + *   NULL value is allowed, in which case default RX configuration
>> + *   will be used.
>> + *   The *rx_conf* structure contains an *rx_thresh* structure with
>> the values
>> + *   of the Prefetch, Host, and Write-Back threshold registers of the
>> receive
>> + *   ring.
>> + *   In addition it contains the hardware offloads features to
>> activate using
>> + *   the DEV_RX_OFFLOAD_* flags.
>> + *   If an offloading set in rx_conf->offloads
>> + *   hasn't been set in the input argument eth_conf->rxmode.offloads
>> + *   to rte_eth_dev_configure(), it is a new added offloading, it
>> must be
>> + *   per-queue type and it is enabled for the queue.
>> + *   No need to repeat any bit in rx_conf->offloads which has already
>> been
>> + *   enabled in rte_eth_dev_configure() at port level. An offloading
>> enabled
>> + *   at port level can't be disabled at queue level.
> 
> Which offloads still make sense in the case of hairpin Rx queue?
> What about threshhods, drop enable?
> 
>> + * @param hairpin_conf
>> + *   The pointer to the hairpin binding configuration.
>> + * @return
>> + *   - 0: Success, receive queue correctly set up.
>> + *   - -EINVAL: The size of network buffers which can be allocated
>> from the
>> + *      memory pool does not fit the various buffer sizes allowed by the
>> + *      device controller.
>> + *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
>> + *      allocate network memory buffers from the memory pool when
>> + *      initializing receive descriptors.
>> + */
>> +__rte_experimental
>> +int rte_eth_rx_hairpin_queue_setup
>> +    (uint16_t port_id, uint16_t rx_queue_id,
>> +     uint16_t nb_rx_desc, unsigned int socket_id,
>> +     const struct rte_eth_rxconf *rx_conf,
>> +     const struct rte_eth_hairpin_conf *hairpin_conf);
>> +
>> +/**
>>    * Allocate and set up a transmit queue for an Ethernet device.
>>    *
>>    * @param port_id
>> @@ -1821,6 +1899,73 @@ int rte_eth_tx_queue_setup(uint16_t port_id,
>> uint16_t tx_queue_id,
>>           const struct rte_eth_txconf *tx_conf);
>>     /**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change, or be removed, without prior
>> notice
>> + *
>> + * Allocate and set up a transmit hairpin queue for an Ethernet device.
>> + *
>> + * @param port_id
>> + *   The port identifier of the Ethernet device.
>> + * @param tx_queue_id
>> + *   The index of the transmit queue to set up.
>> + *   The value must be in the range [0, nb_tx_queue - 1] previously
>> supplied
>> + *   to rte_eth_dev_configure().
> 
> Is any Tx queue may be setup as hairpin queue?
> 
>> + * @param nb_tx_desc
>> + *   The number of transmit descriptors to allocate for the transmit
>> ring.
> 
> Is it really required for hairpin queue? Are min/max/align limits still
> the same?
> 
>> + * @param socket_id
>> + *   The *socket_id* argument is the socket identifier in case of NUMA.
>> + *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
>> + *   the DMA memory allocated for the transmit descriptors of the ring.
> 
> Does it still make sense for Tx hairpin queue?
> 
>> + * @param tx_conf
>> + *   The pointer to the configuration data to be used for the
>> transmit queue.
>> + *   NULL value is allowed, in which case default RX configuration
>> + *   will be used.
>> + *   The *tx_conf* structure contains the following data:
>> + *   - The *tx_thresh* structure with the values of the Prefetch,
>> Host, and
>> + *     Write-Back threshold registers of the transmit ring.
>> + *     When setting Write-Back threshold to the value greater then zero,
>> + *     *tx_rs_thresh* value should be explicitly set to one.
>> + *   - The *tx_free_thresh* value indicates the [minimum] number of
>> network
>> + *     buffers that must be pending in the transmit ring to trigger
>> their
>> + *     [implicit] freeing by the driver transmit function.
>> + *   - The *tx_rs_thresh* value indicates the [minimum] number of
>> transmit
>> + *     descriptors that must be pending in the transmit ring before
>> setting the
>> + *     RS bit on a descriptor by the driver transmit function.
>> + *     The *tx_rs_thresh* value should be less or equal then
>> + *     *tx_free_thresh* value, and both of them should be less then
>> + *     *nb_tx_desc* - 3.
> 
> I'm not sure that everything above makes sense for hairpin Tx queue.
> 
>> + *   - The *txq_flags* member contains flags to pass to the TX queue
>> setup
>> + *     function to configure the behavior of the TX queue. This
>> should be set
>> + *     to 0 if no special configuration is required.
>> + *     This API is obsolete and will be deprecated. Applications
>> + *     should set it to ETH_TXQ_FLAGS_IGNORE and use
>> + *     the offloads field below.
> 
> There is no txq_flags for a long time already. So, I'm wondering when it
> was
> copies from rte_eth_tx_queue_setup().
> 
>> + *   - The *offloads* member contains Tx offloads to be enabled.
>> + *     If an offloading set in tx_conf->offloads
>> + *     hasn't been set in the input argument eth_conf->txmode.offloads
>> + *     to rte_eth_dev_configure(), it is a new added offloading, it
>> must be
>> + *     per-queue type and it is enabled for the queue.
>> + *     No need to repeat any bit in tx_conf->offloads which has
>> already been
>> + *     enabled in rte_eth_dev_configure() at port level. An
>> offloading enabled
>> + *     at port level can't be disabled at queue level.
> 
> Which offloads do really make sense and valid to use for hairpin Tx queues?
> Do we need separate caps for hairpin offloads?
> 
>> + *
>> + *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to
>> 0 forces
>> + *     the transmit function to use default values.
>> + * @param hairpin_conf
>> + *   The hairpin binding configuration.
>> + *
>> + * @return
>> + *   - 0: Success, the transmit queue is correctly set up.
>> + *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
>> + */
>> +__rte_experimental
>> +int rte_eth_tx_hairpin_queue_setup
>> +    (uint16_t port_id, uint16_t tx_queue_id,
>> +     uint16_t nb_tx_desc, unsigned int socket_id,
>> +     const struct rte_eth_txconf *tx_conf,
>> +     const struct rte_eth_hairpin_conf *hairpin_conf);
>> +
>> +/**
>>    * Return the NUMA socket to which an Ethernet device is connected
>>    *
>>    * @param port_id
>>
> 
> [snip]
>
  

Patch

diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
index 30b0c78..4021f38 100644
--- a/lib/librte_ethdev/rte_ethdev.c
+++ b/lib/librte_ethdev/rte_ethdev.c
@@ -1701,6 +1701,115 @@  struct rte_eth_dev *
 }
 
 int
+rte_eth_rx_hairpin_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
+			       uint16_t nb_rx_desc, unsigned int socket_id,
+			       const struct rte_eth_rxconf *rx_conf,
+			       const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+	int ret;
+	struct rte_eth_dev *dev;
+	struct rte_eth_dev_info dev_info;
+	struct rte_eth_rxconf local_conf;
+	void **rxq;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+
+	dev = &rte_eth_devices[port_id];
+	if (rx_queue_id >= dev->data->nb_rx_queues) {
+		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", rx_queue_id);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_hairpin_queue_setup,
+				-ENOTSUP);
+
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	/* Use default specified by driver, if nb_rx_desc is zero */
+	if (nb_rx_desc == 0) {
+		nb_rx_desc = dev_info.default_rxportconf.ring_size;
+		/* If driver default is also zero, fall back on EAL default */
+		if (nb_rx_desc == 0)
+			nb_rx_desc = RTE_ETH_DEV_FALLBACK_RX_RINGSIZE;
+	}
+
+	if (nb_rx_desc > dev_info.rx_desc_lim.nb_max ||
+			nb_rx_desc < dev_info.rx_desc_lim.nb_min ||
+			nb_rx_desc % dev_info.rx_desc_lim.nb_align != 0) {
+
+		RTE_ETHDEV_LOG(ERR,
+			       "Invalid value for nb_rx_desc(=%hu), should be: "
+			       "<= %hu, >= %hu, and a product of %hu\n",
+			nb_rx_desc, dev_info.rx_desc_lim.nb_max,
+			dev_info.rx_desc_lim.nb_min,
+			dev_info.rx_desc_lim.nb_align);
+		return -EINVAL;
+	}
+
+	if (dev->data->dev_started &&
+		!(dev_info.dev_capa &
+			RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP))
+		return -EBUSY;
+
+	if (dev->data->dev_started &&
+		(dev->data->rx_queue_state[rx_queue_id] !=
+			RTE_ETH_QUEUE_STATE_STOPPED))
+		return -EBUSY;
+
+	rxq = dev->data->rx_queues;
+	if (rxq[rx_queue_id]) {
+		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_release,
+					-ENOTSUP);
+		(*dev->dev_ops->rx_queue_release)(rxq[rx_queue_id]);
+		rxq[rx_queue_id] = NULL;
+	}
+
+	if (rx_conf == NULL)
+		rx_conf = &dev_info.default_rxconf;
+
+	local_conf = *rx_conf;
+
+	/*
+	 * If an offloading has already been enabled in
+	 * rte_eth_dev_configure(), it has been enabled on all queues,
+	 * so there is no need to enable it in this queue again.
+	 * The local_conf.offloads input to underlying PMD only carries
+	 * those offloadings which are only enabled on this queue and
+	 * not enabled on all queues.
+	 */
+	local_conf.offloads &= ~dev->data->dev_conf.rxmode.offloads;
+
+	/*
+	 * New added offloadings for this queue are those not enabled in
+	 * rte_eth_dev_configure() and they must be per-queue type.
+	 * A pure per-port offloading can't be enabled on a queue while
+	 * disabled on another queue. A pure per-port offloading can't
+	 * be enabled for any queue as new added one if it hasn't been
+	 * enabled in rte_eth_dev_configure().
+	 */
+	if ((local_conf.offloads & dev_info.rx_queue_offload_capa) !=
+	     local_conf.offloads) {
+		RTE_ETHDEV_LOG(ERR,
+			"Ethdev port_id=%d rx_queue_id=%d, "
+			"new added offloads 0x%"PRIx64" must be "
+			"within per-queue offload capabilities "
+			"0x%"PRIx64" in %s()\n",
+			port_id, rx_queue_id, local_conf.offloads,
+			dev_info.rx_queue_offload_capa,
+			__func__);
+		return -EINVAL;
+	}
+
+	ret = (*dev->dev_ops->rx_hairpin_queue_setup)(dev, rx_queue_id,
+						      nb_rx_desc, socket_id,
+						      &local_conf,
+						      hairpin_conf);
+
+	return eth_err(port_id, ret);
+}
+
+int
 rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
 		       uint16_t nb_tx_desc, unsigned int socket_id,
 		       const struct rte_eth_txconf *tx_conf)
@@ -1799,6 +1908,110 @@  struct rte_eth_dev *
 		       tx_queue_id, nb_tx_desc, socket_id, &local_conf));
 }
 
+int
+rte_eth_tx_hairpin_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
+			       uint16_t nb_tx_desc, unsigned int socket_id,
+			       const struct rte_eth_txconf *tx_conf,
+			       const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+	struct rte_eth_dev *dev;
+	struct rte_eth_dev_info dev_info;
+	struct rte_eth_txconf local_conf;
+	void **txq;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+
+	dev = &rte_eth_devices[port_id];
+	if (tx_queue_id >= dev->data->nb_tx_queues) {
+		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", tx_queue_id);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_hairpin_queue_setup,
+				-ENOTSUP);
+
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	/* Use default specified by driver, if nb_tx_desc is zero */
+	if (nb_tx_desc == 0) {
+		nb_tx_desc = dev_info.default_txportconf.ring_size;
+		/* If driver default is zero, fall back on EAL default */
+		if (nb_tx_desc == 0)
+			nb_tx_desc = RTE_ETH_DEV_FALLBACK_TX_RINGSIZE;
+	}
+	if (nb_tx_desc > dev_info.tx_desc_lim.nb_max ||
+	    nb_tx_desc < dev_info.tx_desc_lim.nb_min ||
+	    nb_tx_desc % dev_info.tx_desc_lim.nb_align != 0) {
+		RTE_ETHDEV_LOG(ERR,
+			       "Invalid value for nb_tx_desc(=%hu), "
+			       "should be: <= %hu, >= %hu, and a product of "
+			       " %hu\n",
+			       nb_tx_desc, dev_info.tx_desc_lim.nb_max,
+			       dev_info.tx_desc_lim.nb_min,
+			       dev_info.tx_desc_lim.nb_align);
+		return -EINVAL;
+	}
+
+	if (dev->data->dev_started &&
+		!(dev_info.dev_capa &
+		  RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP))
+		return -EBUSY;
+
+	if (dev->data->dev_started &&
+		(dev->data->tx_queue_state[tx_queue_id] !=
+		 RTE_ETH_QUEUE_STATE_STOPPED))
+		return -EBUSY;
+
+	txq = dev->data->tx_queues;
+	if (txq[tx_queue_id]) {
+		RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_release,
+					-ENOTSUP);
+		(*dev->dev_ops->tx_queue_release)(txq[tx_queue_id]);
+		txq[tx_queue_id] = NULL;
+	}
+
+	if (tx_conf == NULL)
+		tx_conf = &dev_info.default_txconf;
+
+	local_conf = *tx_conf;
+
+	/*
+	 * If an offloading has already been enabled in
+	 * rte_eth_dev_configure(), it has been enabled on all queues,
+	 * so there is no need to enable it in this queue again.
+	 * The local_conf.offloads input to underlying PMD only carries
+	 * those offloadings which are only enabled on this queue and
+	 * not enabled on all queues.
+	 */
+	local_conf.offloads &= ~dev->data->dev_conf.txmode.offloads;
+
+	/*
+	 * New added offloadings for this queue are those not enabled in
+	 * rte_eth_dev_configure() and they must be per-queue type.
+	 * A pure per-port offloading can't be enabled on a queue while
+	 * disabled on another queue. A pure per-port offloading can't
+	 * be enabled for any queue as new added one if it hasn't been
+	 * enabled in rte_eth_dev_configure().
+	 */
+	if ((local_conf.offloads & dev_info.tx_queue_offload_capa) !=
+	     local_conf.offloads) {
+		RTE_ETHDEV_LOG(ERR,
+			       "Ethdev port_id=%d tx_queue_id=%d, new added "
+			       "offloads 0x%"PRIx64" must be within "
+			       "per-queue offload capabilities 0x%"PRIx64" "
+			       "in %s()\n",
+			       port_id, tx_queue_id, local_conf.offloads,
+			       dev_info.tx_queue_offload_capa,
+			       __func__);
+		return -EINVAL;
+	}
+
+	return eth_err(port_id, (*dev->dev_ops->tx_hairpin_queue_setup)
+		       (dev, tx_queue_id, nb_tx_desc, socket_id, &local_conf,
+			hairpin_conf));
+}
+
 void
 rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t unsent,
 		void *userdata __rte_unused)
diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
index 475dbda..b3b1597 100644
--- a/lib/librte_ethdev/rte_ethdev.h
+++ b/lib/librte_ethdev/rte_ethdev.h
@@ -803,6 +803,30 @@  struct rte_eth_txconf {
 	uint64_t offloads;
 };
 
+#define RTE_ETH_MAX_HAIRPIN_PEERS 32
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * A structure used to hold hairpin peer data.
+ */
+struct rte_eth_hairpin_peer {
+	uint16_t port; /**< Peer port. */
+	uint16_t queue; /**< Peer queue. */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * A structure used to configure hairpin binding.
+ */
+struct rte_eth_hairpin_conf {
+	uint16_t peer_n; /**< The number of peers. */
+	struct rte_eth_hairpin_peer peers[RTE_ETH_MAX_HAIRPIN_PEERS];
+};
+
 /**
  * A structure contains information about HW descriptor ring limitations.
  */
@@ -1769,6 +1793,60 @@  int rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 		struct rte_mempool *mb_pool);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Allocate and set up a hairpin receive queue for an Ethernet device.
+ *
+ * The function set up the selected queue to be used in hairpin.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param rx_queue_id
+ *   The index of the receive queue to set up.
+ *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param nb_rx_desc
+ *   The number of receive descriptors to allocate for the receive ring.
+ * @param socket_id
+ *   The *socket_id* argument is the socket identifier in case of NUMA.
+ *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
+ *   the DMA memory allocated for the receive descriptors of the ring.
+ * @param rx_conf
+ *   The pointer to the configuration data to be used for the receive queue.
+ *   NULL value is allowed, in which case default RX configuration
+ *   will be used.
+ *   The *rx_conf* structure contains an *rx_thresh* structure with the values
+ *   of the Prefetch, Host, and Write-Back threshold registers of the receive
+ *   ring.
+ *   In addition it contains the hardware offloads features to activate using
+ *   the DEV_RX_OFFLOAD_* flags.
+ *   If an offloading set in rx_conf->offloads
+ *   hasn't been set in the input argument eth_conf->rxmode.offloads
+ *   to rte_eth_dev_configure(), it is a new added offloading, it must be
+ *   per-queue type and it is enabled for the queue.
+ *   No need to repeat any bit in rx_conf->offloads which has already been
+ *   enabled in rte_eth_dev_configure() at port level. An offloading enabled
+ *   at port level can't be disabled at queue level.
+ * @param hairpin_conf
+ *   The pointer to the hairpin binding configuration.
+ * @return
+ *   - 0: Success, receive queue correctly set up.
+ *   - -EINVAL: The size of network buffers which can be allocated from the
+ *      memory pool does not fit the various buffer sizes allowed by the
+ *      device controller.
+ *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
+ *      allocate network memory buffers from the memory pool when
+ *      initializing receive descriptors.
+ */
+__rte_experimental
+int rte_eth_rx_hairpin_queue_setup
+	(uint16_t port_id, uint16_t rx_queue_id,
+	 uint16_t nb_rx_desc, unsigned int socket_id,
+	 const struct rte_eth_rxconf *rx_conf,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+
+/**
  * Allocate and set up a transmit queue for an Ethernet device.
  *
  * @param port_id
@@ -1821,6 +1899,73 @@  int rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
 		const struct rte_eth_txconf *tx_conf);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Allocate and set up a transmit hairpin queue for an Ethernet device.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param tx_queue_id
+ *   The index of the transmit queue to set up.
+ *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param nb_tx_desc
+ *   The number of transmit descriptors to allocate for the transmit ring.
+ * @param socket_id
+ *   The *socket_id* argument is the socket identifier in case of NUMA.
+ *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
+ *   the DMA memory allocated for the transmit descriptors of the ring.
+ * @param tx_conf
+ *   The pointer to the configuration data to be used for the transmit queue.
+ *   NULL value is allowed, in which case default RX configuration
+ *   will be used.
+ *   The *tx_conf* structure contains the following data:
+ *   - The *tx_thresh* structure with the values of the Prefetch, Host, and
+ *     Write-Back threshold registers of the transmit ring.
+ *     When setting Write-Back threshold to the value greater then zero,
+ *     *tx_rs_thresh* value should be explicitly set to one.
+ *   - The *tx_free_thresh* value indicates the [minimum] number of network
+ *     buffers that must be pending in the transmit ring to trigger their
+ *     [implicit] freeing by the driver transmit function.
+ *   - The *tx_rs_thresh* value indicates the [minimum] number of transmit
+ *     descriptors that must be pending in the transmit ring before setting the
+ *     RS bit on a descriptor by the driver transmit function.
+ *     The *tx_rs_thresh* value should be less or equal then
+ *     *tx_free_thresh* value, and both of them should be less then
+ *     *nb_tx_desc* - 3.
+ *   - The *txq_flags* member contains flags to pass to the TX queue setup
+ *     function to configure the behavior of the TX queue. This should be set
+ *     to 0 if no special configuration is required.
+ *     This API is obsolete and will be deprecated. Applications
+ *     should set it to ETH_TXQ_FLAGS_IGNORE and use
+ *     the offloads field below.
+ *   - The *offloads* member contains Tx offloads to be enabled.
+ *     If an offloading set in tx_conf->offloads
+ *     hasn't been set in the input argument eth_conf->txmode.offloads
+ *     to rte_eth_dev_configure(), it is a new added offloading, it must be
+ *     per-queue type and it is enabled for the queue.
+ *     No need to repeat any bit in tx_conf->offloads which has already been
+ *     enabled in rte_eth_dev_configure() at port level. An offloading enabled
+ *     at port level can't be disabled at queue level.
+ *
+ *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0 forces
+ *     the transmit function to use default values.
+ * @param hairpin_conf
+ *   The hairpin binding configuration.
+ *
+ * @return
+ *   - 0: Success, the transmit queue is correctly set up.
+ *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
+ */
+__rte_experimental
+int rte_eth_tx_hairpin_queue_setup
+	(uint16_t port_id, uint16_t tx_queue_id,
+	 uint16_t nb_tx_desc, unsigned int socket_id,
+	 const struct rte_eth_txconf *tx_conf,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+
+/**
  * Return the NUMA socket to which an Ethernet device is connected
  *
  * @param port_id
diff --git a/lib/librte_ethdev/rte_ethdev_core.h b/lib/librte_ethdev/rte_ethdev_core.h
index 2394b32..bc40708 100644
--- a/lib/librte_ethdev/rte_ethdev_core.h
+++ b/lib/librte_ethdev/rte_ethdev_core.h
@@ -126,6 +126,13 @@  typedef int (*eth_rx_queue_setup_t)(struct rte_eth_dev *dev,
 				    struct rte_mempool *mb_pool);
 /**< @internal Set up a receive queue of an Ethernet device. */
 
+typedef int (*eth_rx_hairpin_queue_setup_t)
+	(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+	 uint16_t nb_rx_desc, unsigned int socket_id,
+	 const struct rte_eth_rxconf *rx_conf,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+/**< @internal Set up a receive hairpin queue of an Ethernet device. */
+
 typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    uint16_t tx_queue_id,
 				    uint16_t nb_tx_desc,
@@ -133,6 +140,13 @@  typedef int (*eth_tx_queue_setup_t)(struct rte_eth_dev *dev,
 				    const struct rte_eth_txconf *tx_conf);
 /**< @internal Setup a transmit queue of an Ethernet device. */
 
+typedef int (*eth_tx_hairpin_queue_setup_t)
+	(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+	 uint16_t nb_tx_desc, unsigned int socket_id,
+	 const struct rte_eth_txconf *tx_conf,
+	 const struct rte_eth_hairpin_conf *hairpin_conf);
+/**< @internal Setup a transmit hairpin queue of an Ethernet device. */
+
 typedef int (*eth_rx_enable_intr_t)(struct rte_eth_dev *dev,
 				    uint16_t rx_queue_id);
 /**< @internal Enable interrupt of a receive queue of an Ethernet device. */
@@ -433,6 +447,8 @@  struct eth_dev_ops {
 	eth_queue_start_t          tx_queue_start;/**< Start TX for a queue. */
 	eth_queue_stop_t           tx_queue_stop; /**< Stop TX for a queue. */
 	eth_rx_queue_setup_t       rx_queue_setup;/**< Set up device RX queue. */
+	eth_rx_hairpin_queue_setup_t rx_hairpin_queue_setup;
+	/**< Set up device RX hairpin queue. */
 	eth_queue_release_t        rx_queue_release; /**< Release RX queue. */
 	eth_rx_queue_count_t       rx_queue_count;
 	/**< Get the number of used RX descriptors. */
@@ -444,6 +460,8 @@  struct eth_dev_ops {
 	eth_rx_enable_intr_t       rx_queue_intr_enable;  /**< Enable Rx queue interrupt. */
 	eth_rx_disable_intr_t      rx_queue_intr_disable; /**< Disable Rx queue interrupt. */
 	eth_tx_queue_setup_t       tx_queue_setup;/**< Set up device TX queue. */
+	eth_tx_hairpin_queue_setup_t tx_hairpin_queue_setup;
+	/**< Set up device TX hairpin queue. */
 	eth_queue_release_t        tx_queue_release; /**< Release TX queue. */
 	eth_tx_done_cleanup_t      tx_done_cleanup;/**< Free tx ring mbufs */
 
diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map
index 6df42a4..99e05fe 100644
--- a/lib/librte_ethdev/rte_ethdev_version.map
+++ b/lib/librte_ethdev/rte_ethdev_version.map
@@ -283,4 +283,8 @@  EXPERIMENTAL {
 
 	# added in 19.08
 	rte_eth_read_clock;
+
+	# added in 19.11
+	rte_eth_rx_hairpin_queue_setup;
+	rte_eth_tx_hairpin_queue_setup;
 };