diff mbox series

[v5,4/7] ethdev: copy fast-path API into separate structure

Message ID 20211007112750.25526-5-konstantin.ananyev@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers show
Series hide eth dev related structures | expand

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Ananyev, Konstantin Oct. 7, 2021, 11:27 a.m. UTC
Copy public function pointers (rx_pkt_burst(), etc.) and related
pointers to internal data from rte_eth_dev structure into a
separate flat array. That array will remain in a public header.
The intention here is to make rte_eth_dev and related structures internal.
That should allow future possible changes to core eth_dev structures
to be transparent to the user and help to avoid ABI/API breakages.
The plan is to keep minimal part of data from rte_eth_dev public,
so we still can use inline functions for fast-path calls
(like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
The whole idea beyond this new schema:
1. PMDs keep to setup fast-path function pointers and related data
   inside rte_eth_dev struct in the same way they did it before.
2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
   (for secondary process) we call eth_dev_fp_ops_setup, which
   copies these function and data pointers into rte_eth_fp_ops[port_id].
3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
   we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
   into some dummy values.
4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
   flat array to call PMD specific functions.
That approach should allow us to make rte_eth_devices[] private
without introducing regression and help to avoid changes in drivers code.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
 lib/ethdev/ethdev_private.h  |  7 +++++
 lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
 lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
 4 files changed, 141 insertions(+)

Comments

fengchengwen Oct. 9, 2021, 12:05 p.m. UTC | #1
On 2021/10/7 19:27, Konstantin Ananyev wrote:
> Copy public function pointers (rx_pkt_burst(), etc.) and related
> pointers to internal data from rte_eth_dev structure into a
> separate flat array. That array will remain in a public header.
> The intention here is to make rte_eth_dev and related structures internal.
> That should allow future possible changes to core eth_dev structures
> to be transparent to the user and help to avoid ABI/API breakages.
> The plan is to keep minimal part of data from rte_eth_dev public,
> so we still can use inline functions for fast-path calls
> (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
> The whole idea beyond this new schema:
> 1. PMDs keep to setup fast-path function pointers and related data
>    inside rte_eth_dev struct in the same way they did it before.
> 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
>    (for secondary process) we call eth_dev_fp_ops_setup, which
>    copies these function and data pointers into rte_eth_fp_ops[port_id].
> 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
>    we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
>    into some dummy values.
> 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
>    flat array to call PMD specific functions.
> That approach should allow us to make rte_eth_devices[] private
> without introducing regression and help to avoid changes in drivers code.
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> ---
>  lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
>  lib/ethdev/ethdev_private.h  |  7 +++++
>  lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
>  lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 141 insertions(+)
> 
> diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
> index 012cf73ca2..3eeda6e9f9 100644
> --- a/lib/ethdev/ethdev_private.c
> +++ b/lib/ethdev/ethdev_private.c
> @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
>  		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
>  	return str == NULL ? -1 : 0;
>  }
> +
> +static uint16_t
> +dummy_eth_rx_burst(__rte_unused void *rxq,
> +		__rte_unused struct rte_mbuf **rx_pkts,
> +		__rte_unused uint16_t nb_pkts)
> +{
> +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
> +	rte_errno = ENOTSUP;
> +	return 0;
> +}
> +
> +static uint16_t
> +dummy_eth_tx_burst(__rte_unused void *txq,
> +		__rte_unused struct rte_mbuf **tx_pkts,
> +		__rte_unused uint16_t nb_pkts)
> +{
> +	RTE_ETHDEV_LOG(ERR, "tx_pkt_burst for unconfigured port\n");
> +	rte_errno = ENOTSUP;
> +	return 0;
> +}
> +
> +void
> +eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo)

The port_id parameter is preferable, this will hide rte_eth_fp_ops as much as possible.

> +{
> +	static void *dummy_data[RTE_MAX_QUEUES_PER_PORT];
> +	static const struct rte_eth_fp_ops dummy_ops = {
> +		.rx_pkt_burst = dummy_eth_rx_burst,
> +		.tx_pkt_burst = dummy_eth_tx_burst,
> +		.rxq = {.data = dummy_data, .clbk = dummy_data,},
> +		.txq = {.data = dummy_data, .clbk = dummy_data,},
> +	};
> +
> +	*fpo = dummy_ops;
> +}
> +
> +void
> +eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
> +		const struct rte_eth_dev *dev)

Because fp_ops and eth_dev is a one-to-one correspondence. It's better only use
port_id parameter.

> +{
> +	fpo->rx_pkt_burst = dev->rx_pkt_burst;
> +	fpo->tx_pkt_burst = dev->tx_pkt_burst;
> +	fpo->tx_pkt_prepare = dev->tx_pkt_prepare;
> +	fpo->rx_queue_count = dev->rx_queue_count;
> +	fpo->rx_descriptor_status = dev->rx_descriptor_status;
> +	fpo->tx_descriptor_status = dev->tx_descriptor_status;
> +
> +	fpo->rxq.data = dev->data->rx_queues;
> +	fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs;
> +
> +	fpo->txq.data = dev->data->tx_queues;
> +	fpo->txq.clbk = (void **)(uintptr_t)dev->pre_tx_burst_cbs;
> +}
> diff --git a/lib/ethdev/ethdev_private.h b/lib/ethdev/ethdev_private.h
> index 3724429577..5721be7bdc 100644
> --- a/lib/ethdev/ethdev_private.h
> +++ b/lib/ethdev/ethdev_private.h
> @@ -26,4 +26,11 @@ eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
>  /* Parse devargs value for representor parameter. */
>  int rte_eth_devargs_parse_representor_ports(char *str, void *data);
>  
> +/* reset eth fast-path API to dummy values */
> +void eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo);
> +
> +/* setup eth fast-path API to ethdev values */
> +void eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
> +		const struct rte_eth_dev *dev);

Some drivers control the transmit/receive function during operation. E.g.
for hns3 driver, when detect reset, primary process will set rx/tx burst to dummy, after
process reset, primary process will set the correct rx/tx burst. During this process, the
send and receive threads are still working, but the bursts they call are changed. So:
1. it is recommended that trace be deleted from the dummy function.
2. public the eth_dev_fp_ops_reset/setup interface for driver usage.

> +
>  #endif /* _ETH_PRIVATE_H_ */
> diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
> index c8abda6dd7..9f7a0cbb8c 100644
> --- a/lib/ethdev/rte_ethdev.c
> +++ b/lib/ethdev/rte_ethdev.c
> @@ -44,6 +44,9 @@
>  static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data";
>  struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];
>  
> +/* public fast-path API */
> +struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
> +
>  /* spinlock for eth device callbacks */
>  static rte_spinlock_t eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
>  
> @@ -578,6 +581,8 @@ rte_eth_dev_release_port(struct rte_eth_dev *eth_dev)
>  		rte_eth_dev_callback_process(eth_dev,
>  				RTE_ETH_EVENT_DESTROY, NULL);
>  
> +	eth_dev_fp_ops_reset(rte_eth_fp_ops + eth_dev->data->port_id);
> +
>  	rte_spinlock_lock(&eth_dev_shared_data->ownership_lock);
>  
>  	eth_dev->state = RTE_ETH_DEV_UNUSED;
> @@ -1787,6 +1792,9 @@ rte_eth_dev_start(uint16_t port_id)
>  		(*dev->dev_ops->link_update)(dev, 0);
>  	}
>  
> +	/* expose selection of PMD fast-path functions */
> +	eth_dev_fp_ops_setup(rte_eth_fp_ops + port_id, dev);
> +
>  	rte_ethdev_trace_start(port_id);
>  	return 0;
>  }
> @@ -1809,6 +1817,9 @@ rte_eth_dev_stop(uint16_t port_id)
>  		return 0;
>  	}
>  
> +	/* point fast-path functions to dummy ones */
> +	eth_dev_fp_ops_reset(rte_eth_fp_ops + port_id);
> +
>  	dev->data->dev_started = 0;
>  	ret = (*dev->dev_ops->dev_stop)(dev);
>  	rte_ethdev_trace_stop(port_id, ret);
> @@ -4567,6 +4578,14 @@ rte_eth_mirror_rule_reset(uint16_t port_id, uint8_t rule_id)
>  	return eth_err(port_id, (*dev->dev_ops->mirror_rule_reset)(dev, rule_id));
>  }
>  
> +RTE_INIT(eth_dev_init_fp_ops)
> +{
> +	uint32_t i;
> +
> +	for (i = 0; i != RTE_DIM(rte_eth_fp_ops); i++)
> +		eth_dev_fp_ops_reset(rte_eth_fp_ops + i);
> +}
> +
>  RTE_INIT(eth_dev_init_cb_lists)
>  {
>  	uint16_t i;
> @@ -4735,6 +4754,14 @@ rte_eth_dev_probing_finish(struct rte_eth_dev *dev)
>  	if (dev == NULL)
>  		return;
>  
> +	/*
> +	 * for secondary process, at that point we expect device
> +	 * to be already 'usable', so shared data and all function pointers
> +	 * for fast-path devops have to be setup properly inside rte_eth_dev.
> +	 */
> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
> +		eth_dev_fp_ops_setup(rte_eth_fp_ops + dev->data->port_id, dev);
> +
>  	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_NEW, NULL);
>  
>  	dev->state = RTE_ETH_DEV_ATTACHED;
> diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
> index 51cd68de94..d5853dff86 100644
> --- a/lib/ethdev/rte_ethdev_core.h
> +++ b/lib/ethdev/rte_ethdev_core.h
> @@ -50,6 +50,61 @@ typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset);
>  typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset);
>  /**< @internal Check the status of a Tx descriptor */
>  
> +/**
> + * @internal
> + * Structure used to hold opaque pointers to internal ethdev Rx/Tx
> + * queues data.
> + * The main purpose to expose these pointers at all - allow compiler
> + * to fetch this data for fast-path ethdev inline functions in advance.
> + */
> +struct rte_ethdev_qdata {
> +	void **data;
> +	/**< points to array of internal queue data pointers */
> +	void **clbk;
> +	/**< points to array of queue callback data pointers */
> +};
> +
> +/**
> + * @internal
> + * fast-path ethdev functions and related data are hold in a flat array.
> + * One entry per ethdev.
> + * On 64-bit systems contents of this structure occupy exactly two 64B lines.
> + * On 32-bit systems contents of this structure fits into one 64B line.
> + */
> +struct rte_eth_fp_ops {
> +
> +	/**
> +	 * Rx fast-path functions and related data.
> +	 * 64-bit systems: occupies first 64B line
> +	 */
> +	eth_rx_burst_t rx_pkt_burst;
> +	/**< PMD receive function. */
> +	eth_rx_queue_count_t rx_queue_count;
> +	/**< Get the number of used RX descriptors. */
> +	eth_rx_descriptor_status_t rx_descriptor_status;
> +	/**< Check the status of a Rx descriptor. */
> +	struct rte_ethdev_qdata rxq;
> +	/**< Rx queues data. */
> +	uintptr_t reserved1[3];
> +
> +	/**
> +	 * Tx fast-path functions and related data.
> +	 * 64-bit systems: occupies second 64B line
> +	 */
> +	eth_tx_burst_t tx_pkt_burst;

Why not place rx_pkt_burst/tx_pkt_burst/rxq /txq to the first cacheline ?
Other function, e.g. rx_queue_count/descriptor_status are low frequency call functions.

> +	/**< PMD transmit function. */
> +	eth_tx_prep_t tx_pkt_prepare;
> +	/**< PMD transmit prepare function. */
> +	eth_tx_descriptor_status_t tx_descriptor_status;
> +	/**< Check the status of a Tx descriptor. */
> +	struct rte_ethdev_qdata txq;
> +	/**< Tx queues data. */
> +	uintptr_t reserved2[3];
> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
> +
>  
>  /**
>   * @internal
>
fengchengwen Oct. 11, 2021, 1:18 a.m. UTC | #2
Sorry to self-reply.

I think it's better the 'struct rte_eth_dev *dev' hold a pointer to the
'struct rte_eth_fp_ops', e.g.

	struct rte_eth_dev {
		struct rte_eth_fp_ops *fp_ops;
		...  // other field
	}

The eth framework set the pointer in the rte_eth_dev_pci_allocate(), and driver fill
corresponding callback:
	dev->fp_ops->rx_pkt_burst = xxx_recv_pkts;
	dev->fp_ops->tx_pkt_burst = xxx_xmit_pkts;
	...

In this way, the behavior of the primary and secondary processes can be unified, which
is basically the same as that of the original process.


On 2021/10/9 20:05, fengchengwen wrote:
> On 2021/10/7 19:27, Konstantin Ananyev wrote:
>> Copy public function pointers (rx_pkt_burst(), etc.) and related
>> pointers to internal data from rte_eth_dev structure into a
>> separate flat array. That array will remain in a public header.
>> The intention here is to make rte_eth_dev and related structures internal.
>> That should allow future possible changes to core eth_dev structures
>> to be transparent to the user and help to avoid ABI/API breakages.
>> The plan is to keep minimal part of data from rte_eth_dev public,
>> so we still can use inline functions for fast-path calls
>> (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
>> The whole idea beyond this new schema:
>> 1. PMDs keep to setup fast-path function pointers and related data
>>    inside rte_eth_dev struct in the same way they did it before.
>> 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
>>    (for secondary process) we call eth_dev_fp_ops_setup, which
>>    copies these function and data pointers into rte_eth_fp_ops[port_id].
>> 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
>>    we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
>>    into some dummy values.
>> 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
>>    flat array to call PMD specific functions.
>> That approach should allow us to make rte_eth_devices[] private
>> without introducing regression and help to avoid changes in drivers code.
>>
>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>> ---
>>  lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
>>  lib/ethdev/ethdev_private.h  |  7 +++++
>>  lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
>>  lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
>>  4 files changed, 141 insertions(+)
>>
>> diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
>> index 012cf73ca2..3eeda6e9f9 100644
>> --- a/lib/ethdev/ethdev_private.c
>> +++ b/lib/ethdev/ethdev_private.c
>> @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
>>  		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
>>  	return str == NULL ? -1 : 0;
>>  }
>> +
>> +static uint16_t
>> +dummy_eth_rx_burst(__rte_unused void *rxq,
>> +		__rte_unused struct rte_mbuf **rx_pkts,
>> +		__rte_unused uint16_t nb_pkts)
>> +{
>> +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
>> +	rte_errno = ENOTSUP;
>> +	return 0;
>> +}
>> +
>> +static uint16_t
>> +dummy_eth_tx_burst(__rte_unused void *txq,
>> +		__rte_unused struct rte_mbuf **tx_pkts,
>> +		__rte_unused uint16_t nb_pkts)
>> +{
>> +	RTE_ETHDEV_LOG(ERR, "tx_pkt_burst for unconfigured port\n");
>> +	rte_errno = ENOTSUP;
>> +	return 0;
>> +}
>> +
>> +void
>> +eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo)
> 
> The port_id parameter is preferable, this will hide rte_eth_fp_ops as much as possible.
> 
>> +{
>> +	static void *dummy_data[RTE_MAX_QUEUES_PER_PORT];
>> +	static const struct rte_eth_fp_ops dummy_ops = {
>> +		.rx_pkt_burst = dummy_eth_rx_burst,
>> +		.tx_pkt_burst = dummy_eth_tx_burst,
>> +		.rxq = {.data = dummy_data, .clbk = dummy_data,},
>> +		.txq = {.data = dummy_data, .clbk = dummy_data,},
>> +	};
>> +
>> +	*fpo = dummy_ops;
>> +}
>> +
>> +void
>> +eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
>> +		const struct rte_eth_dev *dev)
> 
> Because fp_ops and eth_dev is a one-to-one correspondence. It's better only use
> port_id parameter.
> 
>> +{
>> +	fpo->rx_pkt_burst = dev->rx_pkt_burst;
>> +	fpo->tx_pkt_burst = dev->tx_pkt_burst;
>> +	fpo->tx_pkt_prepare = dev->tx_pkt_prepare;
>> +	fpo->rx_queue_count = dev->rx_queue_count;
>> +	fpo->rx_descriptor_status = dev->rx_descriptor_status;
>> +	fpo->tx_descriptor_status = dev->tx_descriptor_status;
>> +
>> +	fpo->rxq.data = dev->data->rx_queues;
>> +	fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs;
>> +
>> +	fpo->txq.data = dev->data->tx_queues;
>> +	fpo->txq.clbk = (void **)(uintptr_t)dev->pre_tx_burst_cbs;
>> +}
>> diff --git a/lib/ethdev/ethdev_private.h b/lib/ethdev/ethdev_private.h
>> index 3724429577..5721be7bdc 100644
>> --- a/lib/ethdev/ethdev_private.h
>> +++ b/lib/ethdev/ethdev_private.h
>> @@ -26,4 +26,11 @@ eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
>>  /* Parse devargs value for representor parameter. */
>>  int rte_eth_devargs_parse_representor_ports(char *str, void *data);
>>  
>> +/* reset eth fast-path API to dummy values */
>> +void eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo);
>> +
>> +/* setup eth fast-path API to ethdev values */
>> +void eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
>> +		const struct rte_eth_dev *dev);
> 
> Some drivers control the transmit/receive function during operation. E.g.
> for hns3 driver, when detect reset, primary process will set rx/tx burst to dummy, after
> process reset, primary process will set the correct rx/tx burst. During this process, the
> send and receive threads are still working, but the bursts they call are changed. So:
> 1. it is recommended that trace be deleted from the dummy function.
> 2. public the eth_dev_fp_ops_reset/setup interface for driver usage.
> 
>> +
>>  #endif /* _ETH_PRIVATE_H_ */
>> diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
>> index c8abda6dd7..9f7a0cbb8c 100644
>> --- a/lib/ethdev/rte_ethdev.c
>> +++ b/lib/ethdev/rte_ethdev.c
>> @@ -44,6 +44,9 @@
>>  static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data";
>>  struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];
>>  
>> +/* public fast-path API */
>> +struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
>> +
>>  /* spinlock for eth device callbacks */
>>  static rte_spinlock_t eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
>>  
>> @@ -578,6 +581,8 @@ rte_eth_dev_release_port(struct rte_eth_dev *eth_dev)
>>  		rte_eth_dev_callback_process(eth_dev,
>>  				RTE_ETH_EVENT_DESTROY, NULL);
>>  
>> +	eth_dev_fp_ops_reset(rte_eth_fp_ops + eth_dev->data->port_id);
>> +
>>  	rte_spinlock_lock(&eth_dev_shared_data->ownership_lock);
>>  
>>  	eth_dev->state = RTE_ETH_DEV_UNUSED;
>> @@ -1787,6 +1792,9 @@ rte_eth_dev_start(uint16_t port_id)
>>  		(*dev->dev_ops->link_update)(dev, 0);
>>  	}
>>  
>> +	/* expose selection of PMD fast-path functions */
>> +	eth_dev_fp_ops_setup(rte_eth_fp_ops + port_id, dev);
>> +
>>  	rte_ethdev_trace_start(port_id);
>>  	return 0;
>>  }
>> @@ -1809,6 +1817,9 @@ rte_eth_dev_stop(uint16_t port_id)
>>  		return 0;
>>  	}
>>  
>> +	/* point fast-path functions to dummy ones */
>> +	eth_dev_fp_ops_reset(rte_eth_fp_ops + port_id);
>> +
>>  	dev->data->dev_started = 0;
>>  	ret = (*dev->dev_ops->dev_stop)(dev);
>>  	rte_ethdev_trace_stop(port_id, ret);
>> @@ -4567,6 +4578,14 @@ rte_eth_mirror_rule_reset(uint16_t port_id, uint8_t rule_id)
>>  	return eth_err(port_id, (*dev->dev_ops->mirror_rule_reset)(dev, rule_id));
>>  }
>>  
>> +RTE_INIT(eth_dev_init_fp_ops)
>> +{
>> +	uint32_t i;
>> +
>> +	for (i = 0; i != RTE_DIM(rte_eth_fp_ops); i++)
>> +		eth_dev_fp_ops_reset(rte_eth_fp_ops + i);
>> +}
>> +
>>  RTE_INIT(eth_dev_init_cb_lists)
>>  {
>>  	uint16_t i;
>> @@ -4735,6 +4754,14 @@ rte_eth_dev_probing_finish(struct rte_eth_dev *dev)
>>  	if (dev == NULL)
>>  		return;
>>  
>> +	/*
>> +	 * for secondary process, at that point we expect device
>> +	 * to be already 'usable', so shared data and all function pointers
>> +	 * for fast-path devops have to be setup properly inside rte_eth_dev.
>> +	 */
>> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
>> +		eth_dev_fp_ops_setup(rte_eth_fp_ops + dev->data->port_id, dev);
>> +
>>  	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_NEW, NULL);
>>  
>>  	dev->state = RTE_ETH_DEV_ATTACHED;
>> diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
>> index 51cd68de94..d5853dff86 100644
>> --- a/lib/ethdev/rte_ethdev_core.h
>> +++ b/lib/ethdev/rte_ethdev_core.h
>> @@ -50,6 +50,61 @@ typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset);
>>  typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset);
>>  /**< @internal Check the status of a Tx descriptor */
>>  
>> +/**
>> + * @internal
>> + * Structure used to hold opaque pointers to internal ethdev Rx/Tx
>> + * queues data.
>> + * The main purpose to expose these pointers at all - allow compiler
>> + * to fetch this data for fast-path ethdev inline functions in advance.
>> + */
>> +struct rte_ethdev_qdata {
>> +	void **data;
>> +	/**< points to array of internal queue data pointers */
>> +	void **clbk;
>> +	/**< points to array of queue callback data pointers */
>> +};
>> +
>> +/**
>> + * @internal
>> + * fast-path ethdev functions and related data are hold in a flat array.
>> + * One entry per ethdev.
>> + * On 64-bit systems contents of this structure occupy exactly two 64B lines.
>> + * On 32-bit systems contents of this structure fits into one 64B line.
>> + */
>> +struct rte_eth_fp_ops {
>> +
>> +	/**
>> +	 * Rx fast-path functions and related data.
>> +	 * 64-bit systems: occupies first 64B line
>> +	 */
>> +	eth_rx_burst_t rx_pkt_burst;
>> +	/**< PMD receive function. */
>> +	eth_rx_queue_count_t rx_queue_count;
>> +	/**< Get the number of used RX descriptors. */
>> +	eth_rx_descriptor_status_t rx_descriptor_status;
>> +	/**< Check the status of a Rx descriptor. */
>> +	struct rte_ethdev_qdata rxq;
>> +	/**< Rx queues data. */
>> +	uintptr_t reserved1[3];
>> +
>> +	/**
>> +	 * Tx fast-path functions and related data.
>> +	 * 64-bit systems: occupies second 64B line
>> +	 */
>> +	eth_tx_burst_t tx_pkt_burst;
> 
> Why not place rx_pkt_burst/tx_pkt_burst/rxq /txq to the first cacheline ?
> Other function, e.g. rx_queue_count/descriptor_status are low frequency call functions.
> 
>> +	/**< PMD transmit function. */
>> +	eth_tx_prep_t tx_pkt_prepare;
>> +	/**< PMD transmit prepare function. */
>> +	eth_tx_descriptor_status_t tx_descriptor_status;
>> +	/**< Check the status of a Tx descriptor. */
>> +	struct rte_ethdev_qdata txq;
>> +	/**< Tx queues data. */
>> +	uintptr_t reserved2[3];
>> +
>> +} __rte_cache_aligned;
>> +
>> +extern struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
>> +
>>  
>>  /**
>>   * @internal
>>
> 
> 
> .
>
Andrew Rybchenko Oct. 11, 2021, 8:25 a.m. UTC | #3
On 10/7/21 2:27 PM, Konstantin Ananyev wrote:
> Copy public function pointers (rx_pkt_burst(), etc.) and related
> pointers to internal data from rte_eth_dev structure into a
> separate flat array. That array will remain in a public header.
> The intention here is to make rte_eth_dev and related structures internal.
> That should allow future possible changes to core eth_dev structures
> to be transparent to the user and help to avoid ABI/API breakages.
> The plan is to keep minimal part of data from rte_eth_dev public,
> so we still can use inline functions for fast-path calls
> (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
> The whole idea beyond this new schema:
> 1. PMDs keep to setup fast-path function pointers and related data
>    inside rte_eth_dev struct in the same way they did it before.
> 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
>    (for secondary process) we call eth_dev_fp_ops_setup, which
>    copies these function and data pointers into rte_eth_fp_ops[port_id].
> 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
>    we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
>    into some dummy values.
> 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
>    flat array to call PMD specific functions.
> That approach should allow us to make rte_eth_devices[] private
> without introducing regression and help to avoid changes in drivers code.
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

Overall LGTM, few nits below.

> ---
>  lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
>  lib/ethdev/ethdev_private.h  |  7 +++++
>  lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
>  lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 141 insertions(+)
> 
> diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
> index 012cf73ca2..3eeda6e9f9 100644
> --- a/lib/ethdev/ethdev_private.c
> +++ b/lib/ethdev/ethdev_private.c
> @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
>  		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
>  	return str == NULL ? -1 : 0;
>  }
> +
> +static uint16_t
> +dummy_eth_rx_burst(__rte_unused void *rxq,
> +		__rte_unused struct rte_mbuf **rx_pkts,
> +		__rte_unused uint16_t nb_pkts)
> +{
> +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");

May be "unconfigured" -> "stopped" ? Or "non-started" ?

> +	rte_errno = ENOTSUP;
> +	return 0;
> +}
> +
> +static uint16_t
> +dummy_eth_tx_burst(__rte_unused void *txq,
> +		__rte_unused struct rte_mbuf **tx_pkts,
> +		__rte_unused uint16_t nb_pkts)
> +{
> +	RTE_ETHDEV_LOG(ERR, "tx_pkt_burst for unconfigured port\n");

May be "unconfigured" -> "stopped" ?

> +	rte_errno = ENOTSUP;
> +	return 0;
> +}
> +
> +void
> +eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo)
> +{
> +	static void *dummy_data[RTE_MAX_QUEUES_PER_PORT];
> +	static const struct rte_eth_fp_ops dummy_ops = {
> +		.rx_pkt_burst = dummy_eth_rx_burst,
> +		.tx_pkt_burst = dummy_eth_tx_burst,
> +		.rxq = {.data = dummy_data, .clbk = dummy_data,},
> +		.txq = {.data = dummy_data, .clbk = dummy_data,},
> +	};
> +
> +	*fpo = dummy_ops;
> +}
> +
> +void
> +eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
> +		const struct rte_eth_dev *dev)
> +{
> +	fpo->rx_pkt_burst = dev->rx_pkt_burst;
> +	fpo->tx_pkt_burst = dev->tx_pkt_burst;
> +	fpo->tx_pkt_prepare = dev->tx_pkt_prepare;
> +	fpo->rx_queue_count = dev->rx_queue_count;
> +	fpo->rx_descriptor_status = dev->rx_descriptor_status;
> +	fpo->tx_descriptor_status = dev->tx_descriptor_status;
> +
> +	fpo->rxq.data = dev->data->rx_queues;
> +	fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs;
> +
> +	fpo->txq.data = dev->data->tx_queues;
> +	fpo->txq.clbk = (void **)(uintptr_t)dev->pre_tx_burst_cbs;
> +}
> diff --git a/lib/ethdev/ethdev_private.h b/lib/ethdev/ethdev_private.h
> index 3724429577..5721be7bdc 100644
> --- a/lib/ethdev/ethdev_private.h
> +++ b/lib/ethdev/ethdev_private.h
> @@ -26,4 +26,11 @@ eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
>  /* Parse devargs value for representor parameter. */
>  int rte_eth_devargs_parse_representor_ports(char *str, void *data);
>  
> +/* reset eth fast-path API to dummy values */
> +void eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo);
> +
> +/* setup eth fast-path API to ethdev values */
> +void eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
> +		const struct rte_eth_dev *dev);
> +
>  #endif /* _ETH_PRIVATE_H_ */
> diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
> index c8abda6dd7..9f7a0cbb8c 100644
> --- a/lib/ethdev/rte_ethdev.c
> +++ b/lib/ethdev/rte_ethdev.c
> @@ -44,6 +44,9 @@
>  static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data";
>  struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];
>  
> +/* public fast-path API */

Shoudn't it be a doxygen style comment?

> +struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
> +
>  /* spinlock for eth device callbacks */
>  static rte_spinlock_t eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
>  

[snip]

> diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
> index 51cd68de94..d5853dff86 100644
> --- a/lib/ethdev/rte_ethdev_core.h
> +++ b/lib/ethdev/rte_ethdev_core.h
> @@ -50,6 +50,61 @@ typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset);
>  typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset);
>  /**< @internal Check the status of a Tx descriptor */
>  
> +/**
> + * @internal
> + * Structure used to hold opaque pointers to internal ethdev Rx/Tx
> + * queues data.
> + * The main purpose to expose these pointers at all - allow compiler
> + * to fetch this data for fast-path ethdev inline functions in advance.
> + */
> +struct rte_ethdev_qdata {
> +	void **data;
> +	/**< points to array of internal queue data pointers */

Please, put the documentation on the same like or just
put documentation before the documented member.

> +	void **clbk;
> +	/**< points to array of queue callback data pointers */
> +};
> +
> +/**
> + * @internal
> + * fast-path ethdev functions and related data are hold in a flat array.
> + * One entry per ethdev.
> + * On 64-bit systems contents of this structure occupy exactly two 64B lines.
> + * On 32-bit systems contents of this structure fits into one 64B line.
> + */
> +struct rte_eth_fp_ops {
> +
> +	/**
> +	 * Rx fast-path functions and related data.
> +	 * 64-bit systems: occupies first 64B line
> +	 */

As I understand the above comment is for a group of below
fields. If so, Doxygen annocation for member groups should
be used.

> +	eth_rx_burst_t rx_pkt_burst;
> +	/**< PMD receive function. */

May I ask to avoid usage of documentation after member in a
separate line. It makes sense to if it is located in the same
line, but otherwise, it should be simply put before the
documented member.

[snip]
Andrew Rybchenko Oct. 11, 2021, 8:35 a.m. UTC | #4
On 10/9/21 3:05 PM, fengchengwen wrote:
> On 2021/10/7 19:27, Konstantin Ananyev wrote:
>> Copy public function pointers (rx_pkt_burst(), etc.) and related
>> pointers to internal data from rte_eth_dev structure into a
>> separate flat array. That array will remain in a public header.
>> The intention here is to make rte_eth_dev and related structures internal.
>> That should allow future possible changes to core eth_dev structures
>> to be transparent to the user and help to avoid ABI/API breakages.
>> The plan is to keep minimal part of data from rte_eth_dev public,
>> so we still can use inline functions for fast-path calls
>> (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
>> The whole idea beyond this new schema:
>> 1. PMDs keep to setup fast-path function pointers and related data
>>    inside rte_eth_dev struct in the same way they did it before.
>> 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
>>    (for secondary process) we call eth_dev_fp_ops_setup, which
>>    copies these function and data pointers into rte_eth_fp_ops[port_id].
>> 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
>>    we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
>>    into some dummy values.
>> 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
>>    flat array to call PMD specific functions.
>> That approach should allow us to make rte_eth_devices[] private
>> without introducing regression and help to avoid changes in drivers code.
>>
>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>> ---
>>  lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
>>  lib/ethdev/ethdev_private.h  |  7 +++++
>>  lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
>>  lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
>>  4 files changed, 141 insertions(+)
>>
>> diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
>> index 012cf73ca2..3eeda6e9f9 100644
>> --- a/lib/ethdev/ethdev_private.c
>> +++ b/lib/ethdev/ethdev_private.c
>> @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
>>  		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
>>  	return str == NULL ? -1 : 0;
>>  }
>> +
>> +static uint16_t
>> +dummy_eth_rx_burst(__rte_unused void *rxq,
>> +		__rte_unused struct rte_mbuf **rx_pkts,
>> +		__rte_unused uint16_t nb_pkts)
>> +{
>> +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
>> +	rte_errno = ENOTSUP;
>> +	return 0;
>> +}
>> +
>> +static uint16_t
>> +dummy_eth_tx_burst(__rte_unused void *txq,
>> +		__rte_unused struct rte_mbuf **tx_pkts,
>> +		__rte_unused uint16_t nb_pkts)
>> +{
>> +	RTE_ETHDEV_LOG(ERR, "tx_pkt_burst for unconfigured port\n");
>> +	rte_errno = ENOTSUP;
>> +	return 0;
>> +}
>> +
>> +void
>> +eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo)
> 
> The port_id parameter is preferable, this will hide rte_eth_fp_ops as much as possible.

Sorry, but I see no point to hide it inside ethdev.
Of course, prototype should be reconsidered if we make
it ethdev-internal API available for drivers.
If so, I agree that the parameter should be port_id.

[snip]

>> diff --git a/lib/ethdev/ethdev_private.h b/lib/ethdev/ethdev_private.h
>> index 3724429577..5721be7bdc 100644
>> --- a/lib/ethdev/ethdev_private.h
>> +++ b/lib/ethdev/ethdev_private.h
>> @@ -26,4 +26,11 @@ eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
>>  /* Parse devargs value for representor parameter. */
>>  int rte_eth_devargs_parse_representor_ports(char *str, void *data);
>>  
>> +/* reset eth fast-path API to dummy values */
>> +void eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo);
>> +
>> +/* setup eth fast-path API to ethdev values */
>> +void eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
>> +		const struct rte_eth_dev *dev);
> 
> Some drivers control the transmit/receive function during operation. E.g.
> for hns3 driver, when detect reset, primary process will set rx/tx burst to dummy, after
> process reset, primary process will set the correct rx/tx burst. During this process, the
> send and receive threads are still working, but the bursts they call are changed. So:
> 1. it is recommended that trace be deleted from the dummy function.
> 2. public the eth_dev_fp_ops_reset/setup interface for driver usage.

Good point.

[snip]

>> diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
>> index 51cd68de94..d5853dff86 100644
>> --- a/lib/ethdev/rte_ethdev_core.h
>> +++ b/lib/ethdev/rte_ethdev_core.h
>> @@ -50,6 +50,61 @@ typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset);
>>  typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset);
>>  /**< @internal Check the status of a Tx descriptor */
>>  
>> +/**
>> + * @internal
>> + * Structure used to hold opaque pointers to internal ethdev Rx/Tx
>> + * queues data.
>> + * The main purpose to expose these pointers at all - allow compiler
>> + * to fetch this data for fast-path ethdev inline functions in advance.
>> + */
>> +struct rte_ethdev_qdata {
>> +	void **data;
>> +	/**< points to array of internal queue data pointers */
>> +	void **clbk;
>> +	/**< points to array of queue callback data pointers */
>> +};
>> +
>> +/**
>> + * @internal
>> + * fast-path ethdev functions and related data are hold in a flat array.
>> + * One entry per ethdev.
>> + * On 64-bit systems contents of this structure occupy exactly two 64B lines.
>> + * On 32-bit systems contents of this structure fits into one 64B line.
>> + */
>> +struct rte_eth_fp_ops {
>> +
>> +	/**
>> +	 * Rx fast-path functions and related data.
>> +	 * 64-bit systems: occupies first 64B line
>> +	 */
>> +	eth_rx_burst_t rx_pkt_burst;
>> +	/**< PMD receive function. */
>> +	eth_rx_queue_count_t rx_queue_count;
>> +	/**< Get the number of used RX descriptors. */
>> +	eth_rx_descriptor_status_t rx_descriptor_status;
>> +	/**< Check the status of a Rx descriptor. */
>> +	struct rte_ethdev_qdata rxq;
>> +	/**< Rx queues data. */
>> +	uintptr_t reserved1[3];
>> +
>> +	/**
>> +	 * Tx fast-path functions and related data.
>> +	 * 64-bit systems: occupies second 64B line
>> +	 */
>> +	eth_tx_burst_t tx_pkt_burst;
> 
> Why not place rx_pkt_burst/tx_pkt_burst/rxq /txq to the first cacheline ?
> Other function, e.g. rx_queue_count/descriptor_status are low frequency call functions.

+1 Very good question
If so, tx_pkt_prepare should be on the first cache-line
as well.

>> +	/**< PMD transmit function. */
>> +	eth_tx_prep_t tx_pkt_prepare;
>> +	/**< PMD transmit prepare function. */
>> +	eth_tx_descriptor_status_t tx_descriptor_status;
>> +	/**< Check the status of a Tx descriptor. */
>> +	struct rte_ethdev_qdata txq;
>> +	/**< Tx queues data. */
>> +	uintptr_t reserved2[3];
>> +
>> +} __rte_cache_aligned;
>> +
>> +extern struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
>> +
>>  
>>  /**
>>   * @internal
>>
Andrew Rybchenko Oct. 11, 2021, 8:39 a.m. UTC | #5
On 10/11/21 4:18 AM, fengchengwen wrote:
> Sorry to self-reply.
> 
> I think it's better the 'struct rte_eth_dev *dev' hold a pointer to the
> 'struct rte_eth_fp_ops', e.g.
> 
> 	struct rte_eth_dev {
> 		struct rte_eth_fp_ops *fp_ops;
> 		...  // other field
> 	}
> 
> The eth framework set the pointer in the rte_eth_dev_pci_allocate(), and driver fill
> corresponding callback:
> 	dev->fp_ops->rx_pkt_burst = xxx_recv_pkts;
> 	dev->fp_ops->tx_pkt_burst = xxx_xmit_pkts;
> 	...
> 
> In this way, the behavior of the primary and secondary processes can be unified, which
> is basically the same as that of the original process.

Frankly speaking I see huge value in scheme suggested by
Konstantin to care about ops in stopped state on ethdev value.
So, I like approach in the patch more.
Ananyev, Konstantin Oct. 11, 2021, 3:15 p.m. UTC | #6
> > Copy public function pointers (rx_pkt_burst(), etc.) and related
> > pointers to internal data from rte_eth_dev structure into a
> > separate flat array. That array will remain in a public header.
> > The intention here is to make rte_eth_dev and related structures internal.
> > That should allow future possible changes to core eth_dev structures
> > to be transparent to the user and help to avoid ABI/API breakages.
> > The plan is to keep minimal part of data from rte_eth_dev public,
> > so we still can use inline functions for fast-path calls
> > (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
> > The whole idea beyond this new schema:
> > 1. PMDs keep to setup fast-path function pointers and related data
> >    inside rte_eth_dev struct in the same way they did it before.
> > 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
> >    (for secondary process) we call eth_dev_fp_ops_setup, which
> >    copies these function and data pointers into rte_eth_fp_ops[port_id].
> > 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
> >    we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
> >    into some dummy values.
> > 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
> >    flat array to call PMD specific functions.
> > That approach should allow us to make rte_eth_devices[] private
> > without introducing regression and help to avoid changes in drivers code.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > ---
> >  lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
> >  lib/ethdev/ethdev_private.h  |  7 +++++
> >  lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
> >  lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
> >  4 files changed, 141 insertions(+)
> >
> > diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
> > index 012cf73ca2..3eeda6e9f9 100644
> > --- a/lib/ethdev/ethdev_private.c
> > +++ b/lib/ethdev/ethdev_private.c
> > @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
> >  		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
> >  	return str == NULL ? -1 : 0;
> >  }
> > +
> > +static uint16_t
> > +dummy_eth_rx_burst(__rte_unused void *rxq,
> > +		__rte_unused struct rte_mbuf **rx_pkts,
> > +		__rte_unused uint16_t nb_pkts)
> > +{
> > +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
> > +	rte_errno = ENOTSUP;
> > +	return 0;
> > +}
> > +
> > +static uint16_t
> > +dummy_eth_tx_burst(__rte_unused void *txq,
> > +		__rte_unused struct rte_mbuf **tx_pkts,
> > +		__rte_unused uint16_t nb_pkts)
> > +{
> > +	RTE_ETHDEV_LOG(ERR, "tx_pkt_burst for unconfigured port\n");
> > +	rte_errno = ENOTSUP;
> > +	return 0;
> > +}
> > +
> > +void
> > +eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo)
> 
> The port_id parameter is preferable, this will hide rte_eth_fp_ops as much as possible.

Why do we need to hide it here?
rte_eth_fp_ops is a public structure, and it is a helper function that
just resets fields of this structure to some predefined dummy values.
Nice and simple, so I prefer to keep it like that. 

> 
> > +{
> > +	static void *dummy_data[RTE_MAX_QUEUES_PER_PORT];
> > +	static const struct rte_eth_fp_ops dummy_ops = {
> > +		.rx_pkt_burst = dummy_eth_rx_burst,
> > +		.tx_pkt_burst = dummy_eth_tx_burst,
> > +		.rxq = {.data = dummy_data, .clbk = dummy_data,},
> > +		.txq = {.data = dummy_data, .clbk = dummy_data,},
> > +	};
> > +
> > +	*fpo = dummy_ops;
> > +}
> > +
> > +void
> > +eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
> > +		const struct rte_eth_dev *dev)
> 
> Because fp_ops and eth_dev is a one-to-one correspondence. It's better only use
> port_id parameter.

Same as above:
All this internal helper function does - copies some fields from one structure to another.
Both structures are visible by ethdev layer.
No point to add extra assumptions and complexity here. 

> 
> > +{
> > +	fpo->rx_pkt_burst = dev->rx_pkt_burst;
> > +	fpo->tx_pkt_burst = dev->tx_pkt_burst;
> > +	fpo->tx_pkt_prepare = dev->tx_pkt_prepare;
> > +	fpo->rx_queue_count = dev->rx_queue_count;
> > +	fpo->rx_descriptor_status = dev->rx_descriptor_status;
> > +	fpo->tx_descriptor_status = dev->tx_descriptor_status;
> > +
> > +	fpo->rxq.data = dev->data->rx_queues;
> > +	fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs;
> > +
> > +	fpo->txq.data = dev->data->tx_queues;
> > +	fpo->txq.clbk = (void **)(uintptr_t)dev->pre_tx_burst_cbs;
> > +}
> > diff --git a/lib/ethdev/ethdev_private.h b/lib/ethdev/ethdev_private.h
> > index 3724429577..5721be7bdc 100644
> > --- a/lib/ethdev/ethdev_private.h
> > +++ b/lib/ethdev/ethdev_private.h
> > @@ -26,4 +26,11 @@ eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
> >  /* Parse devargs value for representor parameter. */
> >  int rte_eth_devargs_parse_representor_ports(char *str, void *data);
> >
> > +/* reset eth fast-path API to dummy values */
> > +void eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo);
> > +
> > +/* setup eth fast-path API to ethdev values */
> > +void eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
> > +		const struct rte_eth_dev *dev);
> 
> Some drivers control the transmit/receive function during operation. E.g.
> for hns3 driver, when detect reset, primary process will set rx/tx burst to dummy, after
> process reset, primary process will set the correct rx/tx burst. During this process, the
> send and receive threads are still working, but the bursts they call are changed. So:

This text above is a bit too cryptic for me...
Are you saying that your driver changes rte_eth_dev.rx_pkt_burst(/ tx_pkt_burst) on the fly
(after dev_start() and before dev_stop())?
If so, then generally speaking, it is a bad idea.
While it might works for some limited scenarios, right now it is not supported by ethdev framework,
and might introduce a lot of problems. 

> 1. it is recommended that trace be deleted from the dummy function.

You are talking about:
RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
right?
Dummy function is supposed to be set only when device is not able to do RX/TX properly
(not attached, or attached but not configured, or attached and configured, but not started).
Obviously if app calls rx/tx_burst for such port it is a major issue, that should be flagged imemdiatelly.
So I believe having log here makes a perfect sense here. 

> 2. public the eth_dev_fp_ops_reset/setup interface for driver usage.

You mean move their declarations into ethdev_driver.h?
I suppose that could be done, but still wonder why driver would need to
call these functions directly?
 
> > +
> >  #endif /* _ETH_PRIVATE_H_ */
> > diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
> > index c8abda6dd7..9f7a0cbb8c 100644
> > --- a/lib/ethdev/rte_ethdev.c
> > +++ b/lib/ethdev/rte_ethdev.c
> > @@ -44,6 +44,9 @@
> >  static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data";
> >  struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];
> >
> > +/* public fast-path API */
> > +struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
> > +
> >  /* spinlock for eth device callbacks */
> >  static rte_spinlock_t eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
> >
> > @@ -578,6 +581,8 @@ rte_eth_dev_release_port(struct rte_eth_dev *eth_dev)
> >  		rte_eth_dev_callback_process(eth_dev,
> >  				RTE_ETH_EVENT_DESTROY, NULL);
> >
> > +	eth_dev_fp_ops_reset(rte_eth_fp_ops + eth_dev->data->port_id);
> > +
> >  	rte_spinlock_lock(&eth_dev_shared_data->ownership_lock);
> >
> >  	eth_dev->state = RTE_ETH_DEV_UNUSED;
> > @@ -1787,6 +1792,9 @@ rte_eth_dev_start(uint16_t port_id)
> >  		(*dev->dev_ops->link_update)(dev, 0);
> >  	}
> >
> > +	/* expose selection of PMD fast-path functions */
> > +	eth_dev_fp_ops_setup(rte_eth_fp_ops + port_id, dev);
> > +
> >  	rte_ethdev_trace_start(port_id);
> >  	return 0;
> >  }
> > @@ -1809,6 +1817,9 @@ rte_eth_dev_stop(uint16_t port_id)
> >  		return 0;
> >  	}
> >
> > +	/* point fast-path functions to dummy ones */
> > +	eth_dev_fp_ops_reset(rte_eth_fp_ops + port_id);
> > +
> >  	dev->data->dev_started = 0;
> >  	ret = (*dev->dev_ops->dev_stop)(dev);
> >  	rte_ethdev_trace_stop(port_id, ret);
> > @@ -4567,6 +4578,14 @@ rte_eth_mirror_rule_reset(uint16_t port_id, uint8_t rule_id)
> >  	return eth_err(port_id, (*dev->dev_ops->mirror_rule_reset)(dev, rule_id));
> >  }
> >
> > +RTE_INIT(eth_dev_init_fp_ops)
> > +{
> > +	uint32_t i;
> > +
> > +	for (i = 0; i != RTE_DIM(rte_eth_fp_ops); i++)
> > +		eth_dev_fp_ops_reset(rte_eth_fp_ops + i);
> > +}
> > +
> >  RTE_INIT(eth_dev_init_cb_lists)
> >  {
> >  	uint16_t i;
> > @@ -4735,6 +4754,14 @@ rte_eth_dev_probing_finish(struct rte_eth_dev *dev)
> >  	if (dev == NULL)
> >  		return;
> >
> > +	/*
> > +	 * for secondary process, at that point we expect device
> > +	 * to be already 'usable', so shared data and all function pointers
> > +	 * for fast-path devops have to be setup properly inside rte_eth_dev.
> > +	 */
> > +	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
> > +		eth_dev_fp_ops_setup(rte_eth_fp_ops + dev->data->port_id, dev);
> > +
> >  	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_NEW, NULL);
> >
> >  	dev->state = RTE_ETH_DEV_ATTACHED;
> > diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
> > index 51cd68de94..d5853dff86 100644
> > --- a/lib/ethdev/rte_ethdev_core.h
> > +++ b/lib/ethdev/rte_ethdev_core.h
> > @@ -50,6 +50,61 @@ typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset);
> >  typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset);
> >  /**< @internal Check the status of a Tx descriptor */
> >
> > +/**
> > + * @internal
> > + * Structure used to hold opaque pointers to internal ethdev Rx/Tx
> > + * queues data.
> > + * The main purpose to expose these pointers at all - allow compiler
> > + * to fetch this data for fast-path ethdev inline functions in advance.
> > + */
> > +struct rte_ethdev_qdata {
> > +	void **data;
> > +	/**< points to array of internal queue data pointers */
> > +	void **clbk;
> > +	/**< points to array of queue callback data pointers */
> > +};
> > +
> > +/**
> > + * @internal
> > + * fast-path ethdev functions and related data are hold in a flat array.
> > + * One entry per ethdev.
> > + * On 64-bit systems contents of this structure occupy exactly two 64B lines.
> > + * On 32-bit systems contents of this structure fits into one 64B line.
> > + */
> > +struct rte_eth_fp_ops {
> > +
> > +	/**
> > +	 * Rx fast-path functions and related data.
> > +	 * 64-bit systems: occupies first 64B line
> > +	 */
> > +	eth_rx_burst_t rx_pkt_burst;
> > +	/**< PMD receive function. */
> > +	eth_rx_queue_count_t rx_queue_count;
> > +	/**< Get the number of used RX descriptors. */
> > +	eth_rx_descriptor_status_t rx_descriptor_status;
> > +	/**< Check the status of a Rx descriptor. */
> > +	struct rte_ethdev_qdata rxq;
> > +	/**< Rx queues data. */
> > +	uintptr_t reserved1[3];
> > +
> > +	/**
> > +	 * Tx fast-path functions and related data.
> > +	 * 64-bit systems: occupies second 64B line
> > +	 */
> > +	eth_tx_burst_t tx_pkt_burst;
> 
> Why not place rx_pkt_burst/tx_pkt_burst/rxq /txq to the first cacheline ?
> Other function, e.g. rx_queue_count/descriptor_status are low frequency call functions.

I suppose you are talking about layout like that:
struct rte_eth_fp_ops {
   /* first 64B line */
   rx_pkt_burst;
   tx_pkt_burst;
   tx_pkt_prepare;
   struct rte_ethdev_qdata rxq;
   struct rte_ethdev_qdata txq;
   reserved1[1];
   /* second 64B line */
  ...
};

I thought about such ability, even tried it, but I didn't see any performance gain.
From other side current layout seems better to me from structural point:
it is more uniform and easy to extend in future (both RX and TX data occupies
separate 64B line, each have equal rom for extension).    
 
> > +	/**< PMD transmit function. */
> > +	eth_tx_prep_t tx_pkt_prepare;
> > +	/**< PMD transmit prepare function. */
> > +	eth_tx_descriptor_status_t tx_descriptor_status;
> > +	/**< Check the status of a Tx descriptor. */
> > +	struct rte_ethdev_qdata txq;
> > +	/**< Tx queues data. */
> > +	uintptr_t reserved2[3];
> > +
> > +} __rte_cache_aligned;
> > +
> > +extern struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
> > +
> >
> >  /**
> >   * @internal
> >
Ananyev, Konstantin Oct. 11, 2021, 3:24 p.m. UTC | #7
> 
> Sorry to self-reply.
> 
> I think it's better the 'struct rte_eth_dev *dev' hold a pointer to the
> 'struct rte_eth_fp_ops', e.g.
> 
> 	struct rte_eth_dev {
> 		struct rte_eth_fp_ops *fp_ops;
> 		...  // other field
> 	}
> 
> The eth framework set the pointer in the rte_eth_dev_pci_allocate(), and driver fill
> corresponding callback:
> 	dev->fp_ops->rx_pkt_burst = xxx_recv_pkts;
> 	dev->fp_ops->tx_pkt_burst = xxx_xmit_pkts;
> 	...
> 
> In this way, the behavior of the primary and secondary processes can be unified, which
> is basically the same as that of the original process.

I don't think it is a good thing to do, as it nullifies one of the main thing of this approach:
fp_ops[] points to actual rx/tx functions and data only when PMD is really ready to do rx/tx
(device is properly configured and started), in other cases it points to dummy stubs.
As another drawback it will mean code changes in aach and every driver.
Ananyev, Konstantin Oct. 11, 2021, 4:52 p.m. UTC | #8
> On 10/7/21 2:27 PM, Konstantin Ananyev wrote:
> > Copy public function pointers (rx_pkt_burst(), etc.) and related
> > pointers to internal data from rte_eth_dev structure into a
> > separate flat array. That array will remain in a public header.
> > The intention here is to make rte_eth_dev and related structures internal.
> > That should allow future possible changes to core eth_dev structures
> > to be transparent to the user and help to avoid ABI/API breakages.
> > The plan is to keep minimal part of data from rte_eth_dev public,
> > so we still can use inline functions for fast-path calls
> > (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
> > The whole idea beyond this new schema:
> > 1. PMDs keep to setup fast-path function pointers and related data
> >    inside rte_eth_dev struct in the same way they did it before.
> > 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
> >    (for secondary process) we call eth_dev_fp_ops_setup, which
> >    copies these function and data pointers into rte_eth_fp_ops[port_id].
> > 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
> >    we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
> >    into some dummy values.
> > 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
> >    flat array to call PMD specific functions.
> > That approach should allow us to make rte_eth_devices[] private
> > without introducing regression and help to avoid changes in drivers code.
> >
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> 
> Overall LGTM, few nits below.
> 
> > ---
> >  lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
> >  lib/ethdev/ethdev_private.h  |  7 +++++
> >  lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
> >  lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
> >  4 files changed, 141 insertions(+)
> >
> > diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
> > index 012cf73ca2..3eeda6e9f9 100644
> > --- a/lib/ethdev/ethdev_private.c
> > +++ b/lib/ethdev/ethdev_private.c
> > @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
> >  		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
> >  	return str == NULL ? -1 : 0;
> >  }
> > +
> > +static uint16_t
> > +dummy_eth_rx_burst(__rte_unused void *rxq,
> > +		__rte_unused struct rte_mbuf **rx_pkts,
> > +		__rte_unused uint16_t nb_pkts)
> > +{
> > +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
> 
> May be "unconfigured" -> "stopped" ? Or "non-started" ?

Yes, it can be configured but not started.
So 'not started' seems like a better wording here.
Another option probably: 'not ready'.
What people think?

...

> 
> > +	rte_errno = ENOTSUP;
> > +	return 0;
> > +}
> > +
> > +struct rte_eth_fp_ops {
> > +
> > +	/**
> > +	 * Rx fast-path functions and related data.
> > +	 * 64-bit systems: occupies first 64B line
> > +	 */
> 
> As I understand the above comment is for a group of below
> fields. If so, Doxygen annocation for member groups should
> be used.

Ok, and how to do it?
Andrew Rybchenko Oct. 11, 2021, 5:22 p.m. UTC | #9
On 10/11/21 7:52 PM, Ananyev, Konstantin wrote:
> 
> 
>> On 10/7/21 2:27 PM, Konstantin Ananyev wrote:
>>> Copy public function pointers (rx_pkt_burst(), etc.) and related
>>> pointers to internal data from rte_eth_dev structure into a
>>> separate flat array. That array will remain in a public header.
>>> The intention here is to make rte_eth_dev and related structures internal.
>>> That should allow future possible changes to core eth_dev structures
>>> to be transparent to the user and help to avoid ABI/API breakages.
>>> The plan is to keep minimal part of data from rte_eth_dev public,
>>> so we still can use inline functions for fast-path calls
>>> (like rte_eth_rx_burst(), etc.) to avoid/minimize slowdown.
>>> The whole idea beyond this new schema:
>>> 1. PMDs keep to setup fast-path function pointers and related data
>>>     inside rte_eth_dev struct in the same way they did it before.
>>> 2. Inside rte_eth_dev_start() and inside rte_eth_dev_probing_finish()
>>>     (for secondary process) we call eth_dev_fp_ops_setup, which
>>>     copies these function and data pointers into rte_eth_fp_ops[port_id].
>>> 3. Inside rte_eth_dev_stop() and inside rte_eth_dev_release_port()
>>>     we call eth_dev_fp_ops_reset(), which resets rte_eth_fp_ops[port_id]
>>>     into some dummy values.
>>> 4. fast-path ethdev API (rte_eth_rx_burst(), etc.) will use that new
>>>     flat array to call PMD specific functions.
>>> That approach should allow us to make rte_eth_devices[] private
>>> without introducing regression and help to avoid changes in drivers code.
>>>
>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>
>> Overall LGTM, few nits below.
>>
>>> ---
>>>   lib/ethdev/ethdev_private.c  | 52 ++++++++++++++++++++++++++++++++++
>>>   lib/ethdev/ethdev_private.h  |  7 +++++
>>>   lib/ethdev/rte_ethdev.c      | 27 ++++++++++++++++++
>>>   lib/ethdev/rte_ethdev_core.h | 55 ++++++++++++++++++++++++++++++++++++
>>>   4 files changed, 141 insertions(+)
>>>
>>> diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
>>> index 012cf73ca2..3eeda6e9f9 100644
>>> --- a/lib/ethdev/ethdev_private.c
>>> +++ b/lib/ethdev/ethdev_private.c
>>> @@ -174,3 +174,55 @@ rte_eth_devargs_parse_representor_ports(char *str, void *data)
>>>   		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
>>>   	return str == NULL ? -1 : 0;
>>>   }
>>> +
>>> +static uint16_t
>>> +dummy_eth_rx_burst(__rte_unused void *rxq,
>>> +		__rte_unused struct rte_mbuf **rx_pkts,
>>> +		__rte_unused uint16_t nb_pkts)
>>> +{
>>> +	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
>>
>> May be "unconfigured" -> "stopped" ? Or "non-started" ?
> 
> Yes, it can be configured but not started.
> So 'not started' seems like a better wording here.
> Another option probably: 'not ready'.
> What people think?

Taking into account that some PMds would like to set dummy
pointers in some specifics conditions, I think "not ready"
is the best option here.

> 
> ...
> 
>>
>>> +	rte_errno = ENOTSUP;
>>> +	return 0;
>>> +}
>>> +
>>> +struct rte_eth_fp_ops {
>>> +
>>> +	/**
>>> +	 * Rx fast-path functions and related data.
>>> +	 * 64-bit systems: occupies first 64B line
>>> +	 */
>>
>> As I understand the above comment is for a group of below
>> fields. If so, Doxygen annocation for member groups should
>> be used.
> 
> Ok, and how to do it?
> 

See [1]

[1] https://www.doxygen.nl/manual/grouping.html#memgroup
diff mbox series

Patch

diff --git a/lib/ethdev/ethdev_private.c b/lib/ethdev/ethdev_private.c
index 012cf73ca2..3eeda6e9f9 100644
--- a/lib/ethdev/ethdev_private.c
+++ b/lib/ethdev/ethdev_private.c
@@ -174,3 +174,55 @@  rte_eth_devargs_parse_representor_ports(char *str, void *data)
 		RTE_LOG(ERR, EAL, "wrong representor format: %s\n", str);
 	return str == NULL ? -1 : 0;
 }
+
+static uint16_t
+dummy_eth_rx_burst(__rte_unused void *rxq,
+		__rte_unused struct rte_mbuf **rx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	RTE_ETHDEV_LOG(ERR, "rx_pkt_burst for unconfigured port\n");
+	rte_errno = ENOTSUP;
+	return 0;
+}
+
+static uint16_t
+dummy_eth_tx_burst(__rte_unused void *txq,
+		__rte_unused struct rte_mbuf **tx_pkts,
+		__rte_unused uint16_t nb_pkts)
+{
+	RTE_ETHDEV_LOG(ERR, "tx_pkt_burst for unconfigured port\n");
+	rte_errno = ENOTSUP;
+	return 0;
+}
+
+void
+eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo)
+{
+	static void *dummy_data[RTE_MAX_QUEUES_PER_PORT];
+	static const struct rte_eth_fp_ops dummy_ops = {
+		.rx_pkt_burst = dummy_eth_rx_burst,
+		.tx_pkt_burst = dummy_eth_tx_burst,
+		.rxq = {.data = dummy_data, .clbk = dummy_data,},
+		.txq = {.data = dummy_data, .clbk = dummy_data,},
+	};
+
+	*fpo = dummy_ops;
+}
+
+void
+eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
+		const struct rte_eth_dev *dev)
+{
+	fpo->rx_pkt_burst = dev->rx_pkt_burst;
+	fpo->tx_pkt_burst = dev->tx_pkt_burst;
+	fpo->tx_pkt_prepare = dev->tx_pkt_prepare;
+	fpo->rx_queue_count = dev->rx_queue_count;
+	fpo->rx_descriptor_status = dev->rx_descriptor_status;
+	fpo->tx_descriptor_status = dev->tx_descriptor_status;
+
+	fpo->rxq.data = dev->data->rx_queues;
+	fpo->rxq.clbk = (void **)(uintptr_t)dev->post_rx_burst_cbs;
+
+	fpo->txq.data = dev->data->tx_queues;
+	fpo->txq.clbk = (void **)(uintptr_t)dev->pre_tx_burst_cbs;
+}
diff --git a/lib/ethdev/ethdev_private.h b/lib/ethdev/ethdev_private.h
index 3724429577..5721be7bdc 100644
--- a/lib/ethdev/ethdev_private.h
+++ b/lib/ethdev/ethdev_private.h
@@ -26,4 +26,11 @@  eth_find_device(const struct rte_eth_dev *_start, rte_eth_cmp_t cmp,
 /* Parse devargs value for representor parameter. */
 int rte_eth_devargs_parse_representor_ports(char *str, void *data);
 
+/* reset eth fast-path API to dummy values */
+void eth_dev_fp_ops_reset(struct rte_eth_fp_ops *fpo);
+
+/* setup eth fast-path API to ethdev values */
+void eth_dev_fp_ops_setup(struct rte_eth_fp_ops *fpo,
+		const struct rte_eth_dev *dev);
+
 #endif /* _ETH_PRIVATE_H_ */
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index c8abda6dd7..9f7a0cbb8c 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -44,6 +44,9 @@ 
 static const char *MZ_RTE_ETH_DEV_DATA = "rte_eth_dev_data";
 struct rte_eth_dev rte_eth_devices[RTE_MAX_ETHPORTS];
 
+/* public fast-path API */
+struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
+
 /* spinlock for eth device callbacks */
 static rte_spinlock_t eth_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
 
@@ -578,6 +581,8 @@  rte_eth_dev_release_port(struct rte_eth_dev *eth_dev)
 		rte_eth_dev_callback_process(eth_dev,
 				RTE_ETH_EVENT_DESTROY, NULL);
 
+	eth_dev_fp_ops_reset(rte_eth_fp_ops + eth_dev->data->port_id);
+
 	rte_spinlock_lock(&eth_dev_shared_data->ownership_lock);
 
 	eth_dev->state = RTE_ETH_DEV_UNUSED;
@@ -1787,6 +1792,9 @@  rte_eth_dev_start(uint16_t port_id)
 		(*dev->dev_ops->link_update)(dev, 0);
 	}
 
+	/* expose selection of PMD fast-path functions */
+	eth_dev_fp_ops_setup(rte_eth_fp_ops + port_id, dev);
+
 	rte_ethdev_trace_start(port_id);
 	return 0;
 }
@@ -1809,6 +1817,9 @@  rte_eth_dev_stop(uint16_t port_id)
 		return 0;
 	}
 
+	/* point fast-path functions to dummy ones */
+	eth_dev_fp_ops_reset(rte_eth_fp_ops + port_id);
+
 	dev->data->dev_started = 0;
 	ret = (*dev->dev_ops->dev_stop)(dev);
 	rte_ethdev_trace_stop(port_id, ret);
@@ -4567,6 +4578,14 @@  rte_eth_mirror_rule_reset(uint16_t port_id, uint8_t rule_id)
 	return eth_err(port_id, (*dev->dev_ops->mirror_rule_reset)(dev, rule_id));
 }
 
+RTE_INIT(eth_dev_init_fp_ops)
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(rte_eth_fp_ops); i++)
+		eth_dev_fp_ops_reset(rte_eth_fp_ops + i);
+}
+
 RTE_INIT(eth_dev_init_cb_lists)
 {
 	uint16_t i;
@@ -4735,6 +4754,14 @@  rte_eth_dev_probing_finish(struct rte_eth_dev *dev)
 	if (dev == NULL)
 		return;
 
+	/*
+	 * for secondary process, at that point we expect device
+	 * to be already 'usable', so shared data and all function pointers
+	 * for fast-path devops have to be setup properly inside rte_eth_dev.
+	 */
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+		eth_dev_fp_ops_setup(rte_eth_fp_ops + dev->data->port_id, dev);
+
 	rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_NEW, NULL);
 
 	dev->state = RTE_ETH_DEV_ATTACHED;
diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
index 51cd68de94..d5853dff86 100644
--- a/lib/ethdev/rte_ethdev_core.h
+++ b/lib/ethdev/rte_ethdev_core.h
@@ -50,6 +50,61 @@  typedef int (*eth_rx_descriptor_status_t)(void *rxq, uint16_t offset);
 typedef int (*eth_tx_descriptor_status_t)(void *txq, uint16_t offset);
 /**< @internal Check the status of a Tx descriptor */
 
+/**
+ * @internal
+ * Structure used to hold opaque pointers to internal ethdev Rx/Tx
+ * queues data.
+ * The main purpose to expose these pointers at all - allow compiler
+ * to fetch this data for fast-path ethdev inline functions in advance.
+ */
+struct rte_ethdev_qdata {
+	void **data;
+	/**< points to array of internal queue data pointers */
+	void **clbk;
+	/**< points to array of queue callback data pointers */
+};
+
+/**
+ * @internal
+ * fast-path ethdev functions and related data are hold in a flat array.
+ * One entry per ethdev.
+ * On 64-bit systems contents of this structure occupy exactly two 64B lines.
+ * On 32-bit systems contents of this structure fits into one 64B line.
+ */
+struct rte_eth_fp_ops {
+
+	/**
+	 * Rx fast-path functions and related data.
+	 * 64-bit systems: occupies first 64B line
+	 */
+	eth_rx_burst_t rx_pkt_burst;
+	/**< PMD receive function. */
+	eth_rx_queue_count_t rx_queue_count;
+	/**< Get the number of used RX descriptors. */
+	eth_rx_descriptor_status_t rx_descriptor_status;
+	/**< Check the status of a Rx descriptor. */
+	struct rte_ethdev_qdata rxq;
+	/**< Rx queues data. */
+	uintptr_t reserved1[3];
+
+	/**
+	 * Tx fast-path functions and related data.
+	 * 64-bit systems: occupies second 64B line
+	 */
+	eth_tx_burst_t tx_pkt_burst;
+	/**< PMD transmit function. */
+	eth_tx_prep_t tx_pkt_prepare;
+	/**< PMD transmit prepare function. */
+	eth_tx_descriptor_status_t tx_descriptor_status;
+	/**< Check the status of a Tx descriptor. */
+	struct rte_ethdev_qdata txq;
+	/**< Tx queues data. */
+	uintptr_t reserved2[3];
+
+} __rte_cache_aligned;
+
+extern struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
+
 
 /**
  * @internal