[v1,4/7] power: remove thread safety from PMD power API's

Message ID f0ea799f2908da104850c3858eea4fd059d38496.1622548381.git.anatoly.burakov@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Enhancements for PMD power management |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Burakov, Anatoly June 1, 2021, noon UTC
  Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/power/meson.build          |   3 +
 lib/power/rte_power_pmd_mgmt.c | 106 ++++++++-------------------------
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 3 files changed, 35 insertions(+), 80 deletions(-)
  

Comments

Ananyev, Konstantin June 22, 2021, 9:13 a.m. UTC | #1
> Currently, we expect that only one callback can be active at any given
> moment, for a particular queue configuration, which is relatively easy
> to implement in a thread-safe way. However, we're about to add support
> for multiple queues per lcore, which will greatly increase the
> possibility of various race conditions.
> 
> We could have used something like an RCU for this use case, but absent
> of a pressing need for thread safety we'll go the easy way and just
> mandate that the API's are to be called when all affected ports are
> stopped, and document this limitation. This greatly simplifies the
> `rte_power_monitor`-related code.

I think you need to update RN too with that.
Another thing - do you really need the whole port stopped?
From what I understand - you work on queues, so it is enough for you
that related RX queue is stopped.
So, to make things a bit more robust, in pmgmt_queue_enable/disable 
you can call rte_eth_rx_queue_info_get() and check queue state.
 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>  lib/power/meson.build          |   3 +
>  lib/power/rte_power_pmd_mgmt.c | 106 ++++++++-------------------------
>  lib/power/rte_power_pmd_mgmt.h |   6 ++
>  3 files changed, 35 insertions(+), 80 deletions(-)
> 
> diff --git a/lib/power/meson.build b/lib/power/meson.build
> index c1097d32f1..4f6a242364 100644
> --- a/lib/power/meson.build
> +++ b/lib/power/meson.build
> @@ -21,4 +21,7 @@ headers = files(
>          'rte_power_pmd_mgmt.h',
>          'rte_power_guest_channel.h',
>  )
> +if cc.has_argument('-Wno-cast-qual')
> +    cflags += '-Wno-cast-qual'
> +endif
>  deps += ['timer', 'ethdev']
> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
> index db03cbf420..0707c60a4f 100644
> --- a/lib/power/rte_power_pmd_mgmt.c
> +++ b/lib/power/rte_power_pmd_mgmt.c
> @@ -40,8 +40,6 @@ struct pmd_queue_cfg {
>  	/**< Callback mode for this queue */
>  	const struct rte_eth_rxtx_callback *cur_cb;
>  	/**< Callback instance */
> -	volatile bool umwait_in_progress;
> -	/**< are we currently sleeping? */
>  	uint64_t empty_poll_stats;
>  	/**< Number of empty polls */
>  } __rte_cache_aligned;
> @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
>  			struct rte_power_monitor_cond pmc;
>  			uint16_t ret;
> 
> -			/*
> -			 * we might get a cancellation request while being
> -			 * inside the callback, in which case the wakeup
> -			 * wouldn't work because it would've arrived too early.
> -			 *
> -			 * to get around this, we notify the other thread that
> -			 * we're sleeping, so that it can spin until we're done.
> -			 * unsolicited wakeups are perfectly safe.
> -			 */
> -			q_conf->umwait_in_progress = true;
> -
> -			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> -
> -			/* check if we need to cancel sleep */
> -			if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
> -				/* use monitoring condition to sleep */
> -				ret = rte_eth_get_monitor_addr(port_id, qidx,
> -						&pmc);
> -				if (ret == 0)
> -					rte_power_monitor(&pmc, UINT64_MAX);
> -			}
> -			q_conf->umwait_in_progress = false;
> -
> -			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> +			/* use monitoring condition to sleep */
> +			ret = rte_eth_get_monitor_addr(port_id, qidx,
> +					&pmc);
> +			if (ret == 0)
> +				rte_power_monitor(&pmc, UINT64_MAX);
>  		}
>  	} else
>  		q_conf->empty_poll_stats = 0;
> @@ -183,6 +162,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>  {
>  	struct pmd_queue_cfg *queue_cfg;
>  	struct rte_eth_dev_info info;
> +	rte_rx_callback_fn clb;
>  	int ret;
> 
>  	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> @@ -232,17 +212,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>  			ret = -ENOTSUP;
>  			goto end;
>  		}
> -		/* initialize data before enabling the callback */
> -		queue_cfg->empty_poll_stats = 0;
> -		queue_cfg->cb_mode = mode;
> -		queue_cfg->umwait_in_progress = false;
> -		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> -
> -		/* ensure we update our state before callback starts */
> -		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> -
> -		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> -				clb_umwait, NULL);
> +		clb = clb_umwait;
>  		break;
>  	}
>  	case RTE_POWER_MGMT_TYPE_SCALE:
> @@ -269,16 +239,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>  			ret = -ENOTSUP;
>  			goto end;
>  		}
> -		/* initialize data before enabling the callback */
> -		queue_cfg->empty_poll_stats = 0;
> -		queue_cfg->cb_mode = mode;
> -		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> -
> -		/* this is not necessary here, but do it anyway */
> -		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> -
> -		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
> -				queue_id, clb_scale_freq, NULL);
> +		clb = clb_scale_freq;
>  		break;
>  	}
>  	case RTE_POWER_MGMT_TYPE_PAUSE:
> @@ -286,18 +247,21 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>  		if (global_data.tsc_per_us == 0)
>  			calc_tsc();
> 
> -		/* initialize data before enabling the callback */
> -		queue_cfg->empty_poll_stats = 0;
> -		queue_cfg->cb_mode = mode;
> -		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> -
> -		/* this is not necessary here, but do it anyway */
> -		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> -
> -		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> -				clb_pause, NULL);
> +		clb = clb_pause;
>  		break;
> +	default:
> +		RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
> +		ret = -EINVAL;
> +		goto end;
>  	}
> +
> +	/* initialize data before enabling the callback */
> +	queue_cfg->empty_poll_stats = 0;
> +	queue_cfg->cb_mode = mode;
> +	queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> +	queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +			clb, NULL);
> +
>  	ret = 0;
>  end:
>  	return ret;
> @@ -323,27 +287,8 @@ rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
>  	/* stop any callbacks from progressing */
>  	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
> 
> -	/* ensure we update our state before continuing */
> -	rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> -
>  	switch (queue_cfg->cb_mode) {
> -	case RTE_POWER_MGMT_TYPE_MONITOR:
> -	{
> -		bool exit = false;
> -		do {
> -			/*
> -			 * we may request cancellation while the other thread
> -			 * has just entered the callback but hasn't started
> -			 * sleeping yet, so keep waking it up until we know it's
> -			 * done sleeping.
> -			 */
> -			if (queue_cfg->umwait_in_progress)
> -				rte_power_monitor_wakeup(lcore_id);
> -			else
> -				exit = true;
> -		} while (!exit);
> -	}
> -	/* fall-through */
> +	case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
>  	case RTE_POWER_MGMT_TYPE_PAUSE:
>  		rte_eth_remove_rx_callback(port_id, queue_id,
>  				queue_cfg->cur_cb);
> @@ -356,10 +301,11 @@ rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
>  		break;
>  	}
>  	/*
> -	 * we don't free the RX callback here because it is unsafe to do so
> -	 * unless we know for a fact that all data plane threads have stopped.
> +	 * the API doc mandates that the user stops all processing on affected
> +	 * ports before calling any of these API's, so we can assume that the
> +	 * callbacks can be freed. we're intentionally casting away const-ness.
>  	 */
> -	queue_cfg->cur_cb = NULL;
> +	rte_free((void *)queue_cfg->cur_cb);
> 
>  	return 0;
>  }
> diff --git a/lib/power/rte_power_pmd_mgmt.h b/lib/power/rte_power_pmd_mgmt.h
> index 7a0ac24625..7557f5d7e1 100644
> --- a/lib/power/rte_power_pmd_mgmt.h
> +++ b/lib/power/rte_power_pmd_mgmt.h
> @@ -43,6 +43,9 @@ enum rte_power_pmd_mgmt_type {
>   *
>   * @note This function is not thread-safe.
>   *
> + * @warning This function must be called when all affected Ethernet ports are
> + *   stopped and no Rx/Tx is in progress!
> + *
>   * @param lcore_id
>   *   The lcore the Rx queue will be polled from.
>   * @param port_id
> @@ -69,6 +72,9 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
>   *
>   * @note This function is not thread-safe.
>   *
> + * @warning This function must be called when all affected Ethernet ports are
> + *   stopped and no Rx/Tx is in progress!
> + *
>   * @param lcore_id
>   *   The lcore the Rx queue is polled from.
>   * @param port_id
> --
> 2.25.1
  
Burakov, Anatoly June 23, 2021, 9:46 a.m. UTC | #2
On 22-Jun-21 10:13 AM, Ananyev, Konstantin wrote:
> 
>> Currently, we expect that only one callback can be active at any given
>> moment, for a particular queue configuration, which is relatively easy
>> to implement in a thread-safe way. However, we're about to add support
>> for multiple queues per lcore, which will greatly increase the
>> possibility of various race conditions.
>>
>> We could have used something like an RCU for this use case, but absent
>> of a pressing need for thread safety we'll go the easy way and just
>> mandate that the API's are to be called when all affected ports are
>> stopped, and document this limitation. This greatly simplifies the
>> `rte_power_monitor`-related code.
> 
> I think you need to update RN too with that.

Yep, will fix.

> Another thing - do you really need the whole port stopped?
>  From what I understand - you work on queues, so it is enough for you
> that related RX queue is stopped.
> So, to make things a bit more robust, in pmgmt_queue_enable/disable
> you can call rte_eth_rx_queue_info_get() and check queue state.

We work on queues, but the data is per-lcore not per-queue, and it is 
potentially used by multiple queues, so checking one specific queue is 
not going to be enough. We could check all queues that were registered 
so far with the power library, maybe that'll work better?

> 
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>   lib/power/meson.build          |   3 +
>>   lib/power/rte_power_pmd_mgmt.c | 106 ++++++++-------------------------
>>   lib/power/rte_power_pmd_mgmt.h |   6 ++
>>   3 files changed, 35 insertions(+), 80 deletions(-)
>>
>> diff --git a/lib/power/meson.build b/lib/power/meson.build
>> index c1097d32f1..4f6a242364 100644
>> --- a/lib/power/meson.build
>> +++ b/lib/power/meson.build
>> @@ -21,4 +21,7 @@ headers = files(
>>           'rte_power_pmd_mgmt.h',
>>           'rte_power_guest_channel.h',
>>   )
>> +if cc.has_argument('-Wno-cast-qual')
>> +    cflags += '-Wno-cast-qual'
>> +endif
>>   deps += ['timer', 'ethdev']
>> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
>> index db03cbf420..0707c60a4f 100644
>> --- a/lib/power/rte_power_pmd_mgmt.c
>> +++ b/lib/power/rte_power_pmd_mgmt.c
>> @@ -40,8 +40,6 @@ struct pmd_queue_cfg {
>>        /**< Callback mode for this queue */
>>        const struct rte_eth_rxtx_callback *cur_cb;
>>        /**< Callback instance */
>> -     volatile bool umwait_in_progress;
>> -     /**< are we currently sleeping? */
>>        uint64_t empty_poll_stats;
>>        /**< Number of empty polls */
>>   } __rte_cache_aligned;
>> @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
>>                        struct rte_power_monitor_cond pmc;
>>                        uint16_t ret;
>>
>> -                     /*
>> -                      * we might get a cancellation request while being
>> -                      * inside the callback, in which case the wakeup
>> -                      * wouldn't work because it would've arrived too early.
>> -                      *
>> -                      * to get around this, we notify the other thread that
>> -                      * we're sleeping, so that it can spin until we're done.
>> -                      * unsolicited wakeups are perfectly safe.
>> -                      */
>> -                     q_conf->umwait_in_progress = true;
>> -
>> -                     rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
>> -
>> -                     /* check if we need to cancel sleep */
>> -                     if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
>> -                             /* use monitoring condition to sleep */
>> -                             ret = rte_eth_get_monitor_addr(port_id, qidx,
>> -                                             &pmc);
>> -                             if (ret == 0)
>> -                                     rte_power_monitor(&pmc, UINT64_MAX);
>> -                     }
>> -                     q_conf->umwait_in_progress = false;
>> -
>> -                     rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
>> +                     /* use monitoring condition to sleep */
>> +                     ret = rte_eth_get_monitor_addr(port_id, qidx,
>> +                                     &pmc);
>> +                     if (ret == 0)
>> +                             rte_power_monitor(&pmc, UINT64_MAX);
>>                }
>>        } else
>>                q_conf->empty_poll_stats = 0;
>> @@ -183,6 +162,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>>   {
>>        struct pmd_queue_cfg *queue_cfg;
>>        struct rte_eth_dev_info info;
>> +     rte_rx_callback_fn clb;
>>        int ret;
>>
>>        RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>> @@ -232,17 +212,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>>                        ret = -ENOTSUP;
>>                        goto end;
>>                }
>> -             /* initialize data before enabling the callback */
>> -             queue_cfg->empty_poll_stats = 0;
>> -             queue_cfg->cb_mode = mode;
>> -             queue_cfg->umwait_in_progress = false;
>> -             queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
>> -
>> -             /* ensure we update our state before callback starts */
>> -             rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
>> -
>> -             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> -                             clb_umwait, NULL);
>> +             clb = clb_umwait;
>>                break;
>>        }
>>        case RTE_POWER_MGMT_TYPE_SCALE:
>> @@ -269,16 +239,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>>                        ret = -ENOTSUP;
>>                        goto end;
>>                }
>> -             /* initialize data before enabling the callback */
>> -             queue_cfg->empty_poll_stats = 0;
>> -             queue_cfg->cb_mode = mode;
>> -             queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
>> -
>> -             /* this is not necessary here, but do it anyway */
>> -             rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
>> -
>> -             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
>> -                             queue_id, clb_scale_freq, NULL);
>> +             clb = clb_scale_freq;
>>                break;
>>        }
>>        case RTE_POWER_MGMT_TYPE_PAUSE:
>> @@ -286,18 +247,21 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>>                if (global_data.tsc_per_us == 0)
>>                        calc_tsc();
>>
>> -             /* initialize data before enabling the callback */
>> -             queue_cfg->empty_poll_stats = 0;
>> -             queue_cfg->cb_mode = mode;
>> -             queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
>> -
>> -             /* this is not necessary here, but do it anyway */
>> -             rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
>> -
>> -             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> -                             clb_pause, NULL);
>> +             clb = clb_pause;
>>                break;
>> +     default:
>> +             RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
>> +             ret = -EINVAL;
>> +             goto end;
>>        }
>> +
>> +     /* initialize data before enabling the callback */
>> +     queue_cfg->empty_poll_stats = 0;
>> +     queue_cfg->cb_mode = mode;
>> +     queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
>> +     queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +                     clb, NULL);
>> +
>>        ret = 0;
>>   end:
>>        return ret;
>> @@ -323,27 +287,8 @@ rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
>>        /* stop any callbacks from progressing */
>>        queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
>>
>> -     /* ensure we update our state before continuing */
>> -     rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
>> -
>>        switch (queue_cfg->cb_mode) {
>> -     case RTE_POWER_MGMT_TYPE_MONITOR:
>> -     {
>> -             bool exit = false;
>> -             do {
>> -                     /*
>> -                      * we may request cancellation while the other thread
>> -                      * has just entered the callback but hasn't started
>> -                      * sleeping yet, so keep waking it up until we know it's
>> -                      * done sleeping.
>> -                      */
>> -                     if (queue_cfg->umwait_in_progress)
>> -                             rte_power_monitor_wakeup(lcore_id);
>> -                     else
>> -                             exit = true;
>> -             } while (!exit);
>> -     }
>> -     /* fall-through */
>> +     case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
>>        case RTE_POWER_MGMT_TYPE_PAUSE:
>>                rte_eth_remove_rx_callback(port_id, queue_id,
>>                                queue_cfg->cur_cb);
>> @@ -356,10 +301,11 @@ rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
>>                break;
>>        }
>>        /*
>> -      * we don't free the RX callback here because it is unsafe to do so
>> -      * unless we know for a fact that all data plane threads have stopped.
>> +      * the API doc mandates that the user stops all processing on affected
>> +      * ports before calling any of these API's, so we can assume that the
>> +      * callbacks can be freed. we're intentionally casting away const-ness.
>>         */
>> -     queue_cfg->cur_cb = NULL;
>> +     rte_free((void *)queue_cfg->cur_cb);
>>
>>        return 0;
>>   }
>> diff --git a/lib/power/rte_power_pmd_mgmt.h b/lib/power/rte_power_pmd_mgmt.h
>> index 7a0ac24625..7557f5d7e1 100644
>> --- a/lib/power/rte_power_pmd_mgmt.h
>> +++ b/lib/power/rte_power_pmd_mgmt.h
>> @@ -43,6 +43,9 @@ enum rte_power_pmd_mgmt_type {
>>    *
>>    * @note This function is not thread-safe.
>>    *
>> + * @warning This function must be called when all affected Ethernet ports are
>> + *   stopped and no Rx/Tx is in progress!
>> + *
>>    * @param lcore_id
>>    *   The lcore the Rx queue will be polled from.
>>    * @param port_id
>> @@ -69,6 +72,9 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
>>    *
>>    * @note This function is not thread-safe.
>>    *
>> + * @warning This function must be called when all affected Ethernet ports are
>> + *   stopped and no Rx/Tx is in progress!
>> + *
>>    * @param lcore_id
>>    *   The lcore the Rx queue is polled from.
>>    * @param port_id
>> --
>> 2.25.1
>
  
Ananyev, Konstantin June 23, 2021, 9:52 a.m. UTC | #3
> 
> On 22-Jun-21 10:13 AM, Ananyev, Konstantin wrote:
> >
> >> Currently, we expect that only one callback can be active at any given
> >> moment, for a particular queue configuration, which is relatively easy
> >> to implement in a thread-safe way. However, we're about to add support
> >> for multiple queues per lcore, which will greatly increase the
> >> possibility of various race conditions.
> >>
> >> We could have used something like an RCU for this use case, but absent
> >> of a pressing need for thread safety we'll go the easy way and just
> >> mandate that the API's are to be called when all affected ports are
> >> stopped, and document this limitation. This greatly simplifies the
> >> `rte_power_monitor`-related code.
> >
> > I think you need to update RN too with that.
> 
> Yep, will fix.
> 
> > Another thing - do you really need the whole port stopped?
> >  From what I understand - you work on queues, so it is enough for you
> > that related RX queue is stopped.
> > So, to make things a bit more robust, in pmgmt_queue_enable/disable
> > you can call rte_eth_rx_queue_info_get() and check queue state.
> 
> We work on queues, but the data is per-lcore not per-queue, and it is
> potentially used by multiple queues, so checking one specific queue is
> not going to be enough. We could check all queues that were registered
> so far with the power library, maybe that'll work better?

Yep, that's what I mean: on queue_enable() check is that queue stopped or not.
If not, return -EBUSY/EAGAIN or so/
Sorry if I wasn't clear at first time.


> 
> >
> >> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >> ---
> >>   lib/power/meson.build          |   3 +
> >>   lib/power/rte_power_pmd_mgmt.c | 106 ++++++++-------------------------
> >>   lib/power/rte_power_pmd_mgmt.h |   6 ++
> >>   3 files changed, 35 insertions(+), 80 deletions(-)
> >>
> >> diff --git a/lib/power/meson.build b/lib/power/meson.build
> >> index c1097d32f1..4f6a242364 100644
> >> --- a/lib/power/meson.build
> >> +++ b/lib/power/meson.build
> >> @@ -21,4 +21,7 @@ headers = files(
> >>           'rte_power_pmd_mgmt.h',
> >>           'rte_power_guest_channel.h',
> >>   )
> >> +if cc.has_argument('-Wno-cast-qual')
> >> +    cflags += '-Wno-cast-qual'
> >> +endif
> >>   deps += ['timer', 'ethdev']
> >> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
> >> index db03cbf420..0707c60a4f 100644
> >> --- a/lib/power/rte_power_pmd_mgmt.c
> >> +++ b/lib/power/rte_power_pmd_mgmt.c
> >> @@ -40,8 +40,6 @@ struct pmd_queue_cfg {
> >>        /**< Callback mode for this queue */
> >>        const struct rte_eth_rxtx_callback *cur_cb;
> >>        /**< Callback instance */
> >> -     volatile bool umwait_in_progress;
> >> -     /**< are we currently sleeping? */
> >>        uint64_t empty_poll_stats;
> >>        /**< Number of empty polls */
> >>   } __rte_cache_aligned;
> >> @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
> >>                        struct rte_power_monitor_cond pmc;
> >>                        uint16_t ret;
> >>
> >> -                     /*
> >> -                      * we might get a cancellation request while being
> >> -                      * inside the callback, in which case the wakeup
> >> -                      * wouldn't work because it would've arrived too early.
> >> -                      *
> >> -                      * to get around this, we notify the other thread that
> >> -                      * we're sleeping, so that it can spin until we're done.
> >> -                      * unsolicited wakeups are perfectly safe.
> >> -                      */
> >> -                     q_conf->umwait_in_progress = true;
> >> -
> >> -                     rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> >> -
> >> -                     /* check if we need to cancel sleep */
> >> -                     if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
> >> -                             /* use monitoring condition to sleep */
> >> -                             ret = rte_eth_get_monitor_addr(port_id, qidx,
> >> -                                             &pmc);
> >> -                             if (ret == 0)
> >> -                                     rte_power_monitor(&pmc, UINT64_MAX);
> >> -                     }
> >> -                     q_conf->umwait_in_progress = false;
> >> -
> >> -                     rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> >> +                     /* use monitoring condition to sleep */
> >> +                     ret = rte_eth_get_monitor_addr(port_id, qidx,
> >> +                                     &pmc);
> >> +                     if (ret == 0)
> >> +                             rte_power_monitor(&pmc, UINT64_MAX);
> >>                }
> >>        } else
> >>                q_conf->empty_poll_stats = 0;
> >> @@ -183,6 +162,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
> >>   {
> >>        struct pmd_queue_cfg *queue_cfg;
> >>        struct rte_eth_dev_info info;
> >> +     rte_rx_callback_fn clb;
> >>        int ret;
> >>
> >>        RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> >> @@ -232,17 +212,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
> >>                        ret = -ENOTSUP;
> >>                        goto end;
> >>                }
> >> -             /* initialize data before enabling the callback */
> >> -             queue_cfg->empty_poll_stats = 0;
> >> -             queue_cfg->cb_mode = mode;
> >> -             queue_cfg->umwait_in_progress = false;
> >> -             queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> >> -
> >> -             /* ensure we update our state before callback starts */
> >> -             rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> >> -
> >> -             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> -                             clb_umwait, NULL);
> >> +             clb = clb_umwait;
> >>                break;
> >>        }
> >>        case RTE_POWER_MGMT_TYPE_SCALE:
> >> @@ -269,16 +239,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
> >>                        ret = -ENOTSUP;
> >>                        goto end;
> >>                }
> >> -             /* initialize data before enabling the callback */
> >> -             queue_cfg->empty_poll_stats = 0;
> >> -             queue_cfg->cb_mode = mode;
> >> -             queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> >> -
> >> -             /* this is not necessary here, but do it anyway */
> >> -             rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> >> -
> >> -             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
> >> -                             queue_id, clb_scale_freq, NULL);
> >> +             clb = clb_scale_freq;
> >>                break;
> >>        }
> >>        case RTE_POWER_MGMT_TYPE_PAUSE:
> >> @@ -286,18 +247,21 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
> >>                if (global_data.tsc_per_us == 0)
> >>                        calc_tsc();
> >>
> >> -             /* initialize data before enabling the callback */
> >> -             queue_cfg->empty_poll_stats = 0;
> >> -             queue_cfg->cb_mode = mode;
> >> -             queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> >> -
> >> -             /* this is not necessary here, but do it anyway */
> >> -             rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> >> -
> >> -             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> -                             clb_pause, NULL);
> >> +             clb = clb_pause;
> >>                break;
> >> +     default:
> >> +             RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
> >> +             ret = -EINVAL;
> >> +             goto end;
> >>        }
> >> +
> >> +     /* initialize data before enabling the callback */
> >> +     queue_cfg->empty_poll_stats = 0;
> >> +     queue_cfg->cb_mode = mode;
> >> +     queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> >> +     queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +                     clb, NULL);
> >> +
> >>        ret = 0;
> >>   end:
> >>        return ret;
> >> @@ -323,27 +287,8 @@ rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
> >>        /* stop any callbacks from progressing */
> >>        queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
> >>
> >> -     /* ensure we update our state before continuing */
> >> -     rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
> >> -
> >>        switch (queue_cfg->cb_mode) {
> >> -     case RTE_POWER_MGMT_TYPE_MONITOR:
> >> -     {
> >> -             bool exit = false;
> >> -             do {
> >> -                     /*
> >> -                      * we may request cancellation while the other thread
> >> -                      * has just entered the callback but hasn't started
> >> -                      * sleeping yet, so keep waking it up until we know it's
> >> -                      * done sleeping.
> >> -                      */
> >> -                     if (queue_cfg->umwait_in_progress)
> >> -                             rte_power_monitor_wakeup(lcore_id);
> >> -                     else
> >> -                             exit = true;
> >> -             } while (!exit);
> >> -     }
> >> -     /* fall-through */
> >> +     case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
> >>        case RTE_POWER_MGMT_TYPE_PAUSE:
> >>                rte_eth_remove_rx_callback(port_id, queue_id,
> >>                                queue_cfg->cur_cb);
> >> @@ -356,10 +301,11 @@ rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
> >>                break;
> >>        }
> >>        /*
> >> -      * we don't free the RX callback here because it is unsafe to do so
> >> -      * unless we know for a fact that all data plane threads have stopped.
> >> +      * the API doc mandates that the user stops all processing on affected
> >> +      * ports before calling any of these API's, so we can assume that the
> >> +      * callbacks can be freed. we're intentionally casting away const-ness.
> >>         */
> >> -     queue_cfg->cur_cb = NULL;
> >> +     rte_free((void *)queue_cfg->cur_cb);
> >>
> >>        return 0;
> >>   }
> >> diff --git a/lib/power/rte_power_pmd_mgmt.h b/lib/power/rte_power_pmd_mgmt.h
> >> index 7a0ac24625..7557f5d7e1 100644
> >> --- a/lib/power/rte_power_pmd_mgmt.h
> >> +++ b/lib/power/rte_power_pmd_mgmt.h
> >> @@ -43,6 +43,9 @@ enum rte_power_pmd_mgmt_type {
> >>    *
> >>    * @note This function is not thread-safe.
> >>    *
> >> + * @warning This function must be called when all affected Ethernet ports are
> >> + *   stopped and no Rx/Tx is in progress!
> >> + *
> >>    * @param lcore_id
> >>    *   The lcore the Rx queue will be polled from.
> >>    * @param port_id
> >> @@ -69,6 +72,9 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
> >>    *
> >>    * @note This function is not thread-safe.
> >>    *
> >> + * @warning This function must be called when all affected Ethernet ports are
> >> + *   stopped and no Rx/Tx is in progress!
> >> + *
> >>    * @param lcore_id
> >>    *   The lcore the Rx queue is polled from.
> >>    * @param port_id
> >> --
> >> 2.25.1
> >
> 
> 
> --
> Thanks,
> Anatoly
  
Burakov, Anatoly June 25, 2021, 11:52 a.m. UTC | #4
On 23-Jun-21 10:52 AM, Ananyev, Konstantin wrote:
> 
> 
>>
>> On 22-Jun-21 10:13 AM, Ananyev, Konstantin wrote:
>>>
>>>> Currently, we expect that only one callback can be active at any given
>>>> moment, for a particular queue configuration, which is relatively easy
>>>> to implement in a thread-safe way. However, we're about to add support
>>>> for multiple queues per lcore, which will greatly increase the
>>>> possibility of various race conditions.
>>>>
>>>> We could have used something like an RCU for this use case, but absent
>>>> of a pressing need for thread safety we'll go the easy way and just
>>>> mandate that the API's are to be called when all affected ports are
>>>> stopped, and document this limitation. This greatly simplifies the
>>>> `rte_power_monitor`-related code.
>>>
>>> I think you need to update RN too with that.
>>
>> Yep, will fix.
>>
>>> Another thing - do you really need the whole port stopped?
>>>   From what I understand - you work on queues, so it is enough for you
>>> that related RX queue is stopped.
>>> So, to make things a bit more robust, in pmgmt_queue_enable/disable
>>> you can call rte_eth_rx_queue_info_get() and check queue state.
>>
>> We work on queues, but the data is per-lcore not per-queue, and it is
>> potentially used by multiple queues, so checking one specific queue is
>> not going to be enough. We could check all queues that were registered
>> so far with the power library, maybe that'll work better?
> 
> Yep, that's what I mean: on queue_enable() check is that queue stopped or not.
> If not, return -EBUSY/EAGAIN or so/
> Sorry if I wasn't clear at first time.

I think it's still better that all queues are stopped, rather than 
trying to work around the inherently racy implementation. So while i'll 
add the queue stopped checks, i'll still remove all of the thread safety 
stuff from here.
  
Ananyev, Konstantin June 25, 2021, 2:42 p.m. UTC | #5
> >>
> >> On 22-Jun-21 10:13 AM, Ananyev, Konstantin wrote:
> >>>
> >>>> Currently, we expect that only one callback can be active at any given
> >>>> moment, for a particular queue configuration, which is relatively easy
> >>>> to implement in a thread-safe way. However, we're about to add support
> >>>> for multiple queues per lcore, which will greatly increase the
> >>>> possibility of various race conditions.
> >>>>
> >>>> We could have used something like an RCU for this use case, but absent
> >>>> of a pressing need for thread safety we'll go the easy way and just
> >>>> mandate that the API's are to be called when all affected ports are
> >>>> stopped, and document this limitation. This greatly simplifies the
> >>>> `rte_power_monitor`-related code.
> >>>
> >>> I think you need to update RN too with that.
> >>
> >> Yep, will fix.
> >>
> >>> Another thing - do you really need the whole port stopped?
> >>>   From what I understand - you work on queues, so it is enough for you
> >>> that related RX queue is stopped.
> >>> So, to make things a bit more robust, in pmgmt_queue_enable/disable
> >>> you can call rte_eth_rx_queue_info_get() and check queue state.
> >>
> >> We work on queues, but the data is per-lcore not per-queue, and it is
> >> potentially used by multiple queues, so checking one specific queue is
> >> not going to be enough. We could check all queues that were registered
> >> so far with the power library, maybe that'll work better?
> >
> > Yep, that's what I mean: on queue_enable() check is that queue stopped or not.
> > If not, return -EBUSY/EAGAIN or so/
> > Sorry if I wasn't clear at first time.
> 
> I think it's still better that all queues are stopped, rather than
> trying to work around the inherently racy implementation. So while i'll
> add the queue stopped checks, i'll still remove all of the thread safety
> stuff from here.

That's fine by me, all I asked for here - an extra check to make sure the queue is really stopped.
  

Patch

diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@  headers = files(
         'rte_power_pmd_mgmt.h',
         'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+    cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..0707c60a4f 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@  struct pmd_queue_cfg {
 	/**< Callback mode for this queue */
 	const struct rte_eth_rxtx_callback *cur_cb;
 	/**< Callback instance */
-	volatile bool umwait_in_progress;
-	/**< are we currently sleeping? */
 	uint64_t empty_poll_stats;
 	/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@  clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
 			struct rte_power_monitor_cond pmc;
 			uint16_t ret;
 
-			/*
-			 * we might get a cancellation request while being
-			 * inside the callback, in which case the wakeup
-			 * wouldn't work because it would've arrived too early.
-			 *
-			 * to get around this, we notify the other thread that
-			 * we're sleeping, so that it can spin until we're done.
-			 * unsolicited wakeups are perfectly safe.
-			 */
-			q_conf->umwait_in_progress = true;
-
-			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-			/* check if we need to cancel sleep */
-			if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-				/* use monitoring condition to sleep */
-				ret = rte_eth_get_monitor_addr(port_id, qidx,
-						&pmc);
-				if (ret == 0)
-					rte_power_monitor(&pmc, UINT64_MAX);
-			}
-			q_conf->umwait_in_progress = false;
-
-			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+			/* use monitoring condition to sleep */
+			ret = rte_eth_get_monitor_addr(port_id, qidx,
+					&pmc);
+			if (ret == 0)
+				rte_power_monitor(&pmc, UINT64_MAX);
 		}
 	} else
 		q_conf->empty_poll_stats = 0;
@@ -183,6 +162,7 @@  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 {
 	struct pmd_queue_cfg *queue_cfg;
 	struct rte_eth_dev_info info;
+	rte_rx_callback_fn clb;
 	int ret;
 
 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
@@ -232,17 +212,7 @@  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 			ret = -ENOTSUP;
 			goto end;
 		}
-		/* initialize data before enabling the callback */
-		queue_cfg->empty_poll_stats = 0;
-		queue_cfg->cb_mode = mode;
-		queue_cfg->umwait_in_progress = false;
-		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
-
-		/* ensure we update our state before callback starts */
-		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
-				clb_umwait, NULL);
+		clb = clb_umwait;
 		break;
 	}
 	case RTE_POWER_MGMT_TYPE_SCALE:
@@ -269,16 +239,7 @@  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 			ret = -ENOTSUP;
 			goto end;
 		}
-		/* initialize data before enabling the callback */
-		queue_cfg->empty_poll_stats = 0;
-		queue_cfg->cb_mode = mode;
-		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
-
-		/* this is not necessary here, but do it anyway */
-		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
-				queue_id, clb_scale_freq, NULL);
+		clb = clb_scale_freq;
 		break;
 	}
 	case RTE_POWER_MGMT_TYPE_PAUSE:
@@ -286,18 +247,21 @@  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 		if (global_data.tsc_per_us == 0)
 			calc_tsc();
 
-		/* initialize data before enabling the callback */
-		queue_cfg->empty_poll_stats = 0;
-		queue_cfg->cb_mode = mode;
-		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
-
-		/* this is not necessary here, but do it anyway */
-		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
-				clb_pause, NULL);
+		clb = clb_pause;
 		break;
+	default:
+		RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
+		ret = -EINVAL;
+		goto end;
 	}
+
+	/* initialize data before enabling the callback */
+	queue_cfg->empty_poll_stats = 0;
+	queue_cfg->cb_mode = mode;
+	queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+	queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+			clb, NULL);
+
 	ret = 0;
 end:
 	return ret;
@@ -323,27 +287,8 @@  rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
 	/* stop any callbacks from progressing */
 	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
 
-	/* ensure we update our state before continuing */
-	rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
 	switch (queue_cfg->cb_mode) {
-	case RTE_POWER_MGMT_TYPE_MONITOR:
-	{
-		bool exit = false;
-		do {
-			/*
-			 * we may request cancellation while the other thread
-			 * has just entered the callback but hasn't started
-			 * sleeping yet, so keep waking it up until we know it's
-			 * done sleeping.
-			 */
-			if (queue_cfg->umwait_in_progress)
-				rte_power_monitor_wakeup(lcore_id);
-			else
-				exit = true;
-		} while (!exit);
-	}
-	/* fall-through */
+	case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
 	case RTE_POWER_MGMT_TYPE_PAUSE:
 		rte_eth_remove_rx_callback(port_id, queue_id,
 				queue_cfg->cur_cb);
@@ -356,10 +301,11 @@  rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
 		break;
 	}
 	/*
-	 * we don't free the RX callback here because it is unsafe to do so
-	 * unless we know for a fact that all data plane threads have stopped.
+	 * the API doc mandates that the user stops all processing on affected
+	 * ports before calling any of these API's, so we can assume that the
+	 * callbacks can be freed. we're intentionally casting away const-ness.
 	 */
-	queue_cfg->cur_cb = NULL;
+	rte_free((void *)queue_cfg->cur_cb);
 
 	return 0;
 }
diff --git a/lib/power/rte_power_pmd_mgmt.h b/lib/power/rte_power_pmd_mgmt.h
index 7a0ac24625..7557f5d7e1 100644
--- a/lib/power/rte_power_pmd_mgmt.h
+++ b/lib/power/rte_power_pmd_mgmt.h
@@ -43,6 +43,9 @@  enum rte_power_pmd_mgmt_type {
  *
  * @note This function is not thread-safe.
  *
+ * @warning This function must be called when all affected Ethernet ports are
+ *   stopped and no Rx/Tx is in progress!
+ *
  * @param lcore_id
  *   The lcore the Rx queue will be polled from.
  * @param port_id
@@ -69,6 +72,9 @@  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
  *
  * @note This function is not thread-safe.
  *
+ * @warning This function must be called when all affected Ethernet ports are
+ *   stopped and no Rx/Tx is in progress!
+ *
  * @param lcore_id
  *   The lcore the Rx queue is polled from.
  * @param port_id