[RFC] random: use per lcore state

Message ID 20230906172013.169846-1-stephen@networkplumber.org (mailing list archive)
State Rejected, archived
Delegated to: David Marchand
Headers
Series [RFC] random: use per lcore state |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS

Commit Message

Stephen Hemminger Sept. 6, 2023, 5:20 p.m. UTC
  Move the random number state into thread local storage.
This has a several benefits.
 - no false cache sharing from cpu prefetching
 - fixes initialization of random state for non-DPDK threads
 - fixes unsafe usage of random state by non-DPDK threads

The initialization of random number state is done by the
lcore (lazy initialization).

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/eal/common/rte_random.c | 38 +++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)
  

Comments

Morten Brørup Sept. 6, 2023, 5:54 p.m. UTC | #1
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, 6 September 2023 19.20
> 
> Move the random number state into thread local storage.
> This has a several benefits.
>  - no false cache sharing from cpu prefetching
>  - fixes initialization of random state for non-DPDK threads
>  - fixes unsafe usage of random state by non-DPDK threads
> 
> The initialization of random number state is done by the
> lcore (lazy initialization).
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  lib/eal/common/rte_random.c | 38 +++++++++++++++++++------------------
>  1 file changed, 20 insertions(+), 18 deletions(-)
> 
> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> index 53636331a27b..9657adf6ad3b 100644
> --- a/lib/eal/common/rte_random.c
> +++ b/lib/eal/common/rte_random.c
> @@ -19,13 +19,14 @@ struct rte_rand_state {
>  	uint64_t z3;
>  	uint64_t z4;
>  	uint64_t z5;
> -} __rte_cache_aligned;
> +	uint64_t seed;
> +};
> 
> -/* One instance each for every lcore id-equipped thread, and one
> - * additional instance to be shared by all others threads (i.e., all
> - * unregistered non-EAL threads).
> - */
> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> +/* Global random seed */
> +static uint64_t rte_rand_seed;
> +
> +/* Per lcore random state. */
> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
> 
>  static uint32_t
>  __rte_rand_lcg32(uint32_t *seed)
> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct
> rte_rand_state *state)
>  void
>  rte_srand(uint64_t seed)
>  {
> -	unsigned int lcore_id;
> -
> -	/* add lcore_id to seed to avoid having the same sequence */
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> -		__rte_srand_lfsr258(seed + lcore_id,
> &rand_states[lcore_id]);
> +	__atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>  }
> 
>  static __rte_always_inline uint64_t
> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state)
>  static __rte_always_inline
>  struct rte_rand_state *__rte_rand_get_state(void)
>  {
> -	unsigned int idx;
> +	struct rte_rand_state *rand_state =
> &RTE_PER_LCORE(rte_rand_state);
> +	uint64_t seed;
> 
> -	idx = rte_lcore_id();
> +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> +	if (unlikely(seed != rand_state->seed)) {

Please note that rte_rand_seed lives in a completely different cache line than RTE_PER_LCORE(rte_rand_state), so the comparison with rte_rand_seed requires reading one more cache line than the original implementation, which only uses the cache line holding rand_states[idx].

This is in the hot path.

If we could register a per-thread INIT function, the lazy initialization could be avoided, and only one cache line accessed.

> +		rand_state->seed = seed;
> 
> -	/* last instance reserved for unregistered non-EAL threads */
> -	if (unlikely(idx == LCORE_ID_ANY))
> -		idx = RTE_MAX_LCORE;
> +		seed += rte_thread_self().opaque_id;
> +		__rte_srand_lfsr258(seed, rand_state);
> +	}
> 
> -	return &rand_states[idx];
> +	return rand_state;
>  }
> 
>  uint64_t
> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>  {
>  	uint64_t seed;
> 
> -	seed = __rte_random_initial_seed();
> +	do
> +		seed = __rte_random_initial_seed();
> +	while (seed == 0);
> 
>  	rte_srand(seed);
>  }
> --
> 2.39.2
  
Morten Brørup Sept. 6, 2023, 6:16 p.m. UTC | #2
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, 6 September 2023 19.20
> 
> Move the random number state into thread local storage.
> This has a several benefits.
>  - no false cache sharing from cpu prefetching
>  - fixes initialization of random state for non-DPDK threads
>  - fixes unsafe usage of random state by non-DPDK threads
> 
> The initialization of random number state is done by the
> lcore (lazy initialization).
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  lib/eal/common/rte_random.c | 38 +++++++++++++++++++------------------
>  1 file changed, 20 insertions(+), 18 deletions(-)
> 
> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> index 53636331a27b..9657adf6ad3b 100644
> --- a/lib/eal/common/rte_random.c
> +++ b/lib/eal/common/rte_random.c
> @@ -19,13 +19,14 @@ struct rte_rand_state {
>  	uint64_t z3;
>  	uint64_t z4;
>  	uint64_t z5;
> -} __rte_cache_aligned;
> +	uint64_t seed;
> +};
> 
> -/* One instance each for every lcore id-equipped thread, and one
> - * additional instance to be shared by all others threads (i.e., all
> - * unregistered non-EAL threads).
> - */
> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> +/* Global random seed */
> +static uint64_t rte_rand_seed;
> +
> +/* Per lcore random state. */
> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
> 
>  static uint32_t
>  __rte_rand_lcg32(uint32_t *seed)
> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct
> rte_rand_state *state)
>  void
>  rte_srand(uint64_t seed)
>  {
> -	unsigned int lcore_id;
> -
> -	/* add lcore_id to seed to avoid having the same sequence */
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> -		__rte_srand_lfsr258(seed + lcore_id,
> &rand_states[lcore_id]);
> +	__atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>  }
> 
>  static __rte_always_inline uint64_t
> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state)
>  static __rte_always_inline
>  struct rte_rand_state *__rte_rand_get_state(void)
>  {
> -	unsigned int idx;
> +	struct rte_rand_state *rand_state =
> &RTE_PER_LCORE(rte_rand_state);
> +	uint64_t seed;
> 
> -	idx = rte_lcore_id();
> +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> +	if (unlikely(seed != rand_state->seed)) {

Please note that rte_rand_seed lives in a completely different cache line than RTE_PER_LCORE(rte_rand_state), so the comparison with rte_rand_seed requires reading one more cache line than the original implementation, which only uses the cache line holding rand_states[idx].

This is in the hot path.

If we could register a per-thread INIT function, the lazy initialization could be avoided, and only one cache line accessed.

Or, simply replace "uint64_t seed" with "bool initialized" in the rte_rand_state structure, so the lazy init only needs to read rte_rand_seed if rand_state->initialized is false.

> +		rand_state->seed = seed;
> 
> -	/* last instance reserved for unregistered non-EAL threads */
> -	if (unlikely(idx == LCORE_ID_ANY))
> -		idx = RTE_MAX_LCORE;
> +		seed += rte_thread_self().opaque_id;
> +		__rte_srand_lfsr258(seed, rand_state);
> +	}
> 
> -	return &rand_states[idx];
> +	return rand_state;
>  }
> 
>  uint64_t
> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>  {
>  	uint64_t seed;
> 
> -	seed = __rte_random_initial_seed();
> +	do
> +		seed = __rte_random_initial_seed();
> +	while (seed == 0);
> 
>  	rte_srand(seed);
>  }
> --
> 2.39.2
  
Stephen Hemminger Sept. 6, 2023, 7:55 p.m. UTC | #3
On Wed,  6 Sep 2023 10:20:13 -0700
Stephen Hemminger <stephen@networkplumber.org> wrote:

>  static __rte_always_inline
>  struct rte_rand_state *__rte_rand_get_state(void)
>  {
> -	unsigned int idx;
> +	struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state);
> +	uint64_t seed;
>  
> -	idx = rte_lcore_id();
> +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> +	if (unlikely(seed != rand_state->seed)) {
> +		rand_state->seed = seed;
>  
> -	/* last instance reserved for unregistered non-EAL threads */
> -	if (unlikely(idx == LCORE_ID_ANY))
> -		idx = RTE_MAX_LCORE;
> +		seed += rte_thread_self().opaque_id;
> +		__rte_srand_lfsr258(seed, rand_state);
> +	}

Not sure about this.
It would change the semantics of rte_srand so that if passed the same
value across multiple runs, it would still generate different values because
thread_id is not the same.  Using rte_lcore() instead would cause repeatablity
but then there would still be a bug if two non-EAL threads used random.
Both threads would get the same sequence of numbers, but that is true
with current code.
  
Mattias Rönnblom Sept. 6, 2023, 8:02 p.m. UTC | #4
On 2023-09-06 19:20, Stephen Hemminger wrote:
> Move the random number state into thread local storage.

Me and Morten discussed TLS versus other alternatives in some other 
thread. The downside of TLS that Morten pointed out, from what I recall, 
is that lazy initialization is *required* (since the number of threads 
is open-ended), and the data ends up in non-huge page memory. It was 
also unclear to me what the memory footprint implications would be, 
would large per-lcore data structures be put in TLS. More specifically, 
if they would be duplicated across all threads, even non-lcore threads.

None of these issues affect rte_random.c's potential usage of TLS 
(except lazy [re-]initialization makes things more complicated).

Preferably, there should be one pattern that is usable across all or at 
least most DPDK modules requiring per-lcore state.

> This has a several benefits.
>   - no false cache sharing from cpu prefetching
>   - fixes initialization of random state for non-DPDK threads

This seems like a non-reason to me. That bug is easily fixed, if it 
isn't already.

>   - fixes unsafe usage of random state by non-DPDK threads
> 

"Makes random number generation MT safe from all threads (including 
unregistered non-EAL threads)."

With current API semantics you may still register an non-EAL thread, to 
get MT safe access to this API, so I guess it's more about being more 
convenient and less error prone, than anything else.

The new MT safety guarantees should be in the API docs as well.

> The initialization of random number state is done by the
> lcore (lazy initialization).
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>   lib/eal/common/rte_random.c | 38 +++++++++++++++++++------------------
>   1 file changed, 20 insertions(+), 18 deletions(-)
> 
> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> index 53636331a27b..9657adf6ad3b 100644
> --- a/lib/eal/common/rte_random.c
> +++ b/lib/eal/common/rte_random.c
> @@ -19,13 +19,14 @@ struct rte_rand_state {
>   	uint64_t z3;
>   	uint64_t z4;
>   	uint64_t z5;
> -} __rte_cache_aligned;
> +	uint64_t seed;
> +};
>   
> -/* One instance each for every lcore id-equipped thread, and one
> - * additional instance to be shared by all others threads (i.e., all
> - * unregistered non-EAL threads).
> - */
> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> +/* Global random seed */
> +static uint64_t rte_rand_seed;
> +
> +/* Per lcore random state. */
> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
>   
>   static uint32_t
>   __rte_rand_lcg32(uint32_t *seed)
> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct rte_rand_state *state)
>   void
>   rte_srand(uint64_t seed)
>   {
> -	unsigned int lcore_id;
> -
> -	/* add lcore_id to seed to avoid having the same sequence */
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> -		__rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]);
> +	__atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>   }
>   
>   static __rte_always_inline uint64_t
> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state)
>   static __rte_always_inline
>   struct rte_rand_state *__rte_rand_get_state(void)
>   {
> -	unsigned int idx;
> +	struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state);

There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE, to 
cover this usage. Or just use __thread (or _Thread_local?).

> +	uint64_t seed;
>   
> -	idx = rte_lcore_id();
> +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> +	if (unlikely(seed != rand_state->seed)) {
> +		rand_state->seed = seed;
>   

Re-seeding should restart the series, on all lcores. There's nothing 
preventing the user from re-seeding the machinery repeatedly, with the 
same seed. Seems like an unusual, but still valid, use case, if you run 
repeated tests of some sort.

Use a seqlock? :) I guess you need a seed generation number as well 
(e.g., is this the first time you seed with X, or the second one, etc.)

> -	/* last instance reserved for unregistered non-EAL threads */
> -	if (unlikely(idx == LCORE_ID_ANY))
> -		idx = RTE_MAX_LCORE;
> +		seed += rte_thread_self().opaque_id;
> +		__rte_srand_lfsr258(seed, rand_state);
> +	}
>   
> -	return &rand_states[idx];
> +	return rand_state;
>   }
>   
>   uint64_t
> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>   {
>   	uint64_t seed;
>   
> -	seed = __rte_random_initial_seed();
> +	do
> +		seed = __rte_random_initial_seed();
> +	while (seed == 0);
>   

Might be worth a comment why seed 0 is not allowed. Alternatively, use 
some other way of signaling __rte_srand_lfsr258() must be called.

>   	rte_srand(seed);
>   }
  
Mattias Rönnblom Sept. 6, 2023, 8:12 p.m. UTC | #5
On 2023-09-06 21:55, Stephen Hemminger wrote:
> On Wed,  6 Sep 2023 10:20:13 -0700
> Stephen Hemminger <stephen@networkplumber.org> wrote:
> 
>>   static __rte_always_inline
>>   struct rte_rand_state *__rte_rand_get_state(void)
>>   {
>> -	unsigned int idx;
>> +	struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state);
>> +	uint64_t seed;
>>   
>> -	idx = rte_lcore_id();
>> +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
>> +	if (unlikely(seed != rand_state->seed)) {
>> +		rand_state->seed = seed;
>>   
>> -	/* last instance reserved for unregistered non-EAL threads */
>> -	if (unlikely(idx == LCORE_ID_ANY))
>> -		idx = RTE_MAX_LCORE;
>> +		seed += rte_thread_self().opaque_id;
>> +		__rte_srand_lfsr258(seed, rand_state);
>> +	}
> 
> Not sure about this.
> It would change the semantics of rte_srand so that if passed the same
> value across multiple runs, it would still generate different values because
> thread_id is not the same.  Using rte_lcore() instead would cause repeatablity
> but then there would still be a bug if two non-EAL threads used random.

If the non-EAL threads register themselves (and thus acquires a lcore 
id), they will get this kind of repeatability, provided the order of 
registration is the same. A "big if", no doubt.

> Both threads would get the same sequence of numbers, but that is true
> with current code.
  
Stephen Hemminger Sept. 6, 2023, 11 p.m. UTC | #6
On Wed, 6 Sep 2023 22:02:54 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> On 2023-09-06 19:20, Stephen Hemminger wrote:
> > Move the random number state into thread local storage.  
> 
> Me and Morten discussed TLS versus other alternatives in some other 
> thread. The downside of TLS that Morten pointed out, from what I recall, 
> is that lazy initialization is *required* (since the number of threads 
> is open-ended), and the data ends up in non-huge page memory. It was 
> also unclear to me what the memory footprint implications would be, 
> would large per-lcore data structures be put in TLS. More specifically, 
> if they would be duplicated across all threads, even non-lcore threads.

But current method is unsafe on non-lcore threads.
Two non-lcore threads calling rte_rand() will clash on state without
any locking protection.

Also, right now the array is sized at 129 entries to allow for the
maximum number of lcores. When the maximum is increased to 512 or 1024
the problem will get worse.
  
Mattias Rönnblom Sept. 8, 2023, 7:04 a.m. UTC | #7
On 2023-09-07 01:00, Stephen Hemminger wrote:
> On Wed, 6 Sep 2023 22:02:54 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
>> On 2023-09-06 19:20, Stephen Hemminger wrote:
>>> Move the random number state into thread local storage.
>>
>> Me and Morten discussed TLS versus other alternatives in some other
>> thread. The downside of TLS that Morten pointed out, from what I recall,
>> is that lazy initialization is *required* (since the number of threads
>> is open-ended), and the data ends up in non-huge page memory. It was
>> also unclear to me what the memory footprint implications would be,
>> would large per-lcore data structures be put in TLS. More specifically,
>> if they would be duplicated across all threads, even non-lcore threads.
> 
> But current method is unsafe on non-lcore threads.
> Two non-lcore threads calling rte_rand() will clash on state without
> any locking protection.
> 

Sure, just like the API docs say, although the documentation use more 
precise terminology.

If you want to extend the API MT safety guarantees, it should come with 
an argument to why this change is needed.

Is this to save the application from calling rte_thread_register() in 
control plane threads? For convenience? Or for being generally less 
error prone?

Another reason might be that the application have many threads (more 
than RTE_LCORE_MAX), so it will run out of lcore ids.

> Also, right now the array is sized at 129 entries to allow for the
> maximum number of lcores. When the maximum is increased to 512 or 1024
> the problem will get worse.

Using TLS will penalize every thread in the process, not only EAL 
threads and registered non-EAL threads, and worse: not only threads that 
are using the API in question.

Every thread will carry the TLS memory around, increasing the process 
memory footprint.

Thread creation will be slower, since TLS memory is allocated *and 
initialized*, lazy user code-level initialization or not.

On my particular Linux x86_64 system, pthread creation overhead looks 
something like:

8 us w/o any user code-level use of TLS
11 us w/ 16 kB of TLS
314 us w/ 2 MB of TLS.

So, whatever you put into TLS, it needs to be small.

Putting a large amount of data into TLS will effectively prevent the 
DPDK libraries from being linked into a heavily multi-threaded app, 
regardless if those threads calls into DPDK or not.

Again, this doesn't much affect rte_random.c, but does disqualify TLS as 
a plug-in replacement for the current pattern with a statically 
allocated lcore id-indexed array.
  
Konstantin Ananyev Sept. 9, 2023, 12:13 a.m. UTC | #8
06/09/2023 21:02, Mattias Rönnblom пишет:
> On 2023-09-06 19:20, Stephen Hemminger wrote:
>> Move the random number state into thread local storage.
> 
> Me and Morten discussed TLS versus other alternatives in some other 
> thread. The downside of TLS that Morten pointed out, from what I recall, 
> is that lazy initialization is *required* (since the number of threads 
> is open-ended), and the data ends up in non-huge page memory.

Hmm.. correct me if I am wrong, but with current implementation,
rand state is also in non-huge memory:
static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];


> It was 
> also unclear to me what the memory footprint implications would be,h 
> would large per-lcore data structures be put in TLS. More specifically, 
> if they would be duplicated across all threads, even non-lcore threads.
> 
> None of these issues affect rte_random.c's potential usage of TLS 
> (except lazy [re-]initialization makes things more complicated).
> 
> Preferably, there should be one pattern that is usable across all or at 
> least most DPDK modules requiring per-lcore state.
> 
>> This has a several benefits.
>>   - no false cache sharing from cpu prefetching
>>   - fixes initialization of random state for non-DPDK threads
> 
> This seems like a non-reason to me. That bug is easily fixed, if it 
> isn't already.
> 
>>   - fixes unsafe usage of random state by non-DPDK threads
>>
> 
> "Makes random number generation MT safe from all threads (including 
> unregistered non-EAL threads)."
> 
> With current API semantics you may still register an non-EAL thread, to 
> get MT safe access to this API, so I guess it's more about being more 
> convenient and less error prone, than anything else.

I understand that we never guaranteed MT safety for non-EAL threads here,
but as a user of rte_rand() - it would be much more convenient, if I can 
use it
from any thread wthout worring is it a EAL thread or not.

About TlS usage and re-seeding - can we use some sort of middle-ground:
extend rte_rand_state with some gen-counter.
Make a 'master' copy of rte_rand_state that will be updated by rte_srand(),
and TLS copies of rte_rand_state, so rte_rand() can fist compare
its gen-counter value with master copy to decide,
does it need to copy new state from master or not.


> The new MT safety guarantees should be in the API docs as well.

Yes, it is an extension to the current API, not a fix.

> 
>> The initialization of random number state is done by the
>> lcore (lazy initialization).
>>
>> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>> ---
>>   lib/eal/common/rte_random.c | 38 +++++++++++++++++++------------------
>>   1 file changed, 20 insertions(+), 18 deletions(-)
>>
>> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
>> index 53636331a27b..9657adf6ad3b 100644
>> --- a/lib/eal/common/rte_random.c
>> +++ b/lib/eal/common/rte_random.c
>> @@ -19,13 +19,14 @@ struct rte_rand_state {
>>       uint64_t z3;
>>       uint64_t z4;
>>       uint64_t z5;
>> -} __rte_cache_aligned;
>> +    uint64_t seed;
>> +};
>> -/* One instance each for every lcore id-equipped thread, and one
>> - * additional instance to be shared by all others threads (i.e., all
>> - * unregistered non-EAL threads).
>> - */
>> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
>> +/* Global random seed */
>> +static uint64_t rte_rand_seed;
>> +
>> +/* Per lcore random state. */
>> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
>>   static uint32_t
>>   __rte_rand_lcg32(uint32_t *seed)
>> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct 
>> rte_rand_state *state)
>>   void
>>   rte_srand(uint64_t seed)
>>   {
>> -    unsigned int lcore_id;
>> -
>> -    /* add lcore_id to seed to avoid having the same sequence */
>> -    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
>> -        __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]);
>> +    __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>>   }
>>   static __rte_always_inline uint64_t
>> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state)
>>   static __rte_always_inline
>>   struct rte_rand_state *__rte_rand_get_state(void)
>>   {
>> -    unsigned int idx;
>> +    struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state);
> 
> There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE, to 
> cover this usage. Or just use __thread (or _Thread_local?).
> 
>> +    uint64_t seed;
>> -    idx = rte_lcore_id();
>> +    seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
>> +    if (unlikely(seed != rand_state->seed)) {
>> +        rand_state->seed = seed;
> 
> Re-seeding should restart the series, on all lcores. There's nothing 
> preventing the user from re-seeding the machinery repeatedly, with the 
> same seed. Seems like an unusual, but still valid, use case, if you run 
> repeated tests of some sort.
> 
> Use a seqlock? :) I guess you need a seed generation number as well 
> (e.g., is this the first time you seed with X, or the second one, etc.)
> 
>> -    /* last instance reserved for unregistered non-EAL threads */
>> -    if (unlikely(idx == LCORE_ID_ANY))
>> -        idx = RTE_MAX_LCORE;
>> +        seed += rte_thread_self().opaque_id;
>> +        __rte_srand_lfsr258(seed, rand_state);
>> +    }
>> -    return &rand_states[idx];
>> +    return rand_state;
>>   }
>>   uint64_t
>> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>>   {
>>       uint64_t seed;
>> -    seed = __rte_random_initial_seed();
>> +    do
>> +        seed = __rte_random_initial_seed();
>> +    while (seed == 0);
> 
> Might be worth a comment why seed 0 is not allowed. Alternatively, use 
> some other way of signaling __rte_srand_lfsr258() must be called.
> 
>>       rte_srand(seed);
>>   }
  
Mattias Rönnblom Sept. 9, 2023, 6:45 a.m. UTC | #9
On 2023-09-09 02:13, Konstantin Ananyev wrote:
> 06/09/2023 21:02, Mattias Rönnblom пишет:
>> On 2023-09-06 19:20, Stephen Hemminger wrote:
>>> Move the random number state into thread local storage.
>>
>> Me and Morten discussed TLS versus other alternatives in some other 
>> thread. The downside of TLS that Morten pointed out, from what I 
>> recall, is that lazy initialization is *required* (since the number of 
>> threads is open-ended), and the data ends up in non-huge page memory.
> 
> Hmm.. correct me if I am wrong, but with current implementation,
> rand state is also in non-huge memory:
> static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> 

Yes. The current pattern is certainly not perfect.

> 
>> It was also unclear to me what the memory footprint implications would 
>> be,h would large per-lcore data structures be put in TLS. More 
>> specifically, if they would be duplicated across all threads, even 
>> non-lcore threads.
>>
>> None of these issues affect rte_random.c's potential usage of TLS 
>> (except lazy [re-]initialization makes things more complicated).
>>
>> Preferably, there should be one pattern that is usable across all or 
>> at least most DPDK modules requiring per-lcore state.
>>
>>> This has a several benefits.
>>>   - no false cache sharing from cpu prefetching
>>>   - fixes initialization of random state for non-DPDK threads
>>
>> This seems like a non-reason to me. That bug is easily fixed, if it 
>> isn't already.
>>
>>>   - fixes unsafe usage of random state by non-DPDK threads
>>>
>>
>> "Makes random number generation MT safe from all threads (including 
>> unregistered non-EAL threads)."
>>
>> With current API semantics you may still register an non-EAL thread, 
>> to get MT safe access to this API, so I guess it's more about being 
>> more convenient and less error prone, than anything else.
> 
> I understand that we never guaranteed MT safety for non-EAL threads here,


Registered non-EAL threads have a lcore id and thus may safely call 
rte_rand(). Multiple unregistered non-EAL threads may not do so, in 
parallel.


> but as a user of rte_rand() - it would be much more convenient, if I can 
> use it
> from any thread wthout worring is it a EAL thread or not.

Sure, especially if it comes for free. The for-free solution has yet to 
reveal itself though.

> 
> About TlS usage and re-seeding - can we use some sort of middle-ground:
> extend rte_rand_state with some gen-counter.
> Make a 'master' copy of rte_rand_state that will be updated by rte_srand(),
> and TLS copies of rte_rand_state, so rte_rand() can fist compare
> its gen-counter value with master copy to decide,
> does it need to copy new state from master or not.
> 

Calling threads shouldn't all produce the same sequence. That would be 
silly and not very random. The generation number should be tied to the seed.

> 
>> The new MT safety guarantees should be in the API docs as well.
> 
> Yes, it is an extension to the current API, not a fix.
> 
>>
>>> The initialization of random number state is done by the
>>> lcore (lazy initialization).
>>>
>>> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>>> ---
>>>   lib/eal/common/rte_random.c | 38 +++++++++++++++++++------------------
>>>   1 file changed, 20 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
>>> index 53636331a27b..9657adf6ad3b 100644
>>> --- a/lib/eal/common/rte_random.c
>>> +++ b/lib/eal/common/rte_random.c
>>> @@ -19,13 +19,14 @@ struct rte_rand_state {
>>>       uint64_t z3;
>>>       uint64_t z4;
>>>       uint64_t z5;
>>> -} __rte_cache_aligned;
>>> +    uint64_t seed;
>>> +};
>>> -/* One instance each for every lcore id-equipped thread, and one
>>> - * additional instance to be shared by all others threads (i.e., all
>>> - * unregistered non-EAL threads).
>>> - */
>>> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
>>> +/* Global random seed */
>>> +static uint64_t rte_rand_seed;
>>> +
>>> +/* Per lcore random state. */
>>> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
>>>   static uint32_t
>>>   __rte_rand_lcg32(uint32_t *seed)
>>> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct 
>>> rte_rand_state *state)
>>>   void
>>>   rte_srand(uint64_t seed)
>>>   {
>>> -    unsigned int lcore_id;
>>> -
>>> -    /* add lcore_id to seed to avoid having the same sequence */
>>> -    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
>>> -        __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]);
>>> +    __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>>>   }
>>>   static __rte_always_inline uint64_t
>>> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state)
>>>   static __rte_always_inline
>>>   struct rte_rand_state *__rte_rand_get_state(void)
>>>   {
>>> -    unsigned int idx;
>>> +    struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state);
>>
>> There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE, to 
>> cover this usage. Or just use __thread (or _Thread_local?).
>>
>>> +    uint64_t seed;
>>> -    idx = rte_lcore_id();
>>> +    seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
>>> +    if (unlikely(seed != rand_state->seed)) {
>>> +        rand_state->seed = seed;
>>
>> Re-seeding should restart the series, on all lcores. There's nothing 
>> preventing the user from re-seeding the machinery repeatedly, with the 
>> same seed. Seems like an unusual, but still valid, use case, if you 
>> run repeated tests of some sort.
>>
>> Use a seqlock? :) I guess you need a seed generation number as well 
>> (e.g., is this the first time you seed with X, or the second one, etc.)
>>
>>> -    /* last instance reserved for unregistered non-EAL threads */
>>> -    if (unlikely(idx == LCORE_ID_ANY))
>>> -        idx = RTE_MAX_LCORE;
>>> +        seed += rte_thread_self().opaque_id;
>>> +        __rte_srand_lfsr258(seed, rand_state);
>>> +    }
>>> -    return &rand_states[idx];
>>> +    return rand_state;
>>>   }
>>>   uint64_t
>>> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>>>   {
>>>       uint64_t seed;
>>> -    seed = __rte_random_initial_seed();
>>> +    do
>>> +        seed = __rte_random_initial_seed();
>>> +    while (seed == 0);
>>
>> Might be worth a comment why seed 0 is not allowed. Alternatively, use 
>> some other way of signaling __rte_srand_lfsr258() must be called.
>>
>>>       rte_srand(seed);
>>>   }
>
  
Morten Brørup Sept. 9, 2023, 11:23 a.m. UTC | #10
> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Saturday, 9 September 2023 08.45
> 
> On 2023-09-09 02:13, Konstantin Ananyev wrote:
> > 06/09/2023 21:02, Mattias Rönnblom пишет:
> >> On 2023-09-06 19:20, Stephen Hemminger wrote:
> >>> Move the random number state into thread local storage.
> >>
> >> Me and Morten discussed TLS versus other alternatives in some other
> >> thread. The downside of TLS that Morten pointed out, from what I
> >> recall, is that lazy initialization is *required* (since the number
> of
> >> threads is open-ended), and the data ends up in non-huge page memory.
> >
> > Hmm.. correct me if I am wrong, but with current implementation,
> > rand state is also in non-huge memory:
> > static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> >
> 
> Yes. The current pattern is certainly not perfect.
> 
> >
> >> It was also unclear to me what the memory footprint implications
> would
> >> be,h would large per-lcore data structures be put in TLS. More
> >> specifically, if they would be duplicated across all threads, even
> >> non-lcore threads.
> >>
> >> None of these issues affect rte_random.c's potential usage of TLS
> >> (except lazy [re-]initialization makes things more complicated).
> >>
> >> Preferably, there should be one pattern that is usable across all or
> >> at least most DPDK modules requiring per-lcore state.
> >>
> >>> This has a several benefits.
> >>>   - no false cache sharing from cpu prefetching
> >>>   - fixes initialization of random state for non-DPDK threads
> >>
> >> This seems like a non-reason to me. That bug is easily fixed, if it
> >> isn't already.
> >>
> >>>   - fixes unsafe usage of random state by non-DPDK threads
> >>>
> >>
> >> "Makes random number generation MT safe from all threads (including
> >> unregistered non-EAL threads)."
> >>
> >> With current API semantics you may still register an non-EAL thread,
> >> to get MT safe access to this API, so I guess it's more about being
> >> more convenient and less error prone, than anything else.
> >
> > I understand that we never guaranteed MT safety for non-EAL threads
> here,
> 
> 
> Registered non-EAL threads have a lcore id and thus may safely call
> rte_rand(). Multiple unregistered non-EAL threads may not do so, in
> parallel.
> 
> 
> > but as a user of rte_rand() - it would be much more convenient, if I
> can
> > use it
> > from any thread wthout worring is it a EAL thread or not.
> 
> Sure, especially if it comes for free. The for-free solution has yet to
> reveal itself though.

We could offer re-entrant function variants for non-EAL threads:

uint64_t rte_rand_r(struct rte_rand_state * const state);
void rte_srand_r(struct rte_rand_state * const state, uint64_t seed);
uint64_t rte_rand_max_r(struct rte_rand_state * const state, uint64_t upper_bound);
double rte_drand_r(struct rte_rand_state * const state, void);

For this to work, we would have to make struct rte_rand_state public, and the application would need to allocate it. (At least one instance per thread that uses it, obviously.)

> 
> >
> > About TlS usage and re-seeding - can we use some sort of middle-
> ground:
> > extend rte_rand_state with some gen-counter.
> > Make a 'master' copy of rte_rand_state that will be updated by
> rte_srand(),
> > and TLS copies of rte_rand_state, so rte_rand() can fist compare
> > its gen-counter value with master copy to decide,
> > does it need to copy new state from master or not.
> >
> 
> Calling threads shouldn't all produce the same sequence. That would be
> silly and not very random. The generation number should be tied to the
> seed.

I previously thought about seeding...

We are trying to be random, we are not explicitly pseudo-random.

So I came to the conclusion that the ability to reproduce data (typically for verification purposes) is not a requirement here.

> 
> >
> >> The new MT safety guarantees should be in the API docs as well.
> >
> > Yes, it is an extension to the current API, not a fix.
> >
> >>
> >>> The initialization of random number state is done by the
> >>> lcore (lazy initialization).
> >>>
> >>> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> >>> ---
> >>>   lib/eal/common/rte_random.c | 38 +++++++++++++++++++--------------
> ----
> >>>   1 file changed, 20 insertions(+), 18 deletions(-)
> >>>
> >>> diff --git a/lib/eal/common/rte_random.c
> b/lib/eal/common/rte_random.c
> >>> index 53636331a27b..9657adf6ad3b 100644
> >>> --- a/lib/eal/common/rte_random.c
> >>> +++ b/lib/eal/common/rte_random.c
> >>> @@ -19,13 +19,14 @@ struct rte_rand_state {
> >>>       uint64_t z3;
> >>>       uint64_t z4;
> >>>       uint64_t z5;
> >>> -} __rte_cache_aligned;
> >>> +    uint64_t seed;
> >>> +};
> >>> -/* One instance each for every lcore id-equipped thread, and one
> >>> - * additional instance to be shared by all others threads (i.e.,
> all
> >>> - * unregistered non-EAL threads).
> >>> - */
> >>> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> >>> +/* Global random seed */
> >>> +static uint64_t rte_rand_seed;
> >>> +
> >>> +/* Per lcore random state. */
> >>> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
> >>>   static uint32_t
> >>>   __rte_rand_lcg32(uint32_t *seed)
> >>> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct
> >>> rte_rand_state *state)
> >>>   void
> >>>   rte_srand(uint64_t seed)
> >>>   {
> >>> -    unsigned int lcore_id;
> >>> -
> >>> -    /* add lcore_id to seed to avoid having the same sequence */
> >>> -    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> >>> -        __rte_srand_lfsr258(seed + lcore_id,
> &rand_states[lcore_id]);
> >>> +    __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
> >>>   }
> >>>   static __rte_always_inline uint64_t
> >>> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state
> *state)
> >>>   static __rte_always_inline
> >>>   struct rte_rand_state *__rte_rand_get_state(void)
> >>>   {
> >>> -    unsigned int idx;
> >>> +    struct rte_rand_state *rand_state =
> &RTE_PER_LCORE(rte_rand_state);
> >>
> >> There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE,
> to
> >> cover this usage. Or just use __thread (or _Thread_local?).
> >>
> >>> +    uint64_t seed;
> >>> -    idx = rte_lcore_id();
> >>> +    seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> >>> +    if (unlikely(seed != rand_state->seed)) {
> >>> +        rand_state->seed = seed;
> >>
> >> Re-seeding should restart the series, on all lcores. There's nothing
> >> preventing the user from re-seeding the machinery repeatedly, with
> the
> >> same seed. Seems like an unusual, but still valid, use case, if you
> >> run repeated tests of some sort.
> >>
> >> Use a seqlock? :) I guess you need a seed generation number as well
> >> (e.g., is this the first time you seed with X, or the second one,
> etc.)
> >>
> >>> -    /* last instance reserved for unregistered non-EAL threads */
> >>> -    if (unlikely(idx == LCORE_ID_ANY))
> >>> -        idx = RTE_MAX_LCORE;
> >>> +        seed += rte_thread_self().opaque_id;
> >>> +        __rte_srand_lfsr258(seed, rand_state);
> >>> +    }
> >>> -    return &rand_states[idx];
> >>> +    return rand_state;
> >>>   }
> >>>   uint64_t
> >>> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
> >>>   {
> >>>       uint64_t seed;
> >>> -    seed = __rte_random_initial_seed();
> >>> +    do
> >>> +        seed = __rte_random_initial_seed();
> >>> +    while (seed == 0);
> >>
> >> Might be worth a comment why seed 0 is not allowed. Alternatively,
> use
> >> some other way of signaling __rte_srand_lfsr258() must be called.
> >>
> >>>       rte_srand(seed);
> >>>   }
> >
  
Stephen Hemminger Sept. 9, 2023, 11:32 a.m. UTC | #11
On Sat, 9 Sep 2023 08:45:17 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > Hmm.. correct me if I am wrong, but with current implementation,
> > rand state is also in non-huge memory:
> > static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
> >   
> 
> Yes. The current pattern is certainly not perfect.

There is no good reason to put it in hugepage memory.
In fact, hugepage memory is a limited resource and you do want
different rand_states for primary and secondary processes.
  
Konstantin Ananyev Sept. 10, 2023, 1:26 p.m. UTC | #12
09/09/2023 07:45, Mattias Rönnblom пишет:
> On 2023-09-09 02:13, Konstantin Ananyev wrote:
>> 06/09/2023 21:02, Mattias Rönnblom пишет:
>>> On 2023-09-06 19:20, Stephen Hemminger wrote:
>>>> Move the random number state into thread local storage.
>>>
>>> Me and Morten discussed TLS versus other alternatives in some other 
>>> thread. The downside of TLS that Morten pointed out, from what I 
>>> recall, is that lazy initialization is *required* (since the number 
>>> of threads is open-ended), and the data ends up in non-huge page memory.
>>
>> Hmm.. correct me if I am wrong, but with current implementation,
>> rand state is also in non-huge memory:
>> static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
>>
> 
> Yes. The current pattern is certainly not perfect.
> 
>>
>>> It was also unclear to me what the memory footprint implications 
>>> would be,h would large per-lcore data structures be put in TLS. More 
>>> specifically, if they would be duplicated across all threads, even 
>>> non-lcore threads.
>>>
>>> None of these issues affect rte_random.c's potential usage of TLS 
>>> (except lazy [re-]initialization makes things more complicated).
>>>
>>> Preferably, there should be one pattern that is usable across all or 
>>> at least most DPDK modules requiring per-lcore state.
>>>
>>>> This has a several benefits.
>>>>   - no false cache sharing from cpu prefetching
>>>>   - fixes initialization of random state for non-DPDK threads
>>>
>>> This seems like a non-reason to me. That bug is easily fixed, if it 
>>> isn't already.
>>>
>>>>   - fixes unsafe usage of random state by non-DPDK threads
>>>>
>>>
>>> "Makes random number generation MT safe from all threads (including 
>>> unregistered non-EAL threads)."
>>>
>>> With current API semantics you may still register an non-EAL thread, 
>>> to get MT safe access to this API, so I guess it's more about being 
>>> more convenient and less error prone, than anything else.
>>
>> I understand that we never guaranteed MT safety for non-EAL threads here,
> 
> 
> Registered non-EAL threads have a lcore id and thus may safely call 
> rte_rand(). 

I am aware about such ability, but for me register/unregister thread
just to call rte_rand() seems like way too much hassle.

> Multiple unregistered non-EAL threads may not do so, in 
> parallel.
> 
> 
>> but as a user of rte_rand() - it would be much more convenient, if I 
>> can use it
>> from any thread wthout worring is it a EAL thread or not.
> 
> Sure, especially if it comes for free. The for-free solution has yet to 
> reveal itself though.
> 
>>
>> About TlS usage and re-seeding - can we use some sort of middle-ground:
>> extend rte_rand_state with some gen-counter.
>> Make a 'master' copy of rte_rand_state that will be updated by 
>> rte_srand(),
>> and TLS copies of rte_rand_state, so rte_rand() can fist compare
>> its gen-counter value with master copy to decide,
>> does it need to copy new state from master or not.
>>
> 
> Calling threads shouldn't all produce the same sequence. That would be 
> silly and not very random. The generation number should be tied to the 
> seed.

Actually, yes you right, probably we don't need a master copy of 
rte_rand_state itself.
It seems that just having a 'master' copy of 'seed' value, plus some
counter (to indicate that seed has been updated) is enough here.

> 
>>
>>> The new MT safety guarantees should be in the API docs as well.
>>
>> Yes, it is an extension to the current API, not a fix.
>>
>>>
>>>> The initialization of random number state is done by the
>>>> lcore (lazy initialization).
>>>>
>>>> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>>>> ---
>>>>   lib/eal/common/rte_random.c | 38 
>>>> +++++++++++++++++++------------------
>>>>   1 file changed, 20 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
>>>> index 53636331a27b..9657adf6ad3b 100644
>>>> --- a/lib/eal/common/rte_random.c
>>>> +++ b/lib/eal/common/rte_random.c
>>>> @@ -19,13 +19,14 @@ struct rte_rand_state {
>>>>       uint64_t z3;
>>>>       uint64_t z4;
>>>>       uint64_t z5;
>>>> -} __rte_cache_aligned;
>>>> +    uint64_t seed;
>>>> +};
>>>> -/* One instance each for every lcore id-equipped thread, and one
>>>> - * additional instance to be shared by all others threads (i.e., all
>>>> - * unregistered non-EAL threads).
>>>> - */
>>>> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
>>>> +/* Global random seed */
>>>> +static uint64_t rte_rand_seed;
>>>> +
>>>> +/* Per lcore random state. */
>>>> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
>>>>   static uint32_t
>>>>   __rte_rand_lcg32(uint32_t *seed)
>>>> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct 
>>>> rte_rand_state *state)
>>>>   void
>>>>   rte_srand(uint64_t seed)
>>>>   {
>>>> -    unsigned int lcore_id;
>>>> -
>>>> -    /* add lcore_id to seed to avoid having the same sequence */
>>>> -    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
>>>> -        __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]);
>>>> +    __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>>>>   }
>>>>   static __rte_always_inline uint64_t
>>>> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state *state)
>>>>   static __rte_always_inline
>>>>   struct rte_rand_state *__rte_rand_get_state(void)
>>>>   {
>>>> -    unsigned int idx;
>>>> +    struct rte_rand_state *rand_state = 
>>>> &RTE_PER_LCORE(rte_rand_state);
>>>
>>> There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE, 
>>> to cover this usage. Or just use __thread (or _Thread_local?).
>>>
>>>> +    uint64_t seed;
>>>> -    idx = rte_lcore_id();
>>>> +    seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
>>>> +    if (unlikely(seed != rand_state->seed)) {
>>>> +        rand_state->seed = seed;
>>>
>>> Re-seeding should restart the series, on all lcores. There's nothing 
>>> preventing the user from re-seeding the machinery repeatedly, with 
>>> the same seed. Seems like an unusual, but still valid, use case, if 
>>> you run repeated tests of some sort.
>>>
>>> Use a seqlock? :) I guess you need a seed generation number as well 
>>> (e.g., is this the first time you seed with X, or the second one, etc.)
>>>
>>>> -    /* last instance reserved for unregistered non-EAL threads */
>>>> -    if (unlikely(idx == LCORE_ID_ANY))
>>>> -        idx = RTE_MAX_LCORE;
>>>> +        seed += rte_thread_self().opaque_id;
>>>> +        __rte_srand_lfsr258(seed, rand_state);
>>>> +    }
>>>> -    return &rand_states[idx];
>>>> +    return rand_state;
>>>>   }
>>>>   uint64_t
>>>> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>>>>   {
>>>>       uint64_t seed;
>>>> -    seed = __rte_random_initial_seed();
>>>> +    do
>>>> +        seed = __rte_random_initial_seed();
>>>> +    while (seed == 0);
>>>
>>> Might be worth a comment why seed 0 is not allowed. Alternatively, 
>>> use some other way of signaling __rte_srand_lfsr258() must be called.
>>>
>>>>       rte_srand(seed);
>>>>   }
>>
  
Mattias Rönnblom Sept. 11, 2023, 9 a.m. UTC | #13
On 2023-09-09 13:23, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Saturday, 9 September 2023 08.45
>>
>> On 2023-09-09 02:13, Konstantin Ananyev wrote:
>>> 06/09/2023 21:02, Mattias Rönnblom пишет:
>>>> On 2023-09-06 19:20, Stephen Hemminger wrote:
>>>>> Move the random number state into thread local storage.
>>>>
>>>> Me and Morten discussed TLS versus other alternatives in some other
>>>> thread. The downside of TLS that Morten pointed out, from what I
>>>> recall, is that lazy initialization is *required* (since the number
>> of
>>>> threads is open-ended), and the data ends up in non-huge page memory.
>>>
>>> Hmm.. correct me if I am wrong, but with current implementation,
>>> rand state is also in non-huge memory:
>>> static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
>>>
>>
>> Yes. The current pattern is certainly not perfect.
>>
>>>
>>>> It was also unclear to me what the memory footprint implications
>> would
>>>> be,h would large per-lcore data structures be put in TLS. More
>>>> specifically, if they would be duplicated across all threads, even
>>>> non-lcore threads.
>>>>
>>>> None of these issues affect rte_random.c's potential usage of TLS
>>>> (except lazy [re-]initialization makes things more complicated).
>>>>
>>>> Preferably, there should be one pattern that is usable across all or
>>>> at least most DPDK modules requiring per-lcore state.
>>>>
>>>>> This has a several benefits.
>>>>>    - no false cache sharing from cpu prefetching
>>>>>    - fixes initialization of random state for non-DPDK threads
>>>>
>>>> This seems like a non-reason to me. That bug is easily fixed, if it
>>>> isn't already.
>>>>
>>>>>    - fixes unsafe usage of random state by non-DPDK threads
>>>>>
>>>>
>>>> "Makes random number generation MT safe from all threads (including
>>>> unregistered non-EAL threads)."
>>>>
>>>> With current API semantics you may still register an non-EAL thread,
>>>> to get MT safe access to this API, so I guess it's more about being
>>>> more convenient and less error prone, than anything else.
>>>
>>> I understand that we never guaranteed MT safety for non-EAL threads
>> here,
>>
>>
>> Registered non-EAL threads have a lcore id and thus may safely call
>> rte_rand(). Multiple unregistered non-EAL threads may not do so, in
>> parallel.
>>
>>
>>> but as a user of rte_rand() - it would be much more convenient, if I
>> can
>>> use it
>>> from any thread wthout worring is it a EAL thread or not.
>>
>> Sure, especially if it comes for free. The for-free solution has yet to
>> reveal itself though.
> 
> We could offer re-entrant function variants for non-EAL threads:
> 
> uint64_t rte_rand_r(struct rte_rand_state * const state);
> void rte_srand_r(struct rte_rand_state * const state, uint64_t seed);
> uint64_t rte_rand_max_r(struct rte_rand_state * const state, uint64_t upper_bound);
> double rte_drand_r(struct rte_rand_state * const state, void);
> 
> For this to work, we would have to make struct rte_rand_state public, and the application would need to allocate it. (At least one instance per thread that uses it, obviously.)
> 

Yes, and that will come at a pretty severe API complexity cost.

Besides the obvious complexities, it may also lead the user to believe 
the rte_rand() is not MT safe for any thread, since that's how it works 
in glibc (rand() versus rand_r()).

>>
>>>
>>> About TlS usage and re-seeding - can we use some sort of middle-
>> ground:
>>> extend rte_rand_state with some gen-counter.
>>> Make a 'master' copy of rte_rand_state that will be updated by
>> rte_srand(),
>>> and TLS copies of rte_rand_state, so rte_rand() can fist compare
>>> its gen-counter value with master copy to decide,
>>> does it need to copy new state from master or not.
>>>
>>
>> Calling threads shouldn't all produce the same sequence. That would be
>> silly and not very random. The generation number should be tied to the
>> seed.
> 
> I previously thought about seeding...
> 
> We are trying to be random, we are not explicitly pseudo-random.
> 
> So I came to the conclusion that the ability to reproduce data (typically for verification purposes) is not a requirement here.
> 
>>
>>>
>>>> The new MT safety guarantees should be in the API docs as well.
>>>
>>> Yes, it is an extension to the current API, not a fix.
>>>
>>>>
>>>>> The initialization of random number state is done by the
>>>>> lcore (lazy initialization).
>>>>>
>>>>> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>>>>> ---
>>>>>    lib/eal/common/rte_random.c | 38 +++++++++++++++++++--------------
>> ----
>>>>>    1 file changed, 20 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/lib/eal/common/rte_random.c
>> b/lib/eal/common/rte_random.c
>>>>> index 53636331a27b..9657adf6ad3b 100644
>>>>> --- a/lib/eal/common/rte_random.c
>>>>> +++ b/lib/eal/common/rte_random.c
>>>>> @@ -19,13 +19,14 @@ struct rte_rand_state {
>>>>>        uint64_t z3;
>>>>>        uint64_t z4;
>>>>>        uint64_t z5;
>>>>> -} __rte_cache_aligned;
>>>>> +    uint64_t seed;
>>>>> +};
>>>>> -/* One instance each for every lcore id-equipped thread, and one
>>>>> - * additional instance to be shared by all others threads (i.e.,
>> all
>>>>> - * unregistered non-EAL threads).
>>>>> - */
>>>>> -static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
>>>>> +/* Global random seed */
>>>>> +static uint64_t rte_rand_seed;
>>>>> +
>>>>> +/* Per lcore random state. */
>>>>> +static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
>>>>>    static uint32_t
>>>>>    __rte_rand_lcg32(uint32_t *seed)
>>>>> @@ -81,11 +82,7 @@ __rte_srand_lfsr258(uint64_t seed, struct
>>>>> rte_rand_state *state)
>>>>>    void
>>>>>    rte_srand(uint64_t seed)
>>>>>    {
>>>>> -    unsigned int lcore_id;
>>>>> -
>>>>> -    /* add lcore_id to seed to avoid having the same sequence */
>>>>> -    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
>>>>> -        __rte_srand_lfsr258(seed + lcore_id,
>> &rand_states[lcore_id]);
>>>>> +    __atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
>>>>>    }
>>>>>    static __rte_always_inline uint64_t
>>>>> @@ -119,15 +116,18 @@ __rte_rand_lfsr258(struct rte_rand_state
>> *state)
>>>>>    static __rte_always_inline
>>>>>    struct rte_rand_state *__rte_rand_get_state(void)
>>>>>    {
>>>>> -    unsigned int idx;
>>>>> +    struct rte_rand_state *rand_state =
>> &RTE_PER_LCORE(rte_rand_state);
>>>>
>>>> There should really be a RTE_PER_THREAD, an alias to RTE_PER_LCORE,
>> to
>>>> cover this usage. Or just use __thread (or _Thread_local?).
>>>>
>>>>> +    uint64_t seed;
>>>>> -    idx = rte_lcore_id();
>>>>> +    seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
>>>>> +    if (unlikely(seed != rand_state->seed)) {
>>>>> +        rand_state->seed = seed;
>>>>
>>>> Re-seeding should restart the series, on all lcores. There's nothing
>>>> preventing the user from re-seeding the machinery repeatedly, with
>> the
>>>> same seed. Seems like an unusual, but still valid, use case, if you
>>>> run repeated tests of some sort.
>>>>
>>>> Use a seqlock? :) I guess you need a seed generation number as well
>>>> (e.g., is this the first time you seed with X, or the second one,
>> etc.)
>>>>
>>>>> -    /* last instance reserved for unregistered non-EAL threads */
>>>>> -    if (unlikely(idx == LCORE_ID_ANY))
>>>>> -        idx = RTE_MAX_LCORE;
>>>>> +        seed += rte_thread_self().opaque_id;
>>>>> +        __rte_srand_lfsr258(seed, rand_state);
>>>>> +    }
>>>>> -    return &rand_states[idx];
>>>>> +    return rand_state;
>>>>>    }
>>>>>    uint64_t
>>>>> @@ -227,7 +227,9 @@ RTE_INIT(rte_rand_init)
>>>>>    {
>>>>>        uint64_t seed;
>>>>> -    seed = __rte_random_initial_seed();
>>>>> +    do
>>>>> +        seed = __rte_random_initial_seed();
>>>>> +    while (seed == 0);
>>>>
>>>> Might be worth a comment why seed 0 is not allowed. Alternatively,
>> use
>>>> some other way of signaling __rte_srand_lfsr258() must be called.
>>>>
>>>>>        rte_srand(seed);
>>>>>    }
>>>
  
Stephen Hemminger Sept. 11, 2023, 4:02 p.m. UTC | #14
On Mon, 11 Sep 2023 11:00:46 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > uint64_t rte_rand_r(struct rte_rand_state * const state);
> > void rte_srand_r(struct rte_rand_state * const state, uint64_t
> > seed); uint64_t rte_rand_max_r(struct rte_rand_state * const state,
> > uint64_t upper_bound); double rte_drand_r(struct rte_rand_state *
> > const state, void);
> > 
> > For this to work, we would have to make struct rte_rand_state
> > public, and the application would need to allocate it. (At least
> > one instance per thread that uses it, obviously.) 
> 
> Yes, and that will come at a pretty severe API complexity cost.
> 
> Besides the obvious complexities, it may also lead the user to
> believe the rte_rand() is not MT safe for any thread, since that's
> how it works in glibc (rand() versus rand_r()).

Actual rand state implementation could be hidden by having an
allocation/create/new function.
  
Stephen Hemminger Sept. 11, 2023, 4:04 p.m. UTC | #15
On Wed, 6 Sep 2023 19:54:32 +0200
Morten Brørup <mb@smartsharesystems.com> wrote:

> > 
> > -	idx = rte_lcore_id();
> > +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> > +	if (unlikely(seed != rand_state->seed)) {  
> 
> Please note that rte_rand_seed lives in a completely different cache
> line than RTE_PER_LCORE(rte_rand_state), so the comparison with
> rte_rand_seed requires reading one more cache line than the original
> implementation, which only uses the cache line holding
> rand_states[idx].
> 
> This is in the hot path.
> 
> If we could register a per-thread INIT function, the lazy
> initialization could be avoided, and only one cache line accessed.

Since rte_rand_seed rarely changes, it will get cached by each cpu.
The problem was before there was prefetch cache overlap causing false
sharing.
  
Stephen Hemminger Sept. 11, 2023, 4:06 p.m. UTC | #16
On Fri, 8 Sep 2023 09:04:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > Also, right now the array is sized at 129 entries to allow for the
> > maximum number of lcores. When the maximum is increased to 512 or
> > 1024 the problem will get worse.  
> 
> Using TLS will penalize every thread in the process, not only EAL 
> threads and registered non-EAL threads, and worse: not only threads
> that are using the API in question.
> 
> Every thread will carry the TLS memory around, increasing the process 
> memory footprint.
> 
> Thread creation will be slower, since TLS memory is allocated *and 
> initialized*, lazy user code-level initialization or not.
> 
> On my particular Linux x86_64 system, pthread creation overhead looks 
> something like:
> 
> 8 us w/o any user code-level use of TLS
> 11 us w/ 16 kB of TLS
> 314 us w/ 2 MB of TLS.

Agree that TLS does cause potentially more pages to get allocated on
thread creation, but that argument doesn't make sense here. The rand
state is small, and DPDK applications should not be creating threads
after startup. Thread creation is an expensive set of system calls.
  
Morten Brørup Sept. 11, 2023, 4:37 p.m. UTC | #17
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Monday, 11 September 2023 17.04
> 
> On Wed, 6 Sep 2023 19:54:32 +0200
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > >
> > > -	idx = rte_lcore_id();
> > > +	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
> > > +	if (unlikely(seed != rand_state->seed)) {
> >
> > Please note that rte_rand_seed lives in a completely different cache
> > line than RTE_PER_LCORE(rte_rand_state), so the comparison with
> > rte_rand_seed requires reading one more cache line than the original
> > implementation, which only uses the cache line holding
> > rand_states[idx].
> >
> > This is in the hot path.
> >
> > If we could register a per-thread INIT function, the lazy
> > initialization could be avoided, and only one cache line accessed.
> 
> Since rte_rand_seed rarely changes, it will get cached by each cpu.

Agreed. And the point I was trying to convey was that this will consume cache on each CPU. Cache is a scarce resource on some hardware.

If reading rte_rand_seed is gated by (bool) rand_state->initialized, the __rte_rand_get_state() function is less likely to evict other data from the cache to make room for rte_rand_seed in the cache.

> The problem was before there was prefetch cache overlap causing false
> sharing.

Yes, and my patch also wastes a cache line per state, because of the cache guard.

So far, we have use this simple function for a discussion of principles and design patterns... so let me bring something new into the discussion, which is relevant on both the abstract and the practical level:

We could keep using the rte_module_data[RTE_MAX_LCORE] design pattern generally. But we could make an exception for data structures significantly smaller than a cache line. If we use TLS for those, we would avoid gaps in the memory allocated for those, which is even more cache efficient.

Let's assume that (unsigned int) RTE_PER_LCORE(_lcore_id) is currently the only data in TLS. If we add (struct rte_rand_state) RTE_PER_LCORE(rand_state), it will go into the same cache line as RTE_PER_LCORE(_lcore_id), and effectively only cost a fraction of a cache line.
  
Mattias Rönnblom Sept. 11, 2023, 4:53 p.m. UTC | #18
On 2023-09-11 18:06, Stephen Hemminger wrote:
> On Fri, 8 Sep 2023 09:04:29 +0200
> Mattias Rönnblom <hofors@lysator.liu.se> wrote:
> 
>>> Also, right now the array is sized at 129 entries to allow for the
>>> maximum number of lcores. When the maximum is increased to 512 or
>>> 1024 the problem will get worse.
>>
>> Using TLS will penalize every thread in the process, not only EAL
>> threads and registered non-EAL threads, and worse: not only threads
>> that are using the API in question.
>>
>> Every thread will carry the TLS memory around, increasing the process
>> memory footprint.
>>
>> Thread creation will be slower, since TLS memory is allocated *and
>> initialized*, lazy user code-level initialization or not.
>>
>> On my particular Linux x86_64 system, pthread creation overhead looks
>> something like:
>>
>> 8 us w/o any user code-level use of TLS
>> 11 us w/ 16 kB of TLS
>> 314 us w/ 2 MB of TLS.
> 
> Agree that TLS does cause potentially more pages to get allocated on
> thread creation, but that argument doesn't make sense here.

Sure. I was talking about the general concept of replacing per-lcore 
static arrays with TLS.

I find the general applicability of the TLS pattern related because it 
doesn't make sense to have an ad-hoc, opportunistic way to implement 
essentially the same thing across the DPDK code base.

> The rand
> state is small, and DPDK applications should not be creating threads
> after startup. Thread creation is an expensive set of system calls.

I agree, and I would add that non-EAL threads will likely be few in 
numbers, and should all be registered on creation, to assure they can 
call DPDK APIs which require a lcore id.

That said, if application do create threads, DPDK shouldn't make the 
thread creation order of magnitudes slower.
  

Patch

diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
index 53636331a27b..9657adf6ad3b 100644
--- a/lib/eal/common/rte_random.c
+++ b/lib/eal/common/rte_random.c
@@ -19,13 +19,14 @@  struct rte_rand_state {
 	uint64_t z3;
 	uint64_t z4;
 	uint64_t z5;
-} __rte_cache_aligned;
+	uint64_t seed;
+};
 
-/* One instance each for every lcore id-equipped thread, and one
- * additional instance to be shared by all others threads (i.e., all
- * unregistered non-EAL threads).
- */
-static struct rte_rand_state rand_states[RTE_MAX_LCORE + 1];
+/* Global random seed */
+static uint64_t rte_rand_seed;
+
+/* Per lcore random state. */
+static RTE_DEFINE_PER_LCORE(struct rte_rand_state, rte_rand_state);
 
 static uint32_t
 __rte_rand_lcg32(uint32_t *seed)
@@ -81,11 +82,7 @@  __rte_srand_lfsr258(uint64_t seed, struct rte_rand_state *state)
 void
 rte_srand(uint64_t seed)
 {
-	unsigned int lcore_id;
-
-	/* add lcore_id to seed to avoid having the same sequence */
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
-		__rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]);
+	__atomic_store_n(&rte_rand_seed, seed, __ATOMIC_RELAXED);
 }
 
 static __rte_always_inline uint64_t
@@ -119,15 +116,18 @@  __rte_rand_lfsr258(struct rte_rand_state *state)
 static __rte_always_inline
 struct rte_rand_state *__rte_rand_get_state(void)
 {
-	unsigned int idx;
+	struct rte_rand_state *rand_state = &RTE_PER_LCORE(rte_rand_state);
+	uint64_t seed;
 
-	idx = rte_lcore_id();
+	seed = __atomic_load_n(&rte_rand_seed, __ATOMIC_RELAXED);
+	if (unlikely(seed != rand_state->seed)) {
+		rand_state->seed = seed;
 
-	/* last instance reserved for unregistered non-EAL threads */
-	if (unlikely(idx == LCORE_ID_ANY))
-		idx = RTE_MAX_LCORE;
+		seed += rte_thread_self().opaque_id;
+		__rte_srand_lfsr258(seed, rand_state);
+	}
 
-	return &rand_states[idx];
+	return rand_state;
 }
 
 uint64_t
@@ -227,7 +227,9 @@  RTE_INIT(rte_rand_init)
 {
 	uint64_t seed;
 
-	seed = __rte_random_initial_seed();
+	do
+		seed = __rte_random_initial_seed();
+	while (seed == 0);
 
 	rte_srand(seed);
 }