[v4,1/3] random: add rte_drand() function

Message ID 20220525203123.277180-2-stephen@networkplumber.org (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series introduce random floating point function |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Stephen Hemminger May 25, 2022, 8:31 p.m. UTC
  The PIE code and other applications can benefit from having a
fast way to get a random floating point value. This new function
is equivalent to drand() in the standard library.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/test_rand_perf.c              |  7 +++++
 doc/guides/rel_notes/release_22_07.rst |  5 ++++
 lib/eal/common/rte_random.c            | 41 ++++++++++++++++++++++++++
 lib/eal/include/rte_random.h           | 18 +++++++++++
 lib/eal/meson.build                    |  3 ++
 lib/eal/version.map                    |  1 +
 6 files changed, 75 insertions(+)
  

Comments

Ray Kinsella May 26, 2022, 9:56 a.m. UTC | #1
Stephen Hemminger <stephen@networkplumber.org> writes:

> The PIE code and other applications can benefit from having a
> fast way to get a random floating point value. This new function
> is equivalent to drand() in the standard library.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  app/test/test_rand_perf.c              |  7 +++++
>  doc/guides/rel_notes/release_22_07.rst |  5 ++++
>  lib/eal/common/rte_random.c            | 41 ++++++++++++++++++++++++++
>  lib/eal/include/rte_random.h           | 18 +++++++++++
>  lib/eal/meson.build                    |  3 ++
>  lib/eal/version.map                    |  1 +
>  6 files changed, 75 insertions(+)
>
Acked-by: Ray Kinsella <mdr@ashoe.eu>
  
Mattias Rönnblom May 26, 2022, 1:20 p.m. UTC | #2
On 2022-05-25 22:31, Stephen Hemminger wrote:
> The PIE code and other applications can benefit from having a
> fast way to get a random floating point value. This new function
> is equivalent to drand() in the standard library.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>   app/test/test_rand_perf.c              |  7 +++++
>   doc/guides/rel_notes/release_22_07.rst |  5 ++++
>   lib/eal/common/rte_random.c            | 41 ++++++++++++++++++++++++++
>   lib/eal/include/rte_random.h           | 18 +++++++++++
>   lib/eal/meson.build                    |  3 ++
>   lib/eal/version.map                    |  1 +
>   6 files changed, 75 insertions(+)
> 
> diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c
> index fe797ebfa1ca..26fb1d9a586e 100644
> --- a/app/test/test_rand_perf.c
> +++ b/app/test/test_rand_perf.c
> @@ -20,6 +20,7 @@ static volatile uint64_t vsum;
>   
>   enum rand_type {
>   	rand_type_64,
> +	rand_type_float,
>   	rand_type_bounded_best_case,
>   	rand_type_bounded_worst_case
>   };
> @@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type)
>   	switch (rand_type) {
>   	case rand_type_64:
>   		return "Full 64-bit [rte_rand()]";
> +	case rand_type_float:
> +		return "Floating point [rte_drand()]";
>   	case rand_type_bounded_best_case:
>   		return "Bounded average best-case [rte_rand_max()]";
>   	case rand_type_bounded_worst_case:
> @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
>   		case rand_type_64:
>   			sum += rte_rand();
>   			break;
> +		case rand_type_float:
> +			sum += 1000. * rte_drand();

Including this floating point multiplication will lead to an 
overestimation of rte_drand() latency.

You could refactor this function to be a macro, and pass the return type 
to as a parameter to this macro. I did just that, and on both an AMD 
5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think 
it's necessary.

> +			break;
>   		case rand_type_bounded_best_case:
>   			sum += rte_rand_max(BEST_CASE_BOUND);
>   			break;
> @@ -83,6 +89,7 @@ test_rand_perf(void)
>   	printf("Pseudo-random number generation latencies:\n");
>   
>   	test_rand_perf_type(rand_type_64);
> +	test_rand_perf_type(rand_type_float);
>   	test_rand_perf_type(rand_type_bounded_best_case);
>   	test_rand_perf_type(rand_type_bounded_worst_case);
>   
> diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> index e49cacecefd4..b131ea577226 100644
> --- a/doc/guides/rel_notes/release_22_07.rst
> +++ b/doc/guides/rel_notes/release_22_07.rst
> @@ -104,6 +104,11 @@ New Features
>     * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
>     * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
>   
> +* ** Added function get random floating point number.**
> +
> +  Added the function ``rte_drand()`` to provide a pseudo-random
> +  floating point number.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> index 4535cc980cec..3dc3484ee655 100644
> --- a/lib/eal/common/rte_random.c
> +++ b/lib/eal/common/rte_random.c
> @@ -6,6 +6,9 @@
>   #include <x86intrin.h>
>   #endif
>   #include <unistd.h>
> +#ifdef RTE_LIBEAL_USE_IEEE754
> +#include <ieee754.h>
> +#endif
>   
>   #include <rte_branch_prediction.h>
>   #include <rte_cycles.h>
> @@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound)
>   	return res;
>   }
>   
> +double
> +rte_drand(void)
> +{
> +	struct rte_rand_state *state = __rte_rand_get_state();
> +	uint64_t rand64 = __rte_rand_lfsr258(state);
> +#ifdef RTE_LIBEAL_USE_IEEE754
> +	union ieee754_double u = {
> +		.ieee = {
> +			.negative = 0,
> +			.exponent = IEEE754_DOUBLE_BIAS,
> +		},
> +	};
> +
> +	/* Take 64 bit random value and put it into the mantissa
> +	 * This uses direct access to IEEE format to avoid doing
> +	 * any direct floating point math here.
> +	 */
> +	u.ieee.mantissa0 = rand64 >> 32;
> +	u.ieee.mantissa1 = rand64;
> +
> +	return u.d - 1.0;
> +#else
> +	/* Slower method requiring floating point divide
> +	 *

Do you know how much slower? I ran rand_perf_test on two of my systems.

                       AMD 5900X     Pi4 (ARM Cortex-A72)
IEEE754 version          12              1.19
Non-IEEE754 version      11              1.16
Naive version*           24              1.16

* (double)rte_rand() / (double)UINT64_MAX

Numbers are TSC cycles/op.

Surprisingly, it seems like the IEEE754 version is slower on both of 
these machines.

Do you have a machine (or a different use case) where the supposedly 
more optimized version actually runs faster?

> +	 * The double mantissa only has 53 bits, so we uniformly mask off the
> +	 * high 11 bits and then floating-point divide by 2^53 to achieve a
> +	 * result in [0, 1).
> +	 *
> +	 * We are not allowed to emit 1.0, so denom must be one greater than
> +	 * the possible range of the preceeding step.
> +	 */
> +	static const uint64_t denom = (uint64_t)1 << 53;

Remove "static const". Surely, this can't make a difference (at least 
not in a positive direction).

> +
> +	rand64 &= denom - 1;
> +	return (double)rand64 / denom;
> +#endif
> +}
> +
>   static uint64_t
>   __rte_random_initial_seed(void)
>   {
> diff --git a/lib/eal/include/rte_random.h b/lib/eal/include/rte_random.h
> index 29f5f1325a30..f6541c2b0f08 100644
> --- a/lib/eal/include/rte_random.h
> +++ b/lib/eal/include/rte_random.h
> @@ -65,6 +65,24 @@ rte_rand(void);
>   uint64_t
>   rte_rand_max(uint64_t upper_bound);
>   
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Generates a pseudo-random floating point number.
> + *
> + * This function returns a nonnegative double-precision floating random
> + * number uniformly distributed over the interval [0.0, 1.0).
> + *
> + * The generator is not cryptographically secure.
> + * If called from lcore threads, this function is thread-safe.
> + *
> + * @return
> + *   A pseudo-random value between 0 and 1.0.
> + */
> +__rte_experimental
> +double rte_drand(void);
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/eal/meson.build b/lib/eal/meson.build
> index 056beb946119..e50524901c98 100644
> --- a/lib/eal/meson.build
> +++ b/lib/eal/meson.build
> @@ -32,3 +32,6 @@ endif
>   if cc.has_function('getentropy', prefix : '#include <unistd.h>')
>       cflags += '-DRTE_LIBEAL_USE_GETENTROPY'
>   endif
> +if cc.has_header_symbol('ieee754.h', 'union ieee754_double')
> +    cflags += '-DRTE_LIBEAL_USE_IEEE754'
> +endif
> diff --git a/lib/eal/version.map b/lib/eal/version.map
> index d49e30bd042f..cfbade9a33e9 100644
> --- a/lib/eal/version.map
> +++ b/lib/eal/version.map
> @@ -422,6 +422,7 @@ EXPERIMENTAL {
>   	rte_intr_type_set;
>   
>   	# added in 22.07
> +	rte_drand;
>   	rte_thread_get_affinity_by_id;
>   	rte_thread_self;
>   	rte_thread_set_affinity_by_id;
  
Stephen Hemminger May 26, 2022, 3:25 p.m. UTC | #3
On Thu, 26 May 2022 15:20:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
> >   		case rand_type_64:
> >   			sum += rte_rand();
> >   			break;
> > +		case rand_type_float:
> > +			sum += 1000. * rte_drand();  
> 
> Including this floating point multiplication will lead to an 
> overestimation of rte_drand() latency.
> 
> You could refactor this function to be a macro, and pass the return type 
> to as a parameter to this macro. I did just that, and on both an AMD 
> 5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think 
> it's necessary.

The test is not doing anything useful with the result.
It is just a way to exercise the code.

Macros are evil, have little or no typechecking and should be avoided.
  
Stephen Hemminger May 26, 2022, 3:28 p.m. UTC | #4
On Thu, 26 May 2022 15:20:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> > +#else
> > +	/* Slower method requiring floating point divide
> > +	 *  
> 
> Do you know how much slower? I ran rand_perf_test on two of my systems.
> 
>                        AMD 5900X     Pi4 (ARM Cortex-A72)
> IEEE754 version          12              1.19
> Non-IEEE754 version      11              1.16
> Naive version*           24              1.16
> 
> * (double)rte_rand() / (double)UINT64_MAX
> 
> Numbers are TSC cycles/op.
> 
> Surprisingly, it seems like the IEEE754 version is slower on both of 
> these machines.
> 
> Do you have a machine (or a different use case) where the supposedly 
> more optimized version actually runs faster?

The direct method is based off the concept used by glibc and others
and the divide (including spelling error) are from FreeBSD.

Be careful with micro benchmarks. A better one would be do
rte_drand() compared with something to check whether it is in range.
  
Stephen Hemminger May 26, 2022, 8:19 p.m. UTC | #5
On Thu, 26 May 2022 15:20:29 +0200
Mattias Rönnblom <hofors@lysator.liu.se> wrote:

> On 2022-05-25 22:31, Stephen Hemminger wrote:
> > The PIE code and other applications can benefit from having a
> > fast way to get a random floating point value. This new function
> > is equivalent to drand() in the standard library.
> > 
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> >   app/test/test_rand_perf.c              |  7 +++++
> >   doc/guides/rel_notes/release_22_07.rst |  5 ++++
> >   lib/eal/common/rte_random.c            | 41 ++++++++++++++++++++++++++
> >   lib/eal/include/rte_random.h           | 18 +++++++++++
> >   lib/eal/meson.build                    |  3 ++
> >   lib/eal/version.map                    |  1 +
> >   6 files changed, 75 insertions(+)
> > 
> > diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c
> > index fe797ebfa1ca..26fb1d9a586e 100644
> > --- a/app/test/test_rand_perf.c
> > +++ b/app/test/test_rand_perf.c
> > @@ -20,6 +20,7 @@ static volatile uint64_t vsum;
> >   
> >   enum rand_type {
> >   	rand_type_64,
> > +	rand_type_float,
> >   	rand_type_bounded_best_case,
> >   	rand_type_bounded_worst_case
> >   };
> > @@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type)
> >   	switch (rand_type) {
> >   	case rand_type_64:
> >   		return "Full 64-bit [rte_rand()]";
> > +	case rand_type_float:
> > +		return "Floating point [rte_drand()]";
> >   	case rand_type_bounded_best_case:
> >   		return "Bounded average best-case [rte_rand_max()]";
> >   	case rand_type_bounded_worst_case:
> > @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type)
> >   		case rand_type_64:
> >   			sum += rte_rand();
> >   			break;
> > +		case rand_type_float:
> > +			sum += 1000. * rte_drand();  
> 
> Including this floating point multiplication will lead to an 
> overestimation of rte_drand() latency.
> 
> You could refactor this function to be a macro, and pass the return type 
> to as a parameter to this macro. I did just that, and on both an AMD 
> 5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think 
> it's necessary.
> 
> > +			break;
> >   		case rand_type_bounded_best_case:
> >   			sum += rte_rand_max(BEST_CASE_BOUND);
> >   			break;
> > @@ -83,6 +89,7 @@ test_rand_perf(void)
> >   	printf("Pseudo-random number generation latencies:\n");
> >   
> >   	test_rand_perf_type(rand_type_64);
> > +	test_rand_perf_type(rand_type_float);
> >   	test_rand_perf_type(rand_type_bounded_best_case);
> >   	test_rand_perf_type(rand_type_bounded_worst_case);
> >   
> > diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
> > index e49cacecefd4..b131ea577226 100644
> > --- a/doc/guides/rel_notes/release_22_07.rst
> > +++ b/doc/guides/rel_notes/release_22_07.rst
> > @@ -104,6 +104,11 @@ New Features
> >     * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
> >     * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
> >   
> > +* ** Added function get random floating point number.**
> > +
> > +  Added the function ``rte_drand()`` to provide a pseudo-random
> > +  floating point number.
> > +
> >   
> >   Removed Items
> >   -------------
> > diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
> > index 4535cc980cec..3dc3484ee655 100644
> > --- a/lib/eal/common/rte_random.c
> > +++ b/lib/eal/common/rte_random.c
> > @@ -6,6 +6,9 @@
> >   #include <x86intrin.h>
> >   #endif
> >   #include <unistd.h>
> > +#ifdef RTE_LIBEAL_USE_IEEE754
> > +#include <ieee754.h>
> > +#endif
> >   
> >   #include <rte_branch_prediction.h>
> >   #include <rte_cycles.h>
> > @@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound)
> >   	return res;
> >   }
> >   
> > +double
> > +rte_drand(void)
> > +{
> > +	struct rte_rand_state *state = __rte_rand_get_state();
> > +	uint64_t rand64 = __rte_rand_lfsr258(state);
> > +#ifdef RTE_LIBEAL_USE_IEEE754
> > +	union ieee754_double u = {
> > +		.ieee = {
> > +			.negative = 0,
> > +			.exponent = IEEE754_DOUBLE_BIAS,
> > +		},
> > +	};
> > +
> > +	/* Take 64 bit random value and put it into the mantissa
> > +	 * This uses direct access to IEEE format to avoid doing
> > +	 * any direct floating point math here.
> > +	 */
> > +	u.ieee.mantissa0 = rand64 >> 32;
> > +	u.ieee.mantissa1 = rand64;
> > +
> > +	return u.d - 1.0;
> > +#else
> > +	/* Slower method requiring floating point divide
> > +	 *  
> 
> Do you know how much slower? I ran rand_perf_test on two of my systems.
> 
>                        AMD 5900X     Pi4 (ARM Cortex-A72)
> IEEE754 version          12              1.19
> Non-IEEE754 version      11              1.16
> Naive version*           24              1.16
> 
> * (double)rte_rand() / (double)UINT64_MAX
> 
> Numbers are TSC cycles/op.

On AMD Ryzen 7 both versions take 9 cycles/op with the rand_perf_autotest
So it is a toss up.

The 754 version is:

        ubfx    r1, r1, #0, #20
        orr     r3, r1, #1069547520   << mantissa0
        mov     r2, r0
        orr     r3, r3, #3145728
        vmov.f64        d0, #1.0e+0
        vmov    d16, r2, r3
        vsub.f64        d0, d16, d0   << return u.d - 1.0

Note: the compiler is doing smart optimization on the divide version.
It knows that since denominator is fixed value it can use multiply.

        vmov    d16, r0, r1
        vmul.f64        d0, d16, d0
  

Patch

diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c
index fe797ebfa1ca..26fb1d9a586e 100644
--- a/app/test/test_rand_perf.c
+++ b/app/test/test_rand_perf.c
@@ -20,6 +20,7 @@  static volatile uint64_t vsum;
 
 enum rand_type {
 	rand_type_64,
+	rand_type_float,
 	rand_type_bounded_best_case,
 	rand_type_bounded_worst_case
 };
@@ -30,6 +31,8 @@  rand_type_desc(enum rand_type rand_type)
 	switch (rand_type) {
 	case rand_type_64:
 		return "Full 64-bit [rte_rand()]";
+	case rand_type_float:
+		return "Floating point [rte_drand()]";
 	case rand_type_bounded_best_case:
 		return "Bounded average best-case [rte_rand_max()]";
 	case rand_type_bounded_worst_case:
@@ -55,6 +58,9 @@  test_rand_perf_type(enum rand_type rand_type)
 		case rand_type_64:
 			sum += rte_rand();
 			break;
+		case rand_type_float:
+			sum += 1000. * rte_drand();
+			break;
 		case rand_type_bounded_best_case:
 			sum += rte_rand_max(BEST_CASE_BOUND);
 			break;
@@ -83,6 +89,7 @@  test_rand_perf(void)
 	printf("Pseudo-random number generation latencies:\n");
 
 	test_rand_perf_type(rand_type_64);
+	test_rand_perf_type(rand_type_float);
 	test_rand_perf_type(rand_type_bounded_best_case);
 	test_rand_perf_type(rand_type_bounded_worst_case);
 
diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst
index e49cacecefd4..b131ea577226 100644
--- a/doc/guides/rel_notes/release_22_07.rst
+++ b/doc/guides/rel_notes/release_22_07.rst
@@ -104,6 +104,11 @@  New Features
   * ``RTE_EVENT_QUEUE_ATTR_WEIGHT``
   * ``RTE_EVENT_QUEUE_ATTR_AFFINITY``
 
+* ** Added function get random floating point number.**
+
+  Added the function ``rte_drand()`` to provide a pseudo-random
+  floating point number.
+
 
 Removed Items
 -------------
diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
index 4535cc980cec..3dc3484ee655 100644
--- a/lib/eal/common/rte_random.c
+++ b/lib/eal/common/rte_random.c
@@ -6,6 +6,9 @@ 
 #include <x86intrin.h>
 #endif
 #include <unistd.h>
+#ifdef RTE_LIBEAL_USE_IEEE754
+#include <ieee754.h>
+#endif
 
 #include <rte_branch_prediction.h>
 #include <rte_cycles.h>
@@ -173,6 +176,44 @@  rte_rand_max(uint64_t upper_bound)
 	return res;
 }
 
+double
+rte_drand(void)
+{
+	struct rte_rand_state *state = __rte_rand_get_state();
+	uint64_t rand64 = __rte_rand_lfsr258(state);
+#ifdef RTE_LIBEAL_USE_IEEE754
+	union ieee754_double u = {
+		.ieee = {
+			.negative = 0,
+			.exponent = IEEE754_DOUBLE_BIAS,
+		},
+	};
+
+	/* Take 64 bit random value and put it into the mantissa
+	 * This uses direct access to IEEE format to avoid doing
+	 * any direct floating point math here.
+	 */
+	u.ieee.mantissa0 = rand64 >> 32;
+	u.ieee.mantissa1 = rand64;
+
+	return u.d - 1.0;
+#else
+	/* Slower method requiring floating point divide
+	 *
+	 * The double mantissa only has 53 bits, so we uniformly mask off the
+	 * high 11 bits and then floating-point divide by 2^53 to achieve a
+	 * result in [0, 1).
+	 *
+	 * We are not allowed to emit 1.0, so denom must be one greater than
+	 * the possible range of the preceeding step.
+	 */
+	static const uint64_t denom = (uint64_t)1 << 53;
+
+	rand64 &= denom - 1;
+	return (double)rand64 / denom;
+#endif
+}
+
 static uint64_t
 __rte_random_initial_seed(void)
 {
diff --git a/lib/eal/include/rte_random.h b/lib/eal/include/rte_random.h
index 29f5f1325a30..f6541c2b0f08 100644
--- a/lib/eal/include/rte_random.h
+++ b/lib/eal/include/rte_random.h
@@ -65,6 +65,24 @@  rte_rand(void);
 uint64_t
 rte_rand_max(uint64_t upper_bound);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Generates a pseudo-random floating point number.
+ *
+ * This function returns a nonnegative double-precision floating random
+ * number uniformly distributed over the interval [0.0, 1.0).
+ *
+ * The generator is not cryptographically secure.
+ * If called from lcore threads, this function is thread-safe.
+ *
+ * @return
+ *   A pseudo-random value between 0 and 1.0.
+ */
+__rte_experimental
+double rte_drand(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/eal/meson.build b/lib/eal/meson.build
index 056beb946119..e50524901c98 100644
--- a/lib/eal/meson.build
+++ b/lib/eal/meson.build
@@ -32,3 +32,6 @@  endif
 if cc.has_function('getentropy', prefix : '#include <unistd.h>')
     cflags += '-DRTE_LIBEAL_USE_GETENTROPY'
 endif
+if cc.has_header_symbol('ieee754.h', 'union ieee754_double')
+    cflags += '-DRTE_LIBEAL_USE_IEEE754'
+endif
diff --git a/lib/eal/version.map b/lib/eal/version.map
index d49e30bd042f..cfbade9a33e9 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -422,6 +422,7 @@  EXPERIMENTAL {
 	rte_intr_type_set;
 
 	# added in 22.07
+	rte_drand;
 	rte_thread_get_affinity_by_id;
 	rte_thread_self;
 	rte_thread_set_affinity_by_id;