[dpdk-dev,v4,3/5] hash: add fallback to software CRC32 implementation

Message ID a875a94021448514bfbcf1409e4303cde50882f3.1416318389.git.e_zhumabekov@sts.kz (mailing list archive)
State Superseded, archived
Headers

Commit Message

Yerden Zhumabekov Nov. 18, 2014, 2:03 p.m. UTC
  Initially, SSE4.2 support is detected via CPUID instruction.

Added rte_hash_crc_set_alg() function to detect and set CRC32
implementation if necessary. SSE4.2 is allowed by default. If it's
not available, fall back to sw implementation.

Best available algorithm is detected upon application startup
through the constructor function rte_hash_crc_try_sse442().

Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
---
 lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)
  

Comments

Neil Horman Nov. 18, 2014, 2:41 p.m. UTC | #1
On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> Initially, SSE4.2 support is detected via CPUID instruction.
> 
> Added rte_hash_crc_set_alg() function to detect and set CRC32
> implementation if necessary. SSE4.2 is allowed by default. If it's
> not available, fall back to sw implementation.
> 
> Best available algorithm is detected upon application startup
> through the constructor function rte_hash_crc_try_sse442().
> 
> Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
> ---
>  lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 51 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
> index 15f687a..332ed99 100644
> --- a/lib/librte_hash/rte_hash_crc.h
> +++ b/lib/librte_hash/rte_hash_crc.h
> @@ -45,7 +45,11 @@ extern "C" {
>  #endif
>  
>  #include <stdint.h>
> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
>  #include <nmmintrin.h>
> +#endif
> +#include <rte_cpuflags.h>
> +#include <rte_branch_prediction.h>
>  
>  /* Lookup tables for software implementation of CRC32C */
>  static uint32_t crc32c_tables[8][256] = {{
> @@ -363,8 +367,41 @@ crc32c_2words(uint64_t data, uint32_t init_val)
>  	return crc;
>  }
>  
> +enum crc32_alg_t {
> +	CRC32_SW = 0,
> +	CRC32_SSE42
> +};
> +
> +static enum crc32_alg_t crc32_alg;
> +
> +/**
> + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
> + * calculation.
> + *
> + * @param flag
> + *   unsigned integer flag
> + *   - (CRC32_SW) Don't use SSE4.2 intrinsics
> + *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
> + */
> +static inline void
> +rte_hash_crc_set_alg(enum crc32_alg_t alg)
> +{
> +	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
> +	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
> +	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
> +}
> +
> +/* Best available algorithm is detected via CPUID instruction */
> +static inline void __attribute__((constructor))
> +rte_hash_crc_try_sse42(void)
> +{
> +	rte_hash_crc_set_alg(CRC32_SSE42);
> +}
> +
>  /**
>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> + * Fall back to software crc32 implementation in case SSE4.2 is
> + * not supported
>   *
>   * @param data
>   *   Data to perform hash on.
> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
>  static inline uint32_t
>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
>  {
> -	return _mm_crc32_u32(init_val, data);
> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> +	if (likely(crc32_alg == CRC32_SSE42))
> +		return _mm_crc32_u32(init_val, data);
> +#endif

you don't really need these ifdefs here anymore given that you have a
constructor to do the algorithm selection.  In fact you need to remove them, in
the event you build on a system that doesn't support SSE42, but run on a system
that does.

Neil
  
Yerden Zhumabekov Nov. 18, 2014, 3:06 p.m. UTC | #2
18.11.2014 20:41, Neil Horman пишет:
> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
>> Initially, SSE4.2 support is detected via CPUID instruction.
>>
>> Added rte_hash_crc_set_alg() function to detect and set CRC32
>> implementation if necessary. SSE4.2 is allowed by default. If it's
>> not available, fall back to sw implementation.
>>
>> Best available algorithm is detected upon application startup
>> through the constructor function rte_hash_crc_try_sse442().
>>
>> Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
>> ---
>>  lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 51 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
>> index 15f687a..332ed99 100644
>> --- a/lib/librte_hash/rte_hash_crc.h
>> +++ b/lib/librte_hash/rte_hash_crc.h
>> @@ -45,7 +45,11 @@ extern "C" {
>>  #endif
>>  
>>  #include <stdint.h>
>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
>>  #include <nmmintrin.h>
>> +#endif
>> +#include <rte_cpuflags.h>
>> +#include <rte_branch_prediction.h>
>>  
>>  /* Lookup tables for software implementation of CRC32C */
>>  static uint32_t crc32c_tables[8][256] = {{
>> @@ -363,8 +367,41 @@ crc32c_2words(uint64_t data, uint32_t init_val)
>>  	return crc;
>>  }
>>  
>> +enum crc32_alg_t {
>> +	CRC32_SW = 0,
>> +	CRC32_SSE42
>> +};
>> +
>> +static enum crc32_alg_t crc32_alg;
>> +
>> +/**
>> + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
>> + * calculation.
>> + *
>> + * @param flag
>> + *   unsigned integer flag
>> + *   - (CRC32_SW) Don't use SSE4.2 intrinsics
>> + *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
>> + */
>> +static inline void
>> +rte_hash_crc_set_alg(enum crc32_alg_t alg)
>> +{
>> +	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
>> +	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
>> +	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
>> +}
>> +
>> +/* Best available algorithm is detected via CPUID instruction */
>> +static inline void __attribute__((constructor))
>> +rte_hash_crc_try_sse42(void)
>> +{
>> +	rte_hash_crc_set_alg(CRC32_SSE42);
>> +}
>> +
>>  /**
>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
>> + * Fall back to software crc32 implementation in case SSE4.2 is
>> + * not supported
>>   *
>>   * @param data
>>   *   Data to perform hash on.
>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
>>  static inline uint32_t
>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
>>  {
>> -	return _mm_crc32_u32(init_val, data);
>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
>> +	if (likely(crc32_alg == CRC32_SSE42))
>> +		return _mm_crc32_u32(init_val, data);
>> +#endif
> you don't really need these ifdefs here anymore given that you have a
> constructor to do the algorithm selection.  In fact you need to remove them, in
> the event you build on a system that doesn't support SSE42, but run on a system
> that does.

Originally, I thought so as well. I wrote the code without these ifdefs,
but it didn't compile on my machine which doesn't support SSE4.2. Error
was triggered by nmmintrin.h which has a check for respective GCC
extension. So I think these ifdefs are indeed required.
  
Neil Horman Nov. 18, 2014, 4 p.m. UTC | #3
On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> 
> 18.11.2014 20:41, Neil Horman пишет:
> > On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> >> Initially, SSE4.2 support is detected via CPUID instruction.
> >>
> >> Added rte_hash_crc_set_alg() function to detect and set CRC32
> >> implementation if necessary. SSE4.2 is allowed by default. If it's
> >> not available, fall back to sw implementation.
> >>
> >> Best available algorithm is detected upon application startup
> >> through the constructor function rte_hash_crc_try_sse442().
> >>
> >> Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
> >> ---
> >>  lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
> >>  1 file changed, 51 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
> >> index 15f687a..332ed99 100644
> >> --- a/lib/librte_hash/rte_hash_crc.h
> >> +++ b/lib/librte_hash/rte_hash_crc.h
> >> @@ -45,7 +45,11 @@ extern "C" {
> >>  #endif
> >>  
> >>  #include <stdint.h>
> >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> >>  #include <nmmintrin.h>
> >> +#endif
> >> +#include <rte_cpuflags.h>
> >> +#include <rte_branch_prediction.h>
> >>  
> >>  /* Lookup tables for software implementation of CRC32C */
> >>  static uint32_t crc32c_tables[8][256] = {{
> >> @@ -363,8 +367,41 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> >>  	return crc;
> >>  }
> >>  
> >> +enum crc32_alg_t {
> >> +	CRC32_SW = 0,
> >> +	CRC32_SSE42
> >> +};
> >> +
> >> +static enum crc32_alg_t crc32_alg;
> >> +
> >> +/**
> >> + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
> >> + * calculation.
> >> + *
> >> + * @param flag
> >> + *   unsigned integer flag
> >> + *   - (CRC32_SW) Don't use SSE4.2 intrinsics
> >> + *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
> >> + */
> >> +static inline void
> >> +rte_hash_crc_set_alg(enum crc32_alg_t alg)
> >> +{
> >> +	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
> >> +	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
> >> +	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
> >> +}
> >> +
> >> +/* Best available algorithm is detected via CPUID instruction */
> >> +static inline void __attribute__((constructor))
> >> +rte_hash_crc_try_sse42(void)
> >> +{
> >> +	rte_hash_crc_set_alg(CRC32_SSE42);
> >> +}
> >> +
> >>  /**
> >>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> >> + * Fall back to software crc32 implementation in case SSE4.2 is
> >> + * not supported
> >>   *
> >>   * @param data
> >>   *   Data to perform hash on.
> >> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> >>  static inline uint32_t
> >>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> >>  {
> >> -	return _mm_crc32_u32(init_val, data);
> >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> >> +	if (likely(crc32_alg == CRC32_SSE42))
> >> +		return _mm_crc32_u32(init_val, data);
> >> +#endif
> > you don't really need these ifdefs here anymore given that you have a
> > constructor to do the algorithm selection.  In fact you need to remove them, in
> > the event you build on a system that doesn't support SSE42, but run on a system
> > that does.
> 
> Originally, I thought so as well. I wrote the code without these ifdefs,
> but it didn't compile on my machine which doesn't support SSE4.2. Error
> was triggered by nmmintrin.h which has a check for respective GCC
> extension. So I think these ifdefs are indeed required.
> 
You need to edit the makefile so that the compiler gets passed the option
-msse42.  That way it will know to emit sse42 instructions. It will also allow
you to remove the ifdef from the include file
Neil

> -- 
> Sincerely,
> 
> Yerden Zhumabekov
> State Technical Service
> Astana, KZ
> 
> 
>
  
Bruce Richardson Nov. 18, 2014, 4:04 p.m. UTC | #4
On Tue, Nov 18, 2014 at 11:00:05AM -0500, Neil Horman wrote:
> On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > 
> > 18.11.2014 20:41, Neil Horman пишет:
> > > On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > >> Initially, SSE4.2 support is detected via CPUID instruction.
> > >>
> > >> Added rte_hash_crc_set_alg() function to detect and set CRC32
> > >> implementation if necessary. SSE4.2 is allowed by default. If it's
> > >> not available, fall back to sw implementation.
> > >>
> > >> Best available algorithm is detected upon application startup
> > >> through the constructor function rte_hash_crc_try_sse442().
> > >>
> > >> Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
> > >> ---
> > >>  lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
> > >>  1 file changed, 51 insertions(+), 2 deletions(-)
> > >>
> > >> diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
> > >> index 15f687a..332ed99 100644
> > >> --- a/lib/librte_hash/rte_hash_crc.h
> > >> +++ b/lib/librte_hash/rte_hash_crc.h
> > >> @@ -45,7 +45,11 @@ extern "C" {
> > >>  #endif
> > >>  
> > >>  #include <stdint.h>
> > >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > >>  #include <nmmintrin.h>
> > >> +#endif
> > >> +#include <rte_cpuflags.h>
> > >> +#include <rte_branch_prediction.h>
> > >>  
> > >>  /* Lookup tables for software implementation of CRC32C */
> > >>  static uint32_t crc32c_tables[8][256] = {{
> > >> @@ -363,8 +367,41 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > >>  	return crc;
> > >>  }
> > >>  
> > >> +enum crc32_alg_t {
> > >> +	CRC32_SW = 0,
> > >> +	CRC32_SSE42
> > >> +};
> > >> +
> > >> +static enum crc32_alg_t crc32_alg;
> > >> +
> > >> +/**
> > >> + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
> > >> + * calculation.
> > >> + *
> > >> + * @param flag
> > >> + *   unsigned integer flag
> > >> + *   - (CRC32_SW) Don't use SSE4.2 intrinsics
> > >> + *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
> > >> + */
> > >> +static inline void
> > >> +rte_hash_crc_set_alg(enum crc32_alg_t alg)
> > >> +{
> > >> +	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
> > >> +	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
> > >> +	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
> > >> +}
> > >> +
> > >> +/* Best available algorithm is detected via CPUID instruction */
> > >> +static inline void __attribute__((constructor))
> > >> +rte_hash_crc_try_sse42(void)
> > >> +{
> > >> +	rte_hash_crc_set_alg(CRC32_SSE42);
> > >> +}
> > >> +
> > >>  /**
> > >>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > >> + * Fall back to software crc32 implementation in case SSE4.2 is
> > >> + * not supported
> > >>   *
> > >>   * @param data
> > >>   *   Data to perform hash on.
> > >> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > >>  static inline uint32_t
> > >>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > >>  {
> > >> -	return _mm_crc32_u32(init_val, data);
> > >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > >> +	if (likely(crc32_alg == CRC32_SSE42))
> > >> +		return _mm_crc32_u32(init_val, data);
> > >> +#endif
> > > you don't really need these ifdefs here anymore given that you have a
> > > constructor to do the algorithm selection.  In fact you need to remove them, in
> > > the event you build on a system that doesn't support SSE42, but run on a system
> > > that does.
> > 
> > Originally, I thought so as well. I wrote the code without these ifdefs,
> > but it didn't compile on my machine which doesn't support SSE4.2. Error
> > was triggered by nmmintrin.h which has a check for respective GCC
> > extension. So I think these ifdefs are indeed required.
> > 
> You need to edit the makefile so that the compiler gets passed the option
> -msse42.  That way it will know to emit sse42 instructions. It will also allow
> you to remove the ifdef from the include file
> Neil
> 

Question: does that then limit the compiler to emitting sse42 instructions? If,
for instance, the rest of DPDK is being compiled for a target supporting AVX2,
does that flag then prevent the compiler from auto-vectorising using SSE?

/Bruce
  
Bruce Richardson Nov. 18, 2014, 4:08 p.m. UTC | #5
On Tue, Nov 18, 2014 at 04:04:26PM +0000, Bruce Richardson wrote:
> On Tue, Nov 18, 2014 at 11:00:05AM -0500, Neil Horman wrote:
> > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > 
> > > 18.11.2014 20:41, Neil Horman пишет:
> > > > On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > >> Initially, SSE4.2 support is detected via CPUID instruction.
> > > >>
> > > >> Added rte_hash_crc_set_alg() function to detect and set CRC32
> > > >> implementation if necessary. SSE4.2 is allowed by default. If it's
> > > >> not available, fall back to sw implementation.
> > > >>
> > > >> Best available algorithm is detected upon application startup
> > > >> through the constructor function rte_hash_crc_try_sse442().
> > > >>
> > > >> Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
> > > >> ---
> > > >>  lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
> > > >>  1 file changed, 51 insertions(+), 2 deletions(-)
> > > >>
> > > >> diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
> > > >> index 15f687a..332ed99 100644
> > > >> --- a/lib/librte_hash/rte_hash_crc.h
> > > >> +++ b/lib/librte_hash/rte_hash_crc.h
> > > >> @@ -45,7 +45,11 @@ extern "C" {
> > > >>  #endif
> > > >>  
> > > >>  #include <stdint.h>
> > > >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > >>  #include <nmmintrin.h>
> > > >> +#endif
> > > >> +#include <rte_cpuflags.h>
> > > >> +#include <rte_branch_prediction.h>
> > > >>  
> > > >>  /* Lookup tables for software implementation of CRC32C */
> > > >>  static uint32_t crc32c_tables[8][256] = {{
> > > >> @@ -363,8 +367,41 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > >>  	return crc;
> > > >>  }
> > > >>  
> > > >> +enum crc32_alg_t {
> > > >> +	CRC32_SW = 0,
> > > >> +	CRC32_SSE42
> > > >> +};
> > > >> +
> > > >> +static enum crc32_alg_t crc32_alg;
> > > >> +
> > > >> +/**
> > > >> + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
> > > >> + * calculation.
> > > >> + *
> > > >> + * @param flag
> > > >> + *   unsigned integer flag
> > > >> + *   - (CRC32_SW) Don't use SSE4.2 intrinsics
> > > >> + *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
> > > >> + */
> > > >> +static inline void
> > > >> +rte_hash_crc_set_alg(enum crc32_alg_t alg)
> > > >> +{
> > > >> +	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
> > > >> +	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
> > > >> +	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
> > > >> +}
> > > >> +
> > > >> +/* Best available algorithm is detected via CPUID instruction */
> > > >> +static inline void __attribute__((constructor))
> > > >> +rte_hash_crc_try_sse42(void)
> > > >> +{
> > > >> +	rte_hash_crc_set_alg(CRC32_SSE42);
> > > >> +}
> > > >> +
> > > >>  /**
> > > >>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > >> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > >> + * not supported
> > > >>   *
> > > >>   * @param data
> > > >>   *   Data to perform hash on.
> > > >> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > >>  static inline uint32_t
> > > >>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > >>  {
> > > >> -	return _mm_crc32_u32(init_val, data);
> > > >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > >> +	if (likely(crc32_alg == CRC32_SSE42))
> > > >> +		return _mm_crc32_u32(init_val, data);
> > > >> +#endif
> > > > you don't really need these ifdefs here anymore given that you have a
> > > > constructor to do the algorithm selection.  In fact you need to remove them, in
> > > > the event you build on a system that doesn't support SSE42, but run on a system
> > > > that does.
> > > 
> > > Originally, I thought so as well. I wrote the code without these ifdefs,
> > > but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > was triggered by nmmintrin.h which has a check for respective GCC
> > > extension. So I think these ifdefs are indeed required.
> > > 
> > You need to edit the makefile so that the compiler gets passed the option
> > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > you to remove the ifdef from the include file
> > Neil
> > 
> 
Email V2, with fix for the last word:

Question: does that then limit the compiler to emitting sse42 instructions? If,
for instance, the rest of DPDK is being compiled for a target supporting AVX2,
does that flag then prevent the compiler from auto-vectorising using AVX?

/Bruce
  
Neil Horman Nov. 18, 2014, 4:38 p.m. UTC | #6
On Tue, Nov 18, 2014 at 04:04:26PM +0000, Bruce Richardson wrote:
> On Tue, Nov 18, 2014 at 11:00:05AM -0500, Neil Horman wrote:
> > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > 
> > > 18.11.2014 20:41, Neil Horman пишет:
> > > > On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > >> Initially, SSE4.2 support is detected via CPUID instruction.
> > > >>
> > > >> Added rte_hash_crc_set_alg() function to detect and set CRC32
> > > >> implementation if necessary. SSE4.2 is allowed by default. If it's
> > > >> not available, fall back to sw implementation.
> > > >>
> > > >> Best available algorithm is detected upon application startup
> > > >> through the constructor function rte_hash_crc_try_sse442().
> > > >>
> > > >> Signed-off-by: Yerden Zhumabekov <e_zhumabekov@sts.kz>
> > > >> ---
> > > >>  lib/librte_hash/rte_hash_crc.h |   53 ++++++++++++++++++++++++++++++++++++++--
> > > >>  1 file changed, 51 insertions(+), 2 deletions(-)
> > > >>
> > > >> diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
> > > >> index 15f687a..332ed99 100644
> > > >> --- a/lib/librte_hash/rte_hash_crc.h
> > > >> +++ b/lib/librte_hash/rte_hash_crc.h
> > > >> @@ -45,7 +45,11 @@ extern "C" {
> > > >>  #endif
> > > >>  
> > > >>  #include <stdint.h>
> > > >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > >>  #include <nmmintrin.h>
> > > >> +#endif
> > > >> +#include <rte_cpuflags.h>
> > > >> +#include <rte_branch_prediction.h>
> > > >>  
> > > >>  /* Lookup tables for software implementation of CRC32C */
> > > >>  static uint32_t crc32c_tables[8][256] = {{
> > > >> @@ -363,8 +367,41 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > >>  	return crc;
> > > >>  }
> > > >>  
> > > >> +enum crc32_alg_t {
> > > >> +	CRC32_SW = 0,
> > > >> +	CRC32_SSE42
> > > >> +};
> > > >> +
> > > >> +static enum crc32_alg_t crc32_alg;
> > > >> +
> > > >> +/**
> > > >> + * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
> > > >> + * calculation.
> > > >> + *
> > > >> + * @param flag
> > > >> + *   unsigned integer flag
> > > >> + *   - (CRC32_SW) Don't use SSE4.2 intrinsics
> > > >> + *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
> > > >> + */
> > > >> +static inline void
> > > >> +rte_hash_crc_set_alg(enum crc32_alg_t alg)
> > > >> +{
> > > >> +	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
> > > >> +	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
> > > >> +	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
> > > >> +}
> > > >> +
> > > >> +/* Best available algorithm is detected via CPUID instruction */
> > > >> +static inline void __attribute__((constructor))
> > > >> +rte_hash_crc_try_sse42(void)
> > > >> +{
> > > >> +	rte_hash_crc_set_alg(CRC32_SSE42);
> > > >> +}
> > > >> +
> > > >>  /**
> > > >>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > >> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > >> + * not supported
> > > >>   *
> > > >>   * @param data
> > > >>   *   Data to perform hash on.
> > > >> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > >>  static inline uint32_t
> > > >>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > >>  {
> > > >> -	return _mm_crc32_u32(init_val, data);
> > > >> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > >> +	if (likely(crc32_alg == CRC32_SSE42))
> > > >> +		return _mm_crc32_u32(init_val, data);
> > > >> +#endif
> > > > you don't really need these ifdefs here anymore given that you have a
> > > > constructor to do the algorithm selection.  In fact you need to remove them, in
> > > > the event you build on a system that doesn't support SSE42, but run on a system
> > > > that does.
> > > 
> > > Originally, I thought so as well. I wrote the code without these ifdefs,
> > > but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > was triggered by nmmintrin.h which has a check for respective GCC
> > > extension. So I think these ifdefs are indeed required.
> > > 
> > You need to edit the makefile so that the compiler gets passed the option
> > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > you to remove the ifdef from the include file
> > Neil
> > 
> 
> Question: does that then limit the compiler to emitting sse42 instructions? If,
> for instance, the rest of DPDK is being compiled for a target supporting AVX2,
> does that flag then prevent the compiler from auto-vectorising using SSE?
> 
It should be a last-option-wins model I think (that is to say if the command
line specifies -msse42 and -march=core-avx2, then you will get both instructions,
since core-avx2 supports sse42.  

Neil

> /Bruce
> 
>
  
Yerden Zhumabekov Nov. 18, 2014, 5:13 p.m. UTC | #7
18.11.2014 22:00, Neil Horman пишет:
> On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
>> 18.11.2014 20:41, Neil Horman пишет:
>>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
>>>>  /**
>>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
>>>> + * Fall back to software crc32 implementation in case SSE4.2 is
>>>> + * not supported
>>>>   *
>>>>   * @param data
>>>>   *   Data to perform hash on.
>>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
>>>>  static inline uint32_t
>>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
>>>>  {
>>>> -	return _mm_crc32_u32(init_val, data);
>>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
>>>> +	if (likely(crc32_alg == CRC32_SSE42))
>>>> +		return _mm_crc32_u32(init_val, data);
>>>> +#endif
>>> you don't really need these ifdefs here anymore given that you have a
>>> constructor to do the algorithm selection.  In fact you need to remove them, in
>>> the event you build on a system that doesn't support SSE42, but run on a system
>>> that does.
>> Originally, I thought so as well. I wrote the code without these ifdefs,
>> but it didn't compile on my machine which doesn't support SSE4.2. Error
>> was triggered by nmmintrin.h which has a check for respective GCC
>> extension. So I think these ifdefs are indeed required.
>>
> You need to edit the makefile so that the compiler gets passed the option
> -msse42.  That way it will know to emit sse42 instructions. It will also allow
> you to remove the ifdef from the include file

In this case, I guess there are two options:
1) modify all makefiles which use librte_hash
2) move all function bodies from rte_hash_crc.h to separate module,
leaving prototype definitions there only.

Everybody's up for the second option? :)
  
Wang, Shawn Nov. 18, 2014, 5:29 p.m. UTC | #8
I have a general question about using CPUID to detect supported instruction set.
What if we are compiling the software with some old hardware which does not support SSE4.2, but run it on new hardware which does support SSE4.2. Is there still a static way to force the compiler to turn on the SSE4.2 support? 
I guess for SSE4.2, most of the CPU has support for it now. But for AVX2, this might not be the case.
  
Neil Horman Nov. 18, 2014, 5:46 p.m. UTC | #9
On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> 
> 18.11.2014 22:00, Neil Horman пишет:
> > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> >> 18.11.2014 20:41, Neil Horman пишет:
> >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> >>>>  /**
> >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> >>>> + * not supported
> >>>>   *
> >>>>   * @param data
> >>>>   *   Data to perform hash on.
> >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> >>>>  static inline uint32_t
> >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> >>>>  {
> >>>> -	return _mm_crc32_u32(init_val, data);
> >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> >>>> +		return _mm_crc32_u32(init_val, data);
> >>>> +#endif
> >>> you don't really need these ifdefs here anymore given that you have a
> >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> >>> the event you build on a system that doesn't support SSE42, but run on a system
> >>> that does.
> >> Originally, I thought so as well. I wrote the code without these ifdefs,
> >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> >> was triggered by nmmintrin.h which has a check for respective GCC
> >> extension. So I think these ifdefs are indeed required.
> >>
> > You need to edit the makefile so that the compiler gets passed the option
> > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > you to remove the ifdef from the include file
> 
> In this case, I guess there are two options:
> 1) modify all makefiles which use librte_hash
> 2) move all function bodies from rte_hash_crc.h to separate module,
> leaving prototype definitions there only.
> 
> Everybody's up for the second option? :)
> 
Crud, you're right, I didn't think about the header inclusion issue.  Is it
worth adding the jump to enable the dynamic hash selection?
Neil

> -- 
> Sincerely,
> 
> Yerden Zhumabekov
> State Technical Service
> Astana, KZ
> 
> 
>
  
Bruce Richardson Nov. 18, 2014, 5:52 p.m. UTC | #10
On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
> On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> > 
> > 18.11.2014 22:00, Neil Horman пишет:
> > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > >> 18.11.2014 20:41, Neil Horman пишет:
> > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > >>>>  /**
> > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> > >>>> + * not supported
> > >>>>   *
> > >>>>   * @param data
> > >>>>   *   Data to perform hash on.
> > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > >>>>  static inline uint32_t
> > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > >>>>  {
> > >>>> -	return _mm_crc32_u32(init_val, data);
> > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> > >>>> +		return _mm_crc32_u32(init_val, data);
> > >>>> +#endif
> > >>> you don't really need these ifdefs here anymore given that you have a
> > >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> > >>> the event you build on a system that doesn't support SSE42, but run on a system
> > >>> that does.
> > >> Originally, I thought so as well. I wrote the code without these ifdefs,
> > >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> > >> was triggered by nmmintrin.h which has a check for respective GCC
> > >> extension. So I think these ifdefs are indeed required.
> > >>
> > > You need to edit the makefile so that the compiler gets passed the option
> > > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > > you to remove the ifdef from the include file
> > 
> > In this case, I guess there are two options:
> > 1) modify all makefiles which use librte_hash
> > 2) move all function bodies from rte_hash_crc.h to separate module,
> > leaving prototype definitions there only.
> > 
> > Everybody's up for the second option? :)
> > 
> Crud, you're right, I didn't think about the header inclusion issue.  Is it
> worth adding the jump to enable the dynamic hash selection?
> Neil

Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
For builds where we have hardware support confirmed at compile time, just use
the function from the header file.
Does that make sense?

/Bruce
> 
> > -- 
> > Sincerely,
> > 
> > Yerden Zhumabekov
> > State Technical Service
> > Astana, KZ
> > 
> > 
> >
  
Yerden Zhumabekov Nov. 18, 2014, 5:58 p.m. UTC | #11
18.11.2014 23:46, Neil Horman пишет:
> On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
>> 18.11.2014 22:00, Neil Horman пишет:
>>>
>>> You need to edit the makefile so that the compiler gets passed the option
>>> -msse42.  That way it will know to emit sse42 instructions. It will also allow
>>> you to remove the ifdef from the include file
>> In this case, I guess there are two options:
>> 1) modify all makefiles which use librte_hash
>> 2) move all function bodies from rte_hash_crc.h to separate module,
>> leaving prototype definitions there only.
>>
>> Everybody's up for the second option? :)
>>
> Crud, you're right, I didn't think about the header inclusion issue.  Is it
> worth adding the jump to enable the dynamic hash selection?

If I understood you correctly - I've already added a function to
dynamically change the CRC32 implementation in the runtime,
rte_hash_crc_set_alg(). I can rework patches once again, if everybody's
fine with the separate module.
  
Neil Horman Nov. 18, 2014, 9:36 p.m. UTC | #12
On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:
> On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
> > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> > > 
> > > 18.11.2014 22:00, Neil Horman пишет:
> > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > >> 18.11.2014 20:41, Neil Horman пишет:
> > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > >>>>  /**
> > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > >>>> + * not supported
> > > >>>>   *
> > > >>>>   * @param data
> > > >>>>   *   Data to perform hash on.
> > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > >>>>  static inline uint32_t
> > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > >>>>  {
> > > >>>> -	return _mm_crc32_u32(init_val, data);
> > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> > > >>>> +		return _mm_crc32_u32(init_val, data);
> > > >>>> +#endif
> > > >>> you don't really need these ifdefs here anymore given that you have a
> > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> > > >>> the event you build on a system that doesn't support SSE42, but run on a system
> > > >>> that does.
> > > >> Originally, I thought so as well. I wrote the code without these ifdefs,
> > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > >> was triggered by nmmintrin.h which has a check for respective GCC
> > > >> extension. So I think these ifdefs are indeed required.
> > > >>
> > > > You need to edit the makefile so that the compiler gets passed the option
> > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > > > you to remove the ifdef from the include file
> > > 
> > > In this case, I guess there are two options:
> > > 1) modify all makefiles which use librte_hash
> > > 2) move all function bodies from rte_hash_crc.h to separate module,
> > > leaving prototype definitions there only.
> > > 
> > > Everybody's up for the second option? :)
> > > 
> > Crud, you're right, I didn't think about the header inclusion issue.  Is it
> > worth adding the jump to enable the dynamic hash selection?
> > Neil
> 
> Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
> For builds where we have hardware support confirmed at compile time, just use
> the function from the header file.
> Does that make sense?
> 
I'm not certain of that, as I don't think anything can be 'confirmed' at compile
time.  I.e. just because you have sse42 at compile time doesn't guarantee you
have it at run time with a DSO.  If you have these as macros, you need to enable
sse42 whereever you include the file so that the intrinsic works properly.

an alternate option would be to not use the intrinsic, and craft some explicit
__asm__ statement that executes the right sse42 instructions.  That way the asm
is directly emitted, without requiring the -msse42 flag at all, and it will just
work in all the files that call it.

Neil

> /Bruce
> > 
> > > -- 
> > > Sincerely,
> > > 
> > > Yerden Zhumabekov
> > > State Technical Service
> > > Astana, KZ
> > > 
> > > 
> > > 
>
  
Yerden Zhumabekov Nov. 19, 2014, 3:51 a.m. UTC | #13
19.11.2014 3:36, Neil Horman пишет:
> On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:
>> On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
>>> On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
>>>> Everybody's up for the second option? :)
>>>>
>>> Crud, you're right, I didn't think about the header inclusion issue.  Is it
>>> worth adding the jump to enable the dynamic hash selection?
>>> Neil
>> Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
>> For builds where we have hardware support confirmed at compile time, just use
>> the function from the header file.
>> Does that make sense?
>>
> I'm not certain of that, as I don't think anything can be 'confirmed' at compile
> time.  I.e. just because you have sse42 at compile time doesn't guarantee you
> have it at run time with a DSO.  If you have these as macros, you need to enable
> sse42 whereever you include the file so that the intrinsic works properly.
>
> an alternate option would be to not use the intrinsic, and craft some explicit
> __asm__ statement that executes the right sse42 instructions.  That way the asm
> is directly emitted, without requiring the -msse42 flag at all, and it will just
> work in all the files that call it.

Thanks for the discussion. To summarize it with my suggestions for 'v5':
1) replace intrinsics with asm code and give up including nmmintrin.h;
2) detect arch (EM64T flag) on runtime because crc32 for 64-bit operand
doesn't work on 32-bit x86;
3) separate function prototypes (leaving them in header) and bodies, add
to SRCS in Makefile.
  
Yerden Zhumabekov Nov. 19, 2014, 4:07 a.m. UTC | #14
18.11.2014 23:29, Wang, Shawn пишет:
> I have a general question about using CPUID to detect supported instruction set.
> What if we are compiling the software with some old hardware which does not support SSE4.2, but run it on new hardware which does support SSE4.2. Is there still a static way to force the compiler to turn on the SSE4.2 support? 
> I guess for SSE4.2, most of the CPU has support for it now. But for AVX2, this might not be the case.
According to gcc 4.7 changes (https://gcc.gnu.org/gcc-4.7/changes.html)
they've added support for AVX2 instructions since that version.
Use -mavx2 or -march=core-avx2. The latter seems to be supported by ICC
as well, according to Google :)
  
Bruce Richardson Nov. 19, 2014, 10:16 a.m. UTC | #15
On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:
> On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:
> > On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
> > > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> > > > 
> > > > 18.11.2014 22:00, Neil Horman пишет:
> > > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > > >> 18.11.2014 20:41, Neil Horman пишет:
> > > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > > >>>>  /**
> > > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > > >>>> + * not supported
> > > > >>>>   *
> > > > >>>>   * @param data
> > > > >>>>   *   Data to perform hash on.
> > > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > > >>>>  static inline uint32_t
> > > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > > >>>>  {
> > > > >>>> -	return _mm_crc32_u32(init_val, data);
> > > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> > > > >>>> +		return _mm_crc32_u32(init_val, data);
> > > > >>>> +#endif
> > > > >>> you don't really need these ifdefs here anymore given that you have a
> > > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> > > > >>> the event you build on a system that doesn't support SSE42, but run on a system
> > > > >>> that does.
> > > > >> Originally, I thought so as well. I wrote the code without these ifdefs,
> > > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > > >> was triggered by nmmintrin.h which has a check for respective GCC
> > > > >> extension. So I think these ifdefs are indeed required.
> > > > >>
> > > > > You need to edit the makefile so that the compiler gets passed the option
> > > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > > > > you to remove the ifdef from the include file
> > > > 
> > > > In this case, I guess there are two options:
> > > > 1) modify all makefiles which use librte_hash
> > > > 2) move all function bodies from rte_hash_crc.h to separate module,
> > > > leaving prototype definitions there only.
> > > > 
> > > > Everybody's up for the second option? :)
> > > > 
> > > Crud, you're right, I didn't think about the header inclusion issue.  Is it
> > > worth adding the jump to enable the dynamic hash selection?
> > > Neil
> > 
> > Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
> > For builds where we have hardware support confirmed at compile time, just use
> > the function from the header file.
> > Does that make sense?
> > 
> I'm not certain of that, as I don't think anything can be 'confirmed' at compile
> time.  I.e. just because you have sse42 at compile time doesn't guarantee you
> have it at run time with a DSO.  If you have these as macros, you need to enable
> sse42 whereever you include the file so that the intrinsic works properly.

Well, if you compile with sse42 at compile time, the compiler is free to insert
sse4 instructions at any place it feels like, irrespective of whether or not you
use SSE4 intrinsics, so I would never expect such a DSO to work on a system
without SSE42 support.

> 
> an alternate option would be to not use the intrinsic, and craft some explicit
> __asm__ statement that executes the right sse42 instructions.  That way the asm
> is directly emitted, without requiring the -msse42 flag at all, and it will just
> work in all the files that call it.
> 

I really don't like that approach. I think using intrinsics is much more 
maintainable.

/Bruce
  
Neil Horman Nov. 19, 2014, 11:34 a.m. UTC | #16
On Wed, Nov 19, 2014 at 10:16:14AM +0000, Bruce Richardson wrote:
> On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:
> > On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:
> > > On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
> > > > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> > > > > 
> > > > > 18.11.2014 22:00, Neil Horman пишет:
> > > > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > > > >> 18.11.2014 20:41, Neil Horman пишет:
> > > > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > > > >>>>  /**
> > > > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > > > >>>> + * not supported
> > > > > >>>>   *
> > > > > >>>>   * @param data
> > > > > >>>>   *   Data to perform hash on.
> > > > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > > > >>>>  static inline uint32_t
> > > > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > > > >>>>  {
> > > > > >>>> -	return _mm_crc32_u32(init_val, data);
> > > > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> > > > > >>>> +		return _mm_crc32_u32(init_val, data);
> > > > > >>>> +#endif
> > > > > >>> you don't really need these ifdefs here anymore given that you have a
> > > > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> > > > > >>> the event you build on a system that doesn't support SSE42, but run on a system
> > > > > >>> that does.
> > > > > >> Originally, I thought so as well. I wrote the code without these ifdefs,
> > > > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > > > >> was triggered by nmmintrin.h which has a check for respective GCC
> > > > > >> extension. So I think these ifdefs are indeed required.
> > > > > >>
> > > > > > You need to edit the makefile so that the compiler gets passed the option
> > > > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > > > > > you to remove the ifdef from the include file
> > > > > 
> > > > > In this case, I guess there are two options:
> > > > > 1) modify all makefiles which use librte_hash
> > > > > 2) move all function bodies from rte_hash_crc.h to separate module,
> > > > > leaving prototype definitions there only.
> > > > > 
> > > > > Everybody's up for the second option? :)
> > > > > 
> > > > Crud, you're right, I didn't think about the header inclusion issue.  Is it
> > > > worth adding the jump to enable the dynamic hash selection?
> > > > Neil
> > > 
> > > Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
> > > For builds where we have hardware support confirmed at compile time, just use
> > > the function from the header file.
> > > Does that make sense?
> > > 
> > I'm not certain of that, as I don't think anything can be 'confirmed' at compile
> > time.  I.e. just because you have sse42 at compile time doesn't guarantee you
> > have it at run time with a DSO.  If you have these as macros, you need to enable
> > sse42 whereever you include the file so that the intrinsic works properly.
> 
> Well, if you compile with sse42 at compile time, the compiler is free to insert
> sse4 instructions at any place it feels like, irrespective of whether or not you
> use SSE4 intrinsics, so I would never expect such a DSO to work on a system
> without SSE42 support.
> 
> > 
> > an alternate option would be to not use the intrinsic, and craft some explicit
> > __asm__ statement that executes the right sse42 instructions.  That way the asm
> > is directly emitted, without requiring the -msse42 flag at all, and it will just
> > work in all the files that call it.
> > 
> 
> I really don't like that approach. I think using intrinsics is much more 
> maintainable.
> 
I grant you that using an intrinsic is easier to read, but if the code doesn't
compile when using the intrinsic unless you have sse42 turned on, I'm not sure
what choice we have.  and inline asm isn't that hard to maintain.  We're talking
about three lines of code:
asm(
 "mov    %[1],%eax
 mov    %[2],%edx
 crc32l %edx,%eax": 
 [edx] "r" (crc) /*output*/
 :
 [1] "r" (crc), /* input */
 [2] "r" (val)
 :
 [eax] "r" /* clobber */
)

I don't have the syntax quite right, but its pretty easy to read the intent.
Its not like we dont have precidence for this, the atomic interface and several
pmds do this frequently.

Neil


> /Bruce
> 
>
  
Yerden Zhumabekov Nov. 19, 2014, 11:35 a.m. UTC | #17
19.11.2014 16:16, Bruce Richardson пишет:
> On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:
>> an alternate option would be to not use the intrinsic, and craft some explicit
>> __asm__ statement that executes the right sse42 instructions.  That way the asm
>> is directly emitted, without requiring the -msse42 flag at all, and it will just
>> work in all the files that call it.
>>
> I really don't like that approach. I think using intrinsics is much more 
> maintainable.
>

static inline uint32_t
crc32_sse42_u32(uint32_t data, uint32_t init_val)
{
/*··__asm__ volatile(
············"crc32l %[data], %[init_val];"
············: [init_val] "+r" (init_val)
············: [data] "rm" (data));
····return init_val;*/

But wait, will __builtin_ia32_crc32si and __builtin_ia32_crc32di
functions do the trick? ICC has them?
What about prototyping functions and extracting their bodies to separate
module? Does it break anything?
  
Bruce Richardson Nov. 19, 2014, 11:38 a.m. UTC | #18
On Wed, Nov 19, 2014 at 06:34:08AM -0500, Neil Horman wrote:
> On Wed, Nov 19, 2014 at 10:16:14AM +0000, Bruce Richardson wrote:
> > On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:
> > > On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:
> > > > On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
> > > > > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> > > > > > 
> > > > > > 18.11.2014 22:00, Neil Horman пишет:
> > > > > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > > > > >> 18.11.2014 20:41, Neil Horman пишет:
> > > > > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > > > > >>>>  /**
> > > > > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > > > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > > > > >>>> + * not supported
> > > > > > >>>>   *
> > > > > > >>>>   * @param data
> > > > > > >>>>   *   Data to perform hash on.
> > > > > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > > > > >>>>  static inline uint32_t
> > > > > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > > > > >>>>  {
> > > > > > >>>> -	return _mm_crc32_u32(init_val, data);
> > > > > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > > > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> > > > > > >>>> +		return _mm_crc32_u32(init_val, data);
> > > > > > >>>> +#endif
> > > > > > >>> you don't really need these ifdefs here anymore given that you have a
> > > > > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> > > > > > >>> the event you build on a system that doesn't support SSE42, but run on a system
> > > > > > >>> that does.
> > > > > > >> Originally, I thought so as well. I wrote the code without these ifdefs,
> > > > > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > > > > >> was triggered by nmmintrin.h which has a check for respective GCC
> > > > > > >> extension. So I think these ifdefs are indeed required.
> > > > > > >>
> > > > > > > You need to edit the makefile so that the compiler gets passed the option
> > > > > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > > > > > > you to remove the ifdef from the include file
> > > > > > 
> > > > > > In this case, I guess there are two options:
> > > > > > 1) modify all makefiles which use librte_hash
> > > > > > 2) move all function bodies from rte_hash_crc.h to separate module,
> > > > > > leaving prototype definitions there only.
> > > > > > 
> > > > > > Everybody's up for the second option? :)
> > > > > > 
> > > > > Crud, you're right, I didn't think about the header inclusion issue.  Is it
> > > > > worth adding the jump to enable the dynamic hash selection?
> > > > > Neil
> > > > 
> > > > Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
> > > > For builds where we have hardware support confirmed at compile time, just use
> > > > the function from the header file.
> > > > Does that make sense?
> > > > 
> > > I'm not certain of that, as I don't think anything can be 'confirmed' at compile
> > > time.  I.e. just because you have sse42 at compile time doesn't guarantee you
> > > have it at run time with a DSO.  If you have these as macros, you need to enable
> > > sse42 whereever you include the file so that the intrinsic works properly.
> > 
> > Well, if you compile with sse42 at compile time, the compiler is free to insert
> > sse4 instructions at any place it feels like, irrespective of whether or not you
> > use SSE4 intrinsics, so I would never expect such a DSO to work on a system
> > without SSE42 support.
> > 
> > > 
> > > an alternate option would be to not use the intrinsic, and craft some explicit
> > > __asm__ statement that executes the right sse42 instructions.  That way the asm
> > > is directly emitted, without requiring the -msse42 flag at all, and it will just
> > > work in all the files that call it.
> > > 
> > 
> > I really don't like that approach. I think using intrinsics is much more 
> > maintainable.
> > 
> I grant you that using an intrinsic is easier to read, but if the code doesn't
> compile when using the intrinsic unless you have sse42 turned on, I'm not sure
> what choice we have.  and inline asm isn't that hard to maintain.  We're talking
> about three lines of code:
> asm(
>  "mov    %[1],%eax
>  mov    %[2],%edx
>  crc32l %edx,%eax": 
>  [edx] "r" (crc) /*output*/
>  :
>  [1] "r" (crc), /* input */
>  [2] "r" (val)
>  :
>  [eax] "r" /* clobber */
> )
> 
> I don't have the syntax quite right, but its pretty easy to read the intent.
> Its not like we dont have precidence for this, the atomic interface and several
> pmds do this frequently.
> 
> Neil

Fair point. If everyone else is happy enough with it, I'm ok too.

/Bruce
  
Ananyev, Konstantin Nov. 19, 2014, 11:50 a.m. UTC | #19
> -----Original Message-----

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson

> Sent: Wednesday, November 19, 2014 11:38 AM

> To: Neil Horman

> Cc: dev@dpdk.org

> Subject: Re: [dpdk-dev] [PATCH v4 3/5] hash: add fallback to software CRC32 implementation

> 

> On Wed, Nov 19, 2014 at 06:34:08AM -0500, Neil Horman wrote:

> > On Wed, Nov 19, 2014 at 10:16:14AM +0000, Bruce Richardson wrote:

> > > On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:

> > > > On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:

> > > > > On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:

> > > > > > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:

> > > > > > >

> > > > > > > 18.11.2014 22:00, Neil Horman пишет:

> > > > > > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:

> > > > > > > >> 18.11.2014 20:41, Neil Horman пишет:

> > > > > > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:

> > > > > > > >>>>  /**

> > > > > > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.

> > > > > > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is

> > > > > > > >>>> + * not supported

> > > > > > > >>>>   *

> > > > > > > >>>>   * @param data

> > > > > > > >>>>   *   Data to perform hash on.

> > > > > > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)

> > > > > > > >>>>  static inline uint32_t

> > > > > > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)

> > > > > > > >>>>  {

> > > > > > > >>>> -	return _mm_crc32_u32(init_val, data);

> > > > > > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2

> > > > > > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))

> > > > > > > >>>> +		return _mm_crc32_u32(init_val, data);

> > > > > > > >>>> +#endif

> > > > > > > >>> you don't really need these ifdefs here anymore given that you have a

> > > > > > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in

> > > > > > > >>> the event you build on a system that doesn't support SSE42, but run on a system

> > > > > > > >>> that does.

> > > > > > > >> Originally, I thought so as well. I wrote the code without these ifdefs,

> > > > > > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error

> > > > > > > >> was triggered by nmmintrin.h which has a check for respective GCC

> > > > > > > >> extension. So I think these ifdefs are indeed required.

> > > > > > > >>

> > > > > > > > You need to edit the makefile so that the compiler gets passed the option

> > > > > > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow

> > > > > > > > you to remove the ifdef from the include file

> > > > > > >

> > > > > > > In this case, I guess there are two options:

> > > > > > > 1) modify all makefiles which use librte_hash

> > > > > > > 2) move all function bodies from rte_hash_crc.h to separate module,

> > > > > > > leaving prototype definitions there only.

> > > > > > >

> > > > > > > Everybody's up for the second option? :)

> > > > > > >

> > > > > > Crud, you're right, I didn't think about the header inclusion issue.  Is it

> > > > > > worth adding the jump to enable the dynamic hash selection?

> > > > > > Neil

> > > > >

> > > > > Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.

> > > > > For builds where we have hardware support confirmed at compile time, just use

> > > > > the function from the header file.

> > > > > Does that make sense?

> > > > >

> > > > I'm not certain of that, as I don't think anything can be 'confirmed' at compile

> > > > time.  I.e. just because you have sse42 at compile time doesn't guarantee you

> > > > have it at run time with a DSO.  If you have these as macros, you need to enable

> > > > sse42 whereever you include the file so that the intrinsic works properly.

> > >

> > > Well, if you compile with sse42 at compile time, the compiler is free to insert

> > > sse4 instructions at any place it feels like, irrespective of whether or not you

> > > use SSE4 intrinsics, so I would never expect such a DSO to work on a system

> > > without SSE42 support.

> > >

> > > >

> > > > an alternate option would be to not use the intrinsic, and craft some explicit

> > > > __asm__ statement that executes the right sse42 instructions.  That way the asm

> > > > is directly emitted, without requiring the -msse42 flag at all, and it will just

> > > > work in all the files that call it.

> > > >

> > >

> > > I really don't like that approach. I think using intrinsics is much more

> > > maintainable.

> > >

> > I grant you that using an intrinsic is easier to read, but if the code doesn't

> > compile when using the intrinsic unless you have sse42 turned on, I'm not sure

> > what choice we have.  and inline asm isn't that hard to maintain.  We're talking

> > about three lines of code:

> > asm(

> >  "mov    %[1],%eax

> >  mov    %[2],%edx

> >  crc32l %edx,%eax":

> >  [edx] "r" (crc) /*output*/

> >  :

> >  [1] "r" (crc), /* input */

> >  [2] "r" (val)

> >  :

> >  [eax] "r" /* clobber */

> > )

> >

> > I don't have the syntax quite right, but its pretty easy to read the intent.

> > Its not like we dont have precidence for this, the atomic interface and several

> > pmds do this frequently.

> >

> > Neil

> 

> Fair point. If everyone else is happy enough with it, I'm ok too.


As I remember with gcc & icc it is possible to specify tht you'd like to compile that particular function
for different target.
From https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html:
"target
The target attribute is used to specify that a function is to be compiled with different target options than specified on the command line. This can be used for instance to have functions compiled with a different ISA (instruction set architecture) than the default. You can also use the ‘#pragma GCC target’ pragma to set more than one function to be compiled with specific target options. See Function Specific Option Pragmas, for details about the ‘#pragma GCC target’ pragma.
For instance on a 386, you could compile one function with target("sse4.1,arch=core2") and another with target("sse4a,arch=amdfam10"). This is equivalent to compiling the first function with -msse4.1 and -march=core2 options, and the second function with -msse4a and -march=amdfam10 options. It is up to the user to make sure that a function is only invoked on a machine that supports the particular ISA it is compiled for (for example by using cpuid on 386 to determine what feature bits and architecture family are used).

          int core2_func (void) __attribute__ ((__target__ ("arch=core2")));
          int sse3_func (void) __attribute__ ((__target__ ("sse3")));
You can either use multiple strings to specify multiple options, or separate the options with a comma (‘,’).

The target attribute is presently implemented for i386/x86_64, PowerPC, and Nios II targets only. The options supported are specific to each target.

On the 386, the following options are allowed:
...
 ‘sse4.2’
‘no-sse4.2’"

Wouldn't that suit your purposes?
Probably you can even keep your function inline with that approach.

Konstantin


> 

> /Bruce
  
Yerden Zhumabekov Nov. 19, 2014, 11:59 a.m. UTC | #20
19.11.2014 17:50, Ananyev, Konstantin пишет:
>
> As I remember with gcc & icc it is possible to specify tht you'd like to compile that particular function
> for different target.
> From https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html:
> "target
> The target attribute is used to specify that a function is to be compiled with different target options than specified on the command line. This can be used for instance to have functions compiled with a different ISA (instruction set architecture) than the default. You can also use the ‘#pragma GCC target’ pragma to set more than one function to be compiled with specific target options. See Function Specific Option Pragmas, for details about the ‘#pragma GCC target’ pragma.
> For instance on a 386, you could compile one function with target("sse4.1,arch=core2") and another with target("sse4a,arch=amdfam10"). This is equivalent to compiling the first function with -msse4.1 and -march=core2 options, and the second function with -msse4a and -march=amdfam10 options. It is up to the user to make sure that a function is only invoked on a machine that supports the particular ISA it is compiled for (for example by using cpuid on 386 to determine what feature bits and architecture family are used).
>
>           int core2_func (void) __attribute__ ((__target__ ("arch=core2")));
>           int sse3_func (void) __attribute__ ((__target__ ("sse3")));
> You can either use multiple strings to specify multiple options, or separate the options with a comma (‘,’).
>
> The target attribute is presently implemented for i386/x86_64, PowerPC, and Nios II targets only. The options supported are specific to each target.
>
> On the 386, the following options are allowed:
> ...
>  ‘sse4.2’
> ‘no-sse4.2’"
>
> Wouldn't that suit your purposes?
> Probably you can even keep your function inline with that approach.
Very nice. Thank you. I will test it.
  
Neil Horman Nov. 19, 2014, 3:05 p.m. UTC | #21
On Wed, Nov 19, 2014 at 11:50:40AM +0000, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Wednesday, November 19, 2014 11:38 AM
> > To: Neil Horman
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v4 3/5] hash: add fallback to software CRC32 implementation
> > 
> > On Wed, Nov 19, 2014 at 06:34:08AM -0500, Neil Horman wrote:
> > > On Wed, Nov 19, 2014 at 10:16:14AM +0000, Bruce Richardson wrote:
> > > > On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:
> > > > > On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:
> > > > > > On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:
> > > > > > > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:
> > > > > > > >
> > > > > > > > 18.11.2014 22:00, Neil Horman пишет:
> > > > > > > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:
> > > > > > > > >> 18.11.2014 20:41, Neil Horman пишет:
> > > > > > > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:
> > > > > > > > >>>>  /**
> > > > > > > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.
> > > > > > > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is
> > > > > > > > >>>> + * not supported
> > > > > > > > >>>>   *
> > > > > > > > >>>>   * @param data
> > > > > > > > >>>>   *   Data to perform hash on.
> > > > > > > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)
> > > > > > > > >>>>  static inline uint32_t
> > > > > > > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
> > > > > > > > >>>>  {
> > > > > > > > >>>> -	return _mm_crc32_u32(init_val, data);
> > > > > > > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
> > > > > > > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))
> > > > > > > > >>>> +		return _mm_crc32_u32(init_val, data);
> > > > > > > > >>>> +#endif
> > > > > > > > >>> you don't really need these ifdefs here anymore given that you have a
> > > > > > > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in
> > > > > > > > >>> the event you build on a system that doesn't support SSE42, but run on a system
> > > > > > > > >>> that does.
> > > > > > > > >> Originally, I thought so as well. I wrote the code without these ifdefs,
> > > > > > > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error
> > > > > > > > >> was triggered by nmmintrin.h which has a check for respective GCC
> > > > > > > > >> extension. So I think these ifdefs are indeed required.
> > > > > > > > >>
> > > > > > > > > You need to edit the makefile so that the compiler gets passed the option
> > > > > > > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow
> > > > > > > > > you to remove the ifdef from the include file
> > > > > > > >
> > > > > > > > In this case, I guess there are two options:
> > > > > > > > 1) modify all makefiles which use librte_hash
> > > > > > > > 2) move all function bodies from rte_hash_crc.h to separate module,
> > > > > > > > leaving prototype definitions there only.
> > > > > > > >
> > > > > > > > Everybody's up for the second option? :)
> > > > > > > >
> > > > > > > Crud, you're right, I didn't think about the header inclusion issue.  Is it
> > > > > > > worth adding the jump to enable the dynamic hash selection?
> > > > > > > Neil
> > > > > >
> > > > > > Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.
> > > > > > For builds where we have hardware support confirmed at compile time, just use
> > > > > > the function from the header file.
> > > > > > Does that make sense?
> > > > > >
> > > > > I'm not certain of that, as I don't think anything can be 'confirmed' at compile
> > > > > time.  I.e. just because you have sse42 at compile time doesn't guarantee you
> > > > > have it at run time with a DSO.  If you have these as macros, you need to enable
> > > > > sse42 whereever you include the file so that the intrinsic works properly.
> > > >
> > > > Well, if you compile with sse42 at compile time, the compiler is free to insert
> > > > sse4 instructions at any place it feels like, irrespective of whether or not you
> > > > use SSE4 intrinsics, so I would never expect such a DSO to work on a system
> > > > without SSE42 support.
> > > >
> > > > >
> > > > > an alternate option would be to not use the intrinsic, and craft some explicit
> > > > > __asm__ statement that executes the right sse42 instructions.  That way the asm
> > > > > is directly emitted, without requiring the -msse42 flag at all, and it will just
> > > > > work in all the files that call it.
> > > > >
> > > >
> > > > I really don't like that approach. I think using intrinsics is much more
> > > > maintainable.
> > > >
> > > I grant you that using an intrinsic is easier to read, but if the code doesn't
> > > compile when using the intrinsic unless you have sse42 turned on, I'm not sure
> > > what choice we have.  and inline asm isn't that hard to maintain.  We're talking
> > > about three lines of code:
> > > asm(
> > >  "mov    %[1],%eax
> > >  mov    %[2],%edx
> > >  crc32l %edx,%eax":
> > >  [edx] "r" (crc) /*output*/
> > >  :
> > >  [1] "r" (crc), /* input */
> > >  [2] "r" (val)
> > >  :
> > >  [eax] "r" /* clobber */
> > > )
> > >
> > > I don't have the syntax quite right, but its pretty easy to read the intent.
> > > Its not like we dont have precidence for this, the atomic interface and several
> > > pmds do this frequently.
> > >
> > > Neil
> > 
> > Fair point. If everyone else is happy enough with it, I'm ok too.
> 
> As I remember with gcc & icc it is possible to specify tht you'd like to compile that particular function
> for different target.
> From https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html:
> "target
> The target attribute is used to specify that a function is to be compiled with different target options than specified on the command line. This can be used for instance to have functions compiled with a different ISA (instruction set architecture) than the default. You can also use the ‘#pragma GCC target’ pragma to set more than one function to be compiled with specific target options. See Function Specific Option Pragmas, for details about the ‘#pragma GCC target’ pragma.
> For instance on a 386, you could compile one function with target("sse4.1,arch=core2") and another with target("sse4a,arch=amdfam10"). This is equivalent to compiling the first function with -msse4.1 and -march=core2 options, and the second function with -msse4a and -march=amdfam10 options. It is up to the user to make sure that a function is only invoked on a machine that supports the particular ISA it is compiled for (for example by using cpuid on 386 to determine what feature bits and architecture family are used).
> 
>           int core2_func (void) __attribute__ ((__target__ ("arch=core2")));
>           int sse3_func (void) __attribute__ ((__target__ ("sse3")));
> You can either use multiple strings to specify multiple options, or separate the options with a comma (‘,’).
> 
> The target attribute is presently implemented for i386/x86_64, PowerPC, and Nios II targets only. The options supported are specific to each target.
> 
> On the 386, the following options are allowed:
> ...
>  ‘sse4.2’
> ‘no-sse4.2’"
> 
> Wouldn't that suit your purposes?
> Probably you can even keep your function inline with that approach.
> 
That would definately work, and be a great solution in this case.  However, its
limited to only the most recent version of gcc.  If thats an acceptible
constraint on the DPDK, then its ok, but distributions are only starting to
include that version now.  Not sure of the icc status of that attribute.

Neil

> Konstantin
> 
> 
> > 
> > /Bruce
  
Neil Horman Nov. 19, 2014, 3:07 p.m. UTC | #22
On Wed, Nov 19, 2014 at 05:35:51PM +0600, Yerden Zhumabekov wrote:
> 
> 19.11.2014 16:16, Bruce Richardson пишет:
> > On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:
> >> an alternate option would be to not use the intrinsic, and craft some explicit
> >> __asm__ statement that executes the right sse42 instructions.  That way the asm
> >> is directly emitted, without requiring the -msse42 flag at all, and it will just
> >> work in all the files that call it.
> >>
> > I really don't like that approach. I think using intrinsics is much more 
> > maintainable.
> >
> 
> static inline uint32_t
> crc32_sse42_u32(uint32_t data, uint32_t init_val)
> {
> /*··__asm__ volatile(
> ············"crc32l %[data], %[init_val];"
> ············: [init_val] "+r" (init_val)
> ············: [data] "rm" (data));
> ····return init_val;*/
> 
> But wait, will __builtin_ia32_crc32si and __builtin_ia32_crc32di
> functions do the trick? ICC has them?
If builtins work on both icc and gcc, yes, that would be a solution as it
creates non sse instructions when the target cpu doesn't support it.

> What about prototyping functions and extracting their bodies to separate
> module? Does it break anything?
> 
That would be a variant on the asm inline idea, but yes, I think that would work
too
Neil


> -- 
> Sincerely,
> 
> Yerden Zhumabekov
> State Technical Service
> Astana, KZ
> 
>
  
Ananyev, Konstantin Nov. 19, 2014, 4:51 p.m. UTC | #23
> -----Original Message-----

> From: Neil Horman [mailto:nhorman@tuxdriver.com]

> Sent: Wednesday, November 19, 2014 3:06 PM

> To: Ananyev, Konstantin

> Cc: Richardson, Bruce; dev@dpdk.org

> Subject: Re: [dpdk-dev] [PATCH v4 3/5] hash: add fallback to software CRC32 implementation

> 

> On Wed, Nov 19, 2014 at 11:50:40AM +0000, Ananyev, Konstantin wrote:

> >

> >

> > > -----Original Message-----

> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson

> > > Sent: Wednesday, November 19, 2014 11:38 AM

> > > To: Neil Horman

> > > Cc: dev@dpdk.org

> > > Subject: Re: [dpdk-dev] [PATCH v4 3/5] hash: add fallback to software CRC32 implementation

> > >

> > > On Wed, Nov 19, 2014 at 06:34:08AM -0500, Neil Horman wrote:

> > > > On Wed, Nov 19, 2014 at 10:16:14AM +0000, Bruce Richardson wrote:

> > > > > On Tue, Nov 18, 2014 at 04:36:24PM -0500, Neil Horman wrote:

> > > > > > On Tue, Nov 18, 2014 at 05:52:27PM +0000, Bruce Richardson wrote:

> > > > > > > On Tue, Nov 18, 2014 at 12:46:19PM -0500, Neil Horman wrote:

> > > > > > > > On Tue, Nov 18, 2014 at 11:13:17PM +0600, Yerden Zhumabekov wrote:

> > > > > > > > >

> > > > > > > > > 18.11.2014 22:00, Neil Horman пишет:

> > > > > > > > > > On Tue, Nov 18, 2014 at 09:06:35PM +0600, Yerden Zhumabekov wrote:

> > > > > > > > > >> 18.11.2014 20:41, Neil Horman пишет:

> > > > > > > > > >>> On Tue, Nov 18, 2014 at 08:03:40PM +0600, Yerden Zhumabekov wrote:

> > > > > > > > > >>>>  /**

> > > > > > > > > >>>>   * Use single crc32 instruction to perform a hash on a 4 byte value.

> > > > > > > > > >>>> + * Fall back to software crc32 implementation in case SSE4.2 is

> > > > > > > > > >>>> + * not supported

> > > > > > > > > >>>>   *

> > > > > > > > > >>>>   * @param data

> > > > > > > > > >>>>   *   Data to perform hash on.

> > > > > > > > > >>>> @@ -376,11 +413,18 @@ crc32c_2words(uint64_t data, uint32_t init_val)

> > > > > > > > > >>>>  static inline uint32_t

> > > > > > > > > >>>>  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)

> > > > > > > > > >>>>  {

> > > > > > > > > >>>> -	return _mm_crc32_u32(init_val, data);

> > > > > > > > > >>>> +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2

> > > > > > > > > >>>> +	if (likely(crc32_alg == CRC32_SSE42))

> > > > > > > > > >>>> +		return _mm_crc32_u32(init_val, data);

> > > > > > > > > >>>> +#endif

> > > > > > > > > >>> you don't really need these ifdefs here anymore given that you have a

> > > > > > > > > >>> constructor to do the algorithm selection.  In fact you need to remove them, in

> > > > > > > > > >>> the event you build on a system that doesn't support SSE42, but run on a system

> > > > > > > > > >>> that does.

> > > > > > > > > >> Originally, I thought so as well. I wrote the code without these ifdefs,

> > > > > > > > > >> but it didn't compile on my machine which doesn't support SSE4.2. Error

> > > > > > > > > >> was triggered by nmmintrin.h which has a check for respective GCC

> > > > > > > > > >> extension. So I think these ifdefs are indeed required.

> > > > > > > > > >>

> > > > > > > > > > You need to edit the makefile so that the compiler gets passed the option

> > > > > > > > > > -msse42.  That way it will know to emit sse42 instructions. It will also allow

> > > > > > > > > > you to remove the ifdef from the include file

> > > > > > > > >

> > > > > > > > > In this case, I guess there are two options:

> > > > > > > > > 1) modify all makefiles which use librte_hash

> > > > > > > > > 2) move all function bodies from rte_hash_crc.h to separate module,

> > > > > > > > > leaving prototype definitions there only.

> > > > > > > > >

> > > > > > > > > Everybody's up for the second option? :)

> > > > > > > > >

> > > > > > > > Crud, you're right, I didn't think about the header inclusion issue.  Is it

> > > > > > > > worth adding the jump to enable the dynamic hash selection?

> > > > > > > > Neil

> > > > > > >

> > > > > > > Maybe for cases where SSE4.2 is not currently available, i.e. for generic builds.

> > > > > > > For builds where we have hardware support confirmed at compile time, just use

> > > > > > > the function from the header file.

> > > > > > > Does that make sense?

> > > > > > >

> > > > > > I'm not certain of that, as I don't think anything can be 'confirmed' at compile

> > > > > > time.  I.e. just because you have sse42 at compile time doesn't guarantee you

> > > > > > have it at run time with a DSO.  If you have these as macros, you need to enable

> > > > > > sse42 whereever you include the file so that the intrinsic works properly.

> > > > >

> > > > > Well, if you compile with sse42 at compile time, the compiler is free to insert

> > > > > sse4 instructions at any place it feels like, irrespective of whether or not you

> > > > > use SSE4 intrinsics, so I would never expect such a DSO to work on a system

> > > > > without SSE42 support.

> > > > >

> > > > > >

> > > > > > an alternate option would be to not use the intrinsic, and craft some explicit

> > > > > > __asm__ statement that executes the right sse42 instructions.  That way the asm

> > > > > > is directly emitted, without requiring the -msse42 flag at all, and it will just

> > > > > > work in all the files that call it.

> > > > > >

> > > > >

> > > > > I really don't like that approach. I think using intrinsics is much more

> > > > > maintainable.

> > > > >

> > > > I grant you that using an intrinsic is easier to read, but if the code doesn't

> > > > compile when using the intrinsic unless you have sse42 turned on, I'm not sure

> > > > what choice we have.  and inline asm isn't that hard to maintain.  We're talking

> > > > about three lines of code:

> > > > asm(

> > > >  "mov    %[1],%eax

> > > >  mov    %[2],%edx

> > > >  crc32l %edx,%eax":

> > > >  [edx] "r" (crc) /*output*/

> > > >  :

> > > >  [1] "r" (crc), /* input */

> > > >  [2] "r" (val)

> > > >  :

> > > >  [eax] "r" /* clobber */

> > > > )

> > > >

> > > > I don't have the syntax quite right, but its pretty easy to read the intent.

> > > > Its not like we dont have precidence for this, the atomic interface and several

> > > > pmds do this frequently.

> > > >

> > > > Neil

> > >

> > > Fair point. If everyone else is happy enough with it, I'm ok too.

> >

> > As I remember with gcc & icc it is possible to specify tht you'd like to compile that particular function

> > for different target.

> > From https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html:

> > "target

> > The target attribute is used to specify that a function is to be compiled with different target options than specified on the command

> line. This can be used for instance to have functions compiled with a different ISA (instruction set architecture) than the default. You

> can also use the ‘#pragma GCC target’ pragma to set more than one function to be compiled with specific target options. See

> Function Specific Option Pragmas, for details about the ‘#pragma GCC target’ pragma.

> > For instance on a 386, you could compile one function with target("sse4.1,arch=core2") and another with

> target("sse4a,arch=amdfam10"). This is equivalent to compiling the first function with -msse4.1 and -march=core2 options, and the

> second function with -msse4a and -march=amdfam10 options. It is up to the user to make sure that a function is only invoked on a

> machine that supports the particular ISA it is compiled for (for example by using cpuid on 386 to determine what feature bits and

> architecture family are used).

> >

> >           int core2_func (void) __attribute__ ((__target__ ("arch=core2")));

> >           int sse3_func (void) __attribute__ ((__target__ ("sse3")));

> > You can either use multiple strings to specify multiple options, or separate the options with a comma (‘,’).

> >

> > The target attribute is presently implemented for i386/x86_64, PowerPC, and Nios II targets only. The options supported are specific

> to each target.

> >

> > On the 386, the following options are allowed:

> > ...

> >  ‘sse4.2’

> > ‘no-sse4.2’"

> >

> > Wouldn't that suit your purposes?

> > Probably you can even keep your function inline with that approach.

> >

> That would definately work, and be a great solution in this case.  However, its

> limited to only the most recent version of gcc.  If thats an acceptible

> constraint on the DPDK, then its ok, but distributions are only starting to

> include that version now.  Not sure of the icc status of that attribute.


Yes, as I can see that feature was introduced in gcc 4.4, and we have to support backward to gcc 4.3...
Though I suppose for gcc 4.3 , we can just always switch to the scalar version, can't we?
Something like that:

$ cat ./tatrg1.c
#include <stdint.h>

uint64_t
ffx1_gen(uint64_t x)
{
        /* should contain scalar CRC implementation. */
        return (x * x);
}

#pragma GCC target ("sse4.2")

#if defined __SSE4_2__

#include <smmintrin.h>

uint64_t
ffx1_sse42(uint64_t x)
{
        return _mm_crc32_u64(x, x);
}

#else

uint64_t
ffx1_sse42(uint64_t x)
{
        /* should contain scalar CRC implementation. */
        return (x * x);
}

#endif

Konstantin

> 

> Neil

> 

> > Konstantin

> >

> >

> > >

> > > /Bruce
  
Yerden Zhumabekov Nov. 20, 2014, 3:04 a.m. UTC | #24
19.11.2014 21:07, Neil Horman пишет:
> On Wed, Nov 19, 2014 at 05:35:51PM +0600, Yerden Zhumabekov wrote:
>> static inline uint32_t
>> crc32_sse42_u32(uint32_t data, uint32_t init_val)
>> {
>> /*··__asm__ volatile(
>> ············"crc32l %[data], %[init_val];"
>> ············: [init_val] "+r" (init_val)
>> ············: [data] "rm" (data));
>> ····return init_val;*/
>>
>> But wait, will __builtin_ia32_crc32si and __builtin_ia32_crc32di
>> functions do the trick? ICC has them?
> If builtins work on both icc and gcc, yes, that would be a solution as it
> creates non sse instructions when the target cpu doesn't support it.

Can anyone acknowledge?

>
>> What about prototyping functions and extracting their bodies to separate
>> module? Does it break anything?
>>
> That would be a variant on the asm inline idea, but yes, I think that would work
> too

No luck. Performance degrades up to 30-50 percent if extracting
functions to separate module.
  

Patch

diff --git a/lib/librte_hash/rte_hash_crc.h b/lib/librte_hash/rte_hash_crc.h
index 15f687a..332ed99 100644
--- a/lib/librte_hash/rte_hash_crc.h
+++ b/lib/librte_hash/rte_hash_crc.h
@@ -45,7 +45,11 @@  extern "C" {
 #endif
 
 #include <stdint.h>
+#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
 #include <nmmintrin.h>
+#endif
+#include <rte_cpuflags.h>
+#include <rte_branch_prediction.h>
 
 /* Lookup tables for software implementation of CRC32C */
 static uint32_t crc32c_tables[8][256] = {{
@@ -363,8 +367,41 @@  crc32c_2words(uint64_t data, uint32_t init_val)
 	return crc;
 }
 
+enum crc32_alg_t {
+	CRC32_SW = 0,
+	CRC32_SSE42
+};
+
+static enum crc32_alg_t crc32_alg;
+
+/**
+ * Allow or disallow use of SSE4.2 instrinsics for CRC32 hash
+ * calculation.
+ *
+ * @param flag
+ *   unsigned integer flag
+ *   - (CRC32_SW) Don't use SSE4.2 intrinsics
+ *   - (CRC32_SSE42) Use SSE4.2 intrinsics if available, set by default
+ */
+static inline void
+rte_hash_crc_set_alg(enum crc32_alg_t alg)
+{
+	int sse42_supp = rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_2);
+	enum crc32_alg_t alg_supp = sse42_supp ? CRC32_SSE42 : CRC32_SW;
+	crc32_alg = (alg == CRC32_SSE42) ? alg_supp : CRC32_SW;
+}
+
+/* Best available algorithm is detected via CPUID instruction */
+static inline void __attribute__((constructor))
+rte_hash_crc_try_sse42(void)
+{
+	rte_hash_crc_set_alg(CRC32_SSE42);
+}
+
 /**
  * Use single crc32 instruction to perform a hash on a 4 byte value.
+ * Fall back to software crc32 implementation in case SSE4.2 is
+ * not supported
  *
  * @param data
  *   Data to perform hash on.
@@ -376,11 +413,18 @@  crc32c_2words(uint64_t data, uint32_t init_val)
 static inline uint32_t
 rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
 {
-	return _mm_crc32_u32(init_val, data);
+#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
+	if (likely(crc32_alg == CRC32_SSE42))
+		return _mm_crc32_u32(init_val, data);
+#endif
+
+	return crc32c_1word(data, init_val);
 }
 
 /**
  * Use single crc32 instruction to perform a hash on a 8 byte value.
+ * Fall back to software crc32 implementation in case SSE4.2 is
+ * not supported
  *
  * @param data
  *   Data to perform hash on.
@@ -392,7 +436,12 @@  rte_hash_crc_4byte(uint32_t data, uint32_t init_val)
 static inline uint32_t
 rte_hash_crc_8byte(uint64_t data, uint32_t init_val)
 {
-	return _mm_crc32_u64(init_val, data);
+#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
+	if (likely(crc32_alg == CRC32_SSE42))
+		return _mm_crc32_u64(init_val, data);
+#endif
+
+	return crc32c_2words(data, init_val);
 }
 
 /**