[dpdk-dev] hash: update jhash function with the latest available

Message ID 1429190819-27402-1-git-send-email-pablo.de.lara.guarch@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

De Lara Guarch, Pablo April 16, 2015, 1:26 p.m. UTC
  Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.

Check out: http://burtleburtle.net/bob/c/lookup3.c

This patch integrates that code in the rte_jhash library,
adding also a new function rte_jhash_word2,
that returns two different hash values, for a single key.

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
 lib/librte_hash/rte_jhash.h |  407 ++++++++++++++++++++++++++++++++++++-------
 1 files changed, 347 insertions(+), 60 deletions(-)
  

Comments

Bruce Richardson April 16, 2015, 2:01 p.m. UTC | #1
On Thu, Apr 16, 2015 at 02:26:59PM +0100, Pablo de Lara wrote:
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 60% better performance, compared to the original one.
> 
> Check out: http://burtleburtle.net/bob/c/lookup3.c
> 
> This patch integrates that code in the rte_jhash library,
> adding also a new function rte_jhash_word2,
> that returns two different hash values, for a single key.
> 

Should the addition of the new functionality not be a separate patch from the
update to the existing code?
Also, do the new functions return the exact same values as the previous versions,
just faster?

> Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> ---
>  lib/librte_hash/rte_jhash.h |  407 ++++++++++++++++++++++++++++++++++++-------
>  1 files changed, 347 insertions(+), 60 deletions(-)
> 
> diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> index a4bf5a1..3de006d 100644
> --- a/lib/librte_hash/rte_jhash.h
> +++ b/lib/librte_hash/rte_jhash.h
> @@ -1,7 +1,7 @@
>  /*-
>   *   BSD LICENSE
>   *
> - *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
>   *   All rights reserved.
>   *
>   *   Redistribution and use in source and binary forms, with or without
> @@ -45,38 +45,51 @@ extern "C" {
>  #endif
>  
>  #include <stdint.h>
> +#include <rte_byteorder.h>
>  
>  /* jhash.h: Jenkins hash support.
>   *
> - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
>   *
>   * http://burtleburtle.net/bob/hash/
>   *
>   * These are the credits from Bob's sources:
>   *
> - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> - * hash(), hash2(), hash3, and mix() are externally useful functions.
> - * Routines to test the hash are included if SELF_TEST is defined.
> - * You can use this free for any purpose.  It has no warranty.
> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> + *
> + * These are functions for producing 32-bit hashes for hash table lookup.
> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> + * are externally useful functions.  Routines to test the hash are included
> + * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
> + * the public domain.  It has no warranty.
>   *
>   * $FreeBSD$
>   */
>  
> +#define rot(x, k) (((x)<<(k)) | ((x)>>(32-(k))))
> +
>  /** @internal Internal function. NOTE: Arguments are modified. */
>  #define __rte_jhash_mix(a, b, c) do { \
> -	a -= b; a -= c; a ^= (c>>13); \
> -	b -= c; b -= a; b ^= (a<<8); \
> -	c -= a; c -= b; c ^= (b>>13); \
> -	a -= b; a -= c; a ^= (c>>12); \
> -	b -= c; b -= a; b ^= (a<<16); \
> -	c -= a; c -= b; c ^= (b>>5); \
> -	a -= b; a -= c; a ^= (c>>3); \
> -	b -= c; b -= a; b ^= (a<<10); \
> -	c -= a; c -= b; c ^= (b>>15); \
> +	a -= c; a ^= rot(c, 4); c += b; \
> +	b -= a; b ^= rot(a, 6); a += c; \
> +	c -= b; c ^= rot(b, 8); b += a; \
> +	a -= c; a ^= rot(c, 16); c += b; \
> +	b -= a; b ^= rot(a, 19); a += c; \
> +	c -= b; c ^= rot(b, 4); b += a; \
> +} while (0)
> +
> +#define __rte_jhash_final(a, b, c) do { \
> +	c ^= b; c -= rot(b, 14); \
> +	a ^= c; a -= rot(c, 11); \
> +	b ^= a; b -= rot(a, 25); \
> +	c ^= b; c -= rot(b, 16); \
> +	a ^= c; a -= rot(c, 4);  \
> +	b ^= a; b -= rot(a, 14); \
> +	c ^= b; c -= rot(b, 24); \
>  } while (0)
>  
>  /** The golden ratio: an arbitrary value. */
> -#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
> +#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
>  
>  /**
>   * The most generic version, hashes an arbitrary sequence
> @@ -95,42 +108,256 @@ extern "C" {
>  static inline uint32_t
>  rte_jhash(const void *key, uint32_t length, uint32_t initval)
>  {
> -	uint32_t a, b, c, len;
> -	const uint8_t *k = (const uint8_t *)key;
> -	const uint32_t *k32 = (const uint32_t *)key;
> +	uint32_t a, b, c;
> +	union {
> +		const void *ptr;
> +		size_t i;
> +	} u;
>  
> -	len = length;
> -	a = b = RTE_JHASH_GOLDEN_RATIO;
> -	c = initval;
> +	/* Set up the internal state */
> +	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
>  
> -	while (len >= 12) {
> -		a += k32[0];
> -		b += k32[1];
> -		c += k32[2];
> +	u.ptr = key;
>  
> -		__rte_jhash_mix(a,b,c);
> +	if ((u.i & 0x3) == 0) {
> +		const uint32_t *k = (const uint32_t *)key;
>  
> -		k += (3 * sizeof(uint32_t)), k32 += 3;
> -		len -= (3 * sizeof(uint32_t));
> -	}
> +		while (length > 12) {
> +			a += k[0];
> +			b += k[1];
> +			c += k[2];
>  
> -	c += length;
> -	switch (len) {
> -		case 11: c += ((uint32_t)k[10] << 24);
> -		case 10: c += ((uint32_t)k[9] << 16);
> -		case 9 : c += ((uint32_t)k[8] << 8);
> -		case 8 : b += ((uint32_t)k[7] << 24);
> -		case 7 : b += ((uint32_t)k[6] << 16);
> -		case 6 : b += ((uint32_t)k[5] << 8);
> -		case 5 : b += k[4];
> -		case 4 : a += ((uint32_t)k[3] << 24);
> -		case 3 : a += ((uint32_t)k[2] << 16);
> -		case 2 : a += ((uint32_t)k[1] << 8);
> -		case 1 : a += k[0];
> -		default: break;
> -	};
> +			__rte_jhash_mix(a, b, c);
> +
> +			k += 3;
> +			length -= 12;
> +		}
> +
> +		switch (length) {
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +		case 12:
> +			c += k[2]; b += k[1]; a += k[0]; break;
> +		case 11:
> +			c += k[2]&0xffffff; b += k[1]; a += k[0]; break;
> +		case 10:
> +			c += k[2]&0xffff; b += k[1]; a += k[0]; break;
> +		case 9:
> +			c += k[2]&0xff; b += k[1]; a += k[0]; break;
> +		case 8:
> +			b += k[1]; a += k[0]; break;
> +		case 7:
> +			b += k[1]&0xffffff; a += k[0]; break;
> +		case 6:
> +			b += k[1]&0xffff; a += k[0]; break;
> +		case 5:
> +			b += k[1]&0xff; a += k[0]; break;
> +		case 4:
> +			a += k[0]; break;
> +		case 3:
> +			a += k[0]&0xffffff; break;
> +		case 2:
> +			a += k[0]&0xffff; break;
> +		case 1:
> +			a += k[0]&0xff; break;
> +#else
> +		case 12:
> +			c += k[2]; b += k[1]; a += k[0]; break;
> +		case 11:
> +			c += k[2]&0xffffff00; b += k[1]; a += k[0]; break;
> +		case 10:
> +			c += k[2]&0xffff0000; b += k[1]; a += k[0]; break;
> +		case 9:
> +			c += k[2]&0xff000000; b += k[1]; a += k[0]; break;
> +		case 8:
> +			b += k[1]; a += k[0]; break;
> +		case 7:
> +			b += k[1]&0xffffff00; a += k[0]; break;
> +		case 6:
> +			b += k[1]&0xffff0000; a += k[0]; break;
> +		case 5:
> +			b += k[1]&0xff000000; a += k[0]; break;
> +		case 4:
> +			a += k[0]; break;
> +		case 3:
> +			a += k[0]&0xffffff00; break;
> +		case 2:
> +			a += k[0]&0xffff0000; break;
> +		case 1:
> +			a += k[0]&0xff000000; break;
> +#endif

Only the constants seem different in this block. Can we get rid of the
#ifdefs using rte_XX_to_cpu() calls instead?

> +		/* zero length strings require no mixing */
> +		case 0:
> +			return c;
> +		};
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +	} else if ((u.i & 0x1) == 0) {
> +		/* read 16-bit chunks */
> +		const uint16_t *k = (const uint16_t *)key;
> +		const uint8_t  *k8;
> +
> +		/* all but last block: aligned reads and different mixing */
> +		while (length > 12) {
> +			a += k[0] + (((uint32_t)k[1])<<16);
> +			b += k[2] + (((uint32_t)k[3])<<16);
> +			c += k[4] + (((uint32_t)k[5])<<16);
> +
> +			__rte_jhash_mix(a, b, c);
> +
> +			k += 6;
> +			length -= 12;
> +		}
> +
> +		/* handle the last (probably partial) block */
> +		k8 = (const uint8_t *)k;
> +		switch (length) {
> +		case 12:
> +			c += k[4]+(((uint32_t)k[5])<<16);
> +			b += k[2]+(((uint32_t)k[3])<<16);
> +			a += k[0]+(((uint32_t)k[1])<<16);
> +			break;
> +		case 11:
> +			/* fall through */
> +			c += ((uint32_t)k8[10])<<16;
> +		case 10:
> +			c += k[4];
> +			b += k[2]+(((uint32_t)k[3])<<16);
> +			a += k[0]+(((uint32_t)k[1])<<16);
> +			break;
> +		case 9:
> +			/* fall through */
> +			c += k8[8];
> +		case 8:
> +			b += k[2]+(((uint32_t)k[3])<<16);
> +			a += k[0]+(((uint32_t)k[1])<<16);
> +			break;
> +		case 7:
> +			/* fall through */
> +			b += ((uint32_t)k8[6])<<16;
> +		case 6:
> +			b += k[2];
> +			a += k[0]+(((uint32_t)k[1])<<16);
> +			break;
> +		case 5:
> +			/* fall through */
> +			b += k8[4];
> +		case 4:
> +			a += k[0]+(((uint32_t)k[1])<<16);
> +			break;
> +		case 3:
> +			/* fall through */
> +			a += ((uint32_t)k8[2])<<16;
> +		case 2:
> +			a += k[0];
> +			break;
> +		case 1:
> +			a += k8[0];
> +			break;
> +		case 0:
> +			/* zero length requires no mixing */
> +			return c;
> +		}
> +#endif

No else block for this ifdef?

> +	} else {
> +		const uint8_t *k = (const uint8_t *)key;
> +
> +		/* all but the last block: affect some 32 bits of (a, b, c) */
> +		while (length > 12) {
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +			a += k[0];
> +			a += ((uint32_t)k[1])<<8;
> +			a += ((uint32_t)k[2])<<16;
> +			a += ((uint32_t)k[3])<<24;
> +			b += k[4];
> +			b += ((uint32_t)k[5])<<8;
> +			b += ((uint32_t)k[6])<<16;
> +			b += ((uint32_t)k[7])<<24;
> +			c += k[8];
> +			c += ((uint32_t)k[9])<<8;
> +			c += ((uint32_t)k[10])<<16;
> +			c += ((uint32_t)k[11])<<24;
> +#else
> +			a += ((uint32_t)k[0])<<24;
> +			a += ((uint32_t)k[1])<<16;
> +			a += ((uint32_t)k[2])<<8;
> +			a += ((uint32_t)k[3]);
> +			b += ((uint32_t)k[4])<<24;
> +			b += ((uint32_t)k[5])<<16;
> +			b += ((uint32_t)k[6])<<8;
> +			b += ((uint32_t)k[7]);
> +			c += ((uint32_t)k[8])<<32;
> +			c += ((uint32_t)k[9])<<16;
> +			c += ((uint32_t)k[10])<<8;
> +			c += ((uint32_t)k[11]);
> +#endif

Maybe find a better way to shorten/remove this #ifdef also. E.g. shorter
ifdef defining macros for the different shift amounts, 0, 8, 16, 24.

> +
> +			__rte_jhash_mix(a, b, c);
>  
> -	__rte_jhash_mix(a,b,c);
> +			k += 12;
> +			length -= 12;
> +		}
> +
> +		/* last block: affect all 32 bits of (c) */
> +		/* all the case statements fall through */
> +		switch (length) {
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +		case 12:
> +			c += ((uint32_t)k[11])<<24;
> +		case 11:
> +			c += ((uint32_t)k[10])<<16;
> +		case 10:
> +			c += ((uint32_t)k[9])<<8;
> +		case 9:
> +			c += k[8];
> +		case 8:
> +			b += ((uint32_t)k[7])<<24;
> +		case 7:
> +			b += ((uint32_t)k[6])<<16;
> +		case 6:
> +			b += ((uint32_t)k[5])<<8;
> +		case 5:
> +			b += k[4];
> +		case 4:
> +			a += ((uint32_t)k[3])<<24;
> +		case 3:
> +			a += ((uint32_t)k[2])<<16;
> +		case 2:
> +			a += ((uint32_t)k[1])<<8;
> +		case 1:
> +			a += k[0];
> +		break;
> +#else
> +		case 12:
> +			c += k[11];
> +		case 11:
> +			c += ((uint32_t)k[10])<<8;
> +		case 10:
> +			c += ((uint32_t)k[9])<<16;
> +		case 9:
> +			c += ((uint32_t)k[8])<<24;
> +		case 8:
> +			b += k[7];
> +		case 7:
> +			b += ((uint32_t)k[6])<<8;
> +		case 6:
> +			b += ((uint32_t)k[5])<<16;
> +		case 5:
> +			b += ((uint32_t)k[4])<<24;
> +		case 4:
> +			a += k[3];
> +		case 3:
> +			a += ((uint32_t)k[2])<<8;
> +		case 2:
> +			a += ((uint32_t)k[1])<<16;
> +		case 1:
> +			a += ((uint32_t)k[0])<<24;
> +		break;
> +#endif
> +		case 0:
> +			return c;
> +		}
> +	}
> +
> +	__rte_jhash_final(a, b, c);
>  
>  	return c;
>  }
> @@ -151,33 +378,93 @@ rte_jhash(const void *key, uint32_t length, uint32_t initval)
>  static inline uint32_t
>  rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
>  {
> -	uint32_t a, b, c, len;
> +	uint32_t a, b, c;
>  
> -	a = b = RTE_JHASH_GOLDEN_RATIO;
> -	c = initval;
> -	len = length;
> +	/* Set up the internal state */
> +	a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length)<<2) + initval;
>  
> -	while (len >= 3) {
> +	/* Handle most of the key */
> +	while (length > 3) {
>  		a += k[0];
>  		b += k[1];
>  		c += k[2];
> +
>  		__rte_jhash_mix(a, b, c);
> -		k += 3; len -= 3;
> -	}
>  
> -	c += length * 4;
> +		k += 3;
> +		length -= 3;
> +	}
>  
> -	switch (len) {
> -		case 2 : b += k[1];
> -		case 1 : a += k[0];
> -		default: break;
> +	/* Handle the last 3 uint32_t's */
> +	switch (length) {
> +	case 3:
> +		c += k[2];
> +	case 2:
> +		b += k[1];
> +	case 1:
> +		a += k[0];
> +		__rte_jhash_final(a, b, c);
> +	/* case 0: nothing left to add */
> +	case 0:
> +		break;
>  	};
>  
> -	__rte_jhash_mix(a,b,c);
> -
>  	return c;
>  }
>  
> +/**
> + * Same as rte_jhash2, but take two seeds and return two uint32_ts.
> + * pc and pb must be non-null, and *pc and *pb must both be initialized
> + * with seeds. If you pass in (*pb)=0, the output (*pc) will be
> + * the same as the return value from rte_jhash.
> + *
> + * @param k
> + *   Key to calculate hash of.
> + * @param length
> + *   Length of key in units of 4 bytes.
> + * @param pc
> + *   IN: seed OUT: primary hash value.
> + * @param pc
> + *   IN: second seed OUT: secondary hash value.
> + */
> +static inline void
> +rte_jhash_word2(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb)
> +{
> +	uint32_t a, b, c;
> +
> +	/* Set up the internal state */
> +	a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length)<<2) + *pc;
> +	c += *pb;
> +
> +	/* Handle most of the key */
> +	while (length > 3) {
> +		a += k[0];
> +		b += k[1];
> +		c += k[2];
> +
> +		__rte_jhash_mix(a, b, c);
> +
> +		k += 3;
> +		length -= 3;
> +	}
> +
> +	/* Handle the last 3 uint32_t's */
> +	switch (length) {
> +	case 3:
> +		c += k[2];
> +	case 2:
> +		b += k[1];
> +	case 1:
> +		a += k[0];
> +		__rte_jhash_final(a, b, c);
> +	/* case 0: nothing left to add */
> +	case 0:
> +		break;
> +	};
> +
> +	*pc = c;
> +	*pb = b;
> +}
>  
>  /**
>   * A special ultra-optimized versions that knows it is hashing exactly
> -- 
> 1.7.4.1
>
  
De Lara Guarch, Pablo April 17, 2015, 4:03 p.m. UTC | #2
> -----Original Message-----
> From: Richardson, Bruce
> Sent: Thursday, April 16, 2015 3:01 PM
> To: De Lara Guarch, Pablo
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] hash: update jhash function with the latest
> available
> 
> On Thu, Apr 16, 2015 at 02:26:59PM +0100, Pablo de Lara wrote:
> > Jenkins hash function was developed originally in 1996,
> > and was integrated in first versions of DPDK.
> > The function has been improved in 2006,
> > achieving up to 60% better performance, compared to the original one.
> >
> > Check out: http://burtleburtle.net/bob/c/lookup3.c
> >
> > This patch integrates that code in the rte_jhash library,
> > adding also a new function rte_jhash_word2,
> > that returns two different hash values, for a single key.
> >
> 
> Should the addition of the new functionality not be a separate patch from
> the
> update to the existing code?

True, actually, I miss one extra function (2 in total).

> Also, do the new functions return the exact same values as the previous
> versions,
> just faster?

The new functions return different values from the previous version
AND faster (some cases, MUCH faster)

[...]

> > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > +		case 12:
> > +			c += k[2]; b += k[1]; a += k[0]; break;
> > +		case 11:
> > +			c += k[2]&0xffffff; b += k[1]; a += k[0]; break;
> > +		case 10:
> > +			c += k[2]&0xffff; b += k[1]; a += k[0]; break;
> > +		case 9:
> > +			c += k[2]&0xff; b += k[1]; a += k[0]; break;
> > +		case 8:
> > +			b += k[1]; a += k[0]; break;
> > +		case 7:
> > +			b += k[1]&0xffffff; a += k[0]; break;
> > +		case 6:
> > +			b += k[1]&0xffff; a += k[0]; break;
> > +		case 5:
> > +			b += k[1]&0xff; a += k[0]; break;
> > +		case 4:
> > +			a += k[0]; break;
> > +		case 3:
> > +			a += k[0]&0xffffff; break;
> > +		case 2:
> > +			a += k[0]&0xffff; break;
> > +		case 1:
> > +			a += k[0]&0xff; break;
> > +#else
> > +		case 12:
> > +			c += k[2]; b += k[1]; a += k[0]; break;
> > +		case 11:
> > +			c += k[2]&0xffffff00; b += k[1]; a += k[0]; break;
> > +		case 10:
> > +			c += k[2]&0xffff0000; b += k[1]; a += k[0]; break;
> > +		case 9:
> > +			c += k[2]&0xff000000; b += k[1]; a += k[0]; break;
> > +		case 8:
> > +			b += k[1]; a += k[0]; break;
> > +		case 7:
> > +			b += k[1]&0xffffff00; a += k[0]; break;
> > +		case 6:
> > +			b += k[1]&0xffff0000; a += k[0]; break;
> > +		case 5:
> > +			b += k[1]&0xff000000; a += k[0]; break;
> > +		case 4:
> > +			a += k[0]; break;
> > +		case 3:
> > +			a += k[0]&0xffffff00; break;
> > +		case 2:
> > +			a += k[0]&0xffff0000; break;
> > +		case 1:
> > +			a += k[0]&0xff000000; break;
> > +#endif
> 
> Only the constants seem different in this block. Can we get rid of the
> #ifdefs using rte_XX_to_cpu() calls instead?

Will add that in next version.

> 
> > +		/* zero length strings require no mixing */
> > +		case 0:
> > +			return c;
> > +		};
> > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > +	} else if ((u.i & 0x1) == 0) {
> > +		/* read 16-bit chunks */
> > +		const uint16_t *k = (const uint16_t *)key;
> > +		const uint8_t  *k8;
> > +
> > +		/* all but last block: aligned reads and different mixing */
> > +		while (length > 12) {
> > +			a += k[0] + (((uint32_t)k[1])<<16);
> > +			b += k[2] + (((uint32_t)k[3])<<16);
> > +			c += k[4] + (((uint32_t)k[5])<<16);
> > +
> > +			__rte_jhash_mix(a, b, c);
> > +
> > +			k += 6;
> > +			length -= 12;
> > +		}
> > +
> > +		/* handle the last (probably partial) block */
> > +		k8 = (const uint8_t *)k;
> > +		switch (length) {
> > +		case 12:
> > +			c += k[4]+(((uint32_t)k[5])<<16);
> > +			b += k[2]+(((uint32_t)k[3])<<16);
> > +			a += k[0]+(((uint32_t)k[1])<<16);
> > +			break;
> > +		case 11:
> > +			/* fall through */
> > +			c += ((uint32_t)k8[10])<<16;
> > +		case 10:
> > +			c += k[4];
> > +			b += k[2]+(((uint32_t)k[3])<<16);
> > +			a += k[0]+(((uint32_t)k[1])<<16);
> > +			break;
> > +		case 9:
> > +			/* fall through */
> > +			c += k8[8];
> > +		case 8:
> > +			b += k[2]+(((uint32_t)k[3])<<16);
> > +			a += k[0]+(((uint32_t)k[1])<<16);
> > +			break;
> > +		case 7:
> > +			/* fall through */
> > +			b += ((uint32_t)k8[6])<<16;
> > +		case 6:
> > +			b += k[2];
> > +			a += k[0]+(((uint32_t)k[1])<<16);
> > +			break;
> > +		case 5:
> > +			/* fall through */
> > +			b += k8[4];
> > +		case 4:
> > +			a += k[0]+(((uint32_t)k[1])<<16);
> > +			break;
> > +		case 3:
> > +			/* fall through */
> > +			a += ((uint32_t)k8[2])<<16;
> > +		case 2:
> > +			a += k[0];
> > +			break;
> > +		case 1:
> > +			a += k8[0];
> > +			break;
> > +		case 0:
> > +			/* zero length requires no mixing */
> > +			return c;
> > +		}
> > +#endif
> 
> No else block for this ifdef?

According to the code, for big endian, it only covers 4-byte alignment and rest of cases.

> 
> > +	} else {
> > +		const uint8_t *k = (const uint8_t *)key;
> > +
> > +		/* all but the last block: affect some 32 bits of (a, b, c) */
> > +		while (length > 12) {
> > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > +			a += k[0];
> > +			a += ((uint32_t)k[1])<<8;
> > +			a += ((uint32_t)k[2])<<16;
> > +			a += ((uint32_t)k[3])<<24;
> > +			b += k[4];
> > +			b += ((uint32_t)k[5])<<8;
> > +			b += ((uint32_t)k[6])<<16;
> > +			b += ((uint32_t)k[7])<<24;
> > +			c += k[8];
> > +			c += ((uint32_t)k[9])<<8;
> > +			c += ((uint32_t)k[10])<<16;
> > +			c += ((uint32_t)k[11])<<24;
> > +#else
> > +			a += ((uint32_t)k[0])<<24;
> > +			a += ((uint32_t)k[1])<<16;
> > +			a += ((uint32_t)k[2])<<8;
> > +			a += ((uint32_t)k[3]);
> > +			b += ((uint32_t)k[4])<<24;
> > +			b += ((uint32_t)k[5])<<16;
> > +			b += ((uint32_t)k[6])<<8;
> > +			b += ((uint32_t)k[7]);
> > +			c += ((uint32_t)k[8])<<32;
> > +			c += ((uint32_t)k[9])<<16;
> > +			c += ((uint32_t)k[10])<<8;
> > +			c += ((uint32_t)k[11]);
> > +#endif
> 
> Maybe find a better way to shorten/remove this #ifdef also. E.g. shorter
> ifdef defining macros for the different shift amounts, 0, 8, 16, 24.

Agree. Will change that in v2.

[...]

Thanks for the comments. I will send a v2 soon.
  
De Lara Guarch, Pablo April 24, 2015, 11:23 a.m. UTC | #3
Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.

This patchset updates the current jhash in DPDK,
including two new functions that generate two hashes from a single key.

It also separates the existing hash function performance tests to
another file, to make it quicker to run.

changes in v2:

- Split single commit in three commits, one that updates the existing functions
  and another that adds two new functions and use one of those functions 
  as a base to be called by the other ones.
- Remove some unnecessary ifdefs in the code.
- Add new macros to help on the reutilization of constants
- Separate hash function performance tests to another file
  and improve cycle measurements.
- Rename existing function rte_jhash2 to rte_jhash_32b
  (something more meaninful) and mark rte_jhash2 as
  deprecated

Pablo de Lara (6):
  test/hash: move hash function perf tests to separate file
  test/hash: improve accuracy on cycle measurements
  hash: update jhash function with the latest available
  hash: add two new functions to jhash library
  hash: remove duplicated code
  hash: rename rte_jhash2 to rte_jhash_32b

 app/test/Makefile               |    1 +
 app/test/test_func_reentrancy.c |    2 +-
 app/test/test_hash.c            |    4 +-
 app/test/test_hash_func_perf.c  |  145 ++++++++++++++++++
 app/test/test_hash_perf.c       |   71 +---------
 lib/librte_hash/rte_jhash.h     |  313 +++++++++++++++++++++++++++++----------
 6 files changed, 387 insertions(+), 149 deletions(-)
 create mode 100644 app/test/test_hash_func_perf.c
  
De Lara Guarch, Pablo May 5, 2015, 2:43 p.m. UTC | #4
Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.

This patchset updates the current jhash in DPDK,
including two new functions that generate two hashes from a single key.

It also separates the existing hash function performance tests to
another file, to make it quicker to run.

changes in v3:

- Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
  functions

changes in v2:

- Split single commit in three commits, one that updates the existing functions
  and another that adds two new functions and use one of those functions
  as a base to be called by the other ones.
- Remove some unnecessary ifdefs in the code.
- Add new macros to help on the reutilization of constants
- Separate hash function performance tests to another file
  and improve cycle measurements.
- Rename existing function rte_jhash2 to rte_jhash_32b
  (something more meaninful) and mark rte_jhash2 as
  deprecated

De Lara Guarch, Pablo (1):
  hash: rename rte_jhash2 to rte_jhash_32b

Pablo de Lara (5):
  test/hash: move hash function perf tests to separate file
  test/hash: improve accuracy on cycle measurements
  hash: update jhash function with the latest available
  hash: add two new functions to jhash library
  hash: remove duplicated code

 app/test/Makefile               |    1 +
 app/test/test_func_reentrancy.c |    2 +-
 app/test/test_hash.c            |    4 +-
 app/test/test_hash_func_perf.c  |  145 +++++++++++++++++
 app/test/test_hash_perf.c       |   71 +--------
 lib/librte_hash/rte_jhash.h     |  336 +++++++++++++++++++++++++++++----------
 6 files changed, 400 insertions(+), 159 deletions(-)
 create mode 100644 app/test/test_hash_func_perf.c
  
De Lara Guarch, Pablo May 12, 2015, 11:02 a.m. UTC | #5
Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.

This patchset updates the current jhash in DPDK,
including two new functions that generate two hashes from a single key.

It also separates the existing hash function performance tests to
another file, to make it quicker to run.

changes in v4:
- Simplify key alignment checks
- Include missing x86 arch check

changes in v3:

- Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
  functions

changes in v2:

- Split single commit in three commits, one that updates the existing functions
  and another that adds two new functions and use one of those functions
  as a base to be called by the other ones.
- Remove some unnecessary ifdefs in the code.
- Add new macros to help on the reutilization of constants
- Separate hash function performance tests to another file
  and improve cycle measurements.
- Rename existing function rte_jhash2 to rte_jhash_32b
  (something more meaninful) and mark rte_jhash2 as
  deprecated

Pablo de Lara (6):
  test/hash: move hash function perf tests to separate file
  test/hash: improve accuracy on cycle measurements
  hash: update jhash function with the latest available
  hash: add two new functions to jhash library
  hash: remove duplicated code
  hash: rename rte_jhash2 to rte_jhash_32b

 app/test/Makefile               |    1 +
 app/test/test_func_reentrancy.c |    2 +-
 app/test/test_hash.c            |    4 +-
 app/test/test_hash_func_perf.c  |  145 +++++++++++++++++
 app/test/test_hash_perf.c       |   71 +--------
 lib/librte_hash/rte_jhash.h     |  338 +++++++++++++++++++++++++++++----------
 6 files changed, 402 insertions(+), 159 deletions(-)
 create mode 100644 app/test/test_hash_func_perf.c
  
Neil Horman May 12, 2015, 3:33 p.m. UTC | #6
On Tue, May 12, 2015 at 12:02:32PM +0100, Pablo de Lara wrote:
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 60% better performance, compared to the original one.
> 
> This patchset updates the current jhash in DPDK,
> including two new functions that generate two hashes from a single key.
> 
> It also separates the existing hash function performance tests to
> another file, to make it quicker to run.
> 
> changes in v4:
> - Simplify key alignment checks
> - Include missing x86 arch check
> 
> changes in v3:
> 
> - Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
>   functions
> 
> changes in v2:
> 
> - Split single commit in three commits, one that updates the existing functions
>   and another that adds two new functions and use one of those functions
>   as a base to be called by the other ones.
> - Remove some unnecessary ifdefs in the code.
> - Add new macros to help on the reutilization of constants
> - Separate hash function performance tests to another file
>   and improve cycle measurements.
> - Rename existing function rte_jhash2 to rte_jhash_32b
>   (something more meaninful) and mark rte_jhash2 as
>   deprecated
> 
> Pablo de Lara (6):
>   test/hash: move hash function perf tests to separate file
>   test/hash: improve accuracy on cycle measurements
>   hash: update jhash function with the latest available
>   hash: add two new functions to jhash library
>   hash: remove duplicated code
>   hash: rename rte_jhash2 to rte_jhash_32b
> 
>  app/test/Makefile               |    1 +
>  app/test/test_func_reentrancy.c |    2 +-
>  app/test/test_hash.c            |    4 +-
>  app/test/test_hash_func_perf.c  |  145 +++++++++++++++++
>  app/test/test_hash_perf.c       |   71 +--------
>  lib/librte_hash/rte_jhash.h     |  338 +++++++++++++++++++++++++++++----------
>  6 files changed, 402 insertions(+), 159 deletions(-)
>  create mode 100644 app/test/test_hash_func_perf.c
> 
> -- 
> 1.7.4.1
> 
> 
did you run this through the ABI checker?  I see you're removing several symbols
that will likely need to go through the ABI deprecation process.

Neil
  
De Lara Guarch, Pablo May 13, 2015, 1:52 p.m. UTC | #7
Hi Neil,

> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Tuesday, May 12, 2015 4:33 PM
> To: De Lara Guarch, Pablo
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v4 0/6] update jhash function
> 
> On Tue, May 12, 2015 at 12:02:32PM +0100, Pablo de Lara wrote:
> > Jenkins hash function was developed originally in 1996,
> > and was integrated in first versions of DPDK.
> > The function has been improved in 2006,
> > achieving up to 60% better performance, compared to the original one.
> >
> > This patchset updates the current jhash in DPDK,
> > including two new functions that generate two hashes from a single key.
> >
> > It also separates the existing hash function performance tests to
> > another file, to make it quicker to run.
> >
> > changes in v4:
> > - Simplify key alignment checks
> > - Include missing x86 arch check
> >
> > changes in v3:
> >
> > - Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
> >   functions
> >
> > changes in v2:
> >
> > - Split single commit in three commits, one that updates the existing
> functions
> >   and another that adds two new functions and use one of those functions
> >   as a base to be called by the other ones.
> > - Remove some unnecessary ifdefs in the code.
> > - Add new macros to help on the reutilization of constants
> > - Separate hash function performance tests to another file
> >   and improve cycle measurements.
> > - Rename existing function rte_jhash2 to rte_jhash_32b
> >   (something more meaninful) and mark rte_jhash2 as
> >   deprecated
> >
> > Pablo de Lara (6):
> >   test/hash: move hash function perf tests to separate file
> >   test/hash: improve accuracy on cycle measurements
> >   hash: update jhash function with the latest available
> >   hash: add two new functions to jhash library
> >   hash: remove duplicated code
> >   hash: rename rte_jhash2 to rte_jhash_32b
> >
> >  app/test/Makefile               |    1 +
> >  app/test/test_func_reentrancy.c |    2 +-
> >  app/test/test_hash.c            |    4 +-
> >  app/test/test_hash_func_perf.c  |  145 +++++++++++++++++
> >  app/test/test_hash_perf.c       |   71 +--------
> >  lib/librte_hash/rte_jhash.h     |  338 +++++++++++++++++++++++++++++-
> ---------
> >  6 files changed, 402 insertions(+), 159 deletions(-)
> >  create mode 100644 app/test/test_hash_func_perf.c
> >
> > --
> > 1.7.4.1
> >
> >
> did you run this through the ABI checker?  I see you're removing several
> symbols
> that will likely need to go through the ABI deprecation process.
> 
> Neil

I had not run it, but I just did. I see no problems on librte_hash
(but I see some on rte_ethdev.h, due to another commit).

Anyway, I renamed two functions to be more meaningful, but those functions are "static inline", 
so I am not sure exactly what the deprecation process is for those.
What I did was leaving the original function that calls the same function as the new renamed one,
but adds a line warning that the functions is deprecated.

Is that OK or should I do it differently?

Thanks!
Pablo
  
Neil Horman May 13, 2015, 2:20 p.m. UTC | #8
On Wed, May 13, 2015 at 01:52:33PM +0000, De Lara Guarch, Pablo wrote:
> Hi Neil,
> 
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Tuesday, May 12, 2015 4:33 PM
> > To: De Lara Guarch, Pablo
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v4 0/6] update jhash function
> > 
> > On Tue, May 12, 2015 at 12:02:32PM +0100, Pablo de Lara wrote:
> > > Jenkins hash function was developed originally in 1996,
> > > and was integrated in first versions of DPDK.
> > > The function has been improved in 2006,
> > > achieving up to 60% better performance, compared to the original one.
> > >
> > > This patchset updates the current jhash in DPDK,
> > > including two new functions that generate two hashes from a single key.
> > >
> > > It also separates the existing hash function performance tests to
> > > another file, to make it quicker to run.
> > >
> > > changes in v4:
> > > - Simplify key alignment checks
> > > - Include missing x86 arch check
> > >
> > > changes in v3:
> > >
> > > - Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
> > >   functions
> > >
> > > changes in v2:
> > >
> > > - Split single commit in three commits, one that updates the existing
> > functions
> > >   and another that adds two new functions and use one of those functions
> > >   as a base to be called by the other ones.
> > > - Remove some unnecessary ifdefs in the code.
> > > - Add new macros to help on the reutilization of constants
> > > - Separate hash function performance tests to another file
> > >   and improve cycle measurements.
> > > - Rename existing function rte_jhash2 to rte_jhash_32b
> > >   (something more meaninful) and mark rte_jhash2 as
> > >   deprecated
> > >
> > > Pablo de Lara (6):
> > >   test/hash: move hash function perf tests to separate file
> > >   test/hash: improve accuracy on cycle measurements
> > >   hash: update jhash function with the latest available
> > >   hash: add two new functions to jhash library
> > >   hash: remove duplicated code
> > >   hash: rename rte_jhash2 to rte_jhash_32b
> > >
> > >  app/test/Makefile               |    1 +
> > >  app/test/test_func_reentrancy.c |    2 +-
> > >  app/test/test_hash.c            |    4 +-
> > >  app/test/test_hash_func_perf.c  |  145 +++++++++++++++++
> > >  app/test/test_hash_perf.c       |   71 +--------
> > >  lib/librte_hash/rte_jhash.h     |  338 +++++++++++++++++++++++++++++-
> > ---------
> > >  6 files changed, 402 insertions(+), 159 deletions(-)
> > >  create mode 100644 app/test/test_hash_func_perf.c
> > >
> > > --
> > > 1.7.4.1
> > >
> > >
> > did you run this through the ABI checker?  I see you're removing several
> > symbols
> > that will likely need to go through the ABI deprecation process.
> > 
> > Neil
> 
> I had not run it, but I just did. I see no problems on librte_hash
> (but I see some on rte_ethdev.h, due to another commit).
> 
> Anyway, I renamed two functions to be more meaningful, but those functions are "static inline", 
> so I am not sure exactly what the deprecation process is for those.
> What I did was leaving the original function that calls the same function as the new renamed one,
> but adds a line warning that the functions is deprecated.
> 
> Is that OK or should I do it differently?
> 
As long as their all static inline and binaries that are already compiled can
continue to access the data structures they reference at the member offsets
encoded to them at compile time, you should be ok.

Thanks!
Neil

> Thanks!
> Pablo
>
  
Bruce Richardson May 18, 2015, 4:14 p.m. UTC | #9
On Tue, May 12, 2015 at 12:02:32PM +0100, Pablo de Lara wrote:
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 60% better performance, compared to the original one.
> 
> This patchset updates the current jhash in DPDK,
> including two new functions that generate two hashes from a single key.
> 
> It also separates the existing hash function performance tests to
> another file, to make it quicker to run.
> 
> changes in v4:
> - Simplify key alignment checks
> - Include missing x86 arch check
> 
> changes in v3:
> 
> - Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
>   functions
> 
> changes in v2:
> 
> - Split single commit in three commits, one that updates the existing functions
>   and another that adds two new functions and use one of those functions
>   as a base to be called by the other ones.
> - Remove some unnecessary ifdefs in the code.
> - Add new macros to help on the reutilization of constants
> - Separate hash function performance tests to another file
>   and improve cycle measurements.
> - Rename existing function rte_jhash2 to rte_jhash_32b
>   (something more meaninful) and mark rte_jhash2 as
>   deprecated
> 

Hi Pablo,

Patchset looks good to me, and unit tests all pass across the set. Some general
comments or suggestions though - particularly about testing.

1. The set of lengths used when testing the functions looks strange and rather
arbitrary. Perhaps we could have a set of key lengths which are documented. E.g.

	lengths[] = {
		4, 8, 16, 48, 64, /* standard key sizes */
		9,                /* IPv4 SRC + DST + protocol, unpadded */
		13,               /* IPv4 5-tuple, unpadded */
		37,               /* IPv6 5-tuple, unpadded */
		40,               /* IPv6 5-tuple, padded to 8-byte boundary */
	}

2. When testing multiple algorithms, it might be nice to change the order of the
loops so that we test all algorithms with the same key lengths first, and then
change length, rather than running the same algorithm with multiple lengths and
then changing algorithm. The output would be clearer and easier to see which
algorithm performs best for a given key-length.

3. For sanity checking across the patches making changes to the jhash functions,
I think it would be nice to have an initial sanity test with a set of known
keys and hash results in it. That way we can verify that the actual calculation
result never changes as the functions are modified. This would also be a big
help for future work changing the code. [As far as I can see, we don't ever check
in the algorithm checks that we are ever getting the right answer :-)]

All the above suggestions could perhaps go in a patch (or 2/3 patches) after the
first two, which splits out the algorithm tests, and before the actual changes
to the jhash implementation.

Regards,
/Bruce
  
De Lara Guarch, Pablo May 22, 2015, 10:16 a.m. UTC | #10
Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.

This patchset updates the current jhash in DPDK,
including two new functions that generate two hashes from a single key.

It also separates the existing hash function performance tests to
another file, to make it quicker to run, and add new unit tests.

changes in v5:
- Add functional tests (mainly to test that all functions 
  return the expected hash values)
- Modify range of key sizes to test
- Change order of output for perf tests, so it is clearer
  to compare different hash functions for same key size/initial value
- Add new initial value to test in the hash functions
- Fix some errors caught by checkpatch
 
changes in v4:
- Simplify key alignment checks
- Include missing x86 arch check

changes in v3:

- Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
  functions

changes in v2:

- Split single commit in three commits, one that updates the existing functions
  and another that adds two new functions and use one of those functions
  as a base to be called by the other ones.
- Remove some unnecessary ifdefs in the code.
- Add new macros to help on the reutilization of constants
- Separate hash function performance tests to another file
  and improve cycle measurements.
- Rename existing function rte_jhash2 to rte_jhash_32b
  (something more meaninful) and mark rte_jhash2 as
  deprecated

De Lara Guarch, Pablo (3):
  test/hash: move hash function perf tests to separate file
  test/hash: improve accuracy on cycle measurements
  hash: add two new functions to jhash library

Pablo de Lara (7):
  test/hash: update key size range and initial values for testing
  test/hash: change order of loops in hash function tests
  test/hash: add new functional tests for hash functions
  hash: update jhash function with the latest available
  hash: remove duplicated code
  hash: rename rte_jhash2 to rte_jhash_32b
  test/hash: verify rte_jhash_1word/2words/3words

 app/test/Makefile               |    1 +
 app/test/test_func_reentrancy.c |    2 +-
 app/test/test_hash.c            |    4 +-
 app/test/test_hash_functions.c  |  322 +++++++++++++++++++++++++++++++++++++
 app/test/test_hash_perf.c       |   71 +--------
 lib/librte_hash/rte_jhash.h     |  338 +++++++++++++++++++++++++++++----------
 6 files changed, 579 insertions(+), 159 deletions(-)
 create mode 100644 app/test/test_hash_functions.c
  
De Lara Guarch, Pablo June 10, 2015, 3:25 p.m. UTC | #11
Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 35% better performance, compared to the original one.

This patchset updates the current jhash in DPDK,
including two new functions that generate two hashes from a single key.

It also separates the existing hash function performance tests to
another file, to make it quicker to run, and add new unit tests.

changes in v6:
- Use RTE_DIM macro, so it saves lines of code
- Correct mistaken performance improvement
- Add deprecated attribute, instead of printing a message calling it
- Add note stating the changes in release notes

changes in v5:
- Add functional tests (mainly to test that all functions 
  return the expected hash values)
- Modify range of key sizes to test
- Change order of output for perf tests, so it is clearer
  to compare different hash functions for same key size/initial value
- Add new initial value to test in the hash functions
- Fix some errors caught by checkpatch
 
changes in v4:
- Simplify key alignment checks
- Include missing x86 arch check

changes in v3:

- Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
  functions

changes in v2:

- Split single commit in three commits, one that updates the existing functions
  and another that adds two new functions and use one of those functions
  as a base to be called by the other ones.
- Remove some unnecessary ifdefs in the code.
- Add new macros to help on the reutilization of constants
- Separate hash function performance tests to another file
  and improve cycle measurements.
- Rename existing function rte_jhash2 to rte_jhash_32b
  (something more meaninful) and mark rte_jhash2 as
  deprecated

De Lara Guarch, Pablo (6):
  test/hash: move hash function perf tests to separate file
  test/hash: improve accuracy on cycle measurements
  test/hash: update key size range and initial values for testing
  test/hash: add new functional tests for hash functions
  hash: add two new functions to jhash library
  test/hash: verify rte_jhash_1word/2words/3words
  test/hash: change order of loops in hash function tests
  hash: update jhash function with the latest available
  hash: remove duplicated code
  hash: rename rte_jhash2 to rte_jhash_32b

 app/test/Makefile                     |   1 +
 app/test/test_func_reentrancy.c       |   2 +-
 app/test/test_hash.c                  |   4 +-
 app/test/test_hash_functions.c        | 321 ++++++++++++++++++++++++++++++++
 app/test/test_hash_perf.c             |  71 +------
 doc/guides/rel_notes/new_features.rst |   5 +
 lib/librte_hash/rte_jhash.h           | 339 +++++++++++++++++++++++++---------
 7 files changed, 584 insertions(+), 159 deletions(-)
 create mode 100644 app/test/test_hash_functions.c
  
Bruce Richardson June 12, 2015, 10:37 a.m. UTC | #12
On Wed, Jun 10, 2015 at 04:25:17PM +0100, Pablo de Lara wrote:
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 35% better performance, compared to the original one.
> 
> This patchset updates the current jhash in DPDK,
> including two new functions that generate two hashes from a single key.
> 
> It also separates the existing hash function performance tests to
> another file, to make it quicker to run, and add new unit tests.
> 
> changes in v6:
> - Use RTE_DIM macro, so it saves lines of code
> - Correct mistaken performance improvement
> - Add deprecated attribute, instead of printing a message calling it
> - Add note stating the changes in release notes
> 
> changes in v5:
> - Add functional tests (mainly to test that all functions 
>   return the expected hash values)
> - Modify range of key sizes to test
> - Change order of output for perf tests, so it is clearer
>   to compare different hash functions for same key size/initial value
> - Add new initial value to test in the hash functions
> - Fix some errors caught by checkpatch
>  
> changes in v4:
> - Simplify key alignment checks
> - Include missing x86 arch check
> 
> changes in v3:
> 
> - Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
>   functions
> 
> changes in v2:
> 
> - Split single commit in three commits, one that updates the existing functions
>   and another that adds two new functions and use one of those functions
>   as a base to be called by the other ones.
> - Remove some unnecessary ifdefs in the code.
> - Add new macros to help on the reutilization of constants
> - Separate hash function performance tests to another file
>   and improve cycle measurements.
> - Rename existing function rte_jhash2 to rte_jhash_32b
>   (something more meaninful) and mark rte_jhash2 as
>   deprecated
>
Thanks for the all the work, and rework, Pablo.

Series Acked-by: Bruce Richardson <bruce.richardson@intel.com>
  
Thomas Monjalon June 16, 2015, 10:22 a.m. UTC | #13
2015-06-12 11:37, Bruce Richardson:
> On Wed, Jun 10, 2015 at 04:25:17PM +0100, Pablo de Lara wrote:
> > Jenkins hash function was developed originally in 1996,
> > and was integrated in first versions of DPDK.
> > The function has been improved in 2006,
> > achieving up to 35% better performance, compared to the original one.
> > 
> > This patchset updates the current jhash in DPDK,
> > including two new functions that generate two hashes from a single key.
> > 
> > It also separates the existing hash function performance tests to
> > another file, to make it quicker to run, and add new unit tests.
> > 
> > changes in v6:
> > - Use RTE_DIM macro, so it saves lines of code
> > - Correct mistaken performance improvement
> > - Add deprecated attribute, instead of printing a message calling it
> > - Add note stating the changes in release notes
> > 
> > changes in v5:
> > - Add functional tests (mainly to test that all functions 
> >   return the expected hash values)
> > - Modify range of key sizes to test
> > - Change order of output for perf tests, so it is clearer
> >   to compare different hash functions for same key size/initial value
> > - Add new initial value to test in the hash functions
> > - Fix some errors caught by checkpatch
> >  
> > changes in v4:
> > - Simplify key alignment checks
> > - Include missing x86 arch check
> > 
> > changes in v3:
> > 
> > - Update rte_jhash_1word, rte_jhash_2words and rte_jhash_3words
> >   functions
> > 
> > changes in v2:
> > 
> > - Split single commit in three commits, one that updates the existing functions
> >   and another that adds two new functions and use one of those functions
> >   as a base to be called by the other ones.
> > - Remove some unnecessary ifdefs in the code.
> > - Add new macros to help on the reutilization of constants
> > - Separate hash function performance tests to another file
> >   and improve cycle measurements.
> > - Rename existing function rte_jhash2 to rte_jhash_32b
> >   (something more meaninful) and mark rte_jhash2 as
> >   deprecated
> >
> Thanks for the all the work, and rework, Pablo.
> 
> Series Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Applied, thanks
Some doxygen typos has been fixed on the fly.
  

Patch

diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
index a4bf5a1..3de006d 100644
--- a/lib/librte_hash/rte_jhash.h
+++ b/lib/librte_hash/rte_jhash.h
@@ -1,7 +1,7 @@ 
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -45,38 +45,51 @@  extern "C" {
 #endif
 
 #include <stdint.h>
+#include <rte_byteorder.h>
 
 /* jhash.h: Jenkins hash support.
  *
- * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
  *
  * http://burtleburtle.net/bob/hash/
  *
  * These are the credits from Bob's sources:
  *
- * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
- * hash(), hash2(), hash3, and mix() are externally useful functions.
- * Routines to test the hash are included if SELF_TEST is defined.
- * You can use this free for any purpose.  It has no warranty.
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions.  Routines to test the hash are included
+ * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+ * the public domain.  It has no warranty.
  *
  * $FreeBSD$
  */
 
+#define rot(x, k) (((x)<<(k)) | ((x)>>(32-(k))))
+
 /** @internal Internal function. NOTE: Arguments are modified. */
 #define __rte_jhash_mix(a, b, c) do { \
-	a -= b; a -= c; a ^= (c>>13); \
-	b -= c; b -= a; b ^= (a<<8); \
-	c -= a; c -= b; c ^= (b>>13); \
-	a -= b; a -= c; a ^= (c>>12); \
-	b -= c; b -= a; b ^= (a<<16); \
-	c -= a; c -= b; c ^= (b>>5); \
-	a -= b; a -= c; a ^= (c>>3); \
-	b -= c; b -= a; b ^= (a<<10); \
-	c -= a; c -= b; c ^= (b>>15); \
+	a -= c; a ^= rot(c, 4); c += b; \
+	b -= a; b ^= rot(a, 6); a += c; \
+	c -= b; c ^= rot(b, 8); b += a; \
+	a -= c; a ^= rot(c, 16); c += b; \
+	b -= a; b ^= rot(a, 19); a += c; \
+	c -= b; c ^= rot(b, 4); b += a; \
+} while (0)
+
+#define __rte_jhash_final(a, b, c) do { \
+	c ^= b; c -= rot(b, 14); \
+	a ^= c; a -= rot(c, 11); \
+	b ^= a; b -= rot(a, 25); \
+	c ^= b; c -= rot(b, 16); \
+	a ^= c; a -= rot(c, 4);  \
+	b ^= a; b -= rot(a, 14); \
+	c ^= b; c -= rot(b, 24); \
 } while (0)
 
 /** The golden ratio: an arbitrary value. */
-#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
+#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
 
 /**
  * The most generic version, hashes an arbitrary sequence
@@ -95,42 +108,256 @@  extern "C" {
 static inline uint32_t
 rte_jhash(const void *key, uint32_t length, uint32_t initval)
 {
-	uint32_t a, b, c, len;
-	const uint8_t *k = (const uint8_t *)key;
-	const uint32_t *k32 = (const uint32_t *)key;
+	uint32_t a, b, c;
+	union {
+		const void *ptr;
+		size_t i;
+	} u;
 
-	len = length;
-	a = b = RTE_JHASH_GOLDEN_RATIO;
-	c = initval;
+	/* Set up the internal state */
+	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
 
-	while (len >= 12) {
-		a += k32[0];
-		b += k32[1];
-		c += k32[2];
+	u.ptr = key;
 
-		__rte_jhash_mix(a,b,c);
+	if ((u.i & 0x3) == 0) {
+		const uint32_t *k = (const uint32_t *)key;
 
-		k += (3 * sizeof(uint32_t)), k32 += 3;
-		len -= (3 * sizeof(uint32_t));
-	}
+		while (length > 12) {
+			a += k[0];
+			b += k[1];
+			c += k[2];
 
-	c += length;
-	switch (len) {
-		case 11: c += ((uint32_t)k[10] << 24);
-		case 10: c += ((uint32_t)k[9] << 16);
-		case 9 : c += ((uint32_t)k[8] << 8);
-		case 8 : b += ((uint32_t)k[7] << 24);
-		case 7 : b += ((uint32_t)k[6] << 16);
-		case 6 : b += ((uint32_t)k[5] << 8);
-		case 5 : b += k[4];
-		case 4 : a += ((uint32_t)k[3] << 24);
-		case 3 : a += ((uint32_t)k[2] << 16);
-		case 2 : a += ((uint32_t)k[1] << 8);
-		case 1 : a += k[0];
-		default: break;
-	};
+			__rte_jhash_mix(a, b, c);
+
+			k += 3;
+			length -= 12;
+		}
+
+		switch (length) {
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+		case 12:
+			c += k[2]; b += k[1]; a += k[0]; break;
+		case 11:
+			c += k[2]&0xffffff; b += k[1]; a += k[0]; break;
+		case 10:
+			c += k[2]&0xffff; b += k[1]; a += k[0]; break;
+		case 9:
+			c += k[2]&0xff; b += k[1]; a += k[0]; break;
+		case 8:
+			b += k[1]; a += k[0]; break;
+		case 7:
+			b += k[1]&0xffffff; a += k[0]; break;
+		case 6:
+			b += k[1]&0xffff; a += k[0]; break;
+		case 5:
+			b += k[1]&0xff; a += k[0]; break;
+		case 4:
+			a += k[0]; break;
+		case 3:
+			a += k[0]&0xffffff; break;
+		case 2:
+			a += k[0]&0xffff; break;
+		case 1:
+			a += k[0]&0xff; break;
+#else
+		case 12:
+			c += k[2]; b += k[1]; a += k[0]; break;
+		case 11:
+			c += k[2]&0xffffff00; b += k[1]; a += k[0]; break;
+		case 10:
+			c += k[2]&0xffff0000; b += k[1]; a += k[0]; break;
+		case 9:
+			c += k[2]&0xff000000; b += k[1]; a += k[0]; break;
+		case 8:
+			b += k[1]; a += k[0]; break;
+		case 7:
+			b += k[1]&0xffffff00; a += k[0]; break;
+		case 6:
+			b += k[1]&0xffff0000; a += k[0]; break;
+		case 5:
+			b += k[1]&0xff000000; a += k[0]; break;
+		case 4:
+			a += k[0]; break;
+		case 3:
+			a += k[0]&0xffffff00; break;
+		case 2:
+			a += k[0]&0xffff0000; break;
+		case 1:
+			a += k[0]&0xff000000; break;
+#endif
+		/* zero length strings require no mixing */
+		case 0:
+			return c;
+		};
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+	} else if ((u.i & 0x1) == 0) {
+		/* read 16-bit chunks */
+		const uint16_t *k = (const uint16_t *)key;
+		const uint8_t  *k8;
+
+		/* all but last block: aligned reads and different mixing */
+		while (length > 12) {
+			a += k[0] + (((uint32_t)k[1])<<16);
+			b += k[2] + (((uint32_t)k[3])<<16);
+			c += k[4] + (((uint32_t)k[5])<<16);
+
+			__rte_jhash_mix(a, b, c);
+
+			k += 6;
+			length -= 12;
+		}
+
+		/* handle the last (probably partial) block */
+		k8 = (const uint8_t *)k;
+		switch (length) {
+		case 12:
+			c += k[4]+(((uint32_t)k[5])<<16);
+			b += k[2]+(((uint32_t)k[3])<<16);
+			a += k[0]+(((uint32_t)k[1])<<16);
+			break;
+		case 11:
+			/* fall through */
+			c += ((uint32_t)k8[10])<<16;
+		case 10:
+			c += k[4];
+			b += k[2]+(((uint32_t)k[3])<<16);
+			a += k[0]+(((uint32_t)k[1])<<16);
+			break;
+		case 9:
+			/* fall through */
+			c += k8[8];
+		case 8:
+			b += k[2]+(((uint32_t)k[3])<<16);
+			a += k[0]+(((uint32_t)k[1])<<16);
+			break;
+		case 7:
+			/* fall through */
+			b += ((uint32_t)k8[6])<<16;
+		case 6:
+			b += k[2];
+			a += k[0]+(((uint32_t)k[1])<<16);
+			break;
+		case 5:
+			/* fall through */
+			b += k8[4];
+		case 4:
+			a += k[0]+(((uint32_t)k[1])<<16);
+			break;
+		case 3:
+			/* fall through */
+			a += ((uint32_t)k8[2])<<16;
+		case 2:
+			a += k[0];
+			break;
+		case 1:
+			a += k8[0];
+			break;
+		case 0:
+			/* zero length requires no mixing */
+			return c;
+		}
+#endif
+	} else {
+		const uint8_t *k = (const uint8_t *)key;
+
+		/* all but the last block: affect some 32 bits of (a, b, c) */
+		while (length > 12) {
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+			a += k[0];
+			a += ((uint32_t)k[1])<<8;
+			a += ((uint32_t)k[2])<<16;
+			a += ((uint32_t)k[3])<<24;
+			b += k[4];
+			b += ((uint32_t)k[5])<<8;
+			b += ((uint32_t)k[6])<<16;
+			b += ((uint32_t)k[7])<<24;
+			c += k[8];
+			c += ((uint32_t)k[9])<<8;
+			c += ((uint32_t)k[10])<<16;
+			c += ((uint32_t)k[11])<<24;
+#else
+			a += ((uint32_t)k[0])<<24;
+			a += ((uint32_t)k[1])<<16;
+			a += ((uint32_t)k[2])<<8;
+			a += ((uint32_t)k[3]);
+			b += ((uint32_t)k[4])<<24;
+			b += ((uint32_t)k[5])<<16;
+			b += ((uint32_t)k[6])<<8;
+			b += ((uint32_t)k[7]);
+			c += ((uint32_t)k[8])<<32;
+			c += ((uint32_t)k[9])<<16;
+			c += ((uint32_t)k[10])<<8;
+			c += ((uint32_t)k[11]);
+#endif
+
+			__rte_jhash_mix(a, b, c);
 
-	__rte_jhash_mix(a,b,c);
+			k += 12;
+			length -= 12;
+		}
+
+		/* last block: affect all 32 bits of (c) */
+		/* all the case statements fall through */
+		switch (length) {
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+		case 12:
+			c += ((uint32_t)k[11])<<24;
+		case 11:
+			c += ((uint32_t)k[10])<<16;
+		case 10:
+			c += ((uint32_t)k[9])<<8;
+		case 9:
+			c += k[8];
+		case 8:
+			b += ((uint32_t)k[7])<<24;
+		case 7:
+			b += ((uint32_t)k[6])<<16;
+		case 6:
+			b += ((uint32_t)k[5])<<8;
+		case 5:
+			b += k[4];
+		case 4:
+			a += ((uint32_t)k[3])<<24;
+		case 3:
+			a += ((uint32_t)k[2])<<16;
+		case 2:
+			a += ((uint32_t)k[1])<<8;
+		case 1:
+			a += k[0];
+		break;
+#else
+		case 12:
+			c += k[11];
+		case 11:
+			c += ((uint32_t)k[10])<<8;
+		case 10:
+			c += ((uint32_t)k[9])<<16;
+		case 9:
+			c += ((uint32_t)k[8])<<24;
+		case 8:
+			b += k[7];
+		case 7:
+			b += ((uint32_t)k[6])<<8;
+		case 6:
+			b += ((uint32_t)k[5])<<16;
+		case 5:
+			b += ((uint32_t)k[4])<<24;
+		case 4:
+			a += k[3];
+		case 3:
+			a += ((uint32_t)k[2])<<8;
+		case 2:
+			a += ((uint32_t)k[1])<<16;
+		case 1:
+			a += ((uint32_t)k[0])<<24;
+		break;
+#endif
+		case 0:
+			return c;
+		}
+	}
+
+	__rte_jhash_final(a, b, c);
 
 	return c;
 }
@@ -151,33 +378,93 @@  rte_jhash(const void *key, uint32_t length, uint32_t initval)
 static inline uint32_t
 rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
 {
-	uint32_t a, b, c, len;
+	uint32_t a, b, c;
 
-	a = b = RTE_JHASH_GOLDEN_RATIO;
-	c = initval;
-	len = length;
+	/* Set up the internal state */
+	a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length)<<2) + initval;
 
-	while (len >= 3) {
+	/* Handle most of the key */
+	while (length > 3) {
 		a += k[0];
 		b += k[1];
 		c += k[2];
+
 		__rte_jhash_mix(a, b, c);
-		k += 3; len -= 3;
-	}
 
-	c += length * 4;
+		k += 3;
+		length -= 3;
+	}
 
-	switch (len) {
-		case 2 : b += k[1];
-		case 1 : a += k[0];
-		default: break;
+	/* Handle the last 3 uint32_t's */
+	switch (length) {
+	case 3:
+		c += k[2];
+	case 2:
+		b += k[1];
+	case 1:
+		a += k[0];
+		__rte_jhash_final(a, b, c);
+	/* case 0: nothing left to add */
+	case 0:
+		break;
 	};
 
-	__rte_jhash_mix(a,b,c);
-
 	return c;
 }
 
+/**
+ * Same as rte_jhash2, but take two seeds and return two uint32_ts.
+ * pc and pb must be non-null, and *pc and *pb must both be initialized
+ * with seeds. If you pass in (*pb)=0, the output (*pc) will be
+ * the same as the return value from rte_jhash.
+ *
+ * @param k
+ *   Key to calculate hash of.
+ * @param length
+ *   Length of key in units of 4 bytes.
+ * @param pc
+ *   IN: seed OUT: primary hash value.
+ * @param pc
+ *   IN: second seed OUT: secondary hash value.
+ */
+static inline void
+rte_jhash_word2(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb)
+{
+	uint32_t a, b, c;
+
+	/* Set up the internal state */
+	a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length)<<2) + *pc;
+	c += *pb;
+
+	/* Handle most of the key */
+	while (length > 3) {
+		a += k[0];
+		b += k[1];
+		c += k[2];
+
+		__rte_jhash_mix(a, b, c);
+
+		k += 3;
+		length -= 3;
+	}
+
+	/* Handle the last 3 uint32_t's */
+	switch (length) {
+	case 3:
+		c += k[2];
+	case 2:
+		b += k[1];
+	case 1:
+		a += k[0];
+		__rte_jhash_final(a, b, c);
+	/* case 0: nothing left to add */
+	case 0:
+		break;
+	};
+
+	*pc = c;
+	*pb = b;
+}
 
 /**
  * A special ultra-optimized versions that knows it is hashing exactly