[dpdk-dev,v3] Implement memcmp using Intel SIMD instrinsics.

Message ID 1431979303-1346-2-git-send-email-rkerur@gmail.com (mailing list archive)
State Changes Requested, archived
Headers

Commit Message

Ravi Kerur May 18, 2015, 8:01 p.m. UTC
  This patch implements memcmp and use librte_hash as the first candidate
to use rte_memcmp which is implemented using AVX/SSE intrinsics.

Tested with GCC(4.8.2) and Clang(3.4-1) compilers and both tests show better
performance on Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz, Ubuntu 14.04
x86_64 shows when compared to memcmp.

Changes in v3:
Implement complete memcmp functionality.
Implement functional and performance tests and add it to
"make test" infrastructure code.

Changes in v2:
Modified code to support only upto 64 bytes as that's the max bytes
used by hash for comparison.

Changes in v1:
Initial changes to support memcmp with support upto 128 bytes.

Signed-off-by: Ravi Kerur <rkerur@gmail.com>
---
 app/test/Makefile                                  |   5 +-
 app/test/autotest_data.py                          |  19 +
 app/test/test_hash_perf.c                          |  36 +-
 app/test/test_memcmp.c                             | 229 ++++++
 app/test/test_memcmp_perf.c                        | 339 ++++++++
 .../common/include/arch/ppc_64/rte_memcmp.h        |  62 ++
 .../common/include/arch/x86/rte_memcmp.h           | 900 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_memcmp.h | 175 ++++
 lib/librte_hash/rte_hash.c                         |  59 +-
 9 files changed, 1789 insertions(+), 35 deletions(-)
 create mode 100644 app/test/test_memcmp.c
 create mode 100644 app/test/test_memcmp_perf.c
 create mode 100644 lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memcmp.h
 create mode 100644 lib/librte_eal/common/include/generic/rte_memcmp.h
  

Comments

Stephen Hemminger Oct. 14, 2015, 12:32 a.m. UTC | #1
On Mon, 18 May 2015 13:01:43 -0700
Ravi Kerur <rkerur@gmail.com> wrote:

> This patch implements memcmp and use librte_hash as the first candidate
> to use rte_memcmp which is implemented using AVX/SSE intrinsics.
> 
> Tested with GCC(4.8.2) and Clang(3.4-1) compilers and both tests show better
> performance on Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz, Ubuntu 14.04
> x86_64 shows when compared to memcmp.
> 
> Changes in v3:
> Implement complete memcmp functionality.
> Implement functional and performance tests and add it to
> "make test" infrastructure code.
> 
> Changes in v2:
> Modified code to support only upto 64 bytes as that's the max bytes
> used by hash for comparison.
> 
> Changes in v1:
> Initial changes to support memcmp with support upto 128 bytes.
> 
> Signed-off-by: Ravi Kerur <rkerur@gmail.com>

I think this idea is best taken over to glibc not here.
The issue is that Gcc default version of memcmp inline is bad and that
is what needs to be fixed.

See later discussion in email thread with Gcc intrinsic developer.
  
Zhihong Wang Jan. 28, 2016, 3:08 a.m. UTC | #2
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> /librte_eal/common/include/arch/x86/rte_memcmp.h

[...]

> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compare bytes between two locations. The locations must not overlap.
> + *

Parameter names should be kept consistent as they are in function body.

> + * @param src_1
> + *   Pointer to the first source of the data.
> + * @param src_2
> + *   Pointer to the second source of the data.
> + * @param n
> + *   Number of bytes to compare.
> + * @return
> + *   zero if src_1 equal src_2
> + *   -ve if src_1 less than src_2
> + *   +ve if src_1 greater than src_2
> + */
> +static inline int
> +rte_memcmp(const void *src_1, const void *src,
> +		size_t n) __attribute__((always_inline));
> +
> +/**
> + * Find the first different bit for comparison.
> + */
> +static inline int
> +rte_cmpffd (uint32_t x, uint32_t y)
> +{
> +	int i;
> +	int pos = x ^ y;
> +	for (i = 0; i < 32; i++)
> +		if (pos & (1<<i))

Coding style check :-)
BTW, does the bsf instruction provide this check?

> +			return i;
> +	return -1;
> +}
> +

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Guess this is not used.

[...]

> +/**
> + * Compare 256 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp256(const void *src_1, const void *src_2)
> +{
> +	int ret;
> +
> +	ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> +			(const uint8_t *)src_2 + 0 * 64);

Why not just use rte_cmp128?


[...]

> +static inline int
> +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> +{
> +	const uint8_t *src_1 = (const uint8_t *)_src_1;
> +	const uint8_t *src_2 = (const uint8_t *)_src_2;
> +	int ret = 0;
> +
> +	if (n < 16)
> +		return rte_memcmp_regular(src_1, src_2, n);
> +
> +	if (n <= 32) {
> +		ret = rte_cmp16(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +

Too many conditions here may harm the overall performance.
It's a trade-off thing, all about balancing the overhead.
Just make sure this is tuned based on actual test numbers.


> +	if (n <= 48) {
> +		ret = rte_cmp32(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +
> +	if (n <= 64) {
> +		ret = rte_cmp32(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp16(src_1 + 32, src_2 + 32);
> +
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +
> +	if (n <= 96) {
> +		ret = rte_cmp64(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp16(src_1 + 64, src_2 + 64);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}
> +
> +	if (n <= 128) {
> +		ret = rte_cmp64(src_1, src_2);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp32(src_1 + 64, src_2 + 64);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		ret = rte_cmp16(src_1 + 96, src_2 + 96);
> +		if (unlikely(ret != 0))
> +			return ret;
> +
> +		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> +	}

[...]

> +/**
> + * Compare 48 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp48(const void *src_1, const void *src_2)

Not used.

> +{
> +	int ret;
> +
> +	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> +			(const uint8_t *)src_2 + 0 * 16);
> +
> +	if (unlikely(ret != 0))
> +		return ret;
> +
> +	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> +			(const uint8_t *)src_2 + 1 * 16);
> +
> +	if (unlikely(ret != 0))
> +		return ret;
> +
> +	return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
> +			(const uint8_t *)src_2 + 2 * 16);
> +}
> +
> +/**
> + * Compare 64 bytes between two locations.
> + * Locations should not overlap.
> + */
> +static inline int
> +rte_cmp64(const void *src_1, const void *src_2)
> +{
> +	int ret;
> +
> +	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> +			(const uint8_t *)src_2 + 0 * 16);

Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so on.
That should make the code looks clearer.


It'd be great if you could format this patch into a patch set with several
little ones. :-)
Also, the kernel checkpatch is very helpful.
Good coding style and patch organization make it easy for in-depth reviews.
  
Ravi Kerur Feb. 19, 2016, 5:50 p.m. UTC | #3
On Wed, Jan 27, 2016 at 7:08 PM, Zhihong Wang <zhihong.wang@intel.com>
wrote:

> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib
> > /librte_eal/common/include/arch/x86/rte_memcmp.h
>
> [...]
>
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +/**
> > + * Compare bytes between two locations. The locations must not overlap.
> > + *
>
> Parameter names should be kept consistent as they are in function body.
>
> > + * @param src_1
> > + *   Pointer to the first source of the data.
> > + * @param src_2
> > + *   Pointer to the second source of the data.
> > + * @param n
> > + *   Number of bytes to compare.
> > + * @return
> > + *   zero if src_1 equal src_2
> > + *   -ve if src_1 less than src_2
> > + *   +ve if src_1 greater than src_2
> > + */
> > +static inline int
> > +rte_memcmp(const void *src_1, const void *src,
> > +             size_t n) __attribute__((always_inline));
> > +
> > +/**
> > + * Find the first different bit for comparison.
> > + */
> > +static inline int
> > +rte_cmpffd (uint32_t x, uint32_t y)
> > +{
> > +     int i;
> > +     int pos = x ^ y;
> > +     for (i = 0; i < 32; i++)
> > +             if (pos & (1<<i))
>
> Coding style check :-)
> BTW, does the bsf instruction provide this check?
>
> > +                     return i;
> > +     return -1;
> > +}
> > +
>
> [...]
>
> > +/**
> > + * Compare 48 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp48(const void *src_1, const void *src_2)
>
> Guess this is not used.
>

I had left _unused_ with the assumption that it might be needed when actual
performance tests are done on high end servers.

>
> [...]
>
> > +/**
> > + * Compare 256 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp256(const void *src_1, const void *src_2)
> > +{
> > +     int ret;
> > +
> > +     ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
> > +                     (const uint8_t *)src_2 + 0 * 64);
>
> Why not just use rte_cmp128?
>
>
> [...]
>
> > +static inline int
> > +rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
> > +{
> > +     const uint8_t *src_1 = (const uint8_t *)_src_1;
> > +     const uint8_t *src_2 = (const uint8_t *)_src_2;
> > +     int ret = 0;
> > +
> > +     if (n < 16)
> > +             return rte_memcmp_regular(src_1, src_2, n);
> > +
> > +     if (n <= 32) {
> > +             ret = rte_cmp16(src_1, src_2);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > +     }
> > +
>
> Too many conditions here may harm the overall performance.
> It's a trade-off thing, all about balancing the overhead.
> Just make sure this is tuned based on actual test numbers.
>
>
> > +     if (n <= 48) {
> > +             ret = rte_cmp32(src_1, src_2);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > +     }
> > +
> > +     if (n <= 64) {
> > +             ret = rte_cmp32(src_1, src_2);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             ret = rte_cmp16(src_1 + 32, src_2 + 32);
> > +
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > +     }
> > +
> > +     if (n <= 96) {
> > +             ret = rte_cmp64(src_1, src_2);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             ret = rte_cmp16(src_1 + 64, src_2 + 64);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > +     }
> > +
> > +     if (n <= 128) {
> > +             ret = rte_cmp64(src_1, src_2);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             ret = rte_cmp32(src_1 + 64, src_2 + 64);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             ret = rte_cmp16(src_1 + 96, src_2 + 96);
> > +             if (unlikely(ret != 0))
> > +                     return ret;
> > +
> > +             return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
> > +     }
>
> [...]
>
> > +/**
> > + * Compare 48 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp48(const void *src_1, const void *src_2)
>
> Not used.
>
> > +{
> > +     int ret;
> > +
> > +     ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> > +                     (const uint8_t *)src_2 + 0 * 16);
> > +
> > +     if (unlikely(ret != 0))
> > +             return ret;
> > +
> > +     ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
> > +                     (const uint8_t *)src_2 + 1 * 16);
> > +
> > +     if (unlikely(ret != 0))
> > +             return ret;
> > +
> > +     return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
> > +                     (const uint8_t *)src_2 + 2 * 16);
> > +}
> > +
> > +/**
> > + * Compare 64 bytes between two locations.
> > + * Locations should not overlap.
> > + */
> > +static inline int
> > +rte_cmp64(const void *src_1, const void *src_2)
> > +{
> > +     int ret;
> > +
> > +     ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
> > +                     (const uint8_t *)src_2 + 0 * 16);
>
> Why not rte_cmp32? And use rte_cmp64 for rte_cmp128, and so on.
> That should make the code looks clearer.
>
>
> It'd be great if you could format this patch into a patch set with several
> little ones. :-)
> Also, the kernel checkpatch is very helpful.
> Good coding style and patch organization make it easy for in-depth reviews.
>
>
Combination of scalar and vector (32/64/128) was done to get optimal
performance numbers. If there is enough interest in this I can work on it
and provide an updated patch set.

Thanks,
Ravi
  
Zhihong Wang Feb. 23, 2016, 12:22 p.m. UTC | #4
> > It'd be great if you could format this patch into a patch set with several

> > little ones. :-)

> > Also, the kernel checkpatch is very helpful.

> > Good coding style and patch organization make it easy for in-depth reviews.

> > 

> Combination of scalar and vector (32/64/128) was done to get optimal performance numbers. If there is enough interest in this I can work on it and provide an updated patch set.


That'll be very helpful! Looking forward to your patch :)
BTW, have you tested real example performance with your patch?
  
Ravi Kerur Feb. 24, 2016, 4 a.m. UTC | #5
On Tue, Feb 23, 2016 at 4:22 AM, Wang, Zhihong <zhihong.wang@intel.com>
wrote:

> > > It'd be great if you could format this patch into a patch set with
> several
> > > little ones. :-)
> > > Also, the kernel checkpatch is very helpful.
> > > Good coding style and patch organization make it easy for in-depth
> reviews.
> > >
> > Combination of scalar and vector (32/64/128) was done to get optimal
> performance numbers. If there is enough interest in this I can work on it
> and provide an updated patch set.
>
> That'll be very helpful! Looking forward to your patch :)
> BTW, have you tested real example performance with your patch?
>

Yes it was tested with hash functions in dpdk code.I will work on it and
send updated patch. Thanks for your inputs I will incorporate them in next
patch series.
  

Patch

diff --git a/app/test/Makefile b/app/test/Makefile
index 4aca77c..957e4f1 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -81,6 +81,9 @@  SRCS-y += test_logs.c
 SRCS-y += test_memcpy.c
 SRCS-y += test_memcpy_perf.c
 
+SRCS-y += test_memcmp.c
+SRCS-y += test_memcmp_perf.c
+
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_perf.c
 
@@ -150,7 +153,7 @@  CFLAGS_test_kni.o += -Wno-deprecated-declarations
 endif
 CFLAGS += -D_GNU_SOURCE
 
-# Disable VTA for memcpy test
+# Disable VTA for memcpy tests
 ifeq ($(CC), gcc)
 ifeq ($(shell test $(GCC_VERSION) -ge 44 && echo 1), 1)
 CFLAGS_test_memcpy.o += -fno-var-tracking-assignments
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 618a946..e07f087 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -187,6 +187,12 @@  parallel_test_group_list = [
 		 "Report" :	None,
 		},
 		{
+		 "Name" :	"Memcmp autotest",
+		 "Command" : 	"memcmp_autotest",
+		 "Func" :	default_autotest,
+		 "Report" :	None,
+		},
+		{
 		 "Name" :	"Memzone autotest",
 		 "Command" : 	"memzone_autotest",
 		 "Func" :	default_autotest,
@@ -399,6 +405,19 @@  non_parallel_test_group_list = [
 	]
 },
 {
+	"Prefix":	"memcmp_perf",
+	"Memory" :	all_sockets(512),
+	"Tests" :
+	[
+		{
+		 "Name" :	"Memcmp performance autotest",
+		 "Command" : 	"memcmp_perf_autotest",
+		 "Func" :	default_autotest,
+		 "Report" :	None,
+		},
+	]
+},
+{
 	"Prefix":	"hash_perf",
 	"Memory" :	all_sockets(512),
 	"Tests" :	
diff --git a/app/test/test_hash_perf.c b/app/test/test_hash_perf.c
index 6eabb21..6887629 100644
--- a/app/test/test_hash_perf.c
+++ b/app/test/test_hash_perf.c
@@ -440,7 +440,7 @@  run_single_tbl_perf_test(const struct rte_hash *h, hash_operation func,
 		uint32_t *invalid_pos_count)
 {
 	uint64_t begin, end, ticks = 0;
-	uint8_t *key = NULL;
+	uint8_t * volatile key = NULL;
 	uint32_t *bucket_occupancies = NULL;
 	uint32_t num_buckets, i, j;
 	int32_t pos;
@@ -547,30 +547,30 @@  run_tbl_perf_test(struct tbl_perf_test_params *params)
 	case ADD_UPDATE:
 		num_iterations = params->num_iterations;
 		params->num_iterations = params->entries;
-		run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-				&avg_occupancy, &invalid_pos);
-		params->num_iterations = num_iterations;
 		ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
 				params, &avg_occupancy, &invalid_pos);
+		params->num_iterations = num_iterations;
+		ticks += run_single_tbl_perf_test(handle, rte_hash_add_key,
+				params, &avg_occupancy, &invalid_pos);
 		break;
 	case DELETE:
 		num_iterations = params->num_iterations;
 		params->num_iterations = params->entries;
-		run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-				&avg_occupancy, &invalid_pos);
+		ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
+				params, &avg_occupancy, &invalid_pos);
 
 		params->num_iterations = num_iterations;
-		ticks = run_single_tbl_perf_test(handle, rte_hash_del_key,
+		ticks += run_single_tbl_perf_test(handle, rte_hash_del_key,
 				params, &avg_occupancy, &invalid_pos);
 		break;
 	case LOOKUP:
 		num_iterations = params->num_iterations;
 		params->num_iterations = params->entries;
-		run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-				&avg_occupancy, &invalid_pos);
+		ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
+				params, &avg_occupancy, &invalid_pos);
 
 		params->num_iterations = num_iterations;
-		ticks = run_single_tbl_perf_test(handle, rte_hash_lookup,
+		ticks += run_single_tbl_perf_test(handle, rte_hash_lookup,
 				params, &avg_occupancy, &invalid_pos);
 		break;
 	default: return -1;
@@ -623,10 +623,15 @@  static int run_all_tbl_perf_tests(void)
 static void run_hash_func_test(rte_hash_function f, uint32_t init_val,
 		uint32_t key_len)
 {
-	static uint8_t key[RTE_HASH_KEY_LENGTH_MAX];
+	static uint8_t * volatile key;
 	uint64_t ticks = 0, start, end;
 	unsigned i, j;
 
+	key = rte_zmalloc("func hash key",
+			  key_len * sizeof(uint8_t), 16);
+	if (key == NULL)
+		return;
+
 	for (i = 0; i < HASHTEST_ITERATIONS; i++) {
 
 		for (j = 0; j < key_len; j++)
@@ -638,8 +643,11 @@  static void run_hash_func_test(rte_hash_function f, uint32_t init_val,
 		ticks += end - start;
 	}
 
-	printf("%-12s, %-18u, %-13u, %.02f\n", get_hash_name(f), (unsigned) key_len,
-			(unsigned) init_val, (double)ticks / HASHTEST_ITERATIONS);
+	rte_free(key);
+
+	printf("%-12s, %-18u, %-13u, %.02f\n",
+		get_hash_name(f), (unsigned) key_len, (unsigned) init_val,
+		(double)ticks / HASHTEST_ITERATIONS);
 }
 
 /*
@@ -687,7 +695,7 @@  fbk_hash_perf_test(void)
 		.socket_id = rte_socket_id(),
 	};
 	struct rte_fbk_hash_table *handle = NULL;
-	uint32_t *keys = NULL;
+	uint32_t * volatile keys = NULL;
 	unsigned indexes[TEST_SIZE];
 	uint64_t lookup_time = 0;
 	unsigned added = 0;
diff --git a/app/test/test_memcmp.c b/app/test/test_memcmp.c
new file mode 100644
index 0000000..7d9c85f
--- /dev/null
+++ b/app/test/test_memcmp.c
@@ -0,0 +1,229 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/queue.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_memcmp.h>
+
+#include "test.h"
+
+/*******************************************************************************
+ * Memcmp function performance test configuration section.
+ * Each performance test will be performed HASHTEST_ITERATIONS times.
+ *
+ * The five arrays below control what tests are performed. Every combination
+ * from the array entries is tested.
+ */
+static size_t memcmp_sizes[] = {
+	1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
+	256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
+	2048, 3072, 4096, 5120, 6144, 7168, 8192, 16384
+};
+
+/******************************************************************************/
+
+#define RTE_MEMCMP_LENGTH_MAX 16384
+
+/*
+ * Test a memcmp equal function.
+ */
+static int run_memcmp_eq_func_test(uint32_t len)
+{
+	uint32_t i, rc = 0;
+	uint8_t * volatile key = NULL;
+
+	key = rte_zmalloc("memcmp key", len * sizeof(uint8_t), 16);
+	if (key == NULL)
+		return -1;
+
+	for (i = 0; i < len; i++)
+		key[i] = (uint8_t) rte_rand();
+
+	rc = rte_memcmp(key, key, len);
+	rte_free(key);
+
+	return rc;
+}
+
+/*
+ * Test memcmp equal functions.
+ */
+static int run_memcmp_eq_func_tests(void)
+{
+	unsigned i;
+
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		if (run_memcmp_eq_func_test(memcmp_sizes[i])) {
+			printf("Comparing equal %zd bytes failed\n", memcmp_sizes[i]);
+			return 1;
+		}
+	}
+	printf("RTE memcmp for equality successful\n");
+	return 0;
+}
+
+/*
+ * Test a memcmp less than function.
+ */
+static int run_memcmp_lt_func_test(uint32_t len)
+{
+	uint32_t i, rc;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL)
+		return -1;
+
+	for (i = 0; i < len; i++)
+		key_1[i] = i;
+
+	for (i = 0; i < len; i++)
+		key_2[i] = 2;
+
+	rc = rte_memcmp(key_1, key_2, len);
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return rc;
+}
+
+/*
+ * Test memcmp less than functions.
+ */
+static int run_memcmp_lt_func_tests(void)
+{
+	unsigned i;
+
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		if (!(run_memcmp_lt_func_test(memcmp_sizes[i]) < 0)) {
+			printf("Comparing less than for %zd bytes failed\n", memcmp_sizes[i]);
+			return 1;
+		}
+	}
+	printf("RTE memcmp for less than successful\n");
+	return 0;
+}
+
+/*
+ * Test a memcmp greater than function.
+ */
+static int run_memcmp_gt_func_test(uint32_t len)
+{
+	uint32_t i, rc;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL)
+		return -1;
+
+	for (i = 0; i < len; i++)
+		key_1[i] = 2;
+
+	for (i = 0; i < len; i++)
+		key_2[i] = i;
+
+	rc = rte_memcmp(key_1, key_2, len);
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return rc;
+}
+
+/*
+ * Test memcmp less than functions.
+ */
+static int run_memcmp_gt_func_tests(void)
+{
+	unsigned i;
+
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		if (!(run_memcmp_gt_func_test(memcmp_sizes[i]) > 0)) {
+			printf("Comparing greater than for %zd bytes failed\n", memcmp_sizes[i]);
+			return 1;
+		}
+	}
+	printf("RTE memcmp for greater than successful\n");
+	return 0;
+}
+
+/*
+ * Do all unit and performance tests.
+ */
+static int
+test_memcmp(void)
+{
+	if (run_memcmp_eq_func_tests())
+		return -1;
+
+	if (run_memcmp_gt_func_tests())
+		return -1;
+
+	if (run_memcmp_lt_func_tests())
+		return -1;
+
+	return 0;
+}
+
+static struct test_command memcmp_cmd = {
+	.command = "memcmp_autotest",
+	.callback = test_memcmp,
+};
+REGISTER_TEST_COMMAND(memcmp_cmd);
diff --git a/app/test/test_memcmp_perf.c b/app/test/test_memcmp_perf.c
new file mode 100644
index 0000000..8b7a0c4
--- /dev/null
+++ b/app/test/test_memcmp_perf.c
@@ -0,0 +1,339 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/times.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_memory.h>
+#include <rte_memcmp.h>
+
+#include "test.h"
+
+/*******************************************************************************
+ * Memcmp function performance test configuration section. Each performance test
+ * will be performed MEMCMP_ITERATIONS times.
+ *
+ * The five arrays below control what tests are performed. Every combination
+ * from the array entries is tested.
+ */
+#define MEMCMP_ITERATIONS 500 * 500 * 500
+
+static size_t memcmp_sizes[] = {
+	2, 5, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+	129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
+	449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1522, 1536, 1600,
+	2048, 2560, 3072, 3584, 4096, 4608, 5632, 6144, 6656, 7168, 7680, 8192,
+	16834
+};
+
+static size_t memcmp_lt_gt_sizes[] = {
+	16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192};
+
+/******************************************************************************/
+
+static int
+run_single_memcmp_eq_perf_test(uint32_t len, int func_type, uint64_t iterations)
+{
+	double begin = 0, end = 0;
+	uint64_t i, j, rc = 0;
+	uint8_t * volatile key = NULL;
+
+	key = rte_zmalloc("memcmp key", len * sizeof(uint8_t), 16);
+	if (key == NULL)
+		return -1;
+
+	/* Prepare inputs for the current iteration */
+	for (j = 0; j < len; j++)
+		key[j] = j / 64;
+
+	begin = rte_rdtsc();
+
+	/* Perform operation, and measure time it takes */
+	for (i = 0; i < iterations; i++) {
+
+		if (func_type == 1)
+			rc += rte_memcmp(key, key, len);
+		else
+			rc += memcmp(key, key, len);
+	}
+
+	end = rte_rdtsc() - begin;
+
+	printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+	rte_free(key);
+
+	return rc;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_eq_perf_tests(void)
+{
+	unsigned i;
+
+	printf(" *** RTE memcmp equal performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_eq_perf_test(memcmp_sizes[i], 1,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+
+	printf(" *** memcmp equal performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_eq_perf_test(memcmp_sizes[i], 2,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+	return 0;
+}
+
+static int
+run_single_memcmp_lt_perf_test(uint32_t len, int func_type,
+					uint64_t iterations)
+{
+	double begin = 0, end = 0;
+	uint64_t i, j;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL) {
+		rte_free(key_1);
+		return -1;
+	}
+
+	/* Prepare inputs for the current iteration */
+	for (j = 0; j < len; j++)
+		key_1[j] = 1;
+
+	for (j = 0; j < len; j++)
+		key_2[j] = 1;
+
+	key_2[len / 2] = 2;
+
+	begin = rte_rdtsc();
+
+	/* Perform operation, and measure time it takes */
+	for (i = 0; i < iterations; i++) {
+
+		if (func_type == 1) {
+			if (!(rte_memcmp(key_1, key_2, len) < 0))
+				return -1;
+		} else {
+			if (!(memcmp(key_1, key_2, len) < 0))
+				return -1;
+		}
+	}
+
+	end = rte_rdtsc() - begin;
+
+	printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return 0;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_lt_perf_tests(void)
+{
+	unsigned i;
+
+	printf(" *** RTE memcmp less than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_lt_perf_test(memcmp_lt_gt_sizes[i], 1,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+
+	printf(" *** memcmp less than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_lt_perf_test(memcmp_lt_gt_sizes[i], 2,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+	return 0;
+}
+
+static int
+run_single_memcmp_gt_perf_test(uint32_t len, int func_type,
+					uint64_t iterations)
+{
+	double begin = 0, end = 0;
+	uint64_t i, j;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL) {
+		rte_free(key_1);
+		return -1;
+	}
+
+	/* Prepare inputs for the current iteration */
+	for (j = 0; j < len; j++)
+		key_1[j] = 1;
+	key_1[len / 2] = 2;
+
+	for (j = 0; j < len; j++)
+		key_2[j] = 1;
+
+	begin = rte_rdtsc();
+
+	/* Perform operation, and measure time it takes */
+	for (i = 0; i < iterations; i++) {
+
+		if (func_type == 1) {
+			if (!(rte_memcmp(key_1, key_2, len) > 0))
+				return -1;
+		} else {
+			if (!(memcmp(key_1, key_2, len) > 0))
+				return -1;
+		}
+	}
+
+	end = rte_rdtsc() - begin;
+
+	printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return 0;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_gt_perf_tests(void)
+{
+	unsigned i;
+
+	printf(" *** RTE memcmp greater than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_gt_perf_test(memcmp_lt_gt_sizes[i], 1,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+
+	printf(" *** memcmp greater than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_gt_perf_test(memcmp_lt_gt_sizes[i], 2,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+	return 0;
+}
+
+/*
+ * Do all performance tests.
+ */
+static int
+test_memcmp_perf(void)
+{
+	if (run_all_memcmp_eq_perf_tests() != 0)
+		return -1;
+
+	if (run_all_memcmp_lt_perf_tests() != 0)
+		return -1;
+
+	if (run_all_memcmp_gt_perf_tests() != 0)
+		return -1;
+
+	return 0;
+}
+
+static struct test_command memcmp_perf_cmd = {
+	.command = "memcmp_perf_autotest",
+	.callback = test_memcmp_perf,
+};
+REGISTER_TEST_COMMAND(memcmp_perf_cmd);
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h b/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
new file mode 100644
index 0000000..6e54f3b
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
@@ -0,0 +1,62 @@ 
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) IBM Corporation 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IBM Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _RTE_MEMCMP_PPC_64_H_
+#define _RTE_MEMCMP_PPC_64_H_
+
+#include <stdint.h>
+#include <string.h>
+/*To include altivec.h, GCC version must  >= 4.8 */
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcmp.h"
+
+#define rte_memcmp(dst, src, n)              \
+	({ (__builtin_constant_p(n)) ?       \
+	memcmp((dst), (src), (n)) :          \
+	rte_memcmp_func((dst), (src), (n)); })
+
+static inline bool
+rte_memcmp_func(void *dst, const void *src, size_t n)
+{
+	return memcmp(dst, src, n);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCMP_PPC_64_H_ */
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib/librte_eal/common/include/arch/x86/rte_memcmp.h
new file mode 100644
index 0000000..085dfb2
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcmp.h
@@ -0,0 +1,900 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCMP_X86_64_H_
+#define _RTE_MEMCMP_X86_64_H_
+
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcmp().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <rte_vect.h>
+#include <rte_branch_prediction.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *src_1, const void *src,
+		size_t n) __attribute__((always_inline));
+
+/**
+ * Find the first different bit for comparison.
+ */
+static inline int
+rte_cmpffd (uint32_t x, uint32_t y)
+{
+	int i;
+	int pos = x ^ y;
+	for (i = 0; i < 32; i++)
+		if (pos & (1<<i))
+			return i;
+	return -1;
+}
+
+/**
+ * Find the first different byte for comparison.
+ */
+static inline int
+rte_cmpffdb (const uint8_t *x, const uint8_t *y, size_t n)
+{
+	size_t i;
+	for (i = 0; i < n; i++)
+		if (x[i] != y[i])
+			return x[i] - y[i];
+	return 0;
+}
+
+/**
+ * Compare 16 bytes between two locations.
+ * locations should not overlap.
+ */
+static inline int
+rte_cmp16(const void *src_1, const void *src_2)
+{
+	__m128i xmm0, xmm1, xmm2;
+
+	xmm0 = _mm_lddqu_si128((const __m128i *)src_1);
+	xmm1 = _mm_lddqu_si128((const __m128i *)src_2);
+	xmm2 = _mm_xor_si128(xmm0, xmm1);
+
+	if (unlikely(!_mm_testz_si128(xmm2, xmm2))) {
+
+		uint64_t mm11 = _mm_extract_epi64(xmm0, 0);
+		uint64_t mm12 = _mm_extract_epi64(xmm0, 1);
+
+		uint64_t mm21 = _mm_extract_epi64(xmm1, 0);
+		uint64_t mm22 = _mm_extract_epi64(xmm1, 1);
+
+		if (mm11 == mm21)
+			return rte_cmpffdb((const uint8_t *)&mm12,
+					(const uint8_t *)&mm22, 8);
+		else
+			return rte_cmpffdb((const uint8_t *)&mm11,
+					(const uint8_t *)&mm21, 8);
+	}
+
+	return 0;
+}
+
+/**
+ * Compare 0 to 15 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_memcmp_regular(const uint8_t *src_1u, const uint8_t *src_2u, size_t n)
+{
+	int ret = 1;
+
+	/**
+	 * Compare less than 16 bytes
+	 */
+	if (n & 0x08) {
+		ret = (*(const uint64_t *)src_1u ==
+				*(const uint64_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_8;
+
+		n -= 0x8;
+		src_1u += 0x8;
+		src_2u += 0x8;
+	}
+
+	if (n & 0x04) {
+		ret = (*(const uint32_t *)src_1u ==
+				*(const uint32_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_4;
+
+		n -= 0x4;
+		src_1u += 0x4;
+		src_2u += 0x4;
+	}
+
+	if (n & 0x02) {
+		ret = (*(const uint16_t *)src_1u ==
+				*(const uint16_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_2;
+
+		n -= 0x2;
+		src_1u += 0x2;
+		src_2u += 0x2;
+	}
+
+	if (n & 0x01) {
+		ret = (*(const uint8_t *)src_1u ==
+				*(const uint8_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_1;
+
+		n -= 0x1;
+		src_1u += 0x1;
+		src_2u += 0x1;
+	}
+
+	return !ret;
+
+exit_8:
+	return rte_cmpffdb(src_1u, src_2u, 8);
+exit_4:
+	return rte_cmpffdb(src_1u, src_2u, 4);
+exit_2:
+	return rte_cmpffdb(src_1u, src_2u, 2);
+exit_1:
+	return rte_cmpffdb(src_1u, src_2u, 1);
+}
+
+/**
+ * AVX2 implementation below
+ */
+#ifdef RTE_MACHINE_CPUFLAG_AVX2
+
+/**
+ * Compare 32 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2)
+{
+	const __m128i* src1 = (const __m128i*)src_1;
+	const __m128i* src2 = (const __m128i*)src_2;
+	const uint8_t *s1, *s2;
+
+	__m128i mm11 = _mm_lddqu_si128(src1);
+	__m128i mm12 = _mm_lddqu_si128(src1 + 1);
+	__m128i mm21 = _mm_lddqu_si128(src2);
+	__m128i mm22 = _mm_lddqu_si128(src2 + 1);
+
+	__m128i mm1 = _mm_xor_si128(mm11, mm21);
+	__m128i mm2 = _mm_xor_si128(mm12, mm22);
+	__m128i mm = _mm_or_si128(mm1, mm2);
+
+	if (unlikely(!_mm_testz_si128(mm, mm))) {
+
+		/*
+		 * Find out which of the two 16-byte blocks
+		 * are different.
+		 */
+		if (_mm_testz_si128(mm1, mm1)) {
+			mm11 = mm12;
+			mm21 = mm22;
+			mm1 = mm2;
+			s1 = (const uint8_t *)(src1 + 1);
+			s2 = (const uint8_t *)(src2 + 1);
+		} else {
+			s1 = (const uint8_t *)src1;
+			s2 = (const uint8_t *)src2;
+		}
+
+		// Produce the comparison result
+		__m128i mm_cmp = _mm_cmpgt_epi8(mm11, mm21);
+		__m128i mm_rcmp = _mm_cmpgt_epi8(mm21, mm11);
+		mm_cmp = _mm_xor_si128(mm1, mm_cmp);
+		mm_rcmp = _mm_xor_si128(mm1, mm_rcmp);
+
+		uint32_t cmp = _mm_movemask_epi8(mm_cmp);
+		uint32_t rcmp = _mm_movemask_epi8(mm_rcmp);
+
+		int cmp_b = rte_cmpffd(cmp, rcmp);
+
+		int ret = (cmp_b == -1) ? 0 : (s1[cmp_b] - s2[cmp_b]);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * Compare 48 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp32((const uint8_t *)src_1 + 0 * 32,
+			(const uint8_t *)src_2 + 0 * 32);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 32,
+			(const uint8_t *)src_2 + 1 * 32);
+	return ret;
+}
+
+/**
+ * Compare 64 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp64 (const void* src_1, const void* src_2)
+{
+	const __m256i* src1 = (const __m256i*)src_1;
+	const __m256i* src2 = (const __m256i*)src_2;
+	const uint8_t *s1, *s2;
+
+	__m256i mm11 = _mm256_lddqu_si256(src1);
+	__m256i mm12 = _mm256_lddqu_si256(src1 + 1);
+	__m256i mm21 = _mm256_lddqu_si256(src2);
+	__m256i mm22 = _mm256_lddqu_si256(src2 + 1);
+
+	__m256i mm1 = _mm256_xor_si256(mm11, mm21);
+	__m256i mm2 = _mm256_xor_si256(mm12, mm22);
+	__m256i mm = _mm256_or_si256(mm1, mm2);
+
+	if (unlikely(!_mm256_testz_si256(mm, mm))) {
+		/*
+		 * Find out which of the two 32-byte blocks
+		 * are different.
+		 */
+		if (_mm256_testz_si256(mm1, mm1)) {
+			mm11 = mm12;
+			mm21 = mm22;
+			mm1 = mm2;
+			s1 = (const uint8_t *)(src1 + 1);
+			s2 = (const uint8_t *)(src2 + 1);
+		} else {
+			s1 = (const uint8_t *)src1;
+			s2 = (const uint8_t *)src2;
+		}
+
+		// Produce the comparison result
+		__m256i mm_cmp = _mm256_cmpgt_epi8(mm11, mm21);
+		__m256i mm_rcmp = _mm256_cmpgt_epi8(mm21, mm11);
+		mm_cmp = _mm256_xor_si256(mm1, mm_cmp);
+		mm_rcmp = _mm256_xor_si256(mm1, mm_rcmp);
+
+		uint32_t cmp = _mm256_movemask_epi8(mm_cmp);
+		uint32_t rcmp = _mm256_movemask_epi8(mm_rcmp);
+
+		int cmp_b = rte_cmpffd(cmp, rcmp);
+
+		int ret = (cmp_b == -1) ? 0 : (s1[cmp_b] - s2[cmp_b]);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * Compare 128 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
+			(const uint8_t *)src_2 + 0 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp64((const uint8_t *)src_1 + 1 * 64,
+			(const uint8_t *)src_2 + 1 * 64);
+}
+
+/**
+ * Compare 256 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
+			(const uint8_t *)src_2 + 0 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 1 * 64,
+			(const uint8_t *)src_2 + 1 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 2 * 64,
+			(const uint8_t *)src_2 + 2 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp64((const uint8_t *)src_1 + 3 * 64,
+			(const uint8_t *)src_2 + 3 * 64);
+}
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
+{
+	const uint8_t *src_1 = (const uint8_t *)_src_1;
+	const uint8_t *src_2 = (const uint8_t *)_src_2;
+	int ret = 0;
+
+	if (n < 16)
+		return rte_memcmp_regular(src_1, src_2, n);
+
+	if (n <= 32) {
+		ret = rte_cmp16(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 48) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 64) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 32, src_2 + 32);
+
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 96) {
+		ret = rte_cmp64(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 64, src_2 + 64);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 128) {
+		ret = rte_cmp64(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp32(src_1 + 64, src_2 + 64);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 96, src_2 + 96);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+CMP_BLOCK_LESS_THAN_512:
+	if (n <= 512) {
+		if (n >= 256) {
+			ret = rte_cmp256(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 256;
+			src_2 = src_2 + 256;
+			n -= 256;
+		}
+		if (n >= 128) {
+			ret = rte_cmp128(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 128;
+			src_2 = src_2 + 128;
+			n -= 128;
+		}
+		if (n >= 64) {
+			n -= 64;
+			ret = rte_cmp64(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 64;
+			src_2 = src_2 + 64;
+		}
+		if (n > 32) {
+			ret = rte_cmp32(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			ret = rte_cmp32(src_1 - 32 + n, src_2 - 32 + n);
+			return ret;
+		}
+		if (n > 0)
+			ret = rte_cmp32(src_1 - 32 + n, src_2 - 32 + n);
+
+		return ret;
+	}
+
+	while (n > 512) {
+		ret = rte_cmp256(src_1 + 0 * 256, src_2 + 0 * 256);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp256(src_1 + 1 * 256, src_2 + 1 * 256);
+		if (unlikely(ret != 0))
+			return ret;
+
+		src_1 = src_1 + 512;
+		src_2 = src_2 + 512;
+		n -= 512;
+	}
+	goto CMP_BLOCK_LESS_THAN_512;
+}
+
+#else /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+/**
+ * Compare 32 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+}
+
+/**
+ * Compare 48 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+}
+
+/**
+ * Compare 64 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp64(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+			(const uint8_t *)src_2 + 3 * 16);
+}
+
+/**
+ * Compare 128 bytes or its multiple between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+			(const uint8_t *)src_2 + 3 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 4 * 16,
+			(const uint8_t *)src_2 + 4 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 5 * 16,
+			(const uint8_t *)src_2 + 5 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 6 * 16,
+			(const uint8_t *)src_2 + 6 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 7 * 16,
+			(const uint8_t *)src_2 + 7 * 16);
+}
+
+/**
+ * Compare 256 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+			(const uint8_t *)src_2 + 3 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 4 * 16,
+			(const uint8_t *)src_2 + 4 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 5 * 16,
+			(const uint8_t *)src_2 + 5 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 6 * 16,
+			(const uint8_t *)src_2 + 6 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 7 * 16,
+			(const uint8_t *)src_2 + 7 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 8 * 16,
+			(const uint8_t *)src_2 + 8 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 9 * 16,
+			(const uint8_t *)src_2 + 9 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 10 * 16,
+			(const uint8_t *)src_2 + 10 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 11 * 16,
+			(const uint8_t *)src_2 + 11 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 12 * 16,
+			(const uint8_t *)src_2 + 12 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 13 * 16,
+			(const uint8_t *)src_2 + 13 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 14 * 16,
+			(const uint8_t *)src_2 + 14 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 15 * 16,
+			(const uint8_t *)src_2 + 15 * 16);
+}
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
+{
+	const uint8_t *src_1 = (const uint8_t *)_src_1;
+	const uint8_t *src_2 = (const uint8_t *)_src_2;
+	int ret = 0;
+
+	if (n < 16)
+		return rte_memcmp_regular(src_1, src_2, n);
+
+	if (n <= 32) {
+		ret = rte_cmp16(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 48) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 64) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 32, src_2 + 32);
+
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 96) {
+		ret = rte_cmp64(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 64, src_2 + 64);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 128)
+		goto CMP_BLOCK_LESS_THAN_128;
+
+	if (n <= 512) {
+		if (n >= 256) {
+			ret = rte_cmp256(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+
+			src_1 = src_1 + 256;
+			src_2 = src_2 + 256;
+			n -= 256;
+		}
+
+CMP_BLOCK_LESS_THAN_256:
+		if (n >= 128) {
+			ret = rte_cmp128(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+
+			src_1 = src_1 + 128;
+			src_2 = src_2 + 128;
+			n -= 128;
+		}
+
+CMP_BLOCK_LESS_THAN_128:
+		if (n >= 64) {
+			ret = rte_cmp64(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+
+			src_1 = src_1 + 64;
+			src_2 = src_2 + 64;
+			n -= 64;
+		}
+
+		if (n >= 32) {
+			ret = rte_cmp32(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 32;
+			src_2 = src_2 + 32;
+			n -= 32;
+		}
+		if (n > 16) {
+			ret = rte_cmp16(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			ret = rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+			return ret;
+		}
+		if (n > 0)
+			ret = rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+
+		return ret;
+	}
+
+	for (; n >= 256; n -= 256) {
+		ret = rte_cmp256(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		src_1 = src_1 + 256;
+		src_2 = src_2 + 256;
+	}
+
+	goto CMP_BLOCK_LESS_THAN_256;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCMP_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_memcmp.h b/lib/librte_eal/common/include/generic/rte_memcmp.h
new file mode 100644
index 0000000..5e68036
--- /dev/null
+++ b/lib/librte_eal/common/include/generic/rte_memcmp.h
@@ -0,0 +1,175 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCMP_H_
+#define _RTE_MEMCMP_H_
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memcmp().
+ */
+
+/**
+ * Find the first different bit for comparison.
+ */
+static inline int
+rte_cmpffd (uint32_t x, uint32_t y);
+
+/**
+ * Find the first different byte for comparison.
+ */
+static inline int
+rte_cmpffdb (const uint8_t *x, const uint8_t *y, size_t n);
+
+/**
+ * Compare 16 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp16(const void *src_1, const void *src_2);
+
+/**
+ * Compare 32 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2);
+
+/**
+ * Compare 64 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp64(const void *src_1, const void *src_2);
+
+/**
+ * Compare 48 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2);
+
+/**
+ * Compare 128 bytes between two locations using
+ * optimised instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2);
+
+/**
+ * Compare 256 bytes or greater between two locations using
+ * optimised instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2);
+
+#ifdef __DOXYGEN__
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static int
+rte_memcmp(const void *dst, const void *src, size_t n);
+
+#endif /* __DOXYGEN__ */
+
+/*
+ * memcmp() function used by rte_memcmp macro
+ */
+static inline int
+rte_memcmp_func(void *dst, const void *src, size_t n) __attribute__((always_inline));
+
+#endif /* _RTE_MEMCMP_H_ */
diff --git a/lib/librte_hash/rte_hash.c b/lib/librte_hash/rte_hash.c
index 9245716..075da62 100644
--- a/lib/librte_hash/rte_hash.c
+++ b/lib/librte_hash/rte_hash.c
@@ -42,6 +42,7 @@ 
 #include <rte_memory.h>         /* for definition of RTE_CACHE_LINE_SIZE */
 #include <rte_log.h>
 #include <rte_memcpy.h>
+#include <rte_memcmp.h>
 #include <rte_prefetch.h>
 #include <rte_branch_prediction.h>
 #include <rte_memzone.h>
@@ -299,6 +300,7 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h,
 	uint8_t *key_bucket;
 	uint32_t bucket_index, i;
 	int32_t pos;
+	const void * volatile key_1 = key;
 
 	/* Get the hash signature and bucket index */
 	sig |= h->sig_msb;
@@ -308,10 +310,13 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h,
 
 	/* Check if key is already present in the hash */
 	for (i = 0; i < h->bucket_entries; i++) {
-		if ((sig == sig_bucket[i]) &&
-		    likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-				  h->key_len) == 0)) {
-			return bucket_index * h->bucket_entries + i;
+		if (sig == sig_bucket[i]) {
+
+			const void * volatile key_2 =
+				get_key_from_bucket(h, key_bucket, i);
+
+			if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0))
+				return bucket_index * h->bucket_entries + i;
 		}
 	}
 
@@ -350,6 +355,8 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h,
 	uint8_t *key_bucket;
 	uint32_t bucket_index, i;
 
+	const void * volatile key_1 = key;
+
 	/* Get the hash signature and bucket index */
 	sig = sig | h->sig_msb;
 	bucket_index = sig & h->bucket_bitmask;
@@ -358,11 +365,14 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h,
 
 	/* Check if key is already present in the hash */
 	for (i = 0; i < h->bucket_entries; i++) {
-		if ((sig == sig_bucket[i]) &&
-		    likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-				  h->key_len) == 0)) {
-			sig_bucket[i] = NULL_SIGNATURE;
-			return bucket_index * h->bucket_entries + i;
+		if (sig == sig_bucket[i]) {
+			const void * volatile key_2 =
+				get_key_from_bucket(h, key_bucket, i);
+
+			if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0)) {
+				sig_bucket[i] = NULL_SIGNATURE;
+				return bucket_index * h->bucket_entries + i;
+			}
 		}
 	}
 
@@ -392,6 +402,8 @@  __rte_hash_lookup_with_hash(const struct rte_hash *h,
 	uint8_t *key_bucket;
 	uint32_t bucket_index, i;
 
+	const void * volatile key_1 = key;
+
 	/* Get the hash signature and bucket index */
 	sig |= h->sig_msb;
 	bucket_index = sig & h->bucket_bitmask;
@@ -400,10 +412,13 @@  __rte_hash_lookup_with_hash(const struct rte_hash *h,
 
 	/* Check if key is already present in the hash */
 	for (i = 0; i < h->bucket_entries; i++) {
-		if ((sig == sig_bucket[i]) &&
-		    likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-				  h->key_len) == 0)) {
-			return bucket_index * h->bucket_entries + i;
+		if (sig == sig_bucket[i]) {
+
+			const void * volatile key_2 =
+				get_key_from_bucket(h, key_bucket, i);
+
+			if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0))
+				return bucket_index * h->bucket_entries + i;
 		}
 	}
 
@@ -456,13 +471,17 @@  rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
 		positions[i] = -ENOENT;
 
 		for (j = 0; j < h->bucket_entries; j++) {
-			if ((sigs[i] == sig_bucket[j]) &&
-			    likely(memcmp(keys[i],
-					  get_key_from_bucket(h, key_bucket, j),
-					  h->key_len) == 0)) {
-				positions[i] = bucket_index *
-					h->bucket_entries + j;
-				break;
+			if (sigs[i] == sig_bucket[j]) {
+
+				const void * volatile key_1 = keys[i];
+				const void * volatile key_2 =
+					get_key_from_bucket(h, key_bucket, j);
+				if (likely(rte_memcmp(key_1, key_2,
+							h->key_len) == 0)) {
+					positions[i] = bucket_index *
+							h->bucket_entries + j;
+					break;
+				}
 			}
 		}
 	}