mbuf: replace c memcpy() code semantics with optimized rte_memcpy()
Checks
Commit Message
Since rte_memcpy is more optimized it should be used instead of memcpy
Signed-off-by: Sarosh Arif <sarosh.arif@emumba.com>
---
lib/librte_mbuf/rte_mbuf.c | 2 +-
lib/librte_mbuf/rte_mbuf.h | 3 ++-
lib/librte_mbuf/rte_mbuf_dyn.c | 8 +++++---
3 files changed, 8 insertions(+), 5 deletions(-)
Comments
On Thu, 23 Jul 2020 12:02:40 +0500
Sarosh Arif <sarosh.arif@emumba.com> wrote:
> Since rte_memcpy is more optimized it should be used instead of memcpy
>
> Signed-off-by: Sarosh Arif <sarosh.arif@emumba.com>
Really did you measure this.
For fixed size structures, compiler can inline memcpy small set of instructions.
Hello,
The following things made me think that rte_memcpy() is more optimized
than memcpy():
1. dpdk documentation recommends to use rte_memcpy() instead of memcpy():
https://doc.dpdk.org/guides/prog_guide/writing_efficient_code.html
2. Here some benchmarks are available:
https://software.intel.com/content/www/us/en/develop/articles/performance-optimization-of-memcpy-in-dpdk.html
3. rte_memcpy() has __attribute__((always_inline)) associated with it,
so compiler also tries to inline it.
Using rte_memcpy() everywhere ensures consistency in code-base.
Here are the results of the performance number measurement using "perf":
rte_memcpy()
Performance counter stats
1.573864 task-clock (msec) # 0.898 CPUs
utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
342 page-faults # 0.217 M/sec
5,483,016 cycles # 3.484 GHz
5,554,017 instructions # 1.01 insn per
cycle
1,114,593 branches # 708.189 M/sec
33,796 branch-misses # 3.03% of all
branches
1,369,247 L1-dcache-loads # 869.991 M/sec
<not counted> L1-dcache-load-misses
(0.00%)
<not counted> LLC-loads
(0.00%)
<not counted> LLC-load-misses
(0.00%)
0.001753373 seconds time elapsed
memcpy()
Performance counter stats
1.631135 task-clock (msec) # 0.902 CPUs
utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
342 page-faults # 0.210 M/sec
5,676,549 cycles # 3.480 GHz
(73.99%)
5,739,593 instructions # 1.01 insn per
cycle
1,141,121 branches # 699.587 M/sec
34,553 branch-misses # 3.03% of all
branches
1,417,494 L1-dcache-loads # 869.023 M/sec
67,312 L1-dcache-load-misses # 4.75% of all
L1-dcache hits (26.01%)
<not counted> LLC-loads
(0.00%)
<not counted> LLC-load-misses
(0.00%)
0.001808500 seconds time elapsed
On Thu, Jul 23, 2020 at 8:47 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On Thu, 23 Jul 2020 12:02:40 +0500
> Sarosh Arif <sarosh.arif@emumba.com> wrote:
>
> > Since rte_memcpy is more optimized it should be used instead of memcpy
> >
> > Signed-off-by: Sarosh Arif <sarosh.arif@emumba.com>
>
> Really did you measure this.
> For fixed size structures, compiler can inline memcpy small set of instructions.
Hi Sarosh,
On Tue, Jul 28, 2020 at 06:30:46PM +0500, Sarosh Arif wrote:
> Hello,
> The following things made me think that rte_memcpy() is more optimized
> than memcpy():
> 1. dpdk documentation recommends to use rte_memcpy() instead of memcpy():
> https://doc.dpdk.org/guides/prog_guide/writing_efficient_code.html
> 2. Here some benchmarks are available:
> https://software.intel.com/content/www/us/en/develop/articles/performance-optimization-of-memcpy-in-dpdk.html
> 3. rte_memcpy() has __attribute__((always_inline)) associated with it,
> so compiler also tries to inline it.
>
> Using rte_memcpy() everywhere ensures consistency in code-base.
> Here are the results of the performance number measurement using "perf":
>
> rte_memcpy()
>
> Performance counter stats
> 1.573864 task-clock (msec) # 0.898 CPUs
> utilized
> 0 context-switches # 0.000 K/sec
> 0 cpu-migrations # 0.000 K/sec
> 342 page-faults # 0.217 M/sec
> 5,483,016 cycles # 3.484 GHz
> 5,554,017 instructions # 1.01 insn per
> cycle
> 1,114,593 branches # 708.189 M/sec
> 33,796 branch-misses # 3.03% of all
> branches
> 1,369,247 L1-dcache-loads # 869.991 M/sec
> <not counted> L1-dcache-load-misses
> (0.00%)
> <not counted> LLC-loads
> (0.00%)
> <not counted> LLC-load-misses
> (0.00%)
>
> 0.001753373 seconds time elapsed
>
>
>
> memcpy()
>
> Performance counter stats
> 1.631135 task-clock (msec) # 0.902 CPUs
> utilized
> 0 context-switches # 0.000 K/sec
> 0 cpu-migrations # 0.000 K/sec
> 342 page-faults # 0.210 M/sec
> 5,676,549 cycles # 3.480 GHz
> (73.99%)
> 5,739,593 instructions # 1.01 insn per
> cycle
> 1,141,121 branches # 699.587 M/sec
> 34,553 branch-misses # 3.03% of all
> branches
> 1,417,494 L1-dcache-loads # 869.023 M/sec
> 67,312 L1-dcache-load-misses # 4.75% of all
> L1-dcache hits (26.01%)
> <not counted> LLC-loads
> (0.00%)
> <not counted> LLC-load-misses
> (0.00%)
>
> 0.001808500 seconds time elapsed
>
Can you give more details about your use-case? I mean what code
are you running for this benchmark.
I'll tend to agree with Stephen: memcpy() with a constant (small) size
should directly be replaced by the optimal code for this architecture.
rte_memcpy() uses vector instructions, and is probably better than
libc's memcpy for larger copies.
Thanks,
Olivier
>
>
> On Thu, Jul 23, 2020 at 8:47 PM Stephen Hemminger
> <stephen@networkplumber.org> wrote:
> >
> > On Thu, 23 Jul 2020 12:02:40 +0500
> > Sarosh Arif <sarosh.arif@emumba.com> wrote:
> >
> > > Since rte_memcpy is more optimized it should be used instead of memcpy
> > >
> > > Signed-off-by: Sarosh Arif <sarosh.arif@emumba.com>
> >
> > Really did you measure this.
> > For fixed size structures, compiler can inline memcpy small set of instructions.
On Thu, 23 Jul 2020 12:02:40 +0500
Sarosh Arif <sarosh.arif@emumba.com> wrote:
> Since rte_memcpy is more optimized it should be used instead of memcpy
>
> Signed-off-by: Sarosh Arif <sarosh.arif@emumba.com>
The part in pkmbuf_pool_init is not performance critical.
The layout of rte_mbuf_dynfield is sub optimal.
struct rte_mbuf_dynfield {
char name[64]; /* 0 64 */
/* --- cacheline 1 boundary (64 bytes) --- */
size_t size; /* 64 8 */
size_t align; /* 72 8 */
unsigned int flags; /* 80 4 */
/* size: 88, cachelines: 2, members: 4 */
/* padding: 4 */
/* last cacheline: 24 bytes */
};
1. It should have been sized so that overall it was 64 bytes.
2. Use 8 bytes for size and align is wasteful.
3. Hold 4 bytes for future flags is also wasteful. YAGNI
If you look at assembly output on x86 the copy of params becomes a sequence
of vmovups instructions with Gcc.
For 20.11 maybe:
diff --git a/lib/librte_mbuf/rte_mbuf_dyn.h b/lib/librte_mbuf/rte_mbuf_dyn.h
index 8407230ecfdc..eb1d01f97f40 100644
--- a/lib/librte_mbuf/rte_mbuf_dyn.h
+++ b/lib/librte_mbuf/rte_mbuf_dyn.h
@@ -70,16 +70,16 @@
/**
* Maximum length of the dynamic field or flag string.
*/
-#define RTE_MBUF_DYN_NAMESIZE 64
+#define RTE_MBUF_DYN_NAMESIZE 60
/**
* Structure describing the parameters of a mbuf dynamic field.
*/
struct rte_mbuf_dynfield {
char name[RTE_MBUF_DYN_NAMESIZE]; /**< Name of the field. */
- size_t size; /**< The number of bytes to reserve. */
- size_t align; /**< The alignment constraint (power of 2). */
- unsigned int flags; /**< Reserved for future use, must be 0. */
+ uint8_t size; /**< The number of bytes to reserve. */
+ uint8_t align; /**< The alignment constraint (power of 2). */
+ uint16_t flags; /**< Reserved for future use, must be 0. */
};
/**
Or make the dynamic field dynamic size to avoid wasting space?
@@ -66,7 +66,7 @@ rte_pktmbuf_pool_init(struct rte_mempool *mp, void *opaque_arg)
~RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) == 0);
mbp_priv = rte_mempool_get_priv(mp);
- memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv));
+ rte_memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv));
}
/*
@@ -42,6 +42,7 @@
#include <rte_byteorder.h>
#include <rte_mbuf_ptype.h>
#include <rte_mbuf_core.h>
+#include <rte_memcpy.h>
#ifdef __cplusplus
extern "C" {
@@ -1109,7 +1110,7 @@ rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
static inline void
rte_mbuf_dynfield_copy(struct rte_mbuf *mdst, const struct rte_mbuf *msrc)
{
- memcpy(&mdst->dynfield1, msrc->dynfield1, sizeof(mdst->dynfield1));
+ rte_memcpy(&mdst->dynfield1, msrc->dynfield1, sizeof(mdst->dynfield1));
}
/* internal */
@@ -15,6 +15,7 @@
#include <rte_string_fns.h>
#include <rte_mbuf.h>
#include <rte_mbuf_dyn.h>
+#include <rte_memcpy.h>
#define RTE_MBUF_DYN_MZNAME "rte_mbuf_dyn"
@@ -200,7 +201,7 @@ rte_mbuf_dynfield_lookup(const char *name, struct rte_mbuf_dynfield *params)
}
if (params != NULL)
- memcpy(params, &mbuf_dynfield->params, sizeof(*params));
+ rte_memcpy(params, &mbuf_dynfield->params, sizeof(*params));
return mbuf_dynfield->offset;
}
@@ -303,7 +304,8 @@ __rte_mbuf_dynfield_register_offset(const struct rte_mbuf_dynfield *params,
rte_free(te);
return -1;
}
- memcpy(&mbuf_dynfield->params, params, sizeof(mbuf_dynfield->params));
+ rte_memcpy(&mbuf_dynfield->params, params,
+ sizeof(mbuf_dynfield->params));
mbuf_dynfield->offset = offset;
te->data = mbuf_dynfield;
@@ -399,7 +401,7 @@ rte_mbuf_dynflag_lookup(const char *name,
}
if (params != NULL)
- memcpy(params, &mbuf_dynflag->params, sizeof(*params));
+ rte_memcpy(params, &mbuf_dynflag->params, sizeof(*params));
return mbuf_dynflag->bitnum;
}