[v4,01/10] build: add an option to enable LTO build

Message ID 20191022115412.8837-2-aostruszka@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Add an option to use LTO for DPDK build |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Andrzej Ostruszka [C] Oct. 22, 2019, 11:54 a.m. UTC
  This patch adds an option to enable link time optimization.  In addition
to LTO option itself (-flto) fat-lto-objects are being used.  This is
because during the build pmdinfogen scans the generated ELF objects to
find this_pmd_name* symbol in symbol table.  Without fat-lto-objects gcc
produces ELF only with extra symbols for internal use during linking.

Signed-off-by: Andrzej Ostruszka <aostruszka@marvell.com>
---
 .travis.yml                                  |  7 ++++
 config/common_base                           |  5 +++
 config/meson.build                           | 13 +++++++
 doc/guides/prog_guide/lto.rst                | 36 ++++++++++++++++++++
 doc/guides/rel_notes/release_19_11.rst       |  8 +++++
 lib/librte_distributor/rte_distributor.c     | 18 +++++-----
 lib/librte_distributor/rte_distributor_v20.c | 18 +++++-----
 lib/librte_lpm/rte_lpm.c                     | 28 +++++++--------
 lib/librte_lpm/rte_lpm6.c                    | 16 ++++-----
 lib/librte_timer/rte_timer.c                 | 20 +++++------
 mk/toolchain/gcc/rte.toolchain-compat.mk     |  4 +++
 mk/toolchain/gcc/rte.vars.mk                 | 12 +++++++
 mk/toolchain/icc/rte.vars.mk                 |  8 +++++
 13 files changed, 143 insertions(+), 50 deletions(-)
 create mode 100644 doc/guides/prog_guide/lto.rst
  

Comments

Bruce Richardson Oct. 22, 2019, 12:45 p.m. UTC | #1
On Tue, Oct 22, 2019 at 01:54:03PM +0200, Andrzej Ostruszka wrote:
> This patch adds an option to enable link time optimization.  In addition
> to LTO option itself (-flto) fat-lto-objects are being used.  This is
> because during the build pmdinfogen scans the generated ELF objects to
> find this_pmd_name* symbol in symbol table.  Without fat-lto-objects gcc
> produces ELF only with extra symbols for internal use during linking.
> 
> Signed-off-by: Andrzej Ostruszka <aostruszka@marvell.com>
> ---
>  .travis.yml                                  |  7 ++++
>  config/common_base                           |  5 +++
>  config/meson.build                           | 13 +++++++
>  doc/guides/prog_guide/lto.rst                | 36 ++++++++++++++++++++
>  doc/guides/rel_notes/release_19_11.rst       |  8 +++++
>  lib/librte_distributor/rte_distributor.c     | 18 +++++-----
>  lib/librte_distributor/rte_distributor_v20.c | 18 +++++-----
>  lib/librte_lpm/rte_lpm.c                     | 28 +++++++--------
>  lib/librte_lpm/rte_lpm6.c                    | 16 ++++-----
>  lib/librte_timer/rte_timer.c                 | 20 +++++------
>  mk/toolchain/gcc/rte.toolchain-compat.mk     |  4 +++
>  mk/toolchain/gcc/rte.vars.mk                 | 12 +++++++
>  mk/toolchain/icc/rte.vars.mk                 |  8 +++++
>  13 files changed, 143 insertions(+), 50 deletions(-)
>  create mode 100644 doc/guides/prog_guide/lto.rst
> 
> diff --git a/.travis.yml b/.travis.yml
> index 781f9f666..70d221852 100644
> --- a/.travis.yml
> +++ b/.travis.yml
> @@ -31,6 +31,7 @@ env:
>    - DEF_LIB="static" OPTS="-Denable_kmods=false"
>    - DEF_LIB="shared" OPTS="-Denable_kmods=false"
>    - DEF_LIB="shared" RUN_TESTS=1 BUILD_DOCS=1
> +  - DEF_LIB="shared" OPTS="-Db_lto=true"
>  
>  matrix:
>    include:
> @@ -100,6 +101,12 @@ matrix:
>        apt:
>          packages:
>            - *extra_packages
> +  - env: DEF_LIB="shared" OPTS="-Db_lto=true" EXTRA_PACKAGES=1
> +    compiler: gcc
> +    addons:
> +      apt:
> +        packages:
> +          - *extra_packages
>  
>  
>  script: ./.ci/${TRAVIS_OS_NAME}-build.sh
> diff --git a/config/common_base b/config/common_base
> index 8ef75c203..73a55fdec 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -49,6 +49,11 @@ CONFIG_RTE_FORCE_INTRINSICS=n
>  #
>  CONFIG_RTE_ARCH_STRICT_ALIGN=n
>  
> +#
> +# Enable link time optimization
> +#
> +CONFIG_RTE_ENABLE_LTO=n
> +
>  #
>  # Compile to share library
>  #
> diff --git a/config/meson.build b/config/meson.build
> index 2bafea530..1a5093118 100644
> --- a/config/meson.build
> +++ b/config/meson.build
> @@ -196,3 +196,16 @@ add_project_arguments('-D_GNU_SOURCE', language: 'c')
>  if is_freebsd
>  	add_project_arguments('-D__BSD_VISIBLE', language: 'c')
>  endif
> +
> +if get_option('b_lto')
> +	if cc.has_argument('-ffat-lto-objects')
> +		add_project_arguments('-ffat-lto-objects', language: 'c')
> +	else
> +		error('compiler does not support fat LTO objects - please turn LTO off')
> +	endif
> +	# workaround for gcc bug 81440
> +	if cc.get_id() == 'gcc' and cc.version().version_compare('<8.0')
> +		add_project_arguments('-Wno-lto-type-mismatch', language: 'c')
> +		add_project_link_arguments('-Wno-lto-type-mismatch', language: 'c')
> +	endif
> +endif

This LGTM

> diff --git a/doc/guides/prog_guide/lto.rst b/doc/guides/prog_guide/lto.rst
> new file mode 100644
> index 000000000..b6daabc86
> --- /dev/null
<snip>
> diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
> index ea40a11c0..ad4fad83c 100644
> --- a/mk/toolchain/gcc/rte.toolchain-compat.mk
> +++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
> @@ -88,6 +88,10 @@ else
>  		MACHINE_CFLAGS := $(filter-out -march% -mtune% -msse%,$(MACHINE_CFLAGS))
>  	endif
>  
> +	ifeq ($(shell test $(GCC_VERSION) -lt 45 && echo 1), 1)
> +		CONFIG_RTE_ENABLE_LTO=n
> +	endif
> +

I don't think we support GCC versions that far back any more, but this is
fairly harmless, so probably ok to keep.

Acked-by: Bruce Richardson <bruce.richarson@intel.com>
  
Thomas Monjalon Oct. 27, 2019, 11:47 a.m. UTC | #2
Adding John and Marko for doc review.

22/10/2019 13:54, Andrzej Ostruszka:
> --- /dev/null
> +++ b/doc/guides/prog_guide/lto.rst

This new file is not included in the prog guide index.

> @@ -0,0 +1,36 @@

You are missing the mandatory SPDX and Copyright header.

> +Link Time Optimization
> +======================
> +
> +The DPDK framework supports compilation with link time optimization

Instead of "The DPDK framework", I prefer just "DPDK".

> +turned on.  This depends obviously on the capabilities of the compiler

Not sure what you are talking about.
If it is about fat objects, it is not obvious.

> +to do "whole program" optimization at link time and is available only
> +for compilers that support that feature (gcc and icc).  To be more

Please start a new sentence at the beginning of a line in the RsT file.
It is easier to review and update.

> +specific compiler have to support creation of ELF objects containing

A comma is missing after "specific".

> +both normal code and internal representation (fat-lto-objects).  This is
> +required since during build some code is generated by parsing produced
> +ELF objects (pmdinfogen).
> +
> +The amount of performance gain that one can get from LTO depends on the
> +compiler and the code that is being compiled.  However LTO is also
> +useful for additional code analysis done by the compiler.  In particular
> +due to interprocedural analysis compiler can produce additional warnings
> +about variables that might be used uninitialized.  Some of these
> +warnings might be "false positives" though and you might need to
> +explicitly initialize variable in order to silence the compiler.

Any word about build speed degradation?

> +
> +Link time optimization can be enabled for whole DPDK framework by
> +setting:
> +
> +.. code-block:: console
> +    CONFIG_ENABLE_LTO=y
> +
> +in config file for the case of make based build and by:
> +
> +.. code-block:: console
> +    meson build -Db_lto=true
> +    ninja -C build

No need to add the ninja step here.

> +for the case of meson based build.

Better to describe the case (make or meson) before the code-block.

> +Please note that turning LTO on causes considerable extension of
> +compilation time.

"compilation time" is not accurate.
When referring to compilation + linking, it is better to use the words
"build time".


> --- a/doc/guides/rel_notes/release_19_11.rst
> +++ b/doc/guides/rel_notes/release_19_11.rst
> @@ -56,6 +56,14 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =========================================================
>  
> +**Added build support for Link Time Optimization.**
> +
> + LTO is an optimization technique used by the compiler to perform whole
> + program analysis and optimization at link time.  In order to do that
> + compilers store their internal representation of the source code that
> + the linker uses at the final stage of compilation process.
> +
> + See :doc:`../prog_guide/lto` for more information:

Please rebase and add a blank line when adding a paragraph.

>  
>  Removed Items
>  -------------

> --- a/lib/librte_distributor/rte_distributor.c
> +++ b/lib/librte_distributor/rte_distributor.c
> -void
> +void __vsym

I replied in v2 about this change. Better to not talk about it here.


I wanted to merge this series in 19.11-rc1 but I think it is better
to move its merge in 19.11-rc2.
Thanks for the work.
  
Andrzej Ostruszka Oct. 28, 2019, 10:47 a.m. UTC | #3
Thank you Thomas for the comments.  Please see below for replies.

On 10/27/19 12:47 PM, Thomas Monjalon wrote:
> Adding John and Marko for doc review.
> 
> 22/10/2019 13:54, Andrzej Ostruszka:
>> --- /dev/null
>> +++ b/doc/guides/prog_guide/lto.rst
> 
> This new file is not included in the prog guide index.

Will add it (between "writing efficient code" and "profile app").

>> @@ -0,0 +1,36 @@
> 
> You are missing the mandatory SPDX and Copyright header.

Will add.

>> +Link Time Optimization
>> +======================
>> +
>> +The DPDK framework supports compilation with link time optimization
> 
> Instead of "The DPDK framework", I prefer just "DPDK".

OK

>> +turned on.  This depends obviously on the capabilities of the compiler
> 
> Not sure what you are talking about.
> If it is about fat objects, it is not obvious.
> 
>> +to do "whole program" optimization at link time and is available only

That was not a reference to some formally defined "capabilities" just an
"obvious" statement that in order to benefit from LTO one needs a
compiler "capable of" performing it.  I will rephrase this paragraph as:

-8<-------------
The DPDK supports compilation with link time optimization turned on.
This depends obviously on the ability of the compiler to do "whole
program" optimization at link time and is available only for compilers
that support that feature (gcc and icc).
To be more specific, compiler (in addition to performing LTO) have to
support creation of ELF objects containing both normal code and internal
representation (fat-lto-objects).  This is required since during build
some code is generated by parsing produced ELF objects (pmdinfogen).
-8<-------------

Will it suffice?  If not, then please suggest wording to be used.

>> +for compilers that support that feature (gcc and icc).  To be more
> 
> Please start a new sentence at the beginning of a line in the RsT file.
> It is easier to review and update.

OK

>> +specific compiler have to support creation of ELF objects containing
> 
> A comma is missing after "specific".

OK

>> +both normal code and internal representation (fat-lto-objects).  This is
>> +required since during build some code is generated by parsing produced
>> +ELF objects (pmdinfogen).
>> +
>> +The amount of performance gain that one can get from LTO depends on the
>> +compiler and the code that is being compiled.  However LTO is also
>> +useful for additional code analysis done by the compiler.  In particular
>> +due to interprocedural analysis compiler can produce additional warnings
>> +about variables that might be used uninitialized.  Some of these
>> +warnings might be "false positives" though and you might need to
>> +explicitly initialize variable in order to silence the compiler.
> 
> Any word about build speed degradation?

It is mentioned at the end of this file - I will move it here.

>> +Link time optimization can be enabled for whole DPDK framework by
>> +setting:
>> +
>> +.. code-block:: console
>> +    CONFIG_ENABLE_LTO=y
>> +
>> +in config file for the case of make based build and by:
>> +
>> +.. code-block:: console
>> +    meson build -Db_lto=true
>> +    ninja -C build
> 
> No need to add the ninja step here.

OK

>> +for the case of meson based build.
> 
> Better to describe the case (make or meson) before the code-block.

OK

>> +Please note that turning LTO on causes considerable extension of
>> +compilation time.
> 
> "compilation time" is not accurate.
> When referring to compilation + linking, it is better to use the words
> "build time".

OK

>> --- a/doc/guides/rel_notes/release_19_11.rst
>> +++ b/doc/guides/rel_notes/release_19_11.rst
>> @@ -56,6 +56,14 @@ New Features
>>       Also, make sure to start the actual text at the margin.
>>       =========================================================
>>  
>> +**Added build support for Link Time Optimization.**
>> +
>> + LTO is an optimization technique used by the compiler to perform whole
>> + program analysis and optimization at link time.  In order to do that
>> + compilers store their internal representation of the source code that
>> + the linker uses at the final stage of compilation process.
>> +
>> + See :doc:`../prog_guide/lto` for more information:
> 
> Please rebase and add a blank line when adding a paragraph.

OK

[...]
> I wanted to merge this series in 19.11-rc1 but I think it is better
> to move its merge in 19.11-rc2.

OK.  I'll send the next version today.

Thank you again for the comments.

Regards
Andrzej
  
Thomas Monjalon Oct. 28, 2019, 11:03 a.m. UTC | #4
28/10/2019 11:47, Andrzej Ostruszka:
> On 10/27/19 12:47 PM, Thomas Monjalon wrote:
> >> +turned on.  This depends obviously on the capabilities of the compiler
> > 
> > Not sure what you are talking about.
> > If it is about fat objects, it is not obvious.
> > 
> >> +to do "whole program" optimization at link time and is available only
> 
> That was not a reference to some formally defined "capabilities" just an
> "obvious" statement that in order to benefit from LTO one needs a
> compiler "capable of" performing it.  I will rephrase this paragraph as:
> 
> -8<-------------
> The DPDK supports compilation with link time optimization turned on.
> This depends obviously on the ability of the compiler to do "whole
> program" optimization at link time and is available only for compilers
> that support that feature (gcc and icc).
> To be more specific, compiler (in addition to performing LTO) have to
> support creation of ELF objects containing both normal code and internal
> representation (fat-lto-objects).  This is required since during build
> some code is generated by parsing produced ELF objects (pmdinfogen).
> -8<-------------
> 
> Will it suffice?  If not, then please suggest wording to be used.

There is a confusion here:
"compilers that support that feature (gcc and icc)"
In my understanding, clang supports LTO but is not mentioned
because of the lack of fat object support.
I think you should mention gcc and icc only after explaining
the second constraint (fat objects).

> >> +for compilers that support that feature (gcc and icc).  To be more
> > 
> > Please start a new sentence at the beginning of a line in the RsT file.
> > It is easier to review and update.
> 
> OK

Please consider such comment to apply to the whole file :)


[...]
> OK.  I'll send the next version today.
> 
> Thank you again for the comments.

Thank you
  

Patch

diff --git a/.travis.yml b/.travis.yml
index 781f9f666..70d221852 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -31,6 +31,7 @@  env:
   - DEF_LIB="static" OPTS="-Denable_kmods=false"
   - DEF_LIB="shared" OPTS="-Denable_kmods=false"
   - DEF_LIB="shared" RUN_TESTS=1 BUILD_DOCS=1
+  - DEF_LIB="shared" OPTS="-Db_lto=true"
 
 matrix:
   include:
@@ -100,6 +101,12 @@  matrix:
       apt:
         packages:
           - *extra_packages
+  - env: DEF_LIB="shared" OPTS="-Db_lto=true" EXTRA_PACKAGES=1
+    compiler: gcc
+    addons:
+      apt:
+        packages:
+          - *extra_packages
 
 
 script: ./.ci/${TRAVIS_OS_NAME}-build.sh
diff --git a/config/common_base b/config/common_base
index 8ef75c203..73a55fdec 100644
--- a/config/common_base
+++ b/config/common_base
@@ -49,6 +49,11 @@  CONFIG_RTE_FORCE_INTRINSICS=n
 #
 CONFIG_RTE_ARCH_STRICT_ALIGN=n
 
+#
+# Enable link time optimization
+#
+CONFIG_RTE_ENABLE_LTO=n
+
 #
 # Compile to share library
 #
diff --git a/config/meson.build b/config/meson.build
index 2bafea530..1a5093118 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -196,3 +196,16 @@  add_project_arguments('-D_GNU_SOURCE', language: 'c')
 if is_freebsd
 	add_project_arguments('-D__BSD_VISIBLE', language: 'c')
 endif
+
+if get_option('b_lto')
+	if cc.has_argument('-ffat-lto-objects')
+		add_project_arguments('-ffat-lto-objects', language: 'c')
+	else
+		error('compiler does not support fat LTO objects - please turn LTO off')
+	endif
+	# workaround for gcc bug 81440
+	if cc.get_id() == 'gcc' and cc.version().version_compare('<8.0')
+		add_project_arguments('-Wno-lto-type-mismatch', language: 'c')
+		add_project_link_arguments('-Wno-lto-type-mismatch', language: 'c')
+	endif
+endif
diff --git a/doc/guides/prog_guide/lto.rst b/doc/guides/prog_guide/lto.rst
new file mode 100644
index 000000000..b6daabc86
--- /dev/null
+++ b/doc/guides/prog_guide/lto.rst
@@ -0,0 +1,36 @@ 
+Link Time Optimization
+======================
+
+The DPDK framework supports compilation with link time optimization
+turned on.  This depends obviously on the capabilities of the compiler
+to do "whole program" optimization at link time and is available only
+for compilers that support that feature (gcc and icc).  To be more
+specific compiler have to support creation of ELF objects containing
+both normal code and internal representation (fat-lto-objects).  This is
+required since during build some code is generated by parsing produced
+ELF objects (pmdinfogen).
+
+The amount of performance gain that one can get from LTO depends on the
+compiler and the code that is being compiled.  However LTO is also
+useful for additional code analysis done by the compiler.  In particular
+due to interprocedural analysis compiler can produce additional warnings
+about variables that might be used uninitialized.  Some of these
+warnings might be "false positives" though and you might need to
+explicitly initialize variable in order to silence the compiler.
+
+Link time optimization can be enabled for whole DPDK framework by
+setting:
+
+.. code-block:: console
+    CONFIG_ENABLE_LTO=y
+
+in config file for the case of make based build and by:
+
+.. code-block:: console
+    meson build -Db_lto=true
+    ninja -C build
+
+for the case of meson based build.
+
+Please note that turning LTO on causes considerable extension of
+compilation time.
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 8490d897c..97b4f4083 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -56,6 +56,14 @@  New Features
      Also, make sure to start the actual text at the margin.
      =========================================================
 
+**Added build support for Link Time Optimization.**
+
+ LTO is an optimization technique used by the compiler to perform whole
+ program analysis and optimization at link time.  In order to do that
+ compilers store their internal representation of the source code that
+ the linker uses at the final stage of compilation process.
+
+ See :doc:`../prog_guide/lto` for more information:
 
 Removed Items
 -------------
diff --git a/lib/librte_distributor/rte_distributor.c b/lib/librte_distributor/rte_distributor.c
index 21eb1fb0a..848250f4a 100644
--- a/lib/librte_distributor/rte_distributor.c
+++ b/lib/librte_distributor/rte_distributor.c
@@ -32,7 +32,7 @@  EAL_REGISTER_TAILQ(rte_dist_burst_tailq)
 
 /**** Burst Packet APIs called by workers ****/
 
-void
+void __vsym
 rte_distributor_request_pkt_v1705(struct rte_distributor *d,
 		unsigned int worker_id, struct rte_mbuf **oldpkt,
 		unsigned int count)
@@ -84,7 +84,7 @@  MAP_STATIC_SYMBOL(void rte_distributor_request_pkt(struct rte_distributor *d,
 		unsigned int count),
 		rte_distributor_request_pkt_v1705);
 
-int
+int __vsym
 rte_distributor_poll_pkt_v1705(struct rte_distributor *d,
 		unsigned int worker_id, struct rte_mbuf **pkts)
 {
@@ -124,7 +124,7 @@  MAP_STATIC_SYMBOL(int rte_distributor_poll_pkt(struct rte_distributor *d,
 		unsigned int worker_id, struct rte_mbuf **pkts),
 		rte_distributor_poll_pkt_v1705);
 
-int
+int __vsym
 rte_distributor_get_pkt_v1705(struct rte_distributor *d,
 		unsigned int worker_id, struct rte_mbuf **pkts,
 		struct rte_mbuf **oldpkt, unsigned int return_count)
@@ -159,7 +159,7 @@  MAP_STATIC_SYMBOL(int rte_distributor_get_pkt(struct rte_distributor *d,
 		struct rte_mbuf **oldpkt, unsigned int return_count),
 		rte_distributor_get_pkt_v1705);
 
-int
+int __vsym
 rte_distributor_return_pkt_v1705(struct rte_distributor *d,
 		unsigned int worker_id, struct rte_mbuf **oldpkt, int num)
 {
@@ -335,7 +335,7 @@  release(struct rte_distributor *d, unsigned int wkr)
 
 
 /* process a set of packets to distribute them to workers */
-int
+int __vsym
 rte_distributor_process_v1705(struct rte_distributor *d,
 		struct rte_mbuf **mbufs, unsigned int num_mbufs)
 {
@@ -476,7 +476,7 @@  MAP_STATIC_SYMBOL(int rte_distributor_process(struct rte_distributor *d,
 		rte_distributor_process_v1705);
 
 /* return to the caller, packets returned from workers */
-int
+int __vsym
 rte_distributor_returned_pkts_v1705(struct rte_distributor *d,
 		struct rte_mbuf **mbufs, unsigned int max_mbufs)
 {
@@ -526,7 +526,7 @@  total_outstanding(const struct rte_distributor *d)
  * Flush the distributor, so that there are no outstanding packets in flight or
  * queued up.
  */
-int
+int __vsym
 rte_distributor_flush_v1705(struct rte_distributor *d)
 {
 	unsigned int flushed;
@@ -561,7 +561,7 @@  MAP_STATIC_SYMBOL(int rte_distributor_flush(struct rte_distributor *d),
 		rte_distributor_flush_v1705);
 
 /* clears the internal returns array in the distributor */
-void
+void __vsym
 rte_distributor_clear_returns_v1705(struct rte_distributor *d)
 {
 	unsigned int wkr;
@@ -581,7 +581,7 @@  MAP_STATIC_SYMBOL(void rte_distributor_clear_returns(struct rte_distributor *d),
 		rte_distributor_clear_returns_v1705);
 
 /* creates a distributor instance */
-struct rte_distributor *
+struct rte_distributor * __vsym
 rte_distributor_create_v1705(const char *name,
 		unsigned int socket_id,
 		unsigned int num_workers,
diff --git a/lib/librte_distributor/rte_distributor_v20.c b/lib/librte_distributor/rte_distributor_v20.c
index cdc0969a8..31c766421 100644
--- a/lib/librte_distributor/rte_distributor_v20.c
+++ b/lib/librte_distributor/rte_distributor_v20.c
@@ -27,7 +27,7 @@  EAL_REGISTER_TAILQ(rte_distributor_tailq)
 
 /**** APIs called by workers ****/
 
-void
+void __vsym
 rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d,
 		unsigned worker_id, struct rte_mbuf *oldpkt)
 {
@@ -40,7 +40,7 @@  rte_distributor_request_pkt_v20(struct rte_distributor_v20 *d,
 }
 VERSION_SYMBOL(rte_distributor_request_pkt, _v20, 2.0);
 
-struct rte_mbuf *
+struct rte_mbuf * __vsym
 rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d,
 		unsigned worker_id)
 {
@@ -54,7 +54,7 @@  rte_distributor_poll_pkt_v20(struct rte_distributor_v20 *d,
 }
 VERSION_SYMBOL(rte_distributor_poll_pkt, _v20, 2.0);
 
-struct rte_mbuf *
+struct rte_mbuf * __vsym
 rte_distributor_get_pkt_v20(struct rte_distributor_v20 *d,
 		unsigned worker_id, struct rte_mbuf *oldpkt)
 {
@@ -66,7 +66,7 @@  rte_distributor_get_pkt_v20(struct rte_distributor_v20 *d,
 }
 VERSION_SYMBOL(rte_distributor_get_pkt, _v20, 2.0);
 
-int
+int __vsym
 rte_distributor_return_pkt_v20(struct rte_distributor_v20 *d,
 		unsigned worker_id, struct rte_mbuf *oldpkt)
 {
@@ -191,7 +191,7 @@  process_returns(struct rte_distributor_v20 *d)
 }
 
 /* process a set of packets to distribute them to workers */
-int
+int __vsym
 rte_distributor_process_v20(struct rte_distributor_v20 *d,
 		struct rte_mbuf **mbufs, unsigned num_mbufs)
 {
@@ -296,7 +296,7 @@  rte_distributor_process_v20(struct rte_distributor_v20 *d,
 VERSION_SYMBOL(rte_distributor_process, _v20, 2.0);
 
 /* return to the caller, packets returned from workers */
-int
+int __vsym
 rte_distributor_returned_pkts_v20(struct rte_distributor_v20 *d,
 		struct rte_mbuf **mbufs, unsigned max_mbufs)
 {
@@ -334,7 +334,7 @@  total_outstanding(const struct rte_distributor_v20 *d)
 
 /* flush the distributor, so that there are no outstanding packets in flight or
  * queued up. */
-int
+int __vsym
 rte_distributor_flush_v20(struct rte_distributor_v20 *d)
 {
 	const unsigned flushed = total_outstanding(d);
@@ -347,7 +347,7 @@  rte_distributor_flush_v20(struct rte_distributor_v20 *d)
 VERSION_SYMBOL(rte_distributor_flush, _v20, 2.0);
 
 /* clears the internal returns array in the distributor */
-void
+void __vsym
 rte_distributor_clear_returns_v20(struct rte_distributor_v20 *d)
 {
 	d->returns.start = d->returns.count = 0;
@@ -358,7 +358,7 @@  rte_distributor_clear_returns_v20(struct rte_distributor_v20 *d)
 VERSION_SYMBOL(rte_distributor_clear_returns, _v20, 2.0);
 
 /* creates a distributor instance */
-struct rte_distributor_v20 *
+struct rte_distributor_v20 * __vsym
 rte_distributor_create_v20(const char *name,
 		unsigned socket_id,
 		unsigned num_workers)
diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c
index 3a929a1b1..a2fba8d61 100644
--- a/lib/librte_lpm/rte_lpm.c
+++ b/lib/librte_lpm/rte_lpm.c
@@ -89,7 +89,7 @@  depth_to_range(uint8_t depth)
 /*
  * Find an existing lpm table and return a pointer to it.
  */
-struct rte_lpm_v20 *
+struct rte_lpm_v20 * __vsym
 rte_lpm_find_existing_v20(const char *name)
 {
 	struct rte_lpm_v20 *l = NULL;
@@ -115,7 +115,7 @@  rte_lpm_find_existing_v20(const char *name)
 }
 VERSION_SYMBOL(rte_lpm_find_existing, _v20, 2.0);
 
-struct rte_lpm *
+struct rte_lpm * __vsym
 rte_lpm_find_existing_v1604(const char *name)
 {
 	struct rte_lpm *l = NULL;
@@ -146,7 +146,7 @@  MAP_STATIC_SYMBOL(struct rte_lpm *rte_lpm_find_existing(const char *name),
 /*
  * Allocates memory for LPM object
  */
-struct rte_lpm_v20 *
+struct rte_lpm_v20 * __vsym
 rte_lpm_create_v20(const char *name, int socket_id, int max_rules,
 		__rte_unused int flags)
 {
@@ -219,7 +219,7 @@  rte_lpm_create_v20(const char *name, int socket_id, int max_rules,
 }
 VERSION_SYMBOL(rte_lpm_create, _v20, 2.0);
 
-struct rte_lpm *
+struct rte_lpm * __vsym
 rte_lpm_create_v1604(const char *name, int socket_id,
 		const struct rte_lpm_config *config)
 {
@@ -328,7 +328,7 @@  MAP_STATIC_SYMBOL(
 /*
  * Deallocates memory for given LPM table.
  */
-void
+void __vsym
 rte_lpm_free_v20(struct rte_lpm_v20 *lpm)
 {
 	struct rte_lpm_list *lpm_list;
@@ -357,7 +357,7 @@  rte_lpm_free_v20(struct rte_lpm_v20 *lpm)
 }
 VERSION_SYMBOL(rte_lpm_free, _v20, 2.0);
 
-void
+void __vsym
 rte_lpm_free_v1604(struct rte_lpm *lpm)
 {
 	struct rte_lpm_list *lpm_list;
@@ -1176,7 +1176,7 @@  add_depth_big_v1604(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth,
 /*
  * Add a route
  */
-int
+int __vsym
 rte_lpm_add_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth,
 		uint8_t next_hop)
 {
@@ -1217,7 +1217,7 @@  rte_lpm_add_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth,
 }
 VERSION_SYMBOL(rte_lpm_add, _v20, 2.0);
 
-int
+int __vsym
 rte_lpm_add_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
 		uint32_t next_hop)
 {
@@ -1263,7 +1263,7 @@  MAP_STATIC_SYMBOL(int rte_lpm_add(struct rte_lpm *lpm, uint32_t ip,
 /*
  * Look for a rule in the high-level rules table
  */
-int
+int __vsym
 rte_lpm_is_rule_present_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth,
 uint8_t *next_hop)
 {
@@ -1290,7 +1290,7 @@  uint8_t *next_hop)
 }
 VERSION_SYMBOL(rte_lpm_is_rule_present, _v20, 2.0);
 
-int
+int __vsym
 rte_lpm_is_rule_present_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
 uint32_t *next_hop)
 {
@@ -1843,7 +1843,7 @@  delete_depth_big_v1604(struct rte_lpm *lpm, uint32_t ip_masked,
 /*
  * Deletes a rule
  */
-int
+int __vsym
 rte_lpm_delete_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth)
 {
 	int32_t rule_to_delete_index, sub_rule_index;
@@ -1897,7 +1897,7 @@  rte_lpm_delete_v20(struct rte_lpm_v20 *lpm, uint32_t ip, uint8_t depth)
 }
 VERSION_SYMBOL(rte_lpm_delete, _v20, 2.0);
 
-int
+int __vsym
 rte_lpm_delete_v1604(struct rte_lpm *lpm, uint32_t ip, uint8_t depth)
 {
 	int32_t rule_to_delete_index, sub_rule_index;
@@ -1956,7 +1956,7 @@  MAP_STATIC_SYMBOL(int rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip,
 /*
  * Delete all rules from the LPM table.
  */
-void
+void __vsym
 rte_lpm_delete_all_v20(struct rte_lpm_v20 *lpm)
 {
 	/* Zero rule information. */
@@ -1973,7 +1973,7 @@  rte_lpm_delete_all_v20(struct rte_lpm_v20 *lpm)
 }
 VERSION_SYMBOL(rte_lpm_delete_all, _v20, 2.0);
 
-void
+void __vsym
 rte_lpm_delete_all_v1604(struct rte_lpm *lpm)
 {
 	/* Zero rule information. */
diff --git a/lib/librte_lpm/rte_lpm6.c b/lib/librte_lpm/rte_lpm6.c
index 9b8aeb972..49a7fea1d 100644
--- a/lib/librte_lpm/rte_lpm6.c
+++ b/lib/librte_lpm/rte_lpm6.c
@@ -811,7 +811,7 @@  add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
 /*
  * Add a route
  */
-int
+int __vsym
 rte_lpm6_add_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
 		uint8_t next_hop)
 {
@@ -861,7 +861,7 @@  simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth)
 	return 0;
 }
 
-int
+int __vsym
 rte_lpm6_add_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
 		uint32_t next_hop)
 {
@@ -954,7 +954,7 @@  lookup_step(const struct rte_lpm6 *lpm, const struct rte_lpm6_tbl_entry *tbl,
 /*
  * Looks up an IP
  */
-int
+int __vsym
 rte_lpm6_lookup_v20(const struct rte_lpm6 *lpm, uint8_t *ip, uint8_t *next_hop)
 {
 	uint32_t next_hop32 = 0;
@@ -972,7 +972,7 @@  rte_lpm6_lookup_v20(const struct rte_lpm6 *lpm, uint8_t *ip, uint8_t *next_hop)
 }
 VERSION_SYMBOL(rte_lpm6_lookup, _v20, 2.0);
 
-int
+int __vsym
 rte_lpm6_lookup_v1705(const struct rte_lpm6 *lpm, uint8_t *ip,
 		uint32_t *next_hop)
 {
@@ -1007,7 +1007,7 @@  MAP_STATIC_SYMBOL(int rte_lpm6_lookup(const struct rte_lpm6 *lpm, uint8_t *ip,
 /*
  * Looks up a group of IP addresses
  */
-int
+int __vsym
 rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm,
 		uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE],
 		int16_t * next_hops, unsigned n)
@@ -1048,7 +1048,7 @@  rte_lpm6_lookup_bulk_func_v20(const struct rte_lpm6 *lpm,
 }
 VERSION_SYMBOL(rte_lpm6_lookup_bulk_func, _v20, 2.0);
 
-int
+int __vsym
 rte_lpm6_lookup_bulk_func_v1705(const struct rte_lpm6 *lpm,
 		uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE],
 		int32_t *next_hops, unsigned int n)
@@ -1098,7 +1098,7 @@  MAP_STATIC_SYMBOL(int rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm,
 /*
  * Look for a rule in the high-level rules table
  */
-int
+int __vsym
 rte_lpm6_is_rule_present_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
 		uint8_t *next_hop)
 {
@@ -1118,7 +1118,7 @@  rte_lpm6_is_rule_present_v20(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
 }
 VERSION_SYMBOL(rte_lpm6_is_rule_present, _v20, 2.0);
 
-int
+int __vsym
 rte_lpm6_is_rule_present_v1705(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
 		uint32_t *next_hop)
 {
diff --git a/lib/librte_timer/rte_timer.c b/lib/librte_timer/rte_timer.c
index bdcf05d06..e560ace06 100644
--- a/lib/librte_timer/rte_timer.c
+++ b/lib/librte_timer/rte_timer.c
@@ -131,7 +131,7 @@  rte_timer_data_dealloc(uint32_t id)
 	return 0;
 }
 
-void
+void __vsym
 rte_timer_subsystem_init_v20(void)
 {
 	unsigned lcore_id;
@@ -153,7 +153,7 @@  VERSION_SYMBOL(rte_timer_subsystem_init, _v20, 2.0);
  * secondary processes should be empty, the zeroth entry can be shared by
  * multiple processes.
  */
-int
+int __vsym
 rte_timer_subsystem_init_v1905(void)
 {
 	const struct rte_memzone *mz;
@@ -551,7 +551,7 @@  __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
 }
 
 /* Reset and start the timer associated with the timer handle tim */
-int
+int __vsym
 rte_timer_reset_v20(struct rte_timer *tim, uint64_t ticks,
 		    enum rte_timer_type type, unsigned int tim_lcore,
 		    rte_timer_cb_t fct, void *arg)
@@ -574,7 +574,7 @@  rte_timer_reset_v20(struct rte_timer *tim, uint64_t ticks,
 }
 VERSION_SYMBOL(rte_timer_reset, _v20, 2.0);
 
-int
+int __vsym
 rte_timer_reset_v1905(struct rte_timer *tim, uint64_t ticks,
 		      enum rte_timer_type type, unsigned int tim_lcore,
 		      rte_timer_cb_t fct, void *arg)
@@ -657,14 +657,14 @@  __rte_timer_stop(struct rte_timer *tim, int local_is_locked,
 }
 
 /* Stop the timer associated with the timer handle tim */
-int
+int __vsym
 rte_timer_stop_v20(struct rte_timer *tim)
 {
 	return __rte_timer_stop(tim, 0, &default_timer_data);
 }
 VERSION_SYMBOL(rte_timer_stop, _v20, 2.0);
 
-int
+int __vsym
 rte_timer_stop_v1905(struct rte_timer *tim)
 {
 	return rte_timer_alt_stop(default_data_id, tim);
@@ -817,14 +817,14 @@  __rte_timer_manage(struct rte_timer_data *timer_data)
 	priv_timer[lcore_id].running_tim = NULL;
 }
 
-void
+void __vsym
 rte_timer_manage_v20(void)
 {
 	__rte_timer_manage(&default_timer_data);
 }
 VERSION_SYMBOL(rte_timer_manage, _v20, 2.0);
 
-int
+int __vsym
 rte_timer_manage_v1905(void)
 {
 	struct rte_timer_data *timer_data;
@@ -1074,14 +1074,14 @@  __rte_timer_dump_stats(struct rte_timer_data *timer_data __rte_unused, FILE *f)
 #endif
 }
 
-void
+void __vsym
 rte_timer_dump_stats_v20(FILE *f)
 {
 	__rte_timer_dump_stats(&default_timer_data, f);
 }
 VERSION_SYMBOL(rte_timer_dump_stats, _v20, 2.0);
 
-int
+int __vsym
 rte_timer_dump_stats_v1905(FILE *f)
 {
 	return rte_timer_alt_dump_stats(default_data_id, f);
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index ea40a11c0..ad4fad83c 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -88,6 +88,10 @@  else
 		MACHINE_CFLAGS := $(filter-out -march% -mtune% -msse%,$(MACHINE_CFLAGS))
 	endif
 
+	ifeq ($(shell test $(GCC_VERSION) -lt 45 && echo 1), 1)
+		CONFIG_RTE_ENABLE_LTO=n
+	endif
+
 	# Disable thunderx PMD for gcc < 4.7
 	ifeq ($(shell test $(GCC_VERSION) -lt 47 && echo 1), 1)
 		CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD=d
diff --git a/mk/toolchain/gcc/rte.vars.mk b/mk/toolchain/gcc/rte.vars.mk
index b852fcfd7..9fc704193 100644
--- a/mk/toolchain/gcc/rte.vars.mk
+++ b/mk/toolchain/gcc/rte.vars.mk
@@ -62,6 +62,18 @@  endif
 # process cpu flags
 include $(RTE_SDK)/mk/toolchain/$(RTE_TOOLCHAIN)/rte.toolchain-compat.mk
 
+ifeq ($(CONFIG_RTE_ENABLE_LTO),y)
+# 'fat-lto' is used since pmdinfogen needs to have 'this_pmd_nameX'
+# exported in symbol table and without this option only internal
+# representation is present.
+TOOLCHAIN_CFLAGS += -flto -ffat-lto-objects
+TOOLCHAIN_LDFLAGS += -flto
+# workaround for GCC bug 81440
+ifeq ($(shell test $(GCC_VERSION) -lt 80 && echo 1), 1)
+WERROR_FLAGS += -Wno-lto-type-mismatch
+endif
+endif
+
 # workaround GCC bug with warning "missing initializer" for "= {0}"
 ifeq ($(shell test $(GCC_VERSION) -lt 47 && echo 1), 1)
 WERROR_FLAGS += -Wno-missing-field-initializers
diff --git a/mk/toolchain/icc/rte.vars.mk b/mk/toolchain/icc/rte.vars.mk
index aa1422bf1..8aa87aa1e 100644
--- a/mk/toolchain/icc/rte.vars.mk
+++ b/mk/toolchain/icc/rte.vars.mk
@@ -54,5 +54,13 @@  endif
 # process cpu flags
 include $(RTE_SDK)/mk/toolchain/$(RTE_TOOLCHAIN)/rte.toolchain-compat.mk
 
+ifeq ($(CONFIG_RTE_ENABLE_LTO),y)
+# 'fat-lto' is used since pmdinfogen needs to have 'this_pmd_nameX'
+# exported in symbol table and without this option only internal
+# representation is present.
+TOOLCHAIN_CFLAGS += -flto -ffat-lto-objects
+TOOLCHAIN_LDFLAGS += -flto
+endif
+
 export CC AS AR LD OBJCOPY OBJDUMP STRIP READELF
 export TOOLCHAIN_CFLAGS TOOLCHAIN_LDFLAGS TOOLCHAIN_ASFLAGS