[v3,6/9] eal: register non-EAL threads as lcores

Message ID 20200622132531.21857-7-david.marchand@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Register non-EAL threads as lcore |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

David Marchand June 22, 2020, 1:25 p.m. UTC
  DPDK allows calling some part of its API from a non-EAL thread but this
has some limitations.
OVS (and other applications) has its own thread management but still
want to avoid such limitations by hacking RTE_PER_LCORE(_lcore_id) and
faking EAL threads potentially unknown of some DPDK component.

Introduce a new API to register non-EAL thread and associate them to a
free lcore with a new NON_EAL role.
This role denotes lcores that do not run DPDK mainloop and as such
prevents use of rte_eal_wait_lcore() and consorts.

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
Changes since v1:
- moved cleanup on lcore role code in patch 5,
- added unit test,
- updated documentation,
- changed naming from "external thread" to "registered non-EAL thread"

---
 MAINTAINERS                                   |   1 +
 app/test/Makefile                             |   1 +
 app/test/autotest_data.py                     |   6 +
 app/test/meson.build                          |   2 +
 app/test/test_lcores.c                        | 139 ++++++++++++++++++
 doc/guides/howto/debug_troubleshoot.rst       |   5 +-
 .../prog_guide/env_abstraction_layer.rst      |  22 +--
 doc/guides/prog_guide/mempool_lib.rst         |   2 +-
 lib/librte_eal/common/eal_common_lcore.c      |  44 +++++-
 lib/librte_eal/common/eal_common_thread.c     |  33 +++++
 lib/librte_eal/common/eal_private.h           |  18 +++
 lib/librte_eal/include/rte_lcore.h            |  18 ++-
 lib/librte_eal/rte_eal_version.map            |   2 +
 lib/librte_mempool/rte_mempool.h              |  11 +-
 14 files changed, 282 insertions(+), 22 deletions(-)
 create mode 100644 app/test/test_lcores.c
  

Comments

Ananyev, Konstantin June 22, 2020, 3:49 p.m. UTC | #1
Hi David,

> diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> index 86d32a3dd7..7db05428e7 100644
> --- a/lib/librte_eal/common/eal_common_lcore.c
> +++ b/lib/librte_eal/common/eal_common_lcore.c
> @@ -6,12 +6,13 @@
>  #include <limits.h>
>  #include <string.h>
> 
> -#include <rte_errno.h>
> -#include <rte_log.h>
> -#include <rte_eal.h>
> -#include <rte_lcore.h>
>  #include <rte_common.h>
>  #include <rte_debug.h>
> +#include <rte_eal.h>
> +#include <rte_errno.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +#include <rte_spinlock.h>
> 
>  #include "eal_private.h"
>  #include "eal_thread.h"
> @@ -220,3 +221,38 @@ rte_socket_id_by_idx(unsigned int idx)
>  	}
>  	return config->numa_nodes[idx];
>  }
> +
> +static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
> +
> +unsigned int
> +eal_lcore_non_eal_allocate(void)
> +{
> +	struct rte_config *cfg = rte_eal_get_configuration();
> +	unsigned int lcore_id;
> +
> +	rte_spinlock_lock(&lcore_lock);

I think it will break current DPDK MP modes.
The problem here - rte_config (and lcore_role[]) is in shared memory,
while the lock is local.
Simplest way probably to move lcore_lock to rte_config.

> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +		if (cfg->lcore_role[lcore_id] != ROLE_OFF)
> +			continue;
> +		cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
> +		cfg->lcore_count++;
> +		break;
> +	}
> +	if (lcore_id == RTE_MAX_LCORE)
> +		RTE_LOG(DEBUG, EAL, "No lcore available.\n");
> +	rte_spinlock_unlock(&lcore_lock);
> +	return lcore_id;
> +}
> +
> +void
> +eal_lcore_non_eal_release(unsigned int lcore_id)
> +{
> +	struct rte_config *cfg = rte_eal_get_configuration();
> +
> +	rte_spinlock_lock(&lcore_lock);
> +	if (cfg->lcore_role[lcore_id] == ROLE_NON_EAL) {
> +		cfg->lcore_role[lcore_id] = ROLE_OFF;
> +		cfg->lcore_count--;
> +	}
> +	rte_spinlock_unlock(&lcore_lock);
> +}
> diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
> index a7ae0691bf..1cbddc4b5b 100644
> --- a/lib/librte_eal/common/eal_common_thread.c
> +++ b/lib/librte_eal/common/eal_common_thread.c
> @@ -236,3 +236,36 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name,
>  	pthread_join(*thread, NULL);
>  	return -ret;
>  }
> +
> +void
> +rte_thread_register(void)
> +{
> +	unsigned int lcore_id;
> +	rte_cpuset_t cpuset;
> +
> +	/* EAL init flushes all lcores, we can't register before. */
> +	assert(internal_config.init_complete == 1);
> +	if (pthread_getaffinity_np(pthread_self(), sizeof(cpuset),
> +			&cpuset) != 0)
> +		CPU_ZERO(&cpuset);
> +	lcore_id = eal_lcore_non_eal_allocate();
> +	if (lcore_id >= RTE_MAX_LCORE)
> +		lcore_id = LCORE_ID_ANY;
> +	rte_thread_init(lcore_id, &cpuset);

So we just setting affinity to the same value, right?
Not a big deal, but might be easier to allow rte_thread_init()
to accept cpuset==NULL (and just don't change thread affinity in that case)

> +	if (lcore_id != LCORE_ID_ANY)
> +		RTE_LOG(DEBUG, EAL, "Registered non-EAL thread as lcore %u.\n",
> +			lcore_id);
> +}
> +
> +void
> +rte_thread_unregister(void)
> +{
> +	unsigned int lcore_id = rte_lcore_id();
> +
> +	if (lcore_id != LCORE_ID_ANY)
> +		eal_lcore_non_eal_release(lcore_id);
> +	rte_thread_uninit();
> +	if (lcore_id != LCORE_ID_ANY)
> +		RTE_LOG(DEBUG, EAL, "Unregistered non-EAL thread (was lcore %u).\n",
> +			lcore_id);
> +}
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index 0592fcd694..73238ff157 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -396,6 +396,24 @@ uint64_t get_tsc_freq(void);
>   */
>  uint64_t get_tsc_freq_arch(void);
> 
> +/**
> + * Allocate a free lcore to associate to a non-EAL thread.
> + *
> + * @return
> + *   - the id of a lcore with role ROLE_NON_EAL on success.
> + *   - RTE_MAX_LCORE if none was available.
> + */
> +unsigned int eal_lcore_non_eal_allocate(void);
> +
> +/**
> + * Release the lcore used by a non-EAL thread.
> + * Counterpart of eal_lcore_non_eal_allocate().
> + *
> + * @param lcore_id
> + *   The lcore with role ROLE_NON_EAL to release.
> + */
> +void eal_lcore_non_eal_release(unsigned int lcore_id);
> +
>  /**
>   * Prepare physical memory mapping
>   * i.e. hugepages on Linux and
> diff --git a/lib/librte_eal/include/rte_lcore.h b/lib/librte_eal/include/rte_lcore.h
> index 3968c40693..ea86220394 100644
> --- a/lib/librte_eal/include/rte_lcore.h
> +++ b/lib/librte_eal/include/rte_lcore.h
> @@ -31,6 +31,7 @@ enum rte_lcore_role_t {
>  	ROLE_RTE,
>  	ROLE_OFF,
>  	ROLE_SERVICE,
> +	ROLE_NON_EAL,
>  };
> 
>  /**
> @@ -67,7 +68,8 @@ rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role);
>   *   to run threads with lcore IDs 0, 1, 2 and 3 on physical core 10..
>   *
>   * @return
> - *  Logical core ID (in EAL thread) or LCORE_ID_ANY (in non-EAL thread)
> + *  Logical core ID (in EAL thread or registered non-EAL thread) or
> + *  LCORE_ID_ANY (in unregistered non-EAL thread)
>   */
>  static inline unsigned
>  rte_lcore_id(void)
> @@ -279,6 +281,20 @@ int rte_thread_setname(pthread_t id, const char *name);
>  __rte_experimental
>  int rte_thread_getname(pthread_t id, char *name, size_t len);
> 
> +/**
> + * Register current non-EAL thread as a lcore.
> + */
> +__rte_experimental
> +void
> +rte_thread_register(void);
> +
> +/**
> + * Unregister current thread and release lcore if one was associated.
> + */
> +__rte_experimental
> +void
> +rte_thread_unregister(void);
> +
>  /**
>   * Create a control thread.
>   *
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index 5831eea4b0..39c41d445d 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -396,6 +396,8 @@ EXPERIMENTAL {
> 
>  	# added in 20.08
>  	__rte_trace_mem_per_thread_free;
> +	rte_thread_register;
> +	rte_thread_unregister;
>  };
> 
>  INTERNAL {
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 652d19f9f1..9e0ee052b3 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -28,9 +28,9 @@
>   * rte_mempool_get() or rte_mempool_put() are designed to be called from an EAL
>   * thread due to the internal per-lcore cache. Due to the lack of caching,
>   * rte_mempool_get() or rte_mempool_put() performance will suffer when called
> - * by non-EAL threads. Instead, non-EAL threads should call
> - * rte_mempool_generic_get() or rte_mempool_generic_put() with a user cache
> - * created with rte_mempool_cache_create().
> + * by unregistered non-EAL threads. Instead, unregistered non-EAL threads
> + * should call rte_mempool_generic_get() or rte_mempool_generic_put() with a
> + * user cache created with rte_mempool_cache_create().
>   */
> 
>  #include <stdio.h>
> @@ -1233,7 +1233,7 @@ void rte_mempool_dump(FILE *f, struct rte_mempool *mp);
>  /**
>   * Create a user-owned mempool cache.
>   *
> - * This can be used by non-EAL threads to enable caching when they
> + * This can be used by unregistered non-EAL threads to enable caching when they
>   * interact with a mempool.
>   *
>   * @param size
> @@ -1264,7 +1264,8 @@ rte_mempool_cache_free(struct rte_mempool_cache *cache);
>   * @param lcore_id
>   *   The logical core id.
>   * @return
> - *   A pointer to the mempool cache or NULL if disabled or non-EAL thread.
> + *   A pointer to the mempool cache or NULL if disabled or unregistered non-EAL
> + *   thread.
>   */
>  static __rte_always_inline struct rte_mempool_cache *
>  rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id)
> --
> 2.23.0
  
Ananyev, Konstantin June 22, 2020, 4:37 p.m. UTC | #2
> 
> Hi David,
> 
> > diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> > index 86d32a3dd7..7db05428e7 100644
> > --- a/lib/librte_eal/common/eal_common_lcore.c
> > +++ b/lib/librte_eal/common/eal_common_lcore.c
> > @@ -6,12 +6,13 @@
> >  #include <limits.h>
> >  #include <string.h>
> >
> > -#include <rte_errno.h>
> > -#include <rte_log.h>
> > -#include <rte_eal.h>
> > -#include <rte_lcore.h>
> >  #include <rte_common.h>
> >  #include <rte_debug.h>
> > +#include <rte_eal.h>
> > +#include <rte_errno.h>
> > +#include <rte_lcore.h>
> > +#include <rte_log.h>
> > +#include <rte_spinlock.h>
> >
> >  #include "eal_private.h"
> >  #include "eal_thread.h"
> > @@ -220,3 +221,38 @@ rte_socket_id_by_idx(unsigned int idx)
> >  	}
> >  	return config->numa_nodes[idx];
> >  }
> > +
> > +static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
> > +
> > +unsigned int
> > +eal_lcore_non_eal_allocate(void)
> > +{
> > +	struct rte_config *cfg = rte_eal_get_configuration();
> > +	unsigned int lcore_id;
> > +
> > +	rte_spinlock_lock(&lcore_lock);
> 
> I think it will break current DPDK MP modes.
> The problem here - rte_config (and lcore_role[]) is in shared memory,
> while the lock is local.
> Simplest way probably to move lcore_lock to rte_config.

Actually sorry, I was wrong - rte_config is local.
So having lcore_lock local seems ok here.
Though then, I think another issue arises:
For MP case 2 processes might get the same lcore_id via this function. 
And, as I remember, rte_mempool cache is by default located in shared memory.
So two threads might end-up racing for the same mempool cache slot.
Same story probably about some other shared data that uses lcore_id as an index

> 
> > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > +		if (cfg->lcore_role[lcore_id] != ROLE_OFF)
> > +			continue;
> > +		cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
> > +		cfg->lcore_count++;
> > +		break;
> > +	}
> > +	if (lcore_id == RTE_MAX_LCORE)
> > +		RTE_LOG(DEBUG, EAL, "No lcore available.\n");
> > +	rte_spinlock_unlock(&lcore_lock);
> > +	return lcore_id;
> > +}
> > +
> > +void
> > +eal_lcore_non_eal_release(unsigned int lcore_id)
> > +{
> > +	struct rte_config *cfg = rte_eal_get_configuration();
> > +
> > +	rte_spinlock_lock(&lcore_lock);
> > +	if (cfg->lcore_role[lcore_id] == ROLE_NON_EAL) {
> > +		cfg->lcore_role[lcore_id] = ROLE_OFF;
> > +		cfg->lcore_count--;
> > +	}
> > +	rte_spinlock_unlock(&lcore_lock);
> > +}
  
David Marchand June 23, 2020, 7:49 a.m. UTC | #3
Hello Konstantin,

On Mon, Jun 22, 2020 at 5:49 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> > index 86d32a3dd7..7db05428e7 100644
> > --- a/lib/librte_eal/common/eal_common_lcore.c
> > +++ b/lib/librte_eal/common/eal_common_lcore.c
> > @@ -220,3 +221,38 @@ rte_socket_id_by_idx(unsigned int idx)
> >       }
> >       return config->numa_nodes[idx];
> >  }
> > +
> > +static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
> > +
> > +unsigned int
> > +eal_lcore_non_eal_allocate(void)
> > +{
> > +     struct rte_config *cfg = rte_eal_get_configuration();
> > +     unsigned int lcore_id;
> > +
> > +     rte_spinlock_lock(&lcore_lock);
>
> I think it will break current DPDK MP modes.
> The problem here - rte_config (and lcore_role[]) is in shared memory,
> while the lock is local.
> Simplest way probably to move lcore_lock to rte_config.

Even before this series, MP has no protection on lcore placing between
primary and secondary processes.
Personally, I have no use for DPDK MP and marking MP as not supporting
this new feature is tempting for a first phase.
If this is a strong requirement, I can look at it in a second phase.
What do you think?


>
> > +     for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > +             if (cfg->lcore_role[lcore_id] != ROLE_OFF)
> > +                     continue;
> > +             cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
> > +             cfg->lcore_count++;
> > +             break;
> > +     }
> > +     if (lcore_id == RTE_MAX_LCORE)
> > +             RTE_LOG(DEBUG, EAL, "No lcore available.\n");
> > +     rte_spinlock_unlock(&lcore_lock);
> > +     return lcore_id;
> > +}
> > +
> > +void
> > +eal_lcore_non_eal_release(unsigned int lcore_id)
> > +{
> > +     struct rte_config *cfg = rte_eal_get_configuration();
> > +
> > +     rte_spinlock_lock(&lcore_lock);
> > +     if (cfg->lcore_role[lcore_id] == ROLE_NON_EAL) {
> > +             cfg->lcore_role[lcore_id] = ROLE_OFF;
> > +             cfg->lcore_count--;
> > +     }
> > +     rte_spinlock_unlock(&lcore_lock);
> > +}
> > diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
> > index a7ae0691bf..1cbddc4b5b 100644
> > --- a/lib/librte_eal/common/eal_common_thread.c
> > +++ b/lib/librte_eal/common/eal_common_thread.c
> > @@ -236,3 +236,36 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name,
> >       pthread_join(*thread, NULL);
> >       return -ret;
> >  }
> > +
> > +void
> > +rte_thread_register(void)
> > +{
> > +     unsigned int lcore_id;
> > +     rte_cpuset_t cpuset;
> > +
> > +     /* EAL init flushes all lcores, we can't register before. */
> > +     assert(internal_config.init_complete == 1);
> > +     if (pthread_getaffinity_np(pthread_self(), sizeof(cpuset),
> > +                     &cpuset) != 0)
> > +             CPU_ZERO(&cpuset);
> > +     lcore_id = eal_lcore_non_eal_allocate();
> > +     if (lcore_id >= RTE_MAX_LCORE)
> > +             lcore_id = LCORE_ID_ANY;
> > +     rte_thread_init(lcore_id, &cpuset);
>
> So we just setting affinity to the same value, right?
> Not a big deal, but might be easier to allow rte_thread_init()
> to accept cpuset==NULL (and just don't change thread affinity in that case)

rte_thread_init does not change the thread cpu affinity, it handles
per thread (TLS included) variables initialization.

So do you mean accepting cpuset == NULL and do the getaffinity in this case?
rte_thread_init is EAL private for now.
That saves us some code in this function, but we will call with a !=
NULL cpuset in all other EAL code.
  
Bruce Richardson June 23, 2020, 9:14 a.m. UTC | #4
On Tue, Jun 23, 2020 at 09:49:18AM +0200, David Marchand wrote:
> Hello Konstantin,
> 
> On Mon, Jun 22, 2020 at 5:49 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> > > index 86d32a3dd7..7db05428e7 100644
> > > --- a/lib/librte_eal/common/eal_common_lcore.c
> > > +++ b/lib/librte_eal/common/eal_common_lcore.c
> > > @@ -220,3 +221,38 @@ rte_socket_id_by_idx(unsigned int idx)
> > >       }
> > >       return config->numa_nodes[idx];
> > >  }
> > > +
> > > +static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
> > > +
> > > +unsigned int
> > > +eal_lcore_non_eal_allocate(void)
> > > +{
> > > +     struct rte_config *cfg = rte_eal_get_configuration();
> > > +     unsigned int lcore_id;
> > > +
> > > +     rte_spinlock_lock(&lcore_lock);
> >
> > I think it will break current DPDK MP modes.
> > The problem here - rte_config (and lcore_role[]) is in shared memory,
> > while the lock is local.
> > Simplest way probably to move lcore_lock to rte_config.
> 
> Even before this series, MP has no protection on lcore placing between
> primary and secondary processes.
> Personally, I have no use for DPDK MP and marking MP as not supporting
> this new feature is tempting for a first phase.
> If this is a strong requirement, I can look at it in a second phase.
> What do you think?
> 
I think that is reasonable for a new feature. I suspect those wanting to
dynamically manage their own threads probably do not care about
multi-process mode. 

However, this limitation probably needs to be clearly called out in the
docs.

/Bruce
  
David Marchand June 23, 2020, 12:49 p.m. UTC | #5
On Tue, Jun 23, 2020 at 11:14 AM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Tue, Jun 23, 2020 at 09:49:18AM +0200, David Marchand wrote:
> > Hello Konstantin,
> >
> > On Mon, Jun 22, 2020 at 5:49 PM Ananyev, Konstantin
> > <konstantin.ananyev@intel.com> wrote:
> > > > diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> > > > index 86d32a3dd7..7db05428e7 100644
> > > > --- a/lib/librte_eal/common/eal_common_lcore.c
> > > > +++ b/lib/librte_eal/common/eal_common_lcore.c
> > > > @@ -220,3 +221,38 @@ rte_socket_id_by_idx(unsigned int idx)
> > > >       }
> > > >       return config->numa_nodes[idx];
> > > >  }
> > > > +
> > > > +static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
> > > > +
> > > > +unsigned int
> > > > +eal_lcore_non_eal_allocate(void)
> > > > +{
> > > > +     struct rte_config *cfg = rte_eal_get_configuration();
> > > > +     unsigned int lcore_id;
> > > > +
> > > > +     rte_spinlock_lock(&lcore_lock);
> > >
> > > I think it will break current DPDK MP modes.
> > > The problem here - rte_config (and lcore_role[]) is in shared memory,
> > > while the lock is local.
> > > Simplest way probably to move lcore_lock to rte_config.
> >
> > Even before this series, MP has no protection on lcore placing between
> > primary and secondary processes.
> > Personally, I have no use for DPDK MP and marking MP as not supporting
> > this new feature is tempting for a first phase.
> > If this is a strong requirement, I can look at it in a second phase.
> > What do you think?
> >
> I think that is reasonable for a new feature. I suspect those wanting to
> dynamically manage their own threads probably do not care about
> multi-process mode.
>
> However, this limitation probably needs to be clearly called out in the
> docs.

Again, *disclaimer* I am not a user of the MP feature.
But I suppose users of such a feature are relying on DPDK init and
threads management, and I would not expect them to use this new API.


I will add a note in rte_thread_register() doxygen.
But I wonder if adding some check when a secondary attaches would make sense...
Like how we have a version check, I could "taint" the dpdk primary
process: a variable in shared memory could do the trick.
  
Ananyev, Konstantin June 23, 2020, 1:15 p.m. UTC | #6
Hi David,

> > > diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> > > index 86d32a3dd7..7db05428e7 100644
> > > --- a/lib/librte_eal/common/eal_common_lcore.c
> > > +++ b/lib/librte_eal/common/eal_common_lcore.c
> > > @@ -220,3 +221,38 @@ rte_socket_id_by_idx(unsigned int idx)
> > >       }
> > >       return config->numa_nodes[idx];
> > >  }
> > > +
> > > +static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
> > > +
> > > +unsigned int
> > > +eal_lcore_non_eal_allocate(void)
> > > +{
> > > +     struct rte_config *cfg = rte_eal_get_configuration();
> > > +     unsigned int lcore_id;
> > > +
> > > +     rte_spinlock_lock(&lcore_lock);
> >
> > I think it will break current DPDK MP modes.
> > The problem here - rte_config (and lcore_role[]) is in shared memory,
> > while the lock is local.
> > Simplest way probably to move lcore_lock to rte_config.
> 
> Even before this series, MP has no protection on lcore placing between
> primary and secondary processes.

Agree, it is not a new problem, it has been there for a while.
Though making lcore assignment dynamic will make it more noticeable and harder to avoid.
With static only lcore distribution it is much easier to control things.
 
> Personally, I have no use for DPDK MP and marking MP as not supporting
> this new feature is tempting for a first phase.
> If this is a strong requirement, I can look at it in a second phase.
> What do you think?

In theory it is possible to mark this new API as not supported for MP.
At least for now. Though I think it is sort of temporal solution.
AFAIK, MP is used by customers, so sooner or later someone will hit that problem.
Let say, we do have pdump app/library in our mainline.
As I can see - it will be affected when users will start using this new dynamic lcore API
inside their apps.    

> 
> >
> > > +     for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > > +             if (cfg->lcore_role[lcore_id] != ROLE_OFF)
> > > +                     continue;
> > > +             cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
> > > +             cfg->lcore_count++;
> > > +             break;
> > > +     }
> > > +     if (lcore_id == RTE_MAX_LCORE)
> > > +             RTE_LOG(DEBUG, EAL, "No lcore available.\n");
> > > +     rte_spinlock_unlock(&lcore_lock);
> > > +     return lcore_id;
> > > +}
> > > +
> > > +void
> > > +eal_lcore_non_eal_release(unsigned int lcore_id)
> > > +{
> > > +     struct rte_config *cfg = rte_eal_get_configuration();
> > > +
> > > +     rte_spinlock_lock(&lcore_lock);
> > > +     if (cfg->lcore_role[lcore_id] == ROLE_NON_EAL) {
> > > +             cfg->lcore_role[lcore_id] = ROLE_OFF;
> > > +             cfg->lcore_count--;
> > > +     }
> > > +     rte_spinlock_unlock(&lcore_lock);
> > > +}
> > > diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
> > > index a7ae0691bf..1cbddc4b5b 100644
> > > --- a/lib/librte_eal/common/eal_common_thread.c
> > > +++ b/lib/librte_eal/common/eal_common_thread.c
> > > @@ -236,3 +236,36 @@ rte_ctrl_thread_create(pthread_t *thread, const char *name,
> > >       pthread_join(*thread, NULL);
> > >       return -ret;
> > >  }
> > > +
> > > +void
> > > +rte_thread_register(void)
> > > +{
> > > +     unsigned int lcore_id;
> > > +     rte_cpuset_t cpuset;
> > > +
> > > +     /* EAL init flushes all lcores, we can't register before. */
> > > +     assert(internal_config.init_complete == 1);
> > > +     if (pthread_getaffinity_np(pthread_self(), sizeof(cpuset),
> > > +                     &cpuset) != 0)
> > > +             CPU_ZERO(&cpuset);
> > > +     lcore_id = eal_lcore_non_eal_allocate();
> > > +     if (lcore_id >= RTE_MAX_LCORE)
> > > +             lcore_id = LCORE_ID_ANY;
> > > +     rte_thread_init(lcore_id, &cpuset);
> >
> > So we just setting affinity to the same value, right?
> > Not a big deal, but might be easier to allow rte_thread_init()
> > to accept cpuset==NULL (and just don't change thread affinity in that case)
> 
> rte_thread_init does not change the thread cpu affinity, it handles
> per thread (TLS included) variables initialization.

Right, didn't read the code properly.
Please scratch that comment.

> 
> So do you mean accepting cpuset == NULL and do the getaffinity in this case?
> rte_thread_init is EAL private for now.
> That saves us some code in this function, but we will call with a !=
> NULL cpuset in all other EAL code.
  
Andrew Rybchenko June 23, 2020, 5:02 p.m. UTC | #7
On 6/22/20 4:25 PM, David Marchand wrote:
> DPDK allows calling some part of its API from a non-EAL thread but this
> has some limitations.
> OVS (and other applications) has its own thread management but still
> want to avoid such limitations by hacking RTE_PER_LCORE(_lcore_id) and
> faking EAL threads potentially unknown of some DPDK component.
> 
> Introduce a new API to register non-EAL thread and associate them to a
> free lcore with a new NON_EAL role.
> This role denotes lcores that do not run DPDK mainloop and as such
> prevents use of rte_eal_wait_lcore() and consorts.
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
  
David Marchand June 24, 2020, 9:23 a.m. UTC | #8
On Tue, Jun 23, 2020 at 3:16 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > Even before this series, MP has no protection on lcore placing between
> > primary and secondary processes.
>
> Agree, it is not a new problem, it has been there for a while.
> Though making lcore assignment dynamic will make it more noticeable and harder to avoid.
> With static only lcore distribution it is much easier to control things.
>
> > Personally, I have no use for DPDK MP and marking MP as not supporting
> > this new feature is tempting for a first phase.
> > If this is a strong requirement, I can look at it in a second phase.
> > What do you think?
>
> In theory it is possible to mark this new API as not supported for MP.
> At least for now. Though I think it is sort of temporal solution.
> AFAIK, MP is used by customers, so sooner or later someone will hit that problem.

I understand this argument.
But then we don't see those customers giving feedback.


> Let say, we do have pdump app/library in our mainline.
> As I can see - it will be affected when users will start using this new dynamic lcore API
> inside their apps.

Supporting lcore allocation in MP requires exchanges between
primary/secondary processes like what we have for memory allocations.
It will be quite a beast to get to work fine, while not even knowing
if people actually want to use both.

For v4, I added a check to exclude MP and the new API.
I am still willing to help if people do care about using both features together.
  
Bruce Richardson June 24, 2020, 9:56 a.m. UTC | #9
On Wed, Jun 24, 2020 at 11:23:55AM +0200, David Marchand wrote:
> On Tue, Jun 23, 2020 at 3:16 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > Even before this series, MP has no protection on lcore placing between
> > > primary and secondary processes.
> >
> > Agree, it is not a new problem, it has been there for a while.
> > Though making lcore assignment dynamic will make it more noticeable and harder to avoid.
> > With static only lcore distribution it is much easier to control things.
> >
> > > Personally, I have no use for DPDK MP and marking MP as not supporting
> > > this new feature is tempting for a first phase.
> > > If this is a strong requirement, I can look at it in a second phase.
> > > What do you think?
> >
> > In theory it is possible to mark this new API as not supported for MP.
> > At least for now. Though I think it is sort of temporal solution.
> > AFAIK, MP is used by customers, so sooner or later someone will hit that problem.
> 
> I understand this argument.
> But then we don't see those customers giving feedback.
> 
> 
> > Let say, we do have pdump app/library in our mainline.
> > As I can see - it will be affected when users will start using this new dynamic lcore API
> > inside their apps.
> 
> Supporting lcore allocation in MP requires exchanges between
> primary/secondary processes like what we have for memory allocations.
> It will be quite a beast to get to work fine, while not even knowing
> if people actually want to use both.
> 
> For v4, I added a check to exclude MP and the new API.
> I am still willing to help if people do care about using both features together.

I wonder how much we could simplify DPDK generally if we had to enable a
specific runtime flag to enable multi-process support and it was off by
default. This would break proc_info I think, but maybe we could provide
telemetry callbacks to provide the same data, but beyond that it would just
allow us to know whether a DPDK app is actually using MP, or just running
as a single process.

/Bruce
  
Thomas Monjalon June 24, 2020, 10:08 a.m. UTC | #10
24/06/2020 11:56, Bruce Richardson:
> On Wed, Jun 24, 2020 at 11:23:55AM +0200, David Marchand wrote:
> > On Tue, Jun 23, 2020 at 3:16 PM Ananyev, Konstantin
> > <konstantin.ananyev@intel.com> wrote:
> > > > Even before this series, MP has no protection on lcore placing between
> > > > primary and secondary processes.
> > >
> > > Agree, it is not a new problem, it has been there for a while.
> > > Though making lcore assignment dynamic will make it more noticeable and harder to avoid.
> > > With static only lcore distribution it is much easier to control things.
> > >
> > > > Personally, I have no use for DPDK MP and marking MP as not supporting
> > > > this new feature is tempting for a first phase.
> > > > If this is a strong requirement, I can look at it in a second phase.
> > > > What do you think?
> > >
> > > In theory it is possible to mark this new API as not supported for MP.
> > > At least for now. Though I think it is sort of temporal solution.
> > > AFAIK, MP is used by customers, so sooner or later someone will hit that problem.
> > 
> > I understand this argument.
> > But then we don't see those customers giving feedback.
> > 
> > 
> > > Let say, we do have pdump app/library in our mainline.
> > > As I can see - it will be affected when users will start using this new dynamic lcore API
> > > inside their apps.
> > 
> > Supporting lcore allocation in MP requires exchanges between
> > primary/secondary processes like what we have for memory allocations.
> > It will be quite a beast to get to work fine, while not even knowing
> > if people actually want to use both.
> > 
> > For v4, I added a check to exclude MP and the new API.
> > I am still willing to help if people do care about using both features together.
> 
> I wonder how much we could simplify DPDK generally if we had to enable a
> specific runtime flag to enable multi-process support and it was off by
> default. This would break proc_info I think, but maybe we could provide
> telemetry callbacks to provide the same data, but beyond that it would just
> allow us to know whether a DPDK app is actually using MP, or just running
> as a single process.

Same thought here.
I like the idea of a "mode flag" when multi-process is in use.
Should it be an user explicit flag or an automatic one?
  
Ananyev, Konstantin June 24, 2020, 10:39 a.m. UTC | #11
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Wednesday, June 24, 2020 10:24 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; jerinjacobk@gmail.com; Richardson, Bruce <bruce.richardson@intel.com>; mdr@ashroe.eu; ktraynor@redhat.com;
> Stokes, Ian <ian.stokes@intel.com>; i.maximets@ovn.org; Thomas Monjalon <thomas@monjalon.net>; Mcnamara, John
> <john.mcnamara@intel.com>; Kovacevic, Marko <marko.kovacevic@intel.com>; Burakov, Anatoly <anatoly.burakov@intel.com>; Olivier
> Matz <olivier.matz@6wind.com>; Andrew Rybchenko <arybchenko@solarflare.com>; Neil Horman <nhorman@tuxdriver.com>
> Subject: Re: [dpdk-dev] [PATCH v3 6/9] eal: register non-EAL threads as lcores
> 
> On Tue, Jun 23, 2020 at 3:16 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > Even before this series, MP has no protection on lcore placing between
> > > primary and secondary processes.
> >
> > Agree, it is not a new problem, it has been there for a while.
> > Though making lcore assignment dynamic will make it more noticeable and harder to avoid.
> > With static only lcore distribution it is much easier to control things.
> >
> > > Personally, I have no use for DPDK MP and marking MP as not supporting
> > > this new feature is tempting for a first phase.
> > > If this is a strong requirement, I can look at it in a second phase.
> > > What do you think?
> >
> > In theory it is possible to mark this new API as not supported for MP.
> > At least for now. Though I think it is sort of temporal solution.
> > AFAIK, MP is used by customers, so sooner or later someone will hit that problem.
> 
> I understand this argument.
> But then we don't see those customers giving feedback.
> 
> 
> > Let say, we do have pdump app/library in our mainline.
> > As I can see - it will be affected when users will start using this new dynamic lcore API
> > inside their apps.
> 
> Supporting lcore allocation in MP requires exchanges between
> primary/secondary processes like what we have for memory allocations.
> It will be quite a beast to get to work fine, while not even knowing
> if people actually want to use both.

I don't think we need to re-implement RPC as we did for memory subsystem.
One relatively simple approach - move lcore_role[] and related lock into
shared memory (separate memzone or so).
I think it should help a lot and will solve majority of the problems.
One limitation - init/fini callbacks can be static only.
As the drawback, it will introduce change in current behaviour:
secondary process with lcore-mask that intersects with master lcore-mask
will fail to start.
Second approach - make lcore_id local process entity:
prohibit indexing by lcore_id in shared data structures.
Let say for mempool - make cache local (per process).
While that approach is probably more elegant and consistent,
it would require more work and will cause ABI (maybe API also) breakage.
 
> For v4, I added a check to exclude MP and the new API.

Do you mean - make this new dynamic-lcore API return an error if callied
from secondary process?

> I am still willing to help if people do care about using both features together.
  
Ananyev, Konstantin June 24, 2020, 10:45 a.m. UTC | #12
> 
> 24/06/2020 11:56, Bruce Richardson:
> > On Wed, Jun 24, 2020 at 11:23:55AM +0200, David Marchand wrote:
> > > On Tue, Jun 23, 2020 at 3:16 PM Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com> wrote:
> > > > > Even before this series, MP has no protection on lcore placing between
> > > > > primary and secondary processes.
> > > >
> > > > Agree, it is not a new problem, it has been there for a while.
> > > > Though making lcore assignment dynamic will make it more noticeable and harder to avoid.
> > > > With static only lcore distribution it is much easier to control things.
> > > >
> > > > > Personally, I have no use for DPDK MP and marking MP as not supporting
> > > > > this new feature is tempting for a first phase.
> > > > > If this is a strong requirement, I can look at it in a second phase.
> > > > > What do you think?
> > > >
> > > > In theory it is possible to mark this new API as not supported for MP.
> > > > At least for now. Though I think it is sort of temporal solution.
> > > > AFAIK, MP is used by customers, so sooner or later someone will hit that problem.
> > >
> > > I understand this argument.
> > > But then we don't see those customers giving feedback.
> > >
> > >
> > > > Let say, we do have pdump app/library in our mainline.
> > > > As I can see - it will be affected when users will start using this new dynamic lcore API
> > > > inside their apps.
> > >
> > > Supporting lcore allocation in MP requires exchanges between
> > > primary/secondary processes like what we have for memory allocations.
> > > It will be quite a beast to get to work fine, while not even knowing
> > > if people actually want to use both.
> > >
> > > For v4, I added a check to exclude MP and the new API.
> > > I am still willing to help if people do care about using both features together.
> >
> > I wonder how much we could simplify DPDK generally if we had to enable a
> > specific runtime flag to enable multi-process support and it was off by
> > default. This would break proc_info I think, but maybe we could provide
> > telemetry callbacks to provide the same data, but beyond that it would just
> > allow us to know whether a DPDK app is actually using MP, or just running
> > as a single process.
> 
> Same thought here.
> I like the idea of a "mode flag" when multi-process is in use.
> Should it be an user explicit flag or an automatic one?

It is probably possible, but that would mean will need to start splitting
core parts of DPDK code into two paths: standalone/MP, right?   
Wonder how much effort it would require in terms of code  rework, testing,
maintenance, etc. At first glance seems quite substantial.
  
David Marchand June 24, 2020, 10:48 a.m. UTC | #13
On Wed, Jun 24, 2020 at 12:40 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > Supporting lcore allocation in MP requires exchanges between
> > primary/secondary processes like what we have for memory allocations.
> > It will be quite a beast to get to work fine, while not even knowing
> > if people actually want to use both.
>
> I don't think we need to re-implement RPC as we did for memory subsystem.
> One relatively simple approach - move lcore_role[] and related lock into
> shared memory (separate memzone or so).
> I think it should help a lot and will solve majority of the problems.
> One limitation - init/fini callbacks can be static only.
> As the drawback, it will introduce change in current behaviour:
> secondary process with lcore-mask that intersects with master lcore-mask
> will fail to start.
> Second approach - make lcore_id local process entity:
> prohibit indexing by lcore_id in shared data structures.
> Let say for mempool - make cache local (per process).
> While that approach is probably more elegant and consistent,
> it would require more work and will cause ABI (maybe API also) breakage.

In all scenarii, this is quite some work.


>
> > For v4, I added a check to exclude MP and the new API.
>
> Do you mean - make this new dynamic-lcore API return an error if callied
> from secondary process?
>

Yes, and prohibiting from attaching a secondary process if dynamic
lcore API has been used in primary.
I intend to squash in patch 6:
https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
  
Ananyev, Konstantin June 24, 2020, 11:59 a.m. UTC | #14
> 
> On Wed, Jun 24, 2020 at 12:40 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > Supporting lcore allocation in MP requires exchanges between
> > > primary/secondary processes like what we have for memory allocations.
> > > It will be quite a beast to get to work fine, while not even knowing
> > > if people actually want to use both.
> >
> > I don't think we need to re-implement RPC as we did for memory subsystem.
> > One relatively simple approach - move lcore_role[] and related lock into
> > shared memory (separate memzone or so).
> > I think it should help a lot and will solve majority of the problems.
> > One limitation - init/fini callbacks can be static only.
> > As the drawback, it will introduce change in current behaviour:
> > secondary process with lcore-mask that intersects with master lcore-mask
> > will fail to start.
> > Second approach - make lcore_id local process entity:
> > prohibit indexing by lcore_id in shared data structures.
> > Let say for mempool - make cache local (per process).
> > While that approach is probably more elegant and consistent,
> > it would require more work and will cause ABI (maybe API also) breakage.
> 
> In all scenarii, this is quite some work.
> 
> 
> >
> > > For v4, I added a check to exclude MP and the new API.
> >
> > Do you mean - make this new dynamic-lcore API return an error if callied
> > from secondary process?
> >
> 
> Yes, and prohibiting from attaching a secondary process if dynamic
> lcore API has been used in primary.
> I intend to squash in patch 6:
> https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee

But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
If we really  want to go ahead with such workaround -
probably better to introduce explicit EAL flag ( --single-process or so).
As Thomas and  Bruce suggested, if I understood them properly.
  
David Marchand June 26, 2020, 2:43 p.m. UTC | #15
On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > from secondary process?
> > >
> >
> > Yes, and prohibiting from attaching a secondary process if dynamic
> > lcore API has been used in primary.
> > I intend to squash in patch 6:
> > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
>
> But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.

If the developer tries to use both features, he gets an ERROR log in
the two init path.
So whatever the order at runtime, we inform the developer (who did not
read/understand the rte_thread_register() documentation) that what he
is doing is unsupported.


> If we really  want to go ahead with such workaround -
> probably better to introduce explicit EAL flag ( --single-process or so).
> As Thomas and  Bruce suggested, if I understood them properly.

A EAL flag is a stable API from the start, as there is nothing
describing how we can remove one.
So a new EAL flag for an experimental API/feature seems contradictory.

Going with a new features status API... I think it is beyond this series.

Thomas seems to suggest an automatic resolution when features conflict
happens.. ?

I'll send the v4, let's discuss it there if you want.
Thanks.
  
Thomas Monjalon June 30, 2020, 10:35 a.m. UTC | #16
26/06/2020 16:43, David Marchand:
> On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > > from secondary process?
> > >
> > > Yes, and prohibiting from attaching a secondary process if dynamic
> > > lcore API has been used in primary.
> > > I intend to squash in patch 6:
> > > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
> >
> > But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
> 
> If the developer tries to use both features, he gets an ERROR log in
> the two init path.
> So whatever the order at runtime, we inform the developer (who did not
> read/understand the rte_thread_register() documentation) that what he
> is doing is unsupported.

I agree.
Before this patch, pinning a thread on a random core can
trigger some issues.
After this patch, register an external thread will
take care of logging errors in case of inconsistencies.
So the user will know he is doing something not supported
by the app.

It is an nice improvement.

> > If we really  want to go ahead with such workaround -

It is not a workaround.
It is fixing some old issues and making clear what is really impossible.

> > probably better to introduce explicit EAL flag ( --single-process or so).
> > As Thomas and  Bruce suggested, if I understood them properly.

No I was thinking to maintain the tri-state information:
	- secondary is possible
	- secondary is attached
	- secondary is forbidden

Asking the user to use an option to forbid attaching a secondary process
is the same as telling him it is forbidden.
The error log is enough in my opinion.

> A EAL flag is a stable API from the start, as there is nothing
> describing how we can remove one.
> So a new EAL flag for an experimental API/feature seems contradictory.
> 
> Going with a new features status API... I think it is beyond this series.
> 
> Thomas seems to suggest an automatic resolution when features conflict
> happens.. ?

I suggest allowing the maximum and raise an error when usage conflicts.
It seems this is what you did in v4.

> I'll send the v4, let's discuss it there if you want.
  
Ananyev, Konstantin June 30, 2020, 12:07 p.m. UTC | #17
> 
> 26/06/2020 16:43, David Marchand:
> > On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
> > <konstantin.ananyev@intel.com> wrote:
> > > > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > > > from secondary process?
> > > >
> > > > Yes, and prohibiting from attaching a secondary process if dynamic
> > > > lcore API has been used in primary.
> > > > I intend to squash in patch 6:
> > > > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
> > >
> > > But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
> >
> > If the developer tries to use both features, he gets an ERROR log in
> > the two init path.
> > So whatever the order at runtime, we inform the developer (who did not
> > read/understand the rte_thread_register() documentation) that what he
> > is doing is unsupported.
> 
> I agree.
> Before this patch, pinning a thread on a random core can
> trigger some issues.
> After this patch, register an external thread will
> take care of logging errors in case of inconsistencies.
> So the user will know he is doing something not supported
> by the app.

I understand that, and return a meaningful error is definitely
better the silent crash or memory corruption.
The problem with that approach, as I said before, MP group
behaviour becomes non-deterministic. 

> 
> It is an nice improvement.
> 
> > > If we really  want to go ahead with such workaround -
> 
> It is not a workaround.
> It is fixing some old issues and making clear what is really impossible.

The root cause of the problem is in our MP model design decisions:
from one side we treat lcore_id as process local data, from other side
in some shared data-structures we use lcore_id as an index.
I think to fix it properly we need either: 
make lcore_id data shared or stop using lcore_id as an index for shared data. 
So from my perspective this approach is just one of possible workarounds.
BTW, there is nothing wrong to have a workaround for the problem
we are not ready to fix right now.
 
> > > probably better to introduce explicit EAL flag ( --single-process or so).
> > > As Thomas and  Bruce suggested, if I understood them properly.
> 
> No I was thinking to maintain the tri-state information:
> 	- secondary is possible
> 	- secondary is attached
> 	- secondary is forbidden

Ok, then I misunderstood you.
 
> Asking the user to use an option to forbid attaching a secondary process
> is the same as telling him it is forbidden.

I don't think it is the same.
On a live and complex system user can't always predict will the primary proc 
use dynamic lcore and if it will at what particular moment.
Same for secondary process launching - user might never start it,
might start it straight after the primary one,
or might be after several hours. 

> The error log is enough in my opinion.

I think it is better than nothing, but probably not the best one.
Apart from possible non-consistent behaviour, it is quite restrictive:
dynamic lcore_id wouldn't be available on any DPDK MP deployment.
Which is a pity - I think it is a cool and useful feature.
 
What do you guys think about different approach:
introduce new optional EAL parameter to restrict lcore_id
values available for the process.

#let say to start primary proc that can use lcore_id=[0-99] only:
dpdk_primary --lcore-allow=0-99 ... --file-prefix=xz1

#to start secondary one for it with allowed lcore_id=[100-109]:
dpdk_secondary --lcore-allow=100-109 ... --file-prefix=xz1 --proc-type=secondary  
 
It is still a workaround, but that way we don't need to
add any new limitations for dynamic lcores and secondary process usage. 
Now it is up to user to decide would multiple-process use the same shared data
and if so - split lcore_id space properly among them
(same as he has to do now with static lcores).

> > A EAL flag is a stable API from the start, as there is nothing
> > describing how we can remove one.
> > So a new EAL flag for an experimental API/feature seems contradictory.
> >
> > Going with a new features status API... I think it is beyond this series.
> >
> > Thomas seems to suggest an automatic resolution when features conflict
> > happens.. ?
> 
> I suggest allowing the maximum and raise an error when usage conflicts.
> It seems this is what you did in v4.
> 
> > I'll send the v4, let's discuss it there if you want.
>
  
Olivier Matz June 30, 2020, 12:44 p.m. UTC | #18
On Tue, Jun 30, 2020 at 12:07:32PM +0000, Ananyev, Konstantin wrote:
> > 
> > 26/06/2020 16:43, David Marchand:
> > > On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com> wrote:
> > > > > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > > > > from secondary process?
> > > > >
> > > > > Yes, and prohibiting from attaching a secondary process if dynamic
> > > > > lcore API has been used in primary.
> > > > > I intend to squash in patch 6:
> > > > > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
> > > >
> > > > But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
> > >
> > > If the developer tries to use both features, he gets an ERROR log in
> > > the two init path.
> > > So whatever the order at runtime, we inform the developer (who did not
> > > read/understand the rte_thread_register() documentation) that what he
> > > is doing is unsupported.
> > 
> > I agree.
> > Before this patch, pinning a thread on a random core can
> > trigger some issues.
> > After this patch, register an external thread will
> > take care of logging errors in case of inconsistencies.
> > So the user will know he is doing something not supported
> > by the app.
> 
> I understand that, and return a meaningful error is definitely
> better the silent crash or memory corruption.
> The problem with that approach, as I said before, MP group
> behaviour becomes non-deterministic. 
> 
> > 
> > It is an nice improvement.
> > 
> > > > If we really  want to go ahead with such workaround -
> > 
> > It is not a workaround.
> > It is fixing some old issues and making clear what is really impossible.
> 
> The root cause of the problem is in our MP model design decisions:
> from one side we treat lcore_id as process local data, from other side
> in some shared data-structures we use lcore_id as an index.
> I think to fix it properly we need either: 
> make lcore_id data shared or stop using lcore_id as an index for shared data. 
> So from my perspective this approach is just one of possible workarounds.
> BTW, there is nothing wrong to have a workaround for the problem
> we are not ready to fix right now.
>  
> > > > probably better to introduce explicit EAL flag ( --single-process or so).
> > > > As Thomas and  Bruce suggested, if I understood them properly.
> > 
> > No I was thinking to maintain the tri-state information:
> > 	- secondary is possible
> > 	- secondary is attached
> > 	- secondary is forbidden
> 
> Ok, then I misunderstood you.
>  
> > Asking the user to use an option to forbid attaching a secondary process
> > is the same as telling him it is forbidden.
> 
> I don't think it is the same.
> On a live and complex system user can't always predict will the primary proc 
> use dynamic lcore and if it will at what particular moment.
> Same for secondary process launching - user might never start it,
> might start it straight after the primary one,
> or might be after several hours. 
> 
> > The error log is enough in my opinion.
> 
> I think it is better than nothing, but probably not the best one.
> Apart from possible non-consistent behaviour, it is quite restrictive:
> dynamic lcore_id wouldn't be available on any DPDK MP deployment.
> Which is a pity - I think it is a cool and useful feature.
>  
> What do you guys think about different approach:
> introduce new optional EAL parameter to restrict lcore_id
> values available for the process.
> 
> #let say to start primary proc that can use lcore_id=[0-99] only:
> dpdk_primary --lcore-allow=0-99 ... --file-prefix=xz1
> 
> #to start secondary one for it with allowed lcore_id=[100-109]:
> dpdk_secondary --lcore-allow=100-109 ... --file-prefix=xz1 --proc-type=secondary  
>  
> It is still a workaround, but that way we don't need to
> add any new limitations for dynamic lcores and secondary process usage. 
> Now it is up to user to decide would multiple-process use the same shared data
> and if so - split lcore_id space properly among them
> (same as he has to do now with static lcores).

A variant (more simple) of your approach could be to add
"--proc-type=standalone" to explicitly disable MP and enable dynamic thread
registration.



> > > A EAL flag is a stable API from the start, as there is nothing
> > > describing how we can remove one.
> > > So a new EAL flag for an experimental API/feature seems contradictory.
> > >
> > > Going with a new features status API... I think it is beyond this series.
> > >
> > > Thomas seems to suggest an automatic resolution when features conflict
> > > happens.. ?
> > 
> > I suggest allowing the maximum and raise an error when usage conflicts.
> > It seems this is what you did in v4.
> > 
> > > I'll send the v4, let's discuss it there if you want.
> > 
>
  
Thomas Monjalon June 30, 2020, 2:35 p.m. UTC | #19
30/06/2020 14:07, Ananyev, Konstantin:
> > 26/06/2020 16:43, David Marchand:
> > > On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com> wrote:
> > > > > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > > > > from secondary process?
> > > > >
> > > > > Yes, and prohibiting from attaching a secondary process if dynamic
> > > > > lcore API has been used in primary.
> > > > > I intend to squash in patch 6:
> > > > > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
> > > >
> > > > But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
> > >
> > > If the developer tries to use both features, he gets an ERROR log in
> > > the two init path.
> > > So whatever the order at runtime, we inform the developer (who did not
> > > read/understand the rte_thread_register() documentation) that what he
> > > is doing is unsupported.
> > 
> > I agree.
> > Before this patch, pinning a thread on a random core can
> > trigger some issues.
> > After this patch, register an external thread will
> > take care of logging errors in case of inconsistencies.
> > So the user will know he is doing something not supported
> > by the app.
> 
> I understand that, and return a meaningful error is definitely
> better the silent crash or memory corruption.
> The problem with that approach, as I said before, MP group
> behaviour becomes non-deterministic.

It was already non-deterministic before these patches.

> > It is an nice improvement.
> > 
> > > > If we really  want to go ahead with such workaround -
> > 
> > It is not a workaround.
> > It is fixing some old issues and making clear what is really impossible.
> 
> The root cause of the problem is in our MP model design decisions:
> from one side we treat lcore_id as process local data, from other side
> in some shared data-structures we use lcore_id as an index.
> I think to fix it properly we need either: 
> make lcore_id data shared or stop using lcore_id as an index for shared data. 
> So from my perspective this approach is just one of possible workarounds.
> BTW, there is nothing wrong to have a workaround for the problem
> we are not ready to fix right now.

I think you are trying to fix multi-process handling.
This patch is not about multi-process, it only highlight incompatibilities.

> > > > probably better to introduce explicit EAL flag ( --single-process or so).
> > > > As Thomas and  Bruce suggested, if I understood them properly.
> > 
> > No I was thinking to maintain the tri-state information:
> > 	- secondary is possible
> > 	- secondary is attached
> > 	- secondary is forbidden
> 
> Ok, then I misunderstood you.
>  
> > Asking the user to use an option to forbid attaching a secondary process
> > is the same as telling him it is forbidden.
> 
> I don't think it is the same.
> On a live and complex system user can't always predict will the primary proc 
> use dynamic lcore and if it will at what particular moment.
> Same for secondary process launching - user might never start it,
> might start it straight after the primary one,
> or might be after several hours. 

I don't see the difference.
An app which register external threads is not compatible
with multi-process. It needs to be clear.
If the user tries to do it anyway, there can be some error, OK.

> > The error log is enough in my opinion.
> 
> I think it is better than nothing, but probably not the best one.
> Apart from possible non-consistent behaviour, it is quite restrictive:
> dynamic lcore_id wouldn't be available on any DPDK MP deployment.
> Which is a pity - I think it is a cool and useful feature.

So you are asking to extend the feature.
Honestly, I'm not a fan of multi-process,
so I would not push any feature for it.

If we don't add any new option now, and restrict MP handling
to error messages, it would not prevent from extending
in future, right?


> What do you guys think about different approach:
> introduce new optional EAL parameter to restrict lcore_id
> values available for the process.
> 
> #let say to start primary proc that can use lcore_id=[0-99] only:
> dpdk_primary --lcore-allow=0-99 ... --file-prefix=xz1
> 
> #to start secondary one for it with allowed lcore_id=[100-109]:
> dpdk_secondary --lcore-allow=100-109 ... --file-prefix=xz1 --proc-type=secondary  
>  
> It is still a workaround, but that way we don't need to
> add any new limitations for dynamic lcores and secondary process usage. 
> Now it is up to user to decide would multiple-process use the same shared data
> and if so - split lcore_id space properly among them
> (same as he has to do now with static lcores).

Isn't it pushing too much to the user?


> > > A EAL flag is a stable API from the start, as there is nothing
> > > describing how we can remove one.
> > > So a new EAL flag for an experimental API/feature seems contradictory.
> > >
> > > Going with a new features status API... I think it is beyond this series.
> > >
> > > Thomas seems to suggest an automatic resolution when features conflict
> > > happens.. ?
> > 
> > I suggest allowing the maximum and raise an error when usage conflicts.
> > It seems this is what you did in v4.
> > 
> > > I'll send the v4, let's discuss it there if you want.
  
Thomas Monjalon June 30, 2020, 2:37 p.m. UTC | #20
30/06/2020 14:44, Olivier Matz:
> On Tue, Jun 30, 2020 at 12:07:32PM +0000, Ananyev, Konstantin wrote:
> > I think it is better than nothing, but probably not the best one.
> > Apart from possible non-consistent behaviour, it is quite restrictive:
> > dynamic lcore_id wouldn't be available on any DPDK MP deployment.
> > Which is a pity - I think it is a cool and useful feature.
> >  
> > What do you guys think about different approach:
> > introduce new optional EAL parameter to restrict lcore_id
> > values available for the process.
> > 
> > #let say to start primary proc that can use lcore_id=[0-99] only:
> > dpdk_primary --lcore-allow=0-99 ... --file-prefix=xz1
> > 
> > #to start secondary one for it with allowed lcore_id=[100-109]:
> > dpdk_secondary --lcore-allow=100-109 ... --file-prefix=xz1 --proc-type=secondary  
> >  
> > It is still a workaround, but that way we don't need to
> > add any new limitations for dynamic lcores and secondary process usage. 
> > Now it is up to user to decide would multiple-process use the same shared data
> > and if so - split lcore_id space properly among them
> > (same as he has to do now with static lcores).
> 
> A variant (more simple) of your approach could be to add
> "--proc-type=standalone" to explicitly disable MP and enable dynamic thread
> registration.

I don't adding a restriction from user input is adding a feature.
Konstantin wants to support multi-process with non-EAL threads,
which is the opposite of your proposal, Olivier :-)
  
Ananyev, Konstantin June 30, 2020, 6:57 p.m. UTC | #21
> 30/06/2020 14:07, Ananyev, Konstantin:
> > > 26/06/2020 16:43, David Marchand:
> > > > On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com> wrote:
> > > > > > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > > > > > from secondary process?
> > > > > >
> > > > > > Yes, and prohibiting from attaching a secondary process if dynamic
> > > > > > lcore API has been used in primary.
> > > > > > I intend to squash in patch 6:
> > > > > > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
> > > > >
> > > > > But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
> > > >
> > > > If the developer tries to use both features, he gets an ERROR log in
> > > > the two init path.
> > > > So whatever the order at runtime, we inform the developer (who did not
> > > > read/understand the rte_thread_register() documentation) that what he
> > > > is doing is unsupported.
> > >
> > > I agree.
> > > Before this patch, pinning a thread on a random core can
> > > trigger some issues.
> > > After this patch, register an external thread will
> > > take care of logging errors in case of inconsistencies.
> > > So the user will know he is doing something not supported
> > > by the app.
> >
> > I understand that, and return a meaningful error is definitely
> > better the silent crash or memory corruption.
> > The problem with that approach, as I said before, MP group
> > behaviour becomes non-deterministic.
> 
> It was already non-deterministic before these patches.
> > > It is an nice improvement.
> > >
> > > > > If we really  want to go ahead with such workaround -
> > >
> > > It is not a workaround.
> > > It is fixing some old issues and making clear what is really impossible.
> >
> > The root cause of the problem is in our MP model design decisions:
> > from one side we treat lcore_id as process local data, from other side
> > in some shared data-structures we use lcore_id as an index.
> > I think to fix it properly we need either:
> > make lcore_id data shared or stop using lcore_id as an index for shared data.
> > So from my perspective this approach is just one of possible workarounds.
> > BTW, there is nothing wrong to have a workaround for the problem
> > we are not ready to fix right now.
> 
> I think you are trying to fix multi-process handling.
> This patch is not about multi-process, it only highlight incompatibilities.

Yes, the problem has been there for a while.
David's patch just made it more visible.
We discussing different workarounds for the problem.

> > > > > probably better to introduce explicit EAL flag ( --single-process or so).
> > > > > As Thomas and  Bruce suggested, if I understood them properly.
> > >
> > > No I was thinking to maintain the tri-state information:
> > > 	- secondary is possible
> > > 	- secondary is attached
> > > 	- secondary is forbidden
> >
> > Ok, then I misunderstood you.
> >
> > > Asking the user to use an option to forbid attaching a secondary process
> > > is the same as telling him it is forbidden.
> >
> > I don't think it is the same.
> > On a live and complex system user can't always predict will the primary proc
> > use dynamic lcore and if it will at what particular moment.
> > Same for secondary process launching - user might never start it,
> > might start it straight after the primary one,
> > or might be after several hours.
> 
> I don't see the difference.
> An app which register external threads is not compatible
> with multi-process. It needs to be clear.
> If the user tries to do it anyway, there can be some error, OK.

Copying from other mail thread:
Imagine the situation - there is a primary proc (supposed to run forever)
that does  rte_thread_register/rte_thread_unregister during its lifetime.
Plus from time to time user runs some secondary process to collect stats/debug
the primary one (proc-info or so).
Now behaviour of such system will be non-deterministic:
In some runs primary proc will do rte_thread_register() first,
and then secondary proc will be never able to attach.
In other cases - secondary will win the race, and then for primary 
eal_lcore_non_eal_allocate() will always fail.
Which means different behaviour between runs, varying performance, etc.

> > > The error log is enough in my opinion.
> >
> > I think it is better than nothing, but probably not the best one.
> > Apart from possible non-consistent behaviour, it is quite restrictive:
> > dynamic lcore_id wouldn't be available on any DPDK MP deployment.
> > Which is a pity - I think it is a cool and useful feature.
> 
> So you are asking to extend the feature.

I am asking for solution that would guarantee deterministic behaviour to the user.
If dynamic lcores and MP support need to be mutually exclusive,
then there should be a clean way for the user to *always* enable
one and disable the other.
"--proc-type=standalone" will at least guarantee such consistent behaviour between runs:
secondary proc will always fail to attach and  eal_lcore_non_eal_allocate() will always succeed
(as long as there are free lcore_ids off-course).
Though I think even better would be not to make them mutually exclusive,
but instead let user to split lcore_id space accordingly.
Let me list the options currently under discussion:

a)   New EAL parameter '--lcore-allow=...'
	Explicit EAL parameter to enable dyn-lcore=Y
	Consistent behaviour between runs=Y
	DYN-lcores/MP-support are mutually exclusive=N 

b)  Extend '--proc-type' EAL parameter with new 'standalone' type
	Explicit EAL parameter to enable dyn-lcore =Y
	Consistent behaviour between runs=Y
	Dyn lcores/MP-support are mutually exclusive=Y

c) dynamic allow/forbid dynamic-lcore/MP support
	Explicit EAL parameter=N
	Consistent behaviour between runs=N
	Dyn lcores/MP-support are mutually exclusive=Y

My preference list (from top to bottom): a, b, c.

> Honestly, I'm not a fan of multi-process,
> so I would not push any feature for it.

Me too, but as we can't drop it, we probably have no
choice but to live with it. 

> 
> If we don't add any new option now, and restrict MP handling
> to error messages, it would not prevent from extending
> in future, right?

It shouldn't I think.
Though what is the urgency to push this feature without having an
agreement first?
 
> 
> > What do you guys think about different approach:
> > introduce new optional EAL parameter to restrict lcore_id
> > values available for the process.
> >
> > #let say to start primary proc that can use lcore_id=[0-99] only:
> > dpdk_primary --lcore-allow=0-99 ... --file-prefix=xz1
> >
> > #to start secondary one for it with allowed lcore_id=[100-109]:
> > dpdk_secondary --lcore-allow=100-109 ... --file-prefix=xz1 --proc-type=secondary
> >
> > It is still a workaround, but that way we don't need to
> > add any new limitations for dynamic lcores and secondary process usage.
> > Now it is up to user to decide would multiple-process use the same shared data
> > and if so - split lcore_id space properly among them
> > (same as he has to do now with static lcores).
> 
> Isn't it pushing too much to the user?

User has to do the similar thing with static lcores right now.
 
> 
> > > > A EAL flag is a stable API from the start, as there is nothing
> > > > describing how we can remove one.
> > > > So a new EAL flag for an experimental API/feature seems contradictory.
> > > >
> > > > Going with a new features status API... I think it is beyond this series.
> > > >
> > > > Thomas seems to suggest an automatic resolution when features conflict
> > > > happens.. ?
> > >
> > > I suggest allowing the maximum and raise an error when usage conflicts.
> > > It seems this is what you did in v4.
> > >
> > > > I'll send the v4, let's discuss it there if you want.
> 
>
  
Ananyev, Konstantin June 30, 2020, 7:02 p.m. UTC | #22
> 
> On Tue, Jun 30, 2020 at 12:07:32PM +0000, Ananyev, Konstantin wrote:
> > >
> > > 26/06/2020 16:43, David Marchand:
> > > > On Wed, Jun 24, 2020 at 1:59 PM Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com> wrote:
> > > > > > > Do you mean - make this new dynamic-lcore API return an error if callied
> > > > > > > from secondary process?
> > > > > >
> > > > > > Yes, and prohibiting from attaching a secondary process if dynamic
> > > > > > lcore API has been used in primary.
> > > > > > I intend to squash in patch 6:
> > > > > > https://github.com/david-marchand/dpdk/commit/e5861ee734bfe2e4dc23d9b919b0db2a32a58aee
> > > > >
> > > > > But secondary process can attach before lcore_register, so we'll have some sort of inconsistency in behaviour.
> > > >
> > > > If the developer tries to use both features, he gets an ERROR log in
> > > > the two init path.
> > > > So whatever the order at runtime, we inform the developer (who did not
> > > > read/understand the rte_thread_register() documentation) that what he
> > > > is doing is unsupported.
> > >
> > > I agree.
> > > Before this patch, pinning a thread on a random core can
> > > trigger some issues.
> > > After this patch, register an external thread will
> > > take care of logging errors in case of inconsistencies.
> > > So the user will know he is doing something not supported
> > > by the app.
> >
> > I understand that, and return a meaningful error is definitely
> > better the silent crash or memory corruption.
> > The problem with that approach, as I said before, MP group
> > behaviour becomes non-deterministic.
> >
> > >
> > > It is an nice improvement.
> > >
> > > > > If we really  want to go ahead with such workaround -
> > >
> > > It is not a workaround.
> > > It is fixing some old issues and making clear what is really impossible.
> >
> > The root cause of the problem is in our MP model design decisions:
> > from one side we treat lcore_id as process local data, from other side
> > in some shared data-structures we use lcore_id as an index.
> > I think to fix it properly we need either:
> > make lcore_id data shared or stop using lcore_id as an index for shared data.
> > So from my perspective this approach is just one of possible workarounds.
> > BTW, there is nothing wrong to have a workaround for the problem
> > we are not ready to fix right now.
> >
> > > > > probably better to introduce explicit EAL flag ( --single-process or so).
> > > > > As Thomas and  Bruce suggested, if I understood them properly.
> > >
> > > No I was thinking to maintain the tri-state information:
> > > 	- secondary is possible
> > > 	- secondary is attached
> > > 	- secondary is forbidden
> >
> > Ok, then I misunderstood you.
> >
> > > Asking the user to use an option to forbid attaching a secondary process
> > > is the same as telling him it is forbidden.
> >
> > I don't think it is the same.
> > On a live and complex system user can't always predict will the primary proc
> > use dynamic lcore and if it will at what particular moment.
> > Same for secondary process launching - user might never start it,
> > might start it straight after the primary one,
> > or might be after several hours.
> >
> > > The error log is enough in my opinion.
> >
> > I think it is better than nothing, but probably not the best one.
> > Apart from possible non-consistent behaviour, it is quite restrictive:
> > dynamic lcore_id wouldn't be available on any DPDK MP deployment.
> > Which is a pity - I think it is a cool and useful feature.
> >
> > What do you guys think about different approach:
> > introduce new optional EAL parameter to restrict lcore_id
> > values available for the process.
> >
> > #let say to start primary proc that can use lcore_id=[0-99] only:
> > dpdk_primary --lcore-allow=0-99 ... --file-prefix=xz1
> >
> > #to start secondary one for it with allowed lcore_id=[100-109]:
> > dpdk_secondary --lcore-allow=100-109 ... --file-prefix=xz1 --proc-type=secondary
> >
> > It is still a workaround, but that way we don't need to
> > add any new limitations for dynamic lcores and secondary process usage.
> > Now it is up to user to decide would multiple-process use the same shared data
> > and if so - split lcore_id space properly among them
> > (same as he has to do now with static lcores).
> 
> A variant (more simple) of your approach could be to add
> "--proc-type=standalone" to explicitly disable MP and enable dynamic thread
> registration.
> 

For me it is a bit too restrictive, but yes it is a possible option,
and from my perspective - a better one then disabling secondary proc support on the fly. 
I tried to summarize different options under discussion in another mail of this thread.
Please have a look. 

> 
> > > > A EAL flag is a stable API from the start, as there is nothing
> > > > describing how we can remove one.
> > > > So a new EAL flag for an experimental API/feature seems contradictory.
> > > >
> > > > Going with a new features status API... I think it is beyond this series.
> > > >
> > > > Thomas seems to suggest an automatic resolution when features conflict
> > > > happens.. ?
> > >
> > > I suggest allowing the maximum and raise an error when usage conflicts.
> > > It seems this is what you did in v4.
> > >
> > > > I'll send the v4, let's discuss it there if you want.
> > >
> >
  
David Marchand July 1, 2020, 7:48 a.m. UTC | #23
On Tue, Jun 30, 2020 at 8:57 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> Imagine the situation - there is a primary proc (supposed to run forever)
> that does  rte_thread_register/rte_thread_unregister during its lifetime.
> Plus from time to time user runs some secondary process to collect stats/debug
> the primary one (proc-info or so).
> Now behaviour of such system will be non-deterministic:
> In some runs primary proc will do rte_thread_register() first,
> and then secondary proc will be never able to attach.
> In other cases - secondary will win the race, and then for primary
> eal_lcore_non_eal_allocate() will always fail.
> Which means different behaviour between runs, varying performance, etc.

If the final users finally hit the situation you describe, it means
that the multiprocess had been in use so far and was known to be in
use (*hopefully*).
So is it not a problem of design/non-regression testing when
integrating the new API in the first place?


> > If we don't add any new option now, and restrict MP handling
> > to error messages, it would not prevent from extending
> > in future, right?
>
> It shouldn't I think.
> Though what is the urgency to push this feature without having an
> agreement first?

I waited to see others' opinions (and pinged some OVS-DPDK people).
I'd like an agreement too.
  
Ananyev, Konstantin July 1, 2020, 11:58 a.m. UTC | #24
> 
> On Tue, Jun 30, 2020 at 8:57 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > Imagine the situation - there is a primary proc (supposed to run forever)
> > that does  rte_thread_register/rte_thread_unregister during its lifetime.
> > Plus from time to time user runs some secondary process to collect stats/debug
> > the primary one (proc-info or so).
> > Now behaviour of such system will be non-deterministic:
> > In some runs primary proc will do rte_thread_register() first,
> > and then secondary proc will be never able to attach.
> > In other cases - secondary will win the race, and then for primary
> > eal_lcore_non_eal_allocate() will always fail.
> > Which means different behaviour between runs, varying performance, etc.
> 
> If the final users finally hit the situation you describe, it means
> that the multiprocess had been in use so far and was known to be in
> use (*hopefully*).

Yes. 

> So is it not a problem of design/non-regression testing when
> integrating the new API in the first place?

Not sure I understand you here...
If you saying that for SP benchmarking/testing current approach
is sufficient, then - yes it is.
Or are you saying it would be hard to create a test-case to
reproduce such problematic scenario? 

> 
> > > If we don't add any new option now, and restrict MP handling
> > > to error messages, it would not prevent from extending
> > > in future, right?
> >
> > It shouldn't I think.
> > Though what is the urgency to push this feature without having an
> > agreement first?
> 
> I waited to see others' opinions (and pinged some OVS-DPDK people).
> I'd like an agreement too.
> 
> 
> --
> David Marchand
  
David Marchand July 2, 2020, 1:06 p.m. UTC | #25
On Wed, Jul 1, 2020 at 1:58 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> > If the final users finally hit the situation you describe, it means
> > that the multiprocess had been in use so far and was known to be in
> > use (*hopefully*).
>
> Yes.
>
> > So is it not a problem of design/non-regression testing when
> > integrating the new API in the first place?
>
> Not sure I understand you here...
> If you saying that for SP benchmarking/testing current approach
> is sufficient, then - yes it is.
> Or are you saying it would be hard to create a test-case to
> reproduce such problematic scenario?

I am saying that getting to a problematic scenario that only the final
users get, would be a failure in the development, documentation and
validation of the application.

When the developer integrates this new API, the developer will read
the API description.

- If the limitation on mp is understood and accepted, the application
documentation will be updated to reflect this.
Users can then know mp is not available.
If the users still try to use it, it can be a support issue.
The users will then report to support people who should be aware this
is not supported (the documentation says so).

- If the application needs mp support because of X, Y reasons, the
developer integrating the new API, should complain upstream that the
new API requires mp support.
If the developer does not complain but still uses the API.. well too
bad (or it falls through to the following point).

- The application needs mp support, the developer did not catch the
warning in the API (the kids are home, hard to concentrate)...
The new API will be used for datapath processing threads, so non
regression perf tests will be run.
On the other hand, the application uses mp for X, Y reasons, so there
will be associated test cases.
I can't tell for sure, but I find it hard to believe a validation team
would never do tests that combine both.
  
Thomas Monjalon July 3, 2020, 3:15 p.m. UTC | #26
02/07/2020 15:06, David Marchand:
> On Wed, Jul 1, 2020 at 1:58 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > > If the final users finally hit the situation you describe, it means
> > > that the multiprocess had been in use so far and was known to be in
> > > use (*hopefully*).
> >
> > Yes.
> >
> > > So is it not a problem of design/non-regression testing when
> > > integrating the new API in the first place?
> >
> > Not sure I understand you here...
> > If you saying that for SP benchmarking/testing current approach
> > is sufficient, then - yes it is.
> > Or are you saying it would be hard to create a test-case to
> > reproduce such problematic scenario?
> 
> I am saying that getting to a problematic scenario that only the final
> users get, would be a failure in the development, documentation and
> validation of the application.
> 
> When the developer integrates this new API, the developer will read
> the API description.
> 
> - If the limitation on mp is understood and accepted, the application
> documentation will be updated to reflect this.
> Users can then know mp is not available.
> If the users still try to use it, it can be a support issue.
> The users will then report to support people who should be aware this
> is not supported (the documentation says so).
> 
> - If the application needs mp support because of X, Y reasons, the
> developer integrating the new API, should complain upstream that the
> new API requires mp support.
> If the developer does not complain but still uses the API.. well too
> bad (or it falls through to the following point).
> 
> - The application needs mp support, the developer did not catch the
> warning in the API (the kids are home, hard to concentrate)...
> The new API will be used for datapath processing threads, so non
> regression perf tests will be run.
> On the other hand, the application uses mp for X, Y reasons, so there
> will be associated test cases.
> I can't tell for sure, but I find it hard to believe a validation team
> would never do tests that combine both.

Please let's conclude.

The proposed API for thread registration does not support multi-process.
This limitation is documented and there are some runtime checks.
If this API is used in an application design, it should be clear
that attaching a secondary process is forbidden.
If a user tries to attach a secondary process before the application
registers a thread, then future thread registration will fail.
If a user tries to attach a secondary process after a thread registration,
then the secondary process will fail.
It means that depending on when the user *wrongly* attach a secondary
process (despite the documented limitation of the application),
the failure will happen in a different context.
I think it is OK.

The alternative is to introduce a new EAL flag or a new API to make sure
the failure will happen when attaching a secondary process,
even if no thread is registered yet.
I think adding such addition would be weird from user or API perspective.

Please note that accepting the thread registration API does not prevent
from adding an EAL flag or a new API later.
That's why I vote for merging this series.

Acked-by: Thomas Monjalon <thomas@monjalon.net>
  
Ananyev, Konstantin July 3, 2020, 4:40 p.m. UTC | #27
> > > If the final users finally hit the situation you describe, it means
> > > that the multiprocess had been in use so far and was known to be in
> > > use (*hopefully*).
> >
> > Yes.
> >
> > > So is it not a problem of design/non-regression testing when
> > > integrating the new API in the first place?
> >
> > Not sure I understand you here...
> > If you saying that for SP benchmarking/testing current approach
> > is sufficient, then - yes it is.
> > Or are you saying it would be hard to create a test-case to
> > reproduce such problematic scenario?
> 
> I am saying that getting to a problematic scenario that only the final
> users get, would be a failure in the development, documentation and
> validation of the application.
> 
> When the developer integrates this new API, the developer will read
> the API description.
> 
> - If the limitation on mp is understood and accepted, the application
> documentation will be updated to reflect this.
> Users can then know mp is not available.
> If the users still try to use it, it can be a support issue.
> The users will then report to support people who should be aware this
> is not supported (the documentation says so).
> 
> - If the application needs mp support because of X, Y reasons, the
> developer integrating the new API, should complain upstream that the
> new API requires mp support.
> If the developer does not complain but still uses the API.. well too
> bad (or it falls through to the following point).
> 
> - The application needs mp support, the developer did not catch the
> warning in the API (the kids are home, hard to concentrate)...
> The new API will be used for datapath processing threads, so non
> regression perf tests will be run.
> On the other hand, the application uses mp for X, Y reasons, so there
> will be associated test cases.
> I can't tell for sure, but I find it hard to believe a validation team
> would never do tests that combine both.

If there is one team(/organization) doing everything:
development, deployment and support - then yes things
are more or less straightforward.   
Though it is not always a case - one app(/lib) can be used in
dozen different deployments by various organizations and
development team might not always be aware about all
possible deployment and usage scenarios.
Let say, right now dpdk app/proc-info can be used with any
primary dpdk proc, even if it was designed as a standalone app.    
With this patch, it will not be the case anymore.
It might be ok by itself, but what looks like a problem to me -
there is no a clear and easy way for the user to determine 
when he can use proc-info on the given dpdk proc
without causing problems for this app, and when he can't.  
Actually, sort of an opposite question -
what are the advantages of current approach (forbid MP support on the fly)
over explicit start-up parameters (either --proc-type=single or --lcore-allow=...)? 
Why do you think it is a better one?
  
David Marchand July 4, 2020, 3 p.m. UTC | #28
On Fri, Jul 3, 2020 at 6:40 PM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
> what are the advantages of current approach (forbid MP support on the fly)
> over explicit start-up parameters (either --proc-type=single or --lcore-allow=...)?
> Why do you think it is a better one?

I don't want to perpetuate the "please carefully set your command line" habit.
This feature is added through a C API, with documentation and flagged
as experimental, it should be enough.

How about moving the mp disable in rte_thread_register() as a separate API?
Then a developer must call rte_mp_disable() before attempting
rte_thread_register().
That would be equivalent to --proc-type=single.

Why not convert lcore-allow into an API?
This would force us to put something in the init so that external
users see how the application has been started and adjust the
secondary commandline.
On the other hand, rte_mp_disable() is easy to do with my current v4 +
I am running out of time for rc1.


We can still revisit this experimental API.
  
Ananyev, Konstantin July 4, 2020, 9:24 p.m. UTC | #29
> 
> On Fri, Jul 3, 2020 at 6:40 PM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> > what are the advantages of current approach (forbid MP support on the fly)
> > over explicit start-up parameters (either --proc-type=single or --lcore-allow=...)?
> > Why do you think it is a better one?
> 
> I don't want to perpetuate the "please carefully set your command line" habit.
> This feature is added through a C API, with documentation and flagged
> as experimental, it should be enough.
> 
> How about moving the mp disable in rte_thread_register() as a separate API?
> Then a developer must call rte_mp_disable() before attempting
> rte_thread_register().
> That would be equivalent to --proc-type=single.

It wouldn't be exactly the same thing, but yes,
I agree user can call it as first thing after rte_eal_init()
and it should help to prevent non-consistency in behaviour.
I think it is a good compromise.

> 
> Why not convert lcore-allow into an API?
> This would force us to put something in the init so that external
> users see how the application has been started and adjust the
> secondary commandline.

Not sure I understand you here...
If we'll make lcore-allow dynamic it is basically the same as moving
lcore_role[] (or some similar structure) into shared memory.
I am ok with that, but I thought you stated that it would require
quite a lot of work. 

> On the other hand, rte_mp_disable() is easy to do with my current v4 +
> I am running out of time for rc1.

Yes, as I said above such approach seems good enough to me
(at least for now).

Konstantin
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 816696caf2..fe9e74ffbc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -183,6 +183,7 @@  F: app/test/test_debug.c
 F: app/test/test_eal*
 F: app/test/test_errno.c
 F: app/test/test_interrupts.c
+F: app/test/test_lcores.c
 F: app/test/test_logs.c
 F: app/test/test_memcpy*
 F: app/test/test_per_lcore.c
diff --git a/app/test/Makefile b/app/test/Makefile
index 7b96a03a64..4a8dea2425 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -97,6 +97,7 @@  SRCS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += test_flow_classify.c
 endif
 
 SRCS-y += test_rwlock.c
+SRCS-y += test_lcores.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack.c
 SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack_perf.c
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index fc3fcc159e..600b130966 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -62,6 +62,12 @@ 
         "Func":    rwlock_autotest,
         "Report":  None,
     },
+    {
+        "Name":    "Lcores autotest",
+        "Command": "lcores_autotest",
+        "Func":    default_autotest,
+        "Report":  None,
+    },
     {
         "Name":    "Logs autotest",
         "Command": "logs_autotest",
diff --git a/app/test/meson.build b/app/test/meson.build
index 5233ead46e..a57477b7cc 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -67,6 +67,7 @@  test_sources = files('commands.c',
 	'test_ipsec_perf.c',
 	'test_kni.c',
 	'test_kvargs.c',
+	'test_lcores.c',
 	'test_logs.c',
 	'test_lpm.c',
 	'test_lpm6.c',
@@ -206,6 +207,7 @@  fast_tests = [
         ['hash_autotest', true],
         ['interrupt_autotest', true],
         ['ipfrag_autotest', false],
+        ['lcores_autotest', true],
         ['logs_autotest', true],
         ['lpm_autotest', true],
         ['lpm6_autotest', true],
diff --git a/app/test/test_lcores.c b/app/test/test_lcores.c
new file mode 100644
index 0000000000..864bcbade7
--- /dev/null
+++ b/app/test/test_lcores.c
@@ -0,0 +1,139 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Red Hat, Inc.
+ */
+
+#include <pthread.h>
+#include <string.h>
+
+#include <rte_lcore.h>
+
+#include "test.h"
+
+struct thread_context {
+	enum { INIT, ERROR, DONE } state;
+	bool lcore_id_any;
+	pthread_t id;
+	unsigned int *registered_count;
+};
+static void *thread_loop(void *arg)
+{
+	struct thread_context *t = arg;
+	unsigned int lcore_id;
+
+	lcore_id = rte_lcore_id();
+	if (lcore_id != LCORE_ID_ANY) {
+		printf("Incorrect lcore id for new thread %u\n", lcore_id);
+		t->state = ERROR;
+	}
+	rte_thread_register();
+	lcore_id = rte_lcore_id();
+	if ((t->lcore_id_any && lcore_id != LCORE_ID_ANY) ||
+			(!t->lcore_id_any && lcore_id == LCORE_ID_ANY)) {
+		printf("Could not register new thread, got %u while %sexpecting %u\n",
+			lcore_id, t->lcore_id_any ? "" : "not ", LCORE_ID_ANY);
+		t->state = ERROR;
+	}
+	/* Report register happened to the control thread. */
+	__atomic_add_fetch(t->registered_count, 1, __ATOMIC_RELEASE);
+
+	/* Wait for release from the control thread. */
+	while (__atomic_load_n(t->registered_count, __ATOMIC_ACQUIRE) != 0)
+		;
+	rte_thread_unregister();
+	lcore_id = rte_lcore_id();
+	if (lcore_id != LCORE_ID_ANY) {
+		printf("Could not unregister new thread, %u still assigned\n",
+			lcore_id);
+		t->state = ERROR;
+	}
+
+	if (t->state != ERROR)
+		t->state = DONE;
+
+	return NULL;
+}
+
+static int
+test_non_eal_lcores(unsigned int eal_threads_count)
+{
+	struct thread_context thread_contexts[RTE_MAX_LCORE];
+	unsigned int non_eal_threads_count;
+	unsigned int registered_count;
+	struct thread_context *t;
+	unsigned int i;
+	int ret;
+
+	non_eal_threads_count = 0;
+	registered_count = 0;
+
+	/* Try to create as many threads as possible. */
+	for (i = 0; i < RTE_MAX_LCORE - eal_threads_count; i++) {
+		t = &thread_contexts[i];
+		t->state = INIT;
+		t->registered_count = &registered_count;
+		t->lcore_id_any = false;
+		if (pthread_create(&t->id, NULL, thread_loop, t) != 0)
+			break;
+		non_eal_threads_count++;
+	}
+	printf("non-EAL threads count: %u\n", non_eal_threads_count);
+	/* Wait all non-EAL threads to register. */
+	while (__atomic_load_n(&registered_count, __ATOMIC_ACQUIRE) !=
+			non_eal_threads_count)
+		;
+
+	/* We managed to create the max number of threads, let's try to create
+	 * one more. This will allow one more check.
+	 */
+	if (eal_threads_count + non_eal_threads_count < RTE_MAX_LCORE)
+		goto skip_lcore_any;
+	t = &thread_contexts[non_eal_threads_count];
+	t->state = INIT;
+	t->registered_count = &registered_count;
+	t->lcore_id_any = true;
+	if (pthread_create(&t->id, NULL, thread_loop, t) == 0) {
+		non_eal_threads_count++;
+		printf("non-EAL threads count: %u\n", non_eal_threads_count);
+		while (__atomic_load_n(&registered_count, __ATOMIC_ACQUIRE) !=
+				non_eal_threads_count)
+			;
+	}
+
+skip_lcore_any:
+	/* Release all threads, and check their states. */
+	__atomic_store_n(&registered_count, 0, __ATOMIC_RELEASE);
+	ret = 0;
+	for (i = 0; i < non_eal_threads_count; i++) {
+		t = &thread_contexts[i];
+		pthread_join(t->id, NULL);
+		if (t->state != DONE)
+			ret = -1;
+	}
+
+	return ret;
+}
+
+static int
+test_lcores(void)
+{
+	unsigned int eal_threads_count = 0;
+	unsigned int i;
+
+	for (i = 0; i < RTE_MAX_LCORE; i++) {
+		if (!rte_lcore_has_role(i, ROLE_OFF))
+			eal_threads_count++;
+	}
+	if (eal_threads_count == 0) {
+		printf("Something is broken, no EAL thread detected.\n");
+		return TEST_FAILED;
+	}
+	printf("EAL threads count: %u, RTE_MAX_LCORE=%u\n", eal_threads_count,
+		RTE_MAX_LCORE);
+
+	if (test_non_eal_lcores(eal_threads_count) < 0)
+		return TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+REGISTER_TEST_COMMAND(lcores_autotest, test_lcores);
diff --git a/doc/guides/howto/debug_troubleshoot.rst b/doc/guides/howto/debug_troubleshoot.rst
index cef016b2fe..5a46f5fba3 100644
--- a/doc/guides/howto/debug_troubleshoot.rst
+++ b/doc/guides/howto/debug_troubleshoot.rst
@@ -307,8 +307,9 @@  Custom worker function :numref:`dtg_distributor_worker`.
 
 #. Configuration issue isolation
 
-   * Identify core role using ``rte_eal_lcore_role`` to identify RTE, OFF and
-     SERVICE. Check performance functions are mapped to run on the cores.
+   * Identify core role using ``rte_eal_lcore_role`` to identify RTE, OFF,
+     SERVICE and NON_EAL. Check performance functions are mapped to run on the
+     cores.
 
    * For high-performance execution logic ensure running it on correct NUMA
      and non-master core.
diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 48a2fec066..f64ae953d1 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -564,9 +564,13 @@  It's also compatible with the pattern of corelist('-l') option.
 non-EAL pthread support
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-It is possible to use the DPDK execution context with any user pthread (aka. Non-EAL pthreads).
-In a non-EAL pthread, the *_lcore_id* is always LCORE_ID_ANY which identifies that it is not an EAL thread with a valid, unique, *_lcore_id*.
-Some libraries will use an alternative unique ID (e.g. TID), some will not be impacted at all, and some will work but with limitations (e.g. timer and mempool libraries).
+It is possible to use the DPDK execution context with any user pthread (aka. non-EAL pthreads).
+There are two kinds of non-EAL pthreads:
+
+- a registered non-EAL pthread with a valid *_lcore_id* that was successfully assigned by calling ``rte_thread_register()``,
+- a non registered non-EAL pthread with a LCORE_ID_ANY,
+
+For non registered non-EAL pthread (with a LCORE_ID_ANY *_lcore_id*), some libraries will use an alternative unique ID (e.g. TID), some will not be impacted at all, and some will work but with limitations (e.g. timer and mempool libraries).
 
 All these impacts are mentioned in :ref:`known_issue_label` section.
 
@@ -613,9 +617,9 @@  Known Issues
 + rte_mempool
 
   The rte_mempool uses a per-lcore cache inside the mempool.
-  For non-EAL pthreads, ``rte_lcore_id()`` will not return a valid number.
-  So for now, when rte_mempool is used with non-EAL pthreads, the put/get operations will bypass the default mempool cache and there is a performance penalty because of this bypass.
-  Only user-owned external caches can be used in a non-EAL context in conjunction with ``rte_mempool_generic_put()`` and ``rte_mempool_generic_get()`` that accept an explicit cache parameter.
+  For unregistered non-EAL pthreads, ``rte_lcore_id()`` will not return a valid number.
+  So for now, when rte_mempool is used with unregistered non-EAL pthreads, the put/get operations will bypass the default mempool cache and there is a performance penalty because of this bypass.
+  Only user-owned external caches can be used in an unregistered non-EAL context in conjunction with ``rte_mempool_generic_put()`` and ``rte_mempool_generic_get()`` that accept an explicit cache parameter.
 
 + rte_ring
 
@@ -660,15 +664,15 @@  Known Issues
 
 + rte_timer
 
-  Running  ``rte_timer_manage()`` on a non-EAL pthread is not allowed. However, resetting/stopping the timer from a non-EAL pthread is allowed.
+  Running  ``rte_timer_manage()`` on an unregistered non-EAL pthread is not allowed. However, resetting/stopping the timer from a non-EAL pthread is allowed.
 
 + rte_log
 
-  In non-EAL pthreads, there is no per thread loglevel and logtype, global loglevels are used.
+  In unregistered non-EAL pthreads, there is no per thread loglevel and logtype, global loglevels are used.
 
 + misc
 
-  The debug statistics of rte_ring, rte_mempool and rte_timer are not supported in a non-EAL pthread.
+  The debug statistics of rte_ring, rte_mempool and rte_timer are not supported in an unregistered non-EAL pthread.
 
 cgroup control
 ~~~~~~~~~~~~~~
diff --git a/doc/guides/prog_guide/mempool_lib.rst b/doc/guides/prog_guide/mempool_lib.rst
index f8b430d656..e3e1f940be 100644
--- a/doc/guides/prog_guide/mempool_lib.rst
+++ b/doc/guides/prog_guide/mempool_lib.rst
@@ -103,7 +103,7 @@  The maximum size of the cache is static and is defined at compilation time (CONF
 Alternatively to the internal default per-lcore local cache, an application can create and manage external caches through the ``rte_mempool_cache_create()``, ``rte_mempool_cache_free()`` and ``rte_mempool_cache_flush()`` calls.
 These user-owned caches can be explicitly passed to ``rte_mempool_generic_put()`` and ``rte_mempool_generic_get()``.
 The ``rte_mempool_default_cache()`` call returns the default internal cache if any.
-In contrast to the default caches, user-owned caches can be used by non-EAL threads too.
+In contrast to the default caches, user-owned caches can be used by unregistered non-EAL threads too.
 
 Mempool Handlers
 ------------------------
diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
index 86d32a3dd7..7db05428e7 100644
--- a/lib/librte_eal/common/eal_common_lcore.c
+++ b/lib/librte_eal/common/eal_common_lcore.c
@@ -6,12 +6,13 @@ 
 #include <limits.h>
 #include <string.h>
 
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
 #include <rte_common.h>
 #include <rte_debug.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_spinlock.h>
 
 #include "eal_private.h"
 #include "eal_thread.h"
@@ -220,3 +221,38 @@  rte_socket_id_by_idx(unsigned int idx)
 	}
 	return config->numa_nodes[idx];
 }
+
+static rte_spinlock_t lcore_lock = RTE_SPINLOCK_INITIALIZER;
+
+unsigned int
+eal_lcore_non_eal_allocate(void)
+{
+	struct rte_config *cfg = rte_eal_get_configuration();
+	unsigned int lcore_id;
+
+	rte_spinlock_lock(&lcore_lock);
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		if (cfg->lcore_role[lcore_id] != ROLE_OFF)
+			continue;
+		cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
+		cfg->lcore_count++;
+		break;
+	}
+	if (lcore_id == RTE_MAX_LCORE)
+		RTE_LOG(DEBUG, EAL, "No lcore available.\n");
+	rte_spinlock_unlock(&lcore_lock);
+	return lcore_id;
+}
+
+void
+eal_lcore_non_eal_release(unsigned int lcore_id)
+{
+	struct rte_config *cfg = rte_eal_get_configuration();
+
+	rte_spinlock_lock(&lcore_lock);
+	if (cfg->lcore_role[lcore_id] == ROLE_NON_EAL) {
+		cfg->lcore_role[lcore_id] = ROLE_OFF;
+		cfg->lcore_count--;
+	}
+	rte_spinlock_unlock(&lcore_lock);
+}
diff --git a/lib/librte_eal/common/eal_common_thread.c b/lib/librte_eal/common/eal_common_thread.c
index a7ae0691bf..1cbddc4b5b 100644
--- a/lib/librte_eal/common/eal_common_thread.c
+++ b/lib/librte_eal/common/eal_common_thread.c
@@ -236,3 +236,36 @@  rte_ctrl_thread_create(pthread_t *thread, const char *name,
 	pthread_join(*thread, NULL);
 	return -ret;
 }
+
+void
+rte_thread_register(void)
+{
+	unsigned int lcore_id;
+	rte_cpuset_t cpuset;
+
+	/* EAL init flushes all lcores, we can't register before. */
+	assert(internal_config.init_complete == 1);
+	if (pthread_getaffinity_np(pthread_self(), sizeof(cpuset),
+			&cpuset) != 0)
+		CPU_ZERO(&cpuset);
+	lcore_id = eal_lcore_non_eal_allocate();
+	if (lcore_id >= RTE_MAX_LCORE)
+		lcore_id = LCORE_ID_ANY;
+	rte_thread_init(lcore_id, &cpuset);
+	if (lcore_id != LCORE_ID_ANY)
+		RTE_LOG(DEBUG, EAL, "Registered non-EAL thread as lcore %u.\n",
+			lcore_id);
+}
+
+void
+rte_thread_unregister(void)
+{
+	unsigned int lcore_id = rte_lcore_id();
+
+	if (lcore_id != LCORE_ID_ANY)
+		eal_lcore_non_eal_release(lcore_id);
+	rte_thread_uninit();
+	if (lcore_id != LCORE_ID_ANY)
+		RTE_LOG(DEBUG, EAL, "Unregistered non-EAL thread (was lcore %u).\n",
+			lcore_id);
+}
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 0592fcd694..73238ff157 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -396,6 +396,24 @@  uint64_t get_tsc_freq(void);
  */
 uint64_t get_tsc_freq_arch(void);
 
+/**
+ * Allocate a free lcore to associate to a non-EAL thread.
+ *
+ * @return
+ *   - the id of a lcore with role ROLE_NON_EAL on success.
+ *   - RTE_MAX_LCORE if none was available.
+ */
+unsigned int eal_lcore_non_eal_allocate(void);
+
+/**
+ * Release the lcore used by a non-EAL thread.
+ * Counterpart of eal_lcore_non_eal_allocate().
+ *
+ * @param lcore_id
+ *   The lcore with role ROLE_NON_EAL to release.
+ */
+void eal_lcore_non_eal_release(unsigned int lcore_id);
+
 /**
  * Prepare physical memory mapping
  * i.e. hugepages on Linux and
diff --git a/lib/librte_eal/include/rte_lcore.h b/lib/librte_eal/include/rte_lcore.h
index 3968c40693..ea86220394 100644
--- a/lib/librte_eal/include/rte_lcore.h
+++ b/lib/librte_eal/include/rte_lcore.h
@@ -31,6 +31,7 @@  enum rte_lcore_role_t {
 	ROLE_RTE,
 	ROLE_OFF,
 	ROLE_SERVICE,
+	ROLE_NON_EAL,
 };
 
 /**
@@ -67,7 +68,8 @@  rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role);
  *   to run threads with lcore IDs 0, 1, 2 and 3 on physical core 10..
  *
  * @return
- *  Logical core ID (in EAL thread) or LCORE_ID_ANY (in non-EAL thread)
+ *  Logical core ID (in EAL thread or registered non-EAL thread) or
+ *  LCORE_ID_ANY (in unregistered non-EAL thread)
  */
 static inline unsigned
 rte_lcore_id(void)
@@ -279,6 +281,20 @@  int rte_thread_setname(pthread_t id, const char *name);
 __rte_experimental
 int rte_thread_getname(pthread_t id, char *name, size_t len);
 
+/**
+ * Register current non-EAL thread as a lcore.
+ */
+__rte_experimental
+void
+rte_thread_register(void);
+
+/**
+ * Unregister current thread and release lcore if one was associated.
+ */
+__rte_experimental
+void
+rte_thread_unregister(void);
+
 /**
  * Create a control thread.
  *
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 5831eea4b0..39c41d445d 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -396,6 +396,8 @@  EXPERIMENTAL {
 
 	# added in 20.08
 	__rte_trace_mem_per_thread_free;
+	rte_thread_register;
+	rte_thread_unregister;
 };
 
 INTERNAL {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 652d19f9f1..9e0ee052b3 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -28,9 +28,9 @@ 
  * rte_mempool_get() or rte_mempool_put() are designed to be called from an EAL
  * thread due to the internal per-lcore cache. Due to the lack of caching,
  * rte_mempool_get() or rte_mempool_put() performance will suffer when called
- * by non-EAL threads. Instead, non-EAL threads should call
- * rte_mempool_generic_get() or rte_mempool_generic_put() with a user cache
- * created with rte_mempool_cache_create().
+ * by unregistered non-EAL threads. Instead, unregistered non-EAL threads
+ * should call rte_mempool_generic_get() or rte_mempool_generic_put() with a
+ * user cache created with rte_mempool_cache_create().
  */
 
 #include <stdio.h>
@@ -1233,7 +1233,7 @@  void rte_mempool_dump(FILE *f, struct rte_mempool *mp);
 /**
  * Create a user-owned mempool cache.
  *
- * This can be used by non-EAL threads to enable caching when they
+ * This can be used by unregistered non-EAL threads to enable caching when they
  * interact with a mempool.
  *
  * @param size
@@ -1264,7 +1264,8 @@  rte_mempool_cache_free(struct rte_mempool_cache *cache);
  * @param lcore_id
  *   The logical core id.
  * @return
- *   A pointer to the mempool cache or NULL if disabled or non-EAL thread.
+ *   A pointer to the mempool cache or NULL if disabled or unregistered non-EAL
+ *   thread.
  */
 static __rte_always_inline struct rte_mempool_cache *
 rte_mempool_default_cache(struct rte_mempool *mp, unsigned lcore_id)