[v3,2/7] eal/interrupts: implement get set APIs

Message ID 20211018193707.123559-3-hkalra@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series make rte_intr_handle internal |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Harman Kalra Oct. 18, 2021, 7:37 p.m. UTC
  Prototype/Implement get set APIs for interrupt handle fields.
User won't be able to access any of the interrupt handle fields
directly while should use these get/set APIs to access/manipulate
them.

Internal interrupt header i.e. rte_eal_interrupt.h is rearranged,
as APIs defined are moved to rte_interrupts.h and epoll specific
definitions are moved to a new header rte_epoll.h.
Later in the series rte_eal_interrupt.h will be removed.

Signed-off-by: Harman Kalra <hkalra@marvell.com>
Acked-by: Ray Kinsella <mdr@ashroe.eu>
---
 MAINTAINERS                            |   1 +
 lib/eal/common/eal_common_interrupts.c | 410 ++++++++++++++++
 lib/eal/common/meson.build             |   1 +
 lib/eal/include/meson.build            |   1 +
 lib/eal/include/rte_eal_interrupts.h   | 209 +--------
 lib/eal/include/rte_epoll.h            | 118 +++++
 lib/eal/include/rte_interrupts.h       | 622 ++++++++++++++++++++++++-
 lib/eal/version.map                    |  47 +-
 8 files changed, 1195 insertions(+), 214 deletions(-)
 create mode 100644 lib/eal/common/eal_common_interrupts.c
 create mode 100644 lib/eal/include/rte_epoll.h
  

Comments

Dmitry Kozlyuk Oct. 18, 2021, 10:07 p.m. UTC | #1
2021-10-19 01:07 (UTC+0530), Harman Kalra:
[...]
> +struct rte_intr_handle *rte_intr_instance_alloc(void)
> +{
> +	struct rte_intr_handle *intr_handle;
> +	bool mem_allocator;

This name is not very descriptive; what would "mem_allocator is false" mean?
How about "is_rte_memory"?

> +
> +	/* Detect if DPDK malloc APIs are ready to be used. */
> +	mem_allocator = rte_malloc_is_ready();
> +	if (mem_allocator)
> +		intr_handle = rte_zmalloc(NULL, sizeof(struct rte_intr_handle),
> +					  0);
> +	else
> +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));
> +	if (!intr_handle) {
> +		RTE_LOG(ERR, EAL, "Fail to allocate intr_handle\n");
> +		rte_errno = ENOMEM;
> +		return NULL;
> +	}
> +
> +	intr_handle->nb_intr = RTE_MAX_RXTX_INTR_VEC_ID;
> +	intr_handle->mem_allocator = mem_allocator;
> +
> +	return intr_handle;
> +}
> +
> +int rte_intr_instance_copy(struct rte_intr_handle *intr_handle,
> +			   const struct rte_intr_handle *src)
> +{
> +	uint16_t nb_intr;
> +
> +	CHECK_VALID_INTR_HANDLE(intr_handle);
> +
> +	if (src == NULL) {
> +		RTE_LOG(ERR, EAL, "Source interrupt instance unallocated\n");
> +		rte_errno = EINVAL;
> +		goto fail;
> +	}
> +
> +	intr_handle->fd = src->fd;
> +	intr_handle->vfio_dev_fd = src->vfio_dev_fd;
> +	intr_handle->type = src->type;
> +	intr_handle->max_intr = src->max_intr;
> +	intr_handle->nb_efd = src->nb_efd;
> +	intr_handle->efd_counter_size = src->efd_counter_size;
> +
> +	nb_intr = RTE_MIN(src->nb_intr, intr_handle->nb_intr);

Truncating copy is error-prone.
It should be either a reallocation (in the future) or an error (now).

> +	memcpy(intr_handle->efds, src->efds, nb_intr);
> +	memcpy(intr_handle->elist, src->elist, nb_intr);
> +
> +	return 0;
> +fail:
> +	return -rte_errno;
> +}
> +
> +void rte_intr_instance_free(struct rte_intr_handle *intr_handle)
> +{
> +	if (intr_handle->mem_allocator)

This function should accept NULL and be a no-op in such case.

> +		rte_free(intr_handle);
> +	else
> +		free(intr_handle);
> +}

[...]
> +void *rte_intr_instance_windows_handle_get(struct rte_intr_handle *intr_handle)
> +{
> +	CHECK_VALID_INTR_HANDLE(intr_handle);
> +
> +	return intr_handle->windows_handle;
> +fail:
> +	return NULL;
> +}
> +
> +int rte_intr_instance_windows_handle_set(struct rte_intr_handle *intr_handle,
> +					 void *windows_handle)
> +{
> +	CHECK_VALID_INTR_HANDLE(intr_handle);
> +
> +	if (!windows_handle) {
> +		RTE_LOG(ERR, EAL, "Windows handle should not be NULL\n");
> +		rte_errno = EINVAL;
> +		goto fail;
> +	}

Thanks for adding this API, but please remove the check.
It is possible that the API user will pass NULL to reset the state
(also NULL is not the only invalid value for a Windows handle).
There is no check for Unix FD, neither should be here.

> +
> +	intr_handle->windows_handle = windows_handle;
> +
> +	return 0;
> +fail:
> +	return -rte_errno;
> +}

[...]
> @@ -79,191 +53,20 @@ struct rte_intr_handle {
>  			};
>  			int fd;	/**< interrupt event file descriptor */
>  		};
> -		void *handle; /**< device driver handle (Windows) */
> +		void *windows_handle; /**< device driver handle (Windows) */

I guess Windows can be dropped from the comment since it's now in the name.

>  	};
> +	bool mem_allocator;
>  	enum rte_intr_handle_type type;  /**< handle type */
>  	uint32_t max_intr;             /**< max interrupt requested */
>  	uint32_t nb_efd;               /**< number of available efd(event fd) */
>  	uint8_t efd_counter_size;      /**< size of efd counter, used for vdev */
> +	uint16_t nb_intr;
> +		/**< Max vector count, default RTE_MAX_RXTX_INTR_VEC_ID */
>  	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
>  	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
> -				       /**< intr vector epoll event */
> +						/**< intr vector epoll event */
> +	uint16_t vec_list_size;
>  	int *intr_vec;                 /**< intr vector number array */
>  };
>  

[...]
> diff --git a/lib/eal/include/rte_interrupts.h b/lib/eal/include/rte_interrupts.h
> index cc3bf45d8c..98edf774af 100644
> --- a/lib/eal/include/rte_interrupts.h
> +++ b/lib/eal/include/rte_interrupts.h
[...]
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * It allocates memory for interrupt instance. API takes flag as an argument

Not anymore. Please update the description.

> + * which define from where memory should be allocated i.e. using DPDK memory
> + * management library APIs or normal heap allocation.
> + * Default memory allocation for event fds and event list array is done which
> + * can be realloced later as per the requirement.
> + *
> + * This function should be called from application or driver, before calling any
> + * of the interrupt APIs.
> + *
> + * @param flags
> + *  Memory allocation from DPDK allocator or normal allocation
> + *
> + * @return
> + *  - On success, address of first interrupt handle.
> + *  - On failure, NULL.
> + */
> +__rte_experimental
> +struct rte_intr_handle *
> +rte_intr_instance_alloc(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * This API is used to free the memory allocated for event fds. event lists
> + * and interrupt handle array.

It's simpler and more future-proof to just say "interrupt handle resources"
instead of enumerating them.

> + *
> + * @param intr_handle
> + *  Base address of interrupt handle array.

It's not an array anymore.

[...]
> +/**
> + * @internal
> + * This API is used to set the event list array index with the given elist

"Event list array" sound like an array of lists,
while it is really an array of scalar elements.
"Event data array"? TBH, I don't know how it's usually named in Unices.

> + * instance.
> + *
> + * @param intr_handle
> + *  pointer to the interrupt handle.
> + * @param index
> + *  elist array index to be set
> + * @param elist
> + *  event list instance of struct rte_epoll_event
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +__rte_internal
> +int
> +rte_intr_elist_index_set(struct rte_intr_handle *intr_handle, int index,
> +			 struct rte_epoll_event elist);
> +
> +/**
> + * @internal
> + * Returns the address of elist instance of event list array at a given index.
> + *
> + * @param intr_handle
> + *  pointer to the interrupt handle.
> + * @param index
> + *  elist array index to be returned
> + *
> + * @return
> + *  - On success, elist
> + *  - On failure, a negative value.
> + */
> +__rte_internal
> +struct rte_epoll_event *
> +rte_intr_elist_index_get(struct rte_intr_handle *intr_handle, int index);
> +
> +/**
> + * @internal
> + * Allocates the memory of interrupt vector list array, with size defining the
> + * no of elements required in the array.

Typo: "no" -> "number".

[...]
> +
> +/**
> + * @internal
> + * This API returns the windows handle of the given interrupt instance.

Typo: "windows" -> "Windows" here and below.

> + *
> + * @param intr_handle
> + *  pointer to the interrupt handle.
> + *
> + * @return
> + *  - On success, windows handle.
> + *  - On failure, NULL.
> + */
> +__rte_internal
> +void *
> +rte_intr_instance_windows_handle_get(struct rte_intr_handle *intr_handle);
> +
> +/**
> + * @internal
> + * This API set the windows handle for the given interrupt instance.
> + *
> + * @param intr_handle
> + *  pointer to the interrupt handle.
> + * @param windows_handle
> + *  windows handle to be set.
> + *
> + * @return
> + *  - On success, zero
> + *  - On failure, a negative value.
> + */
> +__rte_internal
> +int
> +rte_intr_instance_windows_handle_set(struct rte_intr_handle *intr_handle,
> +				     void *windows_handle);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/eal/version.map b/lib/eal/version.map
> index 38f7de83e1..0ef77c3b40 100644
> --- a/lib/eal/version.map
> +++ b/lib/eal/version.map
> @@ -109,18 +109,10 @@ DPDK_22 {
>  	rte_hexdump;
>  	rte_hypervisor_get;
>  	rte_hypervisor_get_name; # WINDOWS_NO_EXPORT
> -	rte_intr_allow_others;
>  	rte_intr_callback_register;
>  	rte_intr_callback_unregister;
> -	rte_intr_cap_multiple;
> -	rte_intr_disable;
> -	rte_intr_dp_is_en;
> -	rte_intr_efd_disable;
> -	rte_intr_efd_enable;
>  	rte_intr_enable;
> -	rte_intr_free_epoll_fd;
> -	rte_intr_rx_ctl;
> -	rte_intr_tls_epfd;
> +	rte_intr_disable;
>  	rte_keepalive_create; # WINDOWS_NO_EXPORT
>  	rte_keepalive_dispatch_pings; # WINDOWS_NO_EXPORT
>  	rte_keepalive_mark_alive; # WINDOWS_NO_EXPORT
> @@ -420,6 +412,14 @@ EXPERIMENTAL {
>  
>  	# added in 21.08
>  	rte_power_monitor_multi; # WINDOWS_NO_EXPORT
> +
> +	# added in 21.11
> +	rte_intr_fd_set; # WINDOWS_NO_EXPORT
> +	rte_intr_fd_get; # WINDOWS_NO_EXPORT

OK, these are not feasible on Windows.

> +	rte_intr_type_set; # WINDOWS_NO_EXPORT
> +	rte_intr_type_get; # WINDOWS_NO_EXPORT
> +	rte_intr_instance_alloc; # WINDOWS_NO_EXPORT
> +	rte_intr_instance_free; # WINDOWS_NO_EXPORT

No, these *are* needed on Windows.

>  };
>  
>  INTERNAL {
> @@ -430,4 +430,33 @@ INTERNAL {
>  	rte_mem_map;
>  	rte_mem_page_size;
>  	rte_mem_unmap;
> +	rte_intr_cap_multiple;
> +	rte_intr_dp_is_en;
> +	rte_intr_efd_disable;
> +	rte_intr_efd_enable;
> +	rte_intr_free_epoll_fd;
> +	rte_intr_rx_ctl;
> +	rte_intr_allow_others;
> +	rte_intr_tls_epfd;
> +	rte_intr_dev_fd_set; # WINDOWS_NO_EXPORT
> +	rte_intr_dev_fd_get; # WINDOWS_NO_EXPORT

OK.

> +	rte_intr_instance_copy; # WINDOWS_NO_EXPORT
> +	rte_intr_event_list_update; # WINDOWS_NO_EXPORT
> +	rte_intr_max_intr_set; # WINDOWS_NO_EXPORT
> +	rte_intr_max_intr_get; # WINDOWS_NO_EXPORT

These are needed on Windows.

> +	rte_intr_nb_efd_set; # WINDOWS_NO_EXPORT
> +	rte_intr_nb_efd_get; # WINDOWS_NO_EXPORT
> +	rte_intr_nb_intr_get; # WINDOWS_NO_EXPORT
> +	rte_intr_efds_index_set; # WINDOWS_NO_EXPORT
> +	rte_intr_efds_index_get; # WINDOWS_NO_EXPORT

OK.

> +	rte_intr_elist_index_set; # WINDOWS_NO_EXPORT
> +	rte_intr_elist_index_get; # WINDOWS_NO_EXPORT

These are needed on Windows.

> +	rte_intr_efd_counter_size_set; # WINDOWS_NO_EXPORT
> +	rte_intr_efd_counter_size_get; # WINDOWS_NO_EXPORT

OK.

> +	rte_intr_vec_list_alloc; # WINDOWS_NO_EXPORT
> +	rte_intr_vec_list_index_set; # WINDOWS_NO_EXPORT
> +	rte_intr_vec_list_index_get; # WINDOWS_NO_EXPORT
> +	rte_intr_vec_list_free; # WINDOWS_NO_EXPORT

These are needed on Windows.

> +	rte_intr_instance_windows_handle_get;
> +	rte_intr_instance_windows_handle_set;
>  };
  
Stephen Hemminger Oct. 18, 2021, 10:56 p.m. UTC | #2
On Tue, 19 Oct 2021 01:07:02 +0530
Harman Kalra <hkalra@marvell.com> wrote:

> +	/* Detect if DPDK malloc APIs are ready to be used. */
> +	mem_allocator = rte_malloc_is_ready();
> +	if (mem_allocator)
> +		intr_handle = rte_zmalloc(NULL, sizeof(struct rte_intr_handle),
> +					  0);
> +	else
> +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));

This is problematic way to do this.
The reason to use rte_malloc vs malloc should be determined by usage.

If the pointer will be shared between primary/secondary process then
it has to be in hugepages (ie rte_malloc). If it is not shared then
then use regular malloc.

But what you have done is created a method which will be
a latent bug for anyone using primary/secondary process.

Either:
    intr_handle is not allowed to be used in secondary.
      Then always use malloc().
Or.
    intr_handle can be used by both primary and secondary.
    Then always use rte_malloc().
    Any code path that allocates intr_handle before pool is
    ready is broken.
  
Harman Kalra Oct. 19, 2021, 8:32 a.m. UTC | #3
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Tuesday, October 19, 2021 4:27 AM
> To: Harman Kalra <hkalra@marvell.com>
> Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Ray Kinsella
> <mdr@ashroe.eu>; david.marchand@redhat.com;
> dmitry.kozliuk@gmail.com
> Subject: [EXT] Re: [dpdk-dev] [PATCH v3 2/7] eal/interrupts: implement get
> set APIs
> 
> External Email
> 
> ----------------------------------------------------------------------
> On Tue, 19 Oct 2021 01:07:02 +0530
> Harman Kalra <hkalra@marvell.com> wrote:
> 
> > +	/* Detect if DPDK malloc APIs are ready to be used. */
> > +	mem_allocator = rte_malloc_is_ready();
> > +	if (mem_allocator)
> > +		intr_handle = rte_zmalloc(NULL, sizeof(struct
> rte_intr_handle),
> > +					  0);
> > +	else
> > +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));
> 
> This is problematic way to do this.
> The reason to use rte_malloc vs malloc should be determined by usage.
> 
> If the pointer will be shared between primary/secondary process then it has
> to be in hugepages (ie rte_malloc). If it is not shared then then use regular
> malloc.
> 
> But what you have done is created a method which will be a latent bug for
> anyone using primary/secondary process.
> 
> Either:
>     intr_handle is not allowed to be used in secondary.
>       Then always use malloc().
> Or.
>     intr_handle can be used by both primary and secondary.
>     Then always use rte_malloc().
>     Any code path that allocates intr_handle before pool is
>     ready is broken.

Hi Stephan,

Till V2, I implemented this API in a way where user of the API can choose
If he wants intr handle to be allocated using malloc or rte_malloc by passing
a flag arg to the rte_intr_instanc_alloc API. User of the API will best know if
the intr handle is to be shared with secondary or not.

But after some discussions and suggestions from the community we decided
to drop that flag argument and auto detect on whether rte_malloc APIs are
ready to be used and thereafter make all further allocations via rte_malloc.
Currently alarm subsystem (or any driver doing allocation in constructor) gets
interrupt instance allocated using glibc malloc that too because rte_malloc*
is not ready by rte_eal_alarm_init(), while all further consumers gets instance
allocated via rte_malloc.
 I think this should not cause any issue in primary/secondary model as all interrupt
instance pointer will be shared. Infact to avoid any surprises of primary/secondary
not working we thought of making all allocations via rte_malloc. 

David, Thomas, Dmitry, please add if I missed anything.

Can we please conclude on this series APIs as API freeze deadline (rc1) is very near.

Thanks
Harman
  
Harman Kalra Oct. 19, 2021, 8:50 a.m. UTC | #4
Hi Dmitry,

Thanks for reviewing. Please find my responses inline.

> -----Original Message-----
> From: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> Sent: Tuesday, October 19, 2021 3:38 AM
> To: Harman Kalra <hkalra@marvell.com>
> Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Ray Kinsella
> <mdr@ashroe.eu>; david.marchand@redhat.com
> Subject: [EXT] Re: [PATCH v3 2/7] eal/interrupts: implement get set APIs
> 
> External Email
> 
> ----------------------------------------------------------------------
> 2021-10-19 01:07 (UTC+0530), Harman Kalra:
> [...]
> > +struct rte_intr_handle *rte_intr_instance_alloc(void) {
> > +	struct rte_intr_handle *intr_handle;
> > +	bool mem_allocator;
> 
> This name is not very descriptive; what would "mem_allocator is false"
> mean?
> How about "is_rte_memory"?

Sure, will make it "is_rte_memory"

> 
> > +
> > +	/* Detect if DPDK malloc APIs are ready to be used. */
> > +	mem_allocator = rte_malloc_is_ready();
> > +	if (mem_allocator)
> > +		intr_handle = rte_zmalloc(NULL, sizeof(struct
> rte_intr_handle),
> > +					  0);
> > +	else
> > +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));
> > +	if (!intr_handle) {
> > +		RTE_LOG(ERR, EAL, "Fail to allocate intr_handle\n");
> > +		rte_errno = ENOMEM;
> > +		return NULL;
> > +	}
> > +
> > +	intr_handle->nb_intr = RTE_MAX_RXTX_INTR_VEC_ID;
> > +	intr_handle->mem_allocator = mem_allocator;
> > +
> > +	return intr_handle;
> > +}
> > +
> > +int rte_intr_instance_copy(struct rte_intr_handle *intr_handle,
> > +			   const struct rte_intr_handle *src) {
> > +	uint16_t nb_intr;
> > +
> > +	CHECK_VALID_INTR_HANDLE(intr_handle);
> > +
> > +	if (src == NULL) {
> > +		RTE_LOG(ERR, EAL, "Source interrupt instance
> unallocated\n");
> > +		rte_errno = EINVAL;
> > +		goto fail;
> > +	}
> > +
> > +	intr_handle->fd = src->fd;
> > +	intr_handle->vfio_dev_fd = src->vfio_dev_fd;
> > +	intr_handle->type = src->type;
> > +	intr_handle->max_intr = src->max_intr;
> > +	intr_handle->nb_efd = src->nb_efd;
> > +	intr_handle->efd_counter_size = src->efd_counter_size;
> > +
> > +	nb_intr = RTE_MIN(src->nb_intr, intr_handle->nb_intr);
> 
> Truncating copy is error-prone.
> It should be either a reallocation (in the future) or an error (now).

Actually in patch 6, I have made lot of changes to this API wrt nb_intr,
where efds/elist arrays are reallocated based on src->nb_intr and make
intr_handle->nb_intr equal to src->nb_intr. I think those changes can be
moved from patch 6 to patch 2.

> 
> > +	memcpy(intr_handle->efds, src->efds, nb_intr);
> > +	memcpy(intr_handle->elist, src->elist, nb_intr);
> > +
> > +	return 0;
> > +fail:
> > +	return -rte_errno;
> > +}
> > +
> > +void rte_intr_instance_free(struct rte_intr_handle *intr_handle) {
> > +	if (intr_handle->mem_allocator)
> 
> This function should accept NULL and be a no-op in such case.

Ack.

> 
> > +		rte_free(intr_handle);
> > +	else
> > +		free(intr_handle);
> > +}
> 
> [...]
> > +void *rte_intr_instance_windows_handle_get(struct rte_intr_handle
> > +*intr_handle) {
> > +	CHECK_VALID_INTR_HANDLE(intr_handle);
> > +
> > +	return intr_handle->windows_handle;
> > +fail:
> > +	return NULL;
> > +}
> > +
> > +int rte_intr_instance_windows_handle_set(struct rte_intr_handle
> *intr_handle,
> > +					 void *windows_handle)
> > +{
> > +	CHECK_VALID_INTR_HANDLE(intr_handle);
> > +
> > +	if (!windows_handle) {
> > +		RTE_LOG(ERR, EAL, "Windows handle should not be
> NULL\n");
> > +		rte_errno = EINVAL;
> > +		goto fail;
> > +	}
> 
> Thanks for adding this API, but please remove the check.
> It is possible that the API user will pass NULL to reset the state (also NULL is
> not the only invalid value for a Windows handle).
> There is no check for Unix FD, neither should be here.

Sure, will remove the check.

> 
> > +
> > +	intr_handle->windows_handle = windows_handle;
> > +
> > +	return 0;
> > +fail:
> > +	return -rte_errno;
> > +}
> 
> [...]
> > @@ -79,191 +53,20 @@ struct rte_intr_handle {
> >  			};
> >  			int fd;	/**< interrupt event file descriptor */
> >  		};
> > -		void *handle; /**< device driver handle (Windows) */
> > +		void *windows_handle; /**< device driver handle (Windows)
> */
> 
> I guess Windows can be dropped from the comment since it's now in the
> name.

Ack.

> 
> >  	};
> > +	bool mem_allocator;
> >  	enum rte_intr_handle_type type;  /**< handle type */
> >  	uint32_t max_intr;             /**< max interrupt requested */
> >  	uint32_t nb_efd;               /**< number of available efd(event fd) */
> >  	uint8_t efd_counter_size;      /**< size of efd counter, used for vdev
> */
> > +	uint16_t nb_intr;
> > +		/**< Max vector count, default
> RTE_MAX_RXTX_INTR_VEC_ID */
> >  	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds
> mapping */
> >  	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
> > -				       /**< intr vector epoll event */
> > +						/**< intr vector epoll event */
> > +	uint16_t vec_list_size;
> >  	int *intr_vec;                 /**< intr vector number array */
> >  };
> >
> 
> [...]
> > diff --git a/lib/eal/include/rte_interrupts.h
> > b/lib/eal/include/rte_interrupts.h
> > index cc3bf45d8c..98edf774af 100644
> > --- a/lib/eal/include/rte_interrupts.h
> > +++ b/lib/eal/include/rte_interrupts.h
> [...]
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * It allocates memory for interrupt instance. API takes flag as an
> > +argument
> 
> Not anymore. Please update the description.

Ack.

> 
> > + * which define from where memory should be allocated i.e. using DPDK
> > +memory
> > + * management library APIs or normal heap allocation.
> > + * Default memory allocation for event fds and event list array is
> > +done which
> > + * can be realloced later as per the requirement.
> > + *
> > + * This function should be called from application or driver, before
> > +calling any
> > + * of the interrupt APIs.
> > + *
> > + * @param flags
> > + *  Memory allocation from DPDK allocator or normal allocation
> > + *
> > + * @return
> > + *  - On success, address of first interrupt handle.
> > + *  - On failure, NULL.
> > + */
> > +__rte_experimental
> > +struct rte_intr_handle *
> > +rte_intr_instance_alloc(void);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * This API is used to free the memory allocated for event fds. event
> > +lists
> > + * and interrupt handle array.
> 
> It's simpler and more future-proof to just say "interrupt handle resources"
> instead of enumerating them.

Sure, will reword it.

> 
> > + *
> > + * @param intr_handle
> > + *  Base address of interrupt handle array.
> 
> It's not an array anymore.

Ack.

> 
> [...]
> > +/**
> > + * @internal
> > + * This API is used to set the event list array index with the given
> > +elist
> 
> "Event list array" sound like an array of lists, while it is really an array of
> scalar elements.
> "Event data array"? TBH, I don't know how it's usually named in Unices.
> 
> > + * instance.
> > + *
> > + * @param intr_handle
> > + *  pointer to the interrupt handle.
> > + * @param index
> > + *  elist array index to be set
> > + * @param elist
> > + *  event list instance of struct rte_epoll_event
> > + *
> > + * @return
> > + *  - On success, zero.
> > + *  - On failure, a negative value.
> > + */
> > +__rte_internal
> > +int
> > +rte_intr_elist_index_set(struct rte_intr_handle *intr_handle, int index,
> > +			 struct rte_epoll_event elist);
> > +
> > +/**
> > + * @internal
> > + * Returns the address of elist instance of event list array at a given index.
> > + *
> > + * @param intr_handle
> > + *  pointer to the interrupt handle.
> > + * @param index
> > + *  elist array index to be returned
> > + *
> > + * @return
> > + *  - On success, elist
> > + *  - On failure, a negative value.
> > + */
> > +__rte_internal
> > +struct rte_epoll_event *
> > +rte_intr_elist_index_get(struct rte_intr_handle *intr_handle, int
> > +index);
> > +
> > +/**
> > + * @internal
> > + * Allocates the memory of interrupt vector list array, with size
> > +defining the
> > + * no of elements required in the array.
> 
> Typo: "no" -> "number".

Ack.

> 
> [...]
> > +
> > +/**
> > + * @internal
> > + * This API returns the windows handle of the given interrupt instance.
> 
> Typo: "windows" -> "Windows" here and below.
> 
> > + *
> > + * @param intr_handle
> > + *  pointer to the interrupt handle.
> > + *
> > + * @return
> > + *  - On success, windows handle.
> > + *  - On failure, NULL.
> > + */
> > +__rte_internal
> > +void *
> > +rte_intr_instance_windows_handle_get(struct rte_intr_handle
> > +*intr_handle);
> > +
> > +/**
> > + * @internal
> > + * This API set the windows handle for the given interrupt instance.
> > + *
> > + * @param intr_handle
> > + *  pointer to the interrupt handle.
> > + * @param windows_handle
> > + *  windows handle to be set.
> > + *
> > + * @return
> > + *  - On success, zero
> > + *  - On failure, a negative value.
> > + */
> > +__rte_internal
> > +int
> > +rte_intr_instance_windows_handle_set(struct rte_intr_handle
> *intr_handle,
> > +				     void *windows_handle);
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/eal/version.map b/lib/eal/version.map index
> > 38f7de83e1..0ef77c3b40 100644
> > --- a/lib/eal/version.map
> > +++ b/lib/eal/version.map
> > @@ -109,18 +109,10 @@ DPDK_22 {
> >  	rte_hexdump;
> >  	rte_hypervisor_get;
> >  	rte_hypervisor_get_name; # WINDOWS_NO_EXPORT
> > -	rte_intr_allow_others;
> >  	rte_intr_callback_register;
> >  	rte_intr_callback_unregister;
> > -	rte_intr_cap_multiple;
> > -	rte_intr_disable;
> > -	rte_intr_dp_is_en;
> > -	rte_intr_efd_disable;
> > -	rte_intr_efd_enable;
> >  	rte_intr_enable;
> > -	rte_intr_free_epoll_fd;
> > -	rte_intr_rx_ctl;
> > -	rte_intr_tls_epfd;
> > +	rte_intr_disable;
> >  	rte_keepalive_create; # WINDOWS_NO_EXPORT
> >  	rte_keepalive_dispatch_pings; # WINDOWS_NO_EXPORT
> >  	rte_keepalive_mark_alive; # WINDOWS_NO_EXPORT @@ -420,6
> +412,14 @@
> > EXPERIMENTAL {
> >
> >  	# added in 21.08
> >  	rte_power_monitor_multi; # WINDOWS_NO_EXPORT
> > +
> > +	# added in 21.11
> > +	rte_intr_fd_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_fd_get; # WINDOWS_NO_EXPORT
> 
> OK, these are not feasible on Windows.

Ack.

> 
> > +	rte_intr_type_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_type_get; # WINDOWS_NO_EXPORT
> > +	rte_intr_instance_alloc; # WINDOWS_NO_EXPORT
> > +	rte_intr_instance_free; # WINDOWS_NO_EXPORT
> 
> No, these *are* needed on Windows.

Ack.

> 
> >  };
> >
> >  INTERNAL {
> > @@ -430,4 +430,33 @@ INTERNAL {
> >  	rte_mem_map;
> >  	rte_mem_page_size;
> >  	rte_mem_unmap;
> > +	rte_intr_cap_multiple;
> > +	rte_intr_dp_is_en;
> > +	rte_intr_efd_disable;
> > +	rte_intr_efd_enable;
> > +	rte_intr_free_epoll_fd;
> > +	rte_intr_rx_ctl;
> > +	rte_intr_allow_others;
> > +	rte_intr_tls_epfd;
> > +	rte_intr_dev_fd_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_dev_fd_get; # WINDOWS_NO_EXPORT
> 
> OK.
> 
> > +	rte_intr_instance_copy; # WINDOWS_NO_EXPORT
> > +	rte_intr_event_list_update; # WINDOWS_NO_EXPORT
> > +	rte_intr_max_intr_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_max_intr_get; # WINDOWS_NO_EXPORT
> 
> These are needed on Windows.

Ack.

> 
> > +	rte_intr_nb_efd_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_nb_efd_get; # WINDOWS_NO_EXPORT
> > +	rte_intr_nb_intr_get; # WINDOWS_NO_EXPORT
> > +	rte_intr_efds_index_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_efds_index_get; # WINDOWS_NO_EXPORT
> 
> OK.
> 
> > +	rte_intr_elist_index_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_elist_index_get; # WINDOWS_NO_EXPORT
> 
> These are needed on Windows.

Ack.

> 
> > +	rte_intr_efd_counter_size_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_efd_counter_size_get; # WINDOWS_NO_EXPORT
> 
> OK.
> 
> > +	rte_intr_vec_list_alloc; # WINDOWS_NO_EXPORT
> > +	rte_intr_vec_list_index_set; # WINDOWS_NO_EXPORT
> > +	rte_intr_vec_list_index_get; # WINDOWS_NO_EXPORT
> > +	rte_intr_vec_list_free; # WINDOWS_NO_EXPORT
> 
> These are needed on Windows.

Ack.

> 
> > +	rte_intr_instance_windows_handle_get;
> > +	rte_intr_instance_windows_handle_set;
> >  };
  
Thomas Monjalon Oct. 19, 2021, 3:58 p.m. UTC | #5
19/10/2021 10:32, Harman Kalra:
> From: Stephen Hemminger <stephen@networkplumber.org>
> > On Tue, 19 Oct 2021 01:07:02 +0530
> > Harman Kalra <hkalra@marvell.com> wrote:
> > > +	/* Detect if DPDK malloc APIs are ready to be used. */
> > > +	mem_allocator = rte_malloc_is_ready();
> > > +	if (mem_allocator)
> > > +		intr_handle = rte_zmalloc(NULL, sizeof(struct
> > rte_intr_handle),
> > > +					  0);
> > > +	else
> > > +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));
> > 
> > This is problematic way to do this.
> > The reason to use rte_malloc vs malloc should be determined by usage.
> > 
> > If the pointer will be shared between primary/secondary process then it has
> > to be in hugepages (ie rte_malloc). If it is not shared then then use regular
> > malloc.
> > 
> > But what you have done is created a method which will be a latent bug for
> > anyone using primary/secondary process.
> > 
> > Either:
> >     intr_handle is not allowed to be used in secondary.
> >       Then always use malloc().
> > Or.
> >     intr_handle can be used by both primary and secondary.
> >     Then always use rte_malloc().
> >     Any code path that allocates intr_handle before pool is
> >     ready is broken.
> 
> Hi Stephan,
> 
> Till V2, I implemented this API in a way where user of the API can choose
> If he wants intr handle to be allocated using malloc or rte_malloc by passing
> a flag arg to the rte_intr_instanc_alloc API. User of the API will best know if
> the intr handle is to be shared with secondary or not.

Yes the caller should know, but it makes usage more difficult.
Using rte_malloc always is simpler.
 
> But after some discussions and suggestions from the community we decided
> to drop that flag argument and auto detect on whether rte_malloc APIs are
> ready to be used and thereafter make all further allocations via rte_malloc.
> Currently alarm subsystem (or any driver doing allocation in constructor) gets
> interrupt instance allocated using glibc malloc that too because rte_malloc*
> is not ready by rte_eal_alarm_init(), while all further consumers gets instance
> allocated via rte_malloc.

Yes the general case is to allocate after rte_malloc is ready.
Anyway a constructor should not allocate complicate things.

>  I think this should not cause any issue in primary/secondary model as all interrupt
> instance pointer will be shared. Infact to avoid any surprises of primary/secondary
> not working we thought of making all allocations via rte_malloc.

Yes

> David, Thomas, Dmitry, please add if I missed anything.

I understand Stephen's concern but I think this choice is a good compromise.
Ideally we should avoid doing real stuff in constructors.

> Can we please conclude on this series APIs as API freeze deadline (rc1) is very near.

I vote for keeping this design.
  
Harman Kalra Oct. 19, 2021, 6:44 p.m. UTC | #6
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Harman Kalra
> Sent: Tuesday, October 19, 2021 2:21 PM
> To: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Ray Kinsella
> <mdr@ashroe.eu>; david.marchand@redhat.com
> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v3 2/7] eal/interrupts: implement
> get set APIs
> 

[...]

> > > +
> > > +	nb_intr = RTE_MIN(src->nb_intr, intr_handle->nb_intr);
> >
> > Truncating copy is error-prone.
> > It should be either a reallocation (in the future) or an error (now).
> 
> Actually in patch 6, I have made lot of changes to this API wrt nb_intr, where
> efds/elist arrays are reallocated based on src->nb_intr and make
> intr_handle->nb_intr equal to src->nb_intr. I think those changes can be
> moved from patch 6 to patch 2.

Hi Dmitry,

I have addressed all your comments in V4, kindly review.
Regarding this particular comment, I have not made any changes as I already
explained in my previous reply that patch 6 takes care of realloc based on nb_intr,
I thought those changes can be moved from patch 6 to 2, but it's not possible because
till patch 5, arrays efds, elist are kept as static arrays to avoid build breakage. While in
patch 6 I made these arrays as pointers and dynamically allocated memory for them.


Thanks
Harman

[...]
> 
> >
> > > +	memcpy(intr_handle->efds, src->efds, nb_intr);
  
Dmitry Kozlyuk Oct. 20, 2021, 3:30 p.m. UTC | #7
2021-10-19 08:32 (UTC+0000), Harman Kalra:
> > -----Original Message-----
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Sent: Tuesday, October 19, 2021 4:27 AM
> > To: Harman Kalra <hkalra@marvell.com>
> > Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Ray Kinsella
> > <mdr@ashroe.eu>; david.marchand@redhat.com;
> > dmitry.kozliuk@gmail.com
> > Subject: [EXT] Re: [dpdk-dev] [PATCH v3 2/7] eal/interrupts: implement get
> > set APIs
> > 
> > External Email
> > 
> > ----------------------------------------------------------------------
> > On Tue, 19 Oct 2021 01:07:02 +0530
> > Harman Kalra <hkalra@marvell.com> wrote:
> >   
> > > +	/* Detect if DPDK malloc APIs are ready to be used. */
> > > +	mem_allocator = rte_malloc_is_ready();
> > > +	if (mem_allocator)
> > > +		intr_handle = rte_zmalloc(NULL, sizeof(struct  
> > rte_intr_handle),  
> > > +					  0);
> > > +	else
> > > +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));  
> > 
> > This is problematic way to do this.
> > The reason to use rte_malloc vs malloc should be determined by usage.
> > 
> > If the pointer will be shared between primary/secondary process then it has
> > to be in hugepages (ie rte_malloc). If it is not shared then then use regular
> > malloc.
> > 
> > But what you have done is created a method which will be a latent bug for
> > anyone using primary/secondary process.
> > 
> > Either:
> >     intr_handle is not allowed to be used in secondary.
> >       Then always use malloc().
> > Or.
> >     intr_handle can be used by both primary and secondary.
> >     Then always use rte_malloc().
> >     Any code path that allocates intr_handle before pool is
> >     ready is broken.  
> 
> Hi Stephan,
> 
> Till V2, I implemented this API in a way where user of the API can choose
> If he wants intr handle to be allocated using malloc or rte_malloc by passing
> a flag arg to the rte_intr_instanc_alloc API. User of the API will best know if
> the intr handle is to be shared with secondary or not.
> 
> But after some discussions and suggestions from the community we decided
> to drop that flag argument and auto detect on whether rte_malloc APIs are
> ready to be used and thereafter make all further allocations via rte_malloc.
> Currently alarm subsystem (or any driver doing allocation in constructor) gets
> interrupt instance allocated using glibc malloc that too because rte_malloc*
> is not ready by rte_eal_alarm_init(), while all further consumers gets instance
> allocated via rte_malloc.

Just as a comment, bus scanning is the real issue, not the alarms.
Alarms could be initialized after the memory management
(but it's irrelevant because their handle is not accessed from the outside).
However, MM needs to know bus IOVA requirements to initialize,
which is usually determined by at least bus device requirements.

>  I think this should not cause any issue in primary/secondary model as all interrupt
> instance pointer will be shared.

What do you mean? Aren't we discussing the issue
that those allocated early are not shared?

> Infact to avoid any surprises of primary/secondary
> not working we thought of making all allocations via rte_malloc. 

I don't see why anyone would not make them shared.
In order to only use rte_malloc(), we need:
1. In bus drivers, move handle allocation from scan to probe stage.
2. In EAL, move alarm initialization to after the MM.
It all can be done later with v3 design---but there are out-of-tree drivers.
We need to force them to make step 1 at some point.
I see two options:
a) Right now have an external API that only works with rte_malloc()
   and internal API with autodetection. Fix DPDK and drop internal API.
b) Have external API with autodetection. Fix DPDK.
   At the next ABI breakage drop autodetection and libc-malloc.

> David, Thomas, Dmitry, please add if I missed anything.
> 
> Can we please conclude on this series APIs as API freeze deadline (rc1) is very near.

I support v3 design with no options and autodetection,
because that's the interface we want in the end.
Implementation can be improved later.
  
Harman Kalra Oct. 21, 2021, 9:16 a.m. UTC | #8
> -----Original Message-----
> From: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> Sent: Wednesday, October 20, 2021 9:01 PM
> To: Harman Kalra <hkalra@marvell.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>; Thomas
> Monjalon <thomas@monjalon.net>; david.marchand@redhat.com;
> dev@dpdk.org; Ray Kinsella <mdr@ashroe.eu>
> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v3 2/7] eal/interrupts: implement
> get set APIs
> 
> > >
> > > > +	/* Detect if DPDK malloc APIs are ready to be used. */
> > > > +	mem_allocator = rte_malloc_is_ready();
> > > > +	if (mem_allocator)
> > > > +		intr_handle = rte_zmalloc(NULL, sizeof(struct
> > > rte_intr_handle),
> > > > +					  0);
> > > > +	else
> > > > +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));
> > >
> > > This is problematic way to do this.
> > > The reason to use rte_malloc vs malloc should be determined by usage.
> > >
> > > If the pointer will be shared between primary/secondary process then
> > > it has to be in hugepages (ie rte_malloc). If it is not shared then
> > > then use regular malloc.
> > >
> > > But what you have done is created a method which will be a latent
> > > bug for anyone using primary/secondary process.
> > >
> > > Either:
> > >     intr_handle is not allowed to be used in secondary.
> > >       Then always use malloc().
> > > Or.
> > >     intr_handle can be used by both primary and secondary.
> > >     Then always use rte_malloc().
> > >     Any code path that allocates intr_handle before pool is
> > >     ready is broken.
> >
> > Hi Stephan,
> >
> > Till V2, I implemented this API in a way where user of the API can
> > choose If he wants intr handle to be allocated using malloc or
> > rte_malloc by passing a flag arg to the rte_intr_instanc_alloc API.
> > User of the API will best know if the intr handle is to be shared with
> secondary or not.
> >
> > But after some discussions and suggestions from the community we
> > decided to drop that flag argument and auto detect on whether
> > rte_malloc APIs are ready to be used and thereafter make all further
> allocations via rte_malloc.
> > Currently alarm subsystem (or any driver doing allocation in
> > constructor) gets interrupt instance allocated using glibc malloc that
> > too because rte_malloc* is not ready by rte_eal_alarm_init(), while
> > all further consumers gets instance allocated via rte_malloc.
> 
> Just as a comment, bus scanning is the real issue, not the alarms.
> Alarms could be initialized after the memory management (but it's irrelevant
> because their handle is not accessed from the outside).
> However, MM needs to know bus IOVA requirements to initialize, which is
> usually determined by at least bus device requirements.
> 
> >  I think this should not cause any issue in primary/secondary model as
> > all interrupt instance pointer will be shared.
> 
> What do you mean? Aren't we discussing the issue that those allocated early
> are not shared?
> 
> > Infact to avoid any surprises of primary/secondary not working we
> > thought of making all allocations via rte_malloc.
> 
> I don't see why anyone would not make them shared.
> In order to only use rte_malloc(), we need:
> 1. In bus drivers, move handle allocation from scan to probe stage.
> 2. In EAL, move alarm initialization to after the MM.
> It all can be done later with v3 design---but there are out-of-tree drivers.
> We need to force them to make step 1 at some point.
> I see two options:
> a) Right now have an external API that only works with rte_malloc()
>    and internal API with autodetection. Fix DPDK and drop internal API.
> b) Have external API with autodetection. Fix DPDK.
>    At the next ABI breakage drop autodetection and libc-malloc.
> 
> > David, Thomas, Dmitry, please add if I missed anything.
> >
> > Can we please conclude on this series APIs as API freeze deadline (rc1) is
> very near.
> 
> I support v3 design with no options and autodetection, because that's the
> interface we want in the end.
> Implementation can be improved later.

Hi All,

I came across 2 issues introduced with auto detection mechanism.
1. In case of primary secondary model.  Primary application is started which makes lots of allocations via
rte_malloc*
    
    Secondary side:
    a. Secondary starts, in its "rte_eal_init()" it makes some allocation via rte_*, and in one of the allocation
request for heap expand is made as current memseg got exhausted. (malloc_heap_alloc_on_heap_id ()->
   alloc_more_mem_on_socket()->try_expand_heap())
   b. A request to primary for heap expand is sent. Please note secondary holds the spinlock while making
the request. (malloc_heap_alloc_on_heap_id ()->rte_spinlock_lock(&(heap->lock));)

   Primary side:
   a. Primary receives the request, install a new hugepage and setups up the heap (handle_alloc_request())
   b. To inform all the secondaries about the new memseg, primary sends a sync notice where it sets up an 
alarm (rte_mp_request_async ()->mp_request_async()).
   c. Inside alarm setup API, we register an interrupt callback.
   d. Inside rte_intr_callback_register(), a new interrupt instance allocation is requested for "src->intr_handle"
   e. Since memory management is detected as up, inside "rte_intr_instance_alloc()", call to "rte_zmalloc" for
allocating memory and further inside "malloc_heap_alloc_on_heap_id()", primary will experience a deadlock
while taking up the spinlock because this spinlock is already hold by secondary.


2. "eal_flags_file_prefix_autotest" is failing because the spawned process by this tests are expected to cleanup
their hugepage traces from respective directories (eg /dev/hugepage). 
a. Inside eal_cleanup, rte_free()->malloc_heap_free(), where element to be freed is added to the free list and
checked if nearby elements can be joined together and form a big free chunk (malloc_elem_free()).
b. If this free chunk is big enough than the hugepage size, respective hugepage can be uninstalled after making
sure no allocation from this hugepage exists. (malloc_heap_free()->malloc_heap_free_pages()->eal_memalloc_free_seg())

But because of interrupt allocations made for pci intr handles (used for VFIO) and other driver specific interrupt
handles are not cleaned up in "rte_eal_cleanup()", these hugepage files are not removed and test fails.

There could be more such issues, I think we should firstly fix the DPDK.
1. Memory management should be made independent and should be the first thing to come up in rte_eal_init()
2. rte_eal_cleanup() should be exactly opposite to rte_eal_init(), just like bus_probe, we should have bus_remove
to clean up all the memory allocations.

Regarding this IRQ series, I would like to fall back to our original design i.e. rte_intr_instance_alloc() should take
an argument whether its memory should be allocated using glibc malloc or rte_malloc*. Decision for allocation
(malloc or rte_malloc) can be made on fact that in the existing code is the interrupt handle is shared?
Eg.  a. In case of alarm intr_handle was global entry and not confined to any structure, so this can be allocated from
normal malloc.
b. PCI device, had static entry for intr_handle inside "struct rte_pci_device" and memory for struct rte_pci_device is
via normal malloc, so it intr_handle can also be malloc'ed
c. Some driver with intr_handle inside its priv structure, and this priv structure gets allocated via rte_malloc, so
Intr_handle can also be rte_malloc.

Later once DPDK is fixed up, this argument can be removed and all allocations can be via rte_malloc family without
any auto detection.


David, Dmitry, Thomas, Stephan, please share your views....

Thanks
Harman
  
Dmitry Kozlyuk Oct. 21, 2021, 12:33 p.m. UTC | #9
2021-10-21 09:16 (UTC+0000), Harman Kalra:
> > -----Original Message-----
> > From: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> > Sent: Wednesday, October 20, 2021 9:01 PM
> > To: Harman Kalra <hkalra@marvell.com>
> > Cc: Stephen Hemminger <stephen@networkplumber.org>; Thomas
> > Monjalon <thomas@monjalon.net>; david.marchand@redhat.com;
> > dev@dpdk.org; Ray Kinsella <mdr@ashroe.eu>
> > Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v3 2/7] eal/interrupts: implement
> > get set APIs
> >   
> > > >  
> > > > > +	/* Detect if DPDK malloc APIs are ready to be used. */
> > > > > +	mem_allocator = rte_malloc_is_ready();
> > > > > +	if (mem_allocator)
> > > > > +		intr_handle = rte_zmalloc(NULL, sizeof(struct  
> > > > rte_intr_handle),  
> > > > > +					  0);
> > > > > +	else
> > > > > +		intr_handle = calloc(1, sizeof(struct rte_intr_handle));  
> > > >
> > > > This is problematic way to do this.
> > > > The reason to use rte_malloc vs malloc should be determined by usage.
> > > >
> > > > If the pointer will be shared between primary/secondary process then
> > > > it has to be in hugepages (ie rte_malloc). If it is not shared then
> > > > then use regular malloc.
> > > >
> > > > But what you have done is created a method which will be a latent
> > > > bug for anyone using primary/secondary process.
> > > >
> > > > Either:
> > > >     intr_handle is not allowed to be used in secondary.
> > > >       Then always use malloc().
> > > > Or.
> > > >     intr_handle can be used by both primary and secondary.
> > > >     Then always use rte_malloc().
> > > >     Any code path that allocates intr_handle before pool is
> > > >     ready is broken.  
> > >
> > > Hi Stephan,
> > >
> > > Till V2, I implemented this API in a way where user of the API can
> > > choose If he wants intr handle to be allocated using malloc or
> > > rte_malloc by passing a flag arg to the rte_intr_instanc_alloc API.
> > > User of the API will best know if the intr handle is to be shared with  
> > secondary or not.  
> > >
> > > But after some discussions and suggestions from the community we
> > > decided to drop that flag argument and auto detect on whether
> > > rte_malloc APIs are ready to be used and thereafter make all further  
> > allocations via rte_malloc.  
> > > Currently alarm subsystem (or any driver doing allocation in
> > > constructor) gets interrupt instance allocated using glibc malloc that
> > > too because rte_malloc* is not ready by rte_eal_alarm_init(), while
> > > all further consumers gets instance allocated via rte_malloc.  
> > 
> > Just as a comment, bus scanning is the real issue, not the alarms.
> > Alarms could be initialized after the memory management (but it's irrelevant
> > because their handle is not accessed from the outside).
> > However, MM needs to know bus IOVA requirements to initialize, which is
> > usually determined by at least bus device requirements.
> >   
> > >  I think this should not cause any issue in primary/secondary model as
> > > all interrupt instance pointer will be shared.  
> > 
> > What do you mean? Aren't we discussing the issue that those allocated early
> > are not shared?
> >   
> > > Infact to avoid any surprises of primary/secondary not working we
> > > thought of making all allocations via rte_malloc.  
> > 
> > I don't see why anyone would not make them shared.
> > In order to only use rte_malloc(), we need:
> > 1. In bus drivers, move handle allocation from scan to probe stage.
> > 2. In EAL, move alarm initialization to after the MM.
> > It all can be done later with v3 design---but there are out-of-tree drivers.
> > We need to force them to make step 1 at some point.
> > I see two options:
> > a) Right now have an external API that only works with rte_malloc()
> >    and internal API with autodetection. Fix DPDK and drop internal API.
> > b) Have external API with autodetection. Fix DPDK.
> >    At the next ABI breakage drop autodetection and libc-malloc.
> >   
> > > David, Thomas, Dmitry, please add if I missed anything.
> > >
> > > Can we please conclude on this series APIs as API freeze deadline (rc1) is  
> > very near.
> > 
> > I support v3 design with no options and autodetection, because that's the
> > interface we want in the end.
> > Implementation can be improved later.  
> 
> Hi All,
> 
> I came across 2 issues introduced with auto detection mechanism.
> 1. In case of primary secondary model.  Primary application is started which makes lots of allocations via
> rte_malloc*
>     
>     Secondary side:
>     a. Secondary starts, in its "rte_eal_init()" it makes some allocation via rte_*, and in one of the allocation
> request for heap expand is made as current memseg got exhausted. (malloc_heap_alloc_on_heap_id ()->
>    alloc_more_mem_on_socket()->try_expand_heap())
>    b. A request to primary for heap expand is sent. Please note secondary holds the spinlock while making
> the request. (malloc_heap_alloc_on_heap_id ()->rte_spinlock_lock(&(heap->lock));)
> 
>    Primary side:
>    a. Primary receives the request, install a new hugepage and setups up the heap (handle_alloc_request())
>    b. To inform all the secondaries about the new memseg, primary sends a sync notice where it sets up an 
> alarm (rte_mp_request_async ()->mp_request_async()).
>    c. Inside alarm setup API, we register an interrupt callback.
>    d. Inside rte_intr_callback_register(), a new interrupt instance allocation is requested for "src->intr_handle"
>    e. Since memory management is detected as up, inside "rte_intr_instance_alloc()", call to "rte_zmalloc" for
> allocating memory and further inside "malloc_heap_alloc_on_heap_id()", primary will experience a deadlock
> while taking up the spinlock because this spinlock is already hold by secondary.
> 
> 
> 2. "eal_flags_file_prefix_autotest" is failing because the spawned process by this tests are expected to cleanup
> their hugepage traces from respective directories (eg /dev/hugepage). 
> a. Inside eal_cleanup, rte_free()->malloc_heap_free(), where element to be freed is added to the free list and
> checked if nearby elements can be joined together and form a big free chunk (malloc_elem_free()).
> b. If this free chunk is big enough than the hugepage size, respective hugepage can be uninstalled after making
> sure no allocation from this hugepage exists. (malloc_heap_free()->malloc_heap_free_pages()->eal_memalloc_free_seg())
> 
> But because of interrupt allocations made for pci intr handles (used for VFIO) and other driver specific interrupt
> handles are not cleaned up in "rte_eal_cleanup()", these hugepage files are not removed and test fails.

Sad to hear. But it's a great and thorough analysis.

> There could be more such issues, I think we should firstly fix the DPDK.
> 1. Memory management should be made independent and should be the first thing to come up in rte_eal_init()

As I have explained, buses must be able to report IOVA requirement
at this point (`get_iommu_class()` bus method).
Either `scan()` must complete before that
or `get_iommu_class()` must be able to work before `scan()` is called.

> 2. rte_eal_cleanup() should be exactly opposite to rte_eal_init(), just like bus_probe, we should have bus_remove
> to clean up all the memory allocations.

Yes. For most buses it will be just "unplug each device".
In fact, EAL could do it with `unplug()`, but it is not mandatory.

> 
> Regarding this IRQ series, I would like to fall back to our original design i.e. rte_intr_instance_alloc() should take
> an argument whether its memory should be allocated using glibc malloc or rte_malloc*.

Seems there's no other option to make it on time.

> Decision for allocation
> (malloc or rte_malloc) can be made on fact that in the existing code is the interrupt handle is shared?
> Eg.  a. In case of alarm intr_handle was global entry and not confined to any structure, so this can be allocated from
> normal malloc.
> b. PCI device, had static entry for intr_handle inside "struct rte_pci_device" and memory for struct rte_pci_device is
> via normal malloc, so it intr_handle can also be malloc'ed
> c. Some driver with intr_handle inside its priv structure, and this priv structure gets allocated via rte_malloc, so
> Intr_handle can also be rte_malloc.
> 
> Later once DPDK is fixed up, this argument can be removed and all allocations can be via rte_malloc family without
> any auto detection.
  
David Marchand Oct. 21, 2021, 1:32 p.m. UTC | #10
On Thu, Oct 21, 2021 at 2:33 PM Dmitry Kozlyuk <dmitry.kozliuk@gmail.com> wrote:
> > Hi All,
> >
> > I came across 2 issues introduced with auto detection mechanism.
> > 1. In case of primary secondary model.  Primary application is started which makes lots of allocations via
> > rte_malloc*
> >
> >     Secondary side:
> >     a. Secondary starts, in its "rte_eal_init()" it makes some allocation via rte_*, and in one of the allocation
> > request for heap expand is made as current memseg got exhausted. (malloc_heap_alloc_on_heap_id ()->
> >    alloc_more_mem_on_socket()->try_expand_heap())
> >    b. A request to primary for heap expand is sent. Please note secondary holds the spinlock while making
> > the request. (malloc_heap_alloc_on_heap_id ()->rte_spinlock_lock(&(heap->lock));)
> >
> >    Primary side:
> >    a. Primary receives the request, install a new hugepage and setups up the heap (handle_alloc_request())
> >    b. To inform all the secondaries about the new memseg, primary sends a sync notice where it sets up an
> > alarm (rte_mp_request_async ()->mp_request_async()).
> >    c. Inside alarm setup API, we register an interrupt callback.
> >    d. Inside rte_intr_callback_register(), a new interrupt instance allocation is requested for "src->intr_handle"
> >    e. Since memory management is detected as up, inside "rte_intr_instance_alloc()", call to "rte_zmalloc" for
> > allocating memory and further inside "malloc_heap_alloc_on_heap_id()", primary will experience a deadlock
> > while taking up the spinlock because this spinlock is already hold by secondary.
> >
> >
> > 2. "eal_flags_file_prefix_autotest" is failing because the spawned process by this tests are expected to cleanup
> > their hugepage traces from respective directories (eg /dev/hugepage).
> > a. Inside eal_cleanup, rte_free()->malloc_heap_free(), where element to be freed is added to the free list and
> > checked if nearby elements can be joined together and form a big free chunk (malloc_elem_free()).
> > b. If this free chunk is big enough than the hugepage size, respective hugepage can be uninstalled after making
> > sure no allocation from this hugepage exists. (malloc_heap_free()->malloc_heap_free_pages()->eal_memalloc_free_seg())
> >
> > But because of interrupt allocations made for pci intr handles (used for VFIO) and other driver specific interrupt
> > handles are not cleaned up in "rte_eal_cleanup()", these hugepage files are not removed and test fails.
>
> Sad to hear. But it's a great and thorough analysis.
>
> > There could be more such issues, I think we should firstly fix the DPDK.
> > 1. Memory management should be made independent and should be the first thing to come up in rte_eal_init()
>
> As I have explained, buses must be able to report IOVA requirement
> at this point (`get_iommu_class()` bus method).
> Either `scan()` must complete before that
> or `get_iommu_class()` must be able to work before `scan()` is called.
>
> > 2. rte_eal_cleanup() should be exactly opposite to rte_eal_init(), just like bus_probe, we should have bus_remove
> > to clean up all the memory allocations.
>
> Yes. For most buses it will be just "unplug each device".
> In fact, EAL could do it with `unplug()`, but it is not mandatory.
>
> >
> > Regarding this IRQ series, I would like to fall back to our original design i.e. rte_intr_instance_alloc() should take
> > an argument whether its memory should be allocated using glibc malloc or rte_malloc*.
>
> Seems there's no other option to make it on time.

- Sorry, my memory is too short, did we describe where we need to
share rte_intr_handle objects?

I spent some time looking at uses of rte_intr_handle objects.

In many cases intr_handle objects are referenced in malloc() objects.
The cases where rte_intr_handle are shared is in per device private
bits in drivers.

A intr_handle often contains fds.
For them to be used in mp setups, there needs to be a big machinery
with SCM_RIGHTS but I see only 3 drivers which actually reference
this.
So if intr_handle fds are accessed by multiple processes, their
content probably makes no sense wrt fds.


From these two hints, I think we are going backwards, and the main
usecase is that those rte_intr_instance objects are not used in mp.
I even think they are never accessed from other processes.
But I am not sure.


- Seeing how time it short for rc1, I am ok with
rte_intr_instance_alloc() taking a flag argument.
And we can still go back on this API later.

Can we agree on the flag name?
rte_malloc() interest is that it makes objects shared for mp, so how
about RTE_INTR_INSTANCE_F_SHARED ?
  
Harman Kalra Oct. 21, 2021, 4:05 p.m. UTC | #11
Hi Dmitry, David

Please find responses inline.

> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Thursday, October 21, 2021 7:03 PM
> To: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>; Harman Kalra
> <hkalra@marvell.com>
> Cc: Stephen Hemminger <stephen@networkplumber.org>; Thomas
> Monjalon <thomas@monjalon.net>; dev@dpdk.org; Ray Kinsella
> <mdr@ashroe.eu>
> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v3 2/7] eal/interrupts: implement
> get set APIs
> 
> On Thu, Oct 21, 2021 at 2:33 PM Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> wrote:
> > > Hi All,
> > >
> > > I came across 2 issues introduced with auto detection mechanism.
> > > 1. In case of primary secondary model.  Primary application is
> > > started which makes lots of allocations via
> > > rte_malloc*
> > >
> > >     Secondary side:
> > >     a. Secondary starts, in its "rte_eal_init()" it makes some
> > > allocation via rte_*, and in one of the allocation request for heap expand
> is made as current memseg got exhausted. (malloc_heap_alloc_on_heap_id
> ()->
> > >    alloc_more_mem_on_socket()->try_expand_heap())
> > >    b. A request to primary for heap expand is sent. Please note
> > > secondary holds the spinlock while making the request.
> > > (malloc_heap_alloc_on_heap_id ()->rte_spinlock_lock(&(heap->lock));)
> > >
> > >    Primary side:
> > >    a. Primary receives the request, install a new hugepage and setups up
> the heap (handle_alloc_request())
> > >    b. To inform all the secondaries about the new memseg, primary
> > > sends a sync notice where it sets up an alarm (rte_mp_request_async ()-
> >mp_request_async()).
> > >    c. Inside alarm setup API, we register an interrupt callback.
> > >    d. Inside rte_intr_callback_register(), a new interrupt instance allocation
> is requested for "src->intr_handle"
> > >    e. Since memory management is detected as up, inside
> > > "rte_intr_instance_alloc()", call to "rte_zmalloc" for allocating
> > > memory and further inside "malloc_heap_alloc_on_heap_id()", primary
> will experience a deadlock while taking up the spinlock because this spinlock
> is already hold by secondary.
> > >
> > >
> > > 2. "eal_flags_file_prefix_autotest" is failing because the spawned
> > > process by this tests are expected to cleanup their hugepage traces from
> respective directories (eg /dev/hugepage).
> > > a. Inside eal_cleanup, rte_free()->malloc_heap_free(), where element
> > > to be freed is added to the free list and checked if nearby elements can
> be joined together and form a big free chunk (malloc_elem_free()).
> > > b. If this free chunk is big enough than the hugepage size,
> > > respective hugepage can be uninstalled after making sure no
> > > allocation from this hugepage exists.
> > > (malloc_heap_free()->malloc_heap_free_pages()-
> >eal_memalloc_free_seg
> > > ())
> > >
> > > But because of interrupt allocations made for pci intr handles (used
> > > for VFIO) and other driver specific interrupt handles are not cleaned up in
> "rte_eal_cleanup()", these hugepage files are not removed and test fails.
> >
> > Sad to hear. But it's a great and thorough analysis.

Sad but a good learning, atleast we identified areas to be worked upon.

> >
> > > There could be more such issues, I think we should firstly fix the DPDK.
> > > 1. Memory management should be made independent and should be the
> > > first thing to come up in rte_eal_init()
> >
> > As I have explained, buses must be able to report IOVA requirement at
> > this point (`get_iommu_class()` bus method).
> > Either `scan()` must complete before that or `get_iommu_class()` must
> > be able to work before `scan()` is called.
> >
> > > 2. rte_eal_cleanup() should be exactly opposite to rte_eal_init(),
> > > just like bus_probe, we should have bus_remove to clean up all the
> memory allocations.
> >
> > Yes. For most buses it will be just "unplug each device".
> > In fact, EAL could do it with `unplug()`, but it is not mandatory.

I implemented a rough bus_remove which was similar to unplug, faced
some issue. Not sure but some drivers might not be supporting hotplug, for
them unplug might be a challenge.


> >
> > >
> > > Regarding this IRQ series, I would like to fall back to our original
> > > design i.e. rte_intr_instance_alloc() should take an argument whether its
> memory should be allocated using glibc malloc or rte_malloc*.
> >
> > Seems there's no other option to make it on time.
> 
> - Sorry, my memory is too short, did we describe where we need to share
> rte_intr_handle objects?

Intr handle objects are shared in very few drivers.

> 
> I spent some time looking at uses of rte_intr_handle objects.
> 
> In many cases intr_handle objects are referenced in malloc() objects.
> The cases where rte_intr_handle are shared is in per device private bits in
> drivers.
> 

Yes, in V2 design I allocated memory using glibc malloc for such instances by
passing respective flag.

> A intr_handle often contains fds.
> For them to be used in mp setups, there needs to be a big machinery with
> SCM_RIGHTS but I see only 3 drivers which actually reference this.
> So if intr_handle fds are accessed by multiple processes, their content
> probably makes no sense wrt fds.

Those drivers will allocate using SHARED flag.

> 
> 
> From these two hints, I think we are going backwards, and the main usecase
> is that those rte_intr_instance objects are not used in mp.
> I even think they are never accessed from other processes.
> But I am not sure.
> 
> 
> - Seeing how time it short for rc1, I am ok with
> rte_intr_instance_alloc() taking a flag argument.
> And we can still go back on this API later.


Sure, I will revert back to original design and send V5 by tomorrow.

> 
> Can we agree on the flag name?
> rte_malloc() interest is that it makes objects shared for mp, so how about
> RTE_INTR_INSTANCE_F_SHARED ?

Yeah, it sounds good:
RTE_INTR_INSTANCE_F_SHARED  - rte_malloc
RTE_INTR_INSTANCE_F_PRIVATE - malloc


Thanks David, Dmitry, Thomas, Stephan for reviewing the series thoroughly and providing
inputs to improvise it.


Thanks
Harman

> 
> 
> --
> David Marchand
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 8dceb6c0e0..3782e88742 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -210,6 +210,7 @@  F: app/test/test_memzone.c
 
 Interrupt Subsystem
 M: Harman Kalra <hkalra@marvell.com>
+F: lib/eal/include/rte_epoll.h
 F: lib/eal/*/*interrupts.*
 F: app/test/test_interrupts.c
 
diff --git a/lib/eal/common/eal_common_interrupts.c b/lib/eal/common/eal_common_interrupts.c
new file mode 100644
index 0000000000..90e9c70ca3
--- /dev/null
+++ b/lib/eal/common/eal_common_interrupts.c
@@ -0,0 +1,410 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_interrupts.h>
+
+#include <malloc_heap.h>
+
+/* Macros to check for valid port */
+#define CHECK_VALID_INTR_HANDLE(intr_handle) do { \
+	if (intr_handle == NULL) { \
+		RTE_LOG(ERR, EAL, "Interrupt instance unallocated\n"); \
+		rte_errno = EINVAL; \
+		goto fail; \
+	} \
+} while (0)
+
+struct rte_intr_handle *rte_intr_instance_alloc(void)
+{
+	struct rte_intr_handle *intr_handle;
+	bool mem_allocator;
+
+	/* Detect if DPDK malloc APIs are ready to be used. */
+	mem_allocator = rte_malloc_is_ready();
+	if (mem_allocator)
+		intr_handle = rte_zmalloc(NULL, sizeof(struct rte_intr_handle),
+					  0);
+	else
+		intr_handle = calloc(1, sizeof(struct rte_intr_handle));
+	if (!intr_handle) {
+		RTE_LOG(ERR, EAL, "Fail to allocate intr_handle\n");
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	intr_handle->nb_intr = RTE_MAX_RXTX_INTR_VEC_ID;
+	intr_handle->mem_allocator = mem_allocator;
+
+	return intr_handle;
+}
+
+int rte_intr_instance_copy(struct rte_intr_handle *intr_handle,
+			   const struct rte_intr_handle *src)
+{
+	uint16_t nb_intr;
+
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (src == NULL) {
+		RTE_LOG(ERR, EAL, "Source interrupt instance unallocated\n");
+		rte_errno = EINVAL;
+		goto fail;
+	}
+
+	intr_handle->fd = src->fd;
+	intr_handle->vfio_dev_fd = src->vfio_dev_fd;
+	intr_handle->type = src->type;
+	intr_handle->max_intr = src->max_intr;
+	intr_handle->nb_efd = src->nb_efd;
+	intr_handle->efd_counter_size = src->efd_counter_size;
+
+	nb_intr = RTE_MIN(src->nb_intr, intr_handle->nb_intr);
+	memcpy(intr_handle->efds, src->efds, nb_intr);
+	memcpy(intr_handle->elist, src->elist, nb_intr);
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+void rte_intr_instance_free(struct rte_intr_handle *intr_handle)
+{
+	if (intr_handle->mem_allocator)
+		rte_free(intr_handle);
+	else
+		free(intr_handle);
+}
+
+int rte_intr_fd_set(struct rte_intr_handle *intr_handle, int fd)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	intr_handle->fd = fd;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_fd_get(const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->fd;
+fail:
+	return -1;
+}
+
+int rte_intr_type_set(struct rte_intr_handle *intr_handle,
+		      enum rte_intr_handle_type type)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	intr_handle->type = type;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+enum rte_intr_handle_type rte_intr_type_get(
+				const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->type;
+fail:
+	return RTE_INTR_HANDLE_UNKNOWN;
+}
+
+int rte_intr_dev_fd_set(struct rte_intr_handle *intr_handle, int fd)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	intr_handle->vfio_dev_fd = fd;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_dev_fd_get(const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->vfio_dev_fd;
+fail:
+	return -1;
+}
+
+int rte_intr_max_intr_set(struct rte_intr_handle *intr_handle,
+				 int max_intr)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (max_intr > intr_handle->nb_intr) {
+		RTE_LOG(ERR, EAL, "Maximum interrupt vector ID (%d) exceeds "
+			"the number of available events (%d)\n", max_intr,
+			intr_handle->nb_intr);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	intr_handle->max_intr = max_intr;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_max_intr_get(const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->max_intr;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_nb_efd_set(struct rte_intr_handle *intr_handle,
+				 int nb_efd)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	intr_handle->nb_efd = nb_efd;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_nb_efd_get(const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->nb_efd;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_nb_intr_get(const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->nb_intr;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_efd_counter_size_set(struct rte_intr_handle *intr_handle,
+				 uint8_t efd_counter_size)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	intr_handle->efd_counter_size = efd_counter_size;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_efd_counter_size_get(const struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->efd_counter_size;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_efds_index_get(const struct rte_intr_handle *intr_handle,
+			    int index)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (index >= intr_handle->nb_intr) {
+		RTE_LOG(ERR, EAL, "Invalid size %d, max limit %d\n", index,
+			intr_handle->nb_intr);
+		rte_errno = EINVAL;
+		goto fail;
+	}
+
+	return intr_handle->efds[index];
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_efds_index_set(struct rte_intr_handle *intr_handle,
+			    int index, int fd)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (index >= intr_handle->nb_intr) {
+		RTE_LOG(ERR, EAL, "Invalid size %d, max limit %d\n", index,
+			intr_handle->nb_intr);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	intr_handle->efds[index] = fd;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+struct rte_epoll_event *rte_intr_elist_index_get(
+				struct rte_intr_handle *intr_handle, int index)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (index >= intr_handle->nb_intr) {
+		RTE_LOG(ERR, EAL, "Invalid size %d, max limit %d\n", index,
+			intr_handle->nb_intr);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	return &intr_handle->elist[index];
+fail:
+	return NULL;
+}
+
+int rte_intr_elist_index_set(struct rte_intr_handle *intr_handle,
+			     int index, struct rte_epoll_event elist)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (index >= intr_handle->nb_intr) {
+		RTE_LOG(ERR, EAL, "Invalid size %d, max limit %d\n", index,
+			intr_handle->nb_intr);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	intr_handle->elist[index] = elist;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_vec_list_alloc(struct rte_intr_handle *intr_handle,
+				   const char *name, int size)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	/* Vector list already allocated */
+	if (intr_handle->intr_vec)
+		return 0;
+
+	if (size > intr_handle->nb_intr) {
+		RTE_LOG(ERR, EAL, "Invalid size %d, max limit %d\n", size,
+		       intr_handle->nb_intr);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	intr_handle->intr_vec = rte_zmalloc(name, size * sizeof(int), 0);
+	if (!intr_handle->intr_vec) {
+		RTE_LOG(ERR, EAL, "Failed to allocate %d intr_vec", size);
+			rte_errno = ENOMEM;
+			goto fail;
+	}
+
+	intr_handle->vec_list_size = size;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_vec_list_index_get(const struct rte_intr_handle *intr_handle,
+				int index)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (!intr_handle->intr_vec) {
+		RTE_LOG(ERR, EAL, "Intr vector list not allocated\n");
+		rte_errno = ENOTSUP;
+		goto fail;
+	}
+
+	if (index > intr_handle->vec_list_size) {
+		RTE_LOG(ERR, EAL, "Index %d greater than vec list size %d\n",
+			index, intr_handle->vec_list_size);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	return intr_handle->intr_vec[index];
+fail:
+	return -rte_errno;
+}
+
+int rte_intr_vec_list_index_set(struct rte_intr_handle *intr_handle,
+				   int index, int vec)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (!intr_handle->intr_vec) {
+		RTE_LOG(ERR, EAL, "Intr vector list not allocated\n");
+		rte_errno = ENOTSUP;
+		goto fail;
+	}
+
+	if (index > intr_handle->vec_list_size) {
+		RTE_LOG(ERR, EAL, "Index %d greater than vec list size %d\n",
+			index, intr_handle->vec_list_size);
+		rte_errno = ERANGE;
+		goto fail;
+	}
+
+	intr_handle->intr_vec[index] = vec;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+void rte_intr_vec_list_free(struct rte_intr_handle *intr_handle)
+{
+	if (intr_handle) {
+		rte_free(intr_handle->intr_vec);
+		intr_handle->intr_vec = NULL;
+	}
+}
+
+void *rte_intr_instance_windows_handle_get(struct rte_intr_handle *intr_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	return intr_handle->windows_handle;
+fail:
+	return NULL;
+}
+
+int rte_intr_instance_windows_handle_set(struct rte_intr_handle *intr_handle,
+					 void *windows_handle)
+{
+	CHECK_VALID_INTR_HANDLE(intr_handle);
+
+	if (!windows_handle) {
+		RTE_LOG(ERR, EAL, "Windows handle should not be NULL\n");
+		rte_errno = EINVAL;
+		goto fail;
+	}
+
+	intr_handle->windows_handle = windows_handle;
+
+	return 0;
+fail:
+	return -rte_errno;
+}
diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
index 6d01b0f072..917758cc65 100644
--- a/lib/eal/common/meson.build
+++ b/lib/eal/common/meson.build
@@ -15,6 +15,7 @@  sources += files(
         'eal_common_errno.c',
         'eal_common_fbarray.c',
         'eal_common_hexdump.c',
+        'eal_common_interrupts.c',
         'eal_common_launch.c',
         'eal_common_lcore.c',
         'eal_common_log.c',
diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index 88a9eba12f..8e258607b8 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -19,6 +19,7 @@  headers += files(
         'rte_eal_memconfig.h',
         'rte_eal_trace.h',
         'rte_errno.h',
+        'rte_epoll.h',
         'rte_fbarray.h',
         'rte_hexdump.h',
         'rte_hypervisor.h',
diff --git a/lib/eal/include/rte_eal_interrupts.h b/lib/eal/include/rte_eal_interrupts.h
index 00bcc19b6d..6764ba3f35 100644
--- a/lib/eal/include/rte_eal_interrupts.h
+++ b/lib/eal/include/rte_eal_interrupts.h
@@ -39,32 +39,6 @@  enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_MAX           /**< count of elements */
 };
 
-#define RTE_INTR_EVENT_ADD            1UL
-#define RTE_INTR_EVENT_DEL            2UL
-
-typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
-
-struct rte_epoll_data {
-	uint32_t event;               /**< event type */
-	void *data;                   /**< User data */
-	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
-	void *cb_arg;	              /**< IN: callback arg */
-};
-
-enum {
-	RTE_EPOLL_INVALID = 0,
-	RTE_EPOLL_VALID,
-	RTE_EPOLL_EXEC,
-};
-
-/** interrupt epoll event obj, taken by epoll_event.ptr */
-struct rte_epoll_event {
-	uint32_t status;           /**< OUT: event status */
-	int fd;                    /**< OUT: event fd */
-	int epfd;       /**< OUT: epoll instance the ev associated with */
-	struct rte_epoll_data epdata;
-};
-
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	RTE_STD_C11
@@ -79,191 +53,20 @@  struct rte_intr_handle {
 			};
 			int fd;	/**< interrupt event file descriptor */
 		};
-		void *handle; /**< device driver handle (Windows) */
+		void *windows_handle; /**< device driver handle (Windows) */
 	};
+	bool mem_allocator;
 	enum rte_intr_handle_type type;  /**< handle type */
 	uint32_t max_intr;             /**< max interrupt requested */
 	uint32_t nb_efd;               /**< number of available efd(event fd) */
 	uint8_t efd_counter_size;      /**< size of efd counter, used for vdev */
+	uint16_t nb_intr;
+		/**< Max vector count, default RTE_MAX_RXTX_INTR_VEC_ID */
 	int efds[RTE_MAX_RXTX_INTR_VEC_ID];  /**< intr vectors/efds mapping */
 	struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
-				       /**< intr vector epoll event */
+						/**< intr vector epoll event */
+	uint16_t vec_list_size;
 	int *intr_vec;                 /**< intr vector number array */
 };
 
-#define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
-
-/**
- * It waits for events on the epoll instance.
- * Retries if signal received.
- *
- * @param epfd
- *   Epoll instance fd on which the caller wait for events.
- * @param events
- *   Memory area contains the events that will be available for the caller.
- * @param maxevents
- *   Up to maxevents are returned, must greater than zero.
- * @param timeout
- *   Specifying a timeout of -1 causes a block indefinitely.
- *   Specifying a timeout equal to zero cause to return immediately.
- * @return
- *   - On success, returns the number of available event.
- *   - On failure, a negative value.
- */
-int
-rte_epoll_wait(int epfd, struct rte_epoll_event *events,
-	       int maxevents, int timeout);
-
-/**
- * It waits for events on the epoll instance.
- * Does not retry if signal received.
- *
- * @param epfd
- *   Epoll instance fd on which the caller wait for events.
- * @param events
- *   Memory area contains the events that will be available for the caller.
- * @param maxevents
- *   Up to maxevents are returned, must greater than zero.
- * @param timeout
- *   Specifying a timeout of -1 causes a block indefinitely.
- *   Specifying a timeout equal to zero cause to return immediately.
- * @return
- *   - On success, returns the number of available event.
- *   - On failure, a negative value.
- */
-__rte_experimental
-int
-rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
-	       int maxevents, int timeout);
-
-/**
- * It performs control operations on epoll instance referred by the epfd.
- * It requests that the operation op be performed for the target fd.
- *
- * @param epfd
- *   Epoll instance fd on which the caller perform control operations.
- * @param op
- *   The operation be performed for the target fd.
- * @param fd
- *   The target fd on which the control ops perform.
- * @param event
- *   Describes the object linked to the fd.
- *   Note: The caller must take care the object deletion after CTL_DEL.
- * @return
- *   - On success, zero.
- *   - On failure, a negative value.
- */
-int
-rte_epoll_ctl(int epfd, int op, int fd,
-	      struct rte_epoll_event *event);
-
-/**
- * The function returns the per thread epoll instance.
- *
- * @return
- *   epfd the epoll instance referred to.
- */
-int
-rte_intr_tls_epfd(void);
-
-/**
- * @param intr_handle
- *   Pointer to the interrupt handle.
- * @param epfd
- *   Epoll instance fd which the intr vector associated to.
- * @param op
- *   The operation be performed for the vector.
- *   Operation type of {ADD, DEL}.
- * @param vec
- *   RX intr vector number added to the epoll instance wait list.
- * @param data
- *   User raw data.
- * @return
- *   - On success, zero.
- *   - On failure, a negative value.
- */
-int
-rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
-		int epfd, int op, unsigned int vec, void *data);
-
-/**
- * It deletes registered eventfds.
- *
- * @param intr_handle
- *   Pointer to the interrupt handle.
- */
-void
-rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle);
-
-/**
- * It enables the packet I/O interrupt event if it's necessary.
- * It creates event fd for each interrupt vector when MSIX is used,
- * otherwise it multiplexes a single event fd.
- *
- * @param intr_handle
- *   Pointer to the interrupt handle.
- * @param nb_efd
- *   Number of interrupt vector trying to enable.
- *   The value 0 is not allowed.
- * @return
- *   - On success, zero.
- *   - On failure, a negative value.
- */
-int
-rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
-
-/**
- * It disables the packet I/O interrupt event.
- * It deletes registered eventfds and closes the open fds.
- *
- * @param intr_handle
- *   Pointer to the interrupt handle.
- */
-void
-rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
-
-/**
- * The packet I/O interrupt on datapath is enabled or not.
- *
- * @param intr_handle
- *   Pointer to the interrupt handle.
- */
-int
-rte_intr_dp_is_en(struct rte_intr_handle *intr_handle);
-
-/**
- * The interrupt handle instance allows other causes or not.
- * Other causes stand for any none packet I/O interrupts.
- *
- * @param intr_handle
- *   Pointer to the interrupt handle.
- */
-int
-rte_intr_allow_others(struct rte_intr_handle *intr_handle);
-
-/**
- * The multiple interrupt vector capability of interrupt handle instance.
- * It returns zero if no multiple interrupt vector support.
- *
- * @param intr_handle
- *   Pointer to the interrupt handle.
- */
-int
-rte_intr_cap_multiple(struct rte_intr_handle *intr_handle);
-
-/**
- * @warning
- * @b EXPERIMENTAL: this API may change without prior notice
- *
- * @internal
- * Check if currently executing in interrupt context
- *
- * @return
- *  - non zero in case of interrupt context
- *  - zero in case of process context
- */
-__rte_experimental
-int
-rte_thread_is_intr(void);
-
 #endif /* _RTE_EAL_INTERRUPTS_H_ */
diff --git a/lib/eal/include/rte_epoll.h b/lib/eal/include/rte_epoll.h
new file mode 100644
index 0000000000..56b7b6bad6
--- /dev/null
+++ b/lib/eal/include/rte_epoll.h
@@ -0,0 +1,118 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell International Ltd.
+ */
+
+#ifndef __RTE_EPOLL_H__
+#define __RTE_EPOLL_H__
+
+/**
+ * @file
+ * The rte_epoll provides interfaces functions to add delete events,
+ * wait poll for an event.
+ */
+
+#include <stdint.h>
+
+#include <rte_compat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_INTR_EVENT_ADD            1UL
+#define RTE_INTR_EVENT_DEL            2UL
+
+typedef void (*rte_intr_event_cb_t)(int fd, void *arg);
+
+struct rte_epoll_data {
+	uint32_t event;               /**< event type */
+	void *data;                   /**< User data */
+	rte_intr_event_cb_t cb_fun;   /**< IN: callback fun */
+	void *cb_arg;	              /**< IN: callback arg */
+};
+
+enum {
+	RTE_EPOLL_INVALID = 0,
+	RTE_EPOLL_VALID,
+	RTE_EPOLL_EXEC,
+};
+
+/** interrupt epoll event obj, taken by epoll_event.ptr */
+struct rte_epoll_event {
+	uint32_t status;           /**< OUT: event status */
+	int fd;                    /**< OUT: event fd */
+	int epfd;       /**< OUT: epoll instance the ev associated with */
+	struct rte_epoll_data epdata;
+};
+
+#define RTE_EPOLL_PER_THREAD        -1  /**< to hint using per thread epfd */
+
+/**
+ * It waits for events on the epoll instance.
+ * Retries if signal received.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller wait for events.
+ * @param events
+ *   Memory area contains the events that will be available for the caller.
+ * @param maxevents
+ *   Up to maxevents are returned, must greater than zero.
+ * @param timeout
+ *   Specifying a timeout of -1 causes a block indefinitely.
+ *   Specifying a timeout equal to zero cause to return immediately.
+ * @return
+ *   - On success, returns the number of available event.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout);
+
+/**
+ * It waits for events on the epoll instance.
+ * Does not retry if signal received.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller wait for events.
+ * @param events
+ *   Memory area contains the events that will be available for the caller.
+ * @param maxevents
+ *   Up to maxevents are returned, must greater than zero.
+ * @param timeout
+ *   Specifying a timeout of -1 causes a block indefinitely.
+ *   Specifying a timeout equal to zero cause to return immediately.
+ * @return
+ *   - On success, returns the number of available event.
+ *   - On failure, a negative value.
+ */
+__rte_experimental
+int
+rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
+	       int maxevents, int timeout);
+
+/**
+ * It performs control operations on epoll instance referred by the epfd.
+ * It requests that the operation op be performed for the target fd.
+ *
+ * @param epfd
+ *   Epoll instance fd on which the caller perform control operations.
+ * @param op
+ *   The operation be performed for the target fd.
+ * @param fd
+ *   The target fd on which the control ops perform.
+ * @param event
+ *   Describes the object linked to the fd.
+ *   Note: The caller must take care the object deletion after CTL_DEL.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+	      struct rte_epoll_event *event);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __RTE_EPOLL_H__ */
diff --git a/lib/eal/include/rte_interrupts.h b/lib/eal/include/rte_interrupts.h
index cc3bf45d8c..98edf774af 100644
--- a/lib/eal/include/rte_interrupts.h
+++ b/lib/eal/include/rte_interrupts.h
@@ -5,8 +5,11 @@ 
 #ifndef _RTE_INTERRUPTS_H_
 #define _RTE_INTERRUPTS_H_
 
+#include <stdbool.h>
+
 #include <rte_common.h>
 #include <rte_compat.h>
+#include <rte_epoll.h>
 
 /**
  * @file
@@ -22,6 +25,8 @@  extern "C" {
 /** Interrupt handle */
 struct rte_intr_handle;
 
+#include "rte_eal_interrupts.h"
+
 /** Function to be registered for the specific interrupt */
 typedef void (*rte_intr_callback_fn)(void *cb_arg);
 
@@ -32,8 +37,6 @@  typedef void (*rte_intr_callback_fn)(void *cb_arg);
 typedef void (*rte_intr_unregister_callback_fn)(struct rte_intr_handle *intr_handle,
 						void *cb_arg);
 
-#include "rte_eal_interrupts.h"
-
 /**
  * It registers the callback for the specific interrupt. Multiple
  * callbacks can be registered at the same time.
@@ -163,6 +166,621 @@  int rte_intr_disable(const struct rte_intr_handle *intr_handle);
 __rte_experimental
 int rte_intr_ack(const struct rte_intr_handle *intr_handle);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Check if currently executing in interrupt context
+ *
+ * @return
+ *  - non zero in case of interrupt context
+ *  - zero in case of process context
+ */
+__rte_experimental
+int
+rte_thread_is_intr(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * It allocates memory for interrupt instance. API takes flag as an argument
+ * which define from where memory should be allocated i.e. using DPDK memory
+ * management library APIs or normal heap allocation.
+ * Default memory allocation for event fds and event list array is done which
+ * can be realloced later as per the requirement.
+ *
+ * This function should be called from application or driver, before calling any
+ * of the interrupt APIs.
+ *
+ * @param flags
+ *  Memory allocation from DPDK allocator or normal allocation
+ *
+ * @return
+ *  - On success, address of first interrupt handle.
+ *  - On failure, NULL.
+ */
+__rte_experimental
+struct rte_intr_handle *
+rte_intr_instance_alloc(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * This API is used to free the memory allocated for event fds. event lists
+ * and interrupt handle array.
+ *
+ * @param intr_handle
+ *  Base address of interrupt handle array.
+ *
+ */
+__rte_experimental
+void
+rte_intr_instance_free(struct rte_intr_handle *intr_handle);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * This API is used to set the fd field of interrupt handle with user provided
+ * file descriptor.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param fd
+ *  file descriptor value provided by user.
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_experimental
+int
+rte_intr_fd_set(struct rte_intr_handle *intr_handle, int fd);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Returns the fd field of the given interrupt handle instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, fd field.
+ *  - On failure, a negative value.
+ */
+__rte_experimental
+int
+rte_intr_fd_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * This API is used to set the type field of interrupt handle with user provided
+ * interrupt type.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param type
+ *  interrupt type
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_experimental
+int
+rte_intr_type_set(struct rte_intr_handle *intr_handle,
+		  enum rte_intr_handle_type type);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Returns the type field of the given interrupt handle instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, interrupt type
+ *  - On failure, RTE_INTR_HANDLE_UNKNOWN.
+ */
+__rte_experimental
+enum rte_intr_handle_type
+rte_intr_type_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * The function returns the per thread epoll instance.
+ *
+ * @return
+ *   epfd the epoll instance referred to.
+ */
+__rte_internal
+int
+rte_intr_tls_epfd(void);
+
+/**
+ * @internal
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param epfd
+ *   Epoll instance fd which the intr vector associated to.
+ * @param op
+ *   The operation be performed for the vector.
+ *   Operation type of {ADD, DEL}.
+ * @param vec
+ *   RX intr vector number added to the epoll instance wait list.
+ * @param data
+ *   User raw data.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+		int epfd, int op, unsigned int vec, void *data);
+
+/**
+ * @internal
+ * It deletes registered eventfds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+__rte_internal
+void
+rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * It enables the packet I/O interrupt event if it's necessary.
+ * It creates event fd for each interrupt vector when MSIX is used,
+ * otherwise it multiplexes a single event fd.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ * @param nb_efd
+ *   Number of interrupt vector trying to enable.
+ *   The value 0 is not allowed.
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd);
+
+/**
+ * @internal
+ * It disables the packet I/O interrupt event.
+ * It deletes registered eventfds and closes the open fds.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+__rte_internal
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * The packet I/O interrupt on datapath is enabled or not.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+__rte_internal
+int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * The interrupt handle instance allows other causes or not.
+ * Other causes stand for any none packet I/O interrupts.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+__rte_internal
+int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * The multiple interrupt vector capability of interrupt handle instance.
+ * It returns zero if no multiple interrupt vector support.
+ *
+ * @param intr_handle
+ *   Pointer to the interrupt handle.
+ */
+__rte_internal
+int
+rte_intr_cap_multiple(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * This API is used to populate interrupt handle, with src handler fields.
+ *
+ * @param intr_handle
+ *  Start address of interrupt handles
+ * @param src
+ *  Source interrupt handle to be cloned.
+ *
+ * @return
+ *   - On success, zero.
+ *   - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_instance_copy(struct rte_intr_handle *intr_handle,
+		       const struct rte_intr_handle *src);
+
+/**
+ * @internal
+ * This API is used to set the device fd field of interrupt handle with user
+ * provided dev fd. Device fd corresponds to VFIO device fd or UIO config fd.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param fd
+ *  interrupt type
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_dev_fd_set(struct rte_intr_handle *intr_handle, int fd);
+
+/**
+ * @internal
+ * Returns the device fd field of the given interrupt handle instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, dev fd.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_dev_fd_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * This API is used to set the max intr field of interrupt handle with user
+ * provided max intr value.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param max_intr
+ *  interrupt type
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_max_intr_set(struct rte_intr_handle *intr_handle, int max_intr);
+
+/**
+ * @internal
+ * Returns the max intr field of the given interrupt handle instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, max intr.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_max_intr_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * This API is used to set the no of event fd field of interrupt handle with
+ * user provided available event file descriptor value.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param nb_efd
+ *  Available event fd
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_nb_efd_set(struct rte_intr_handle *intr_handle, int nb_efd);
+
+/**
+ * @internal
+ * Returns the no of available event fd field of the given interrupt handle
+ * instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, nb_efd
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_nb_efd_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * Returns the no of interrupt vector field of the given interrupt handle
+ * instance. This field is to configured on device probe time, and based on
+ * this value efds and elist arrays are dynamically allocated. By default
+ * this value is set to RTE_MAX_RXTX_INTR_VEC_ID.
+ * For eg. in case of PCI device, its msix size is queried and efds/elist
+ * arrays are allocated accordingly.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, nb_intr
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_nb_intr_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * This API is used to set the event fd counter size field of interrupt handle
+ * with user provided efd counter size.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param efd_counter_size
+ *  size of efd counter, used for vdev
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_efd_counter_size_set(struct rte_intr_handle *intr_handle,
+			      uint8_t efd_counter_size);
+
+/**
+ * @internal
+ * Returns the event fd counter size field of the given interrupt handle
+ * instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, efd_counter_size
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_efd_counter_size_get(const struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * This API is used to set the event fd array index with the given fd.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param index
+ *  efds array index to be set
+ * @param fd
+ *  event fd
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_efds_index_set(struct rte_intr_handle *intr_handle, int index, int fd);
+
+/**
+ * @internal
+ * Returns the fd value of event fds array at a given index.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param index
+ *  efds array index to be returned
+ *
+ * @return
+ *  - On success, fd
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_efds_index_get(const struct rte_intr_handle *intr_handle, int index);
+
+/**
+ * @internal
+ * This API is used to set the event list array index with the given elist
+ * instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param index
+ *  elist array index to be set
+ * @param elist
+ *  event list instance of struct rte_epoll_event
+ *
+ * @return
+ *  - On success, zero.
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_elist_index_set(struct rte_intr_handle *intr_handle, int index,
+			 struct rte_epoll_event elist);
+
+/**
+ * @internal
+ * Returns the address of elist instance of event list array at a given index.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param index
+ *  elist array index to be returned
+ *
+ * @return
+ *  - On success, elist
+ *  - On failure, a negative value.
+ */
+__rte_internal
+struct rte_epoll_event *
+rte_intr_elist_index_get(struct rte_intr_handle *intr_handle, int index);
+
+/**
+ * @internal
+ * Allocates the memory of interrupt vector list array, with size defining the
+ * no of elements required in the array.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param name
+ *  Name assigned to the allocation, or NULL.
+ * @param size
+ * No of element required in the array.
+ *
+ * @return
+ *  - On success, zero
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_vec_list_alloc(struct rte_intr_handle *intr_handle, const char *name,
+			int size);
+
+/**
+ * @internal
+ * Sets the vector value at given index of interrupt vector list field of given
+ * interrupt handle.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param index
+ *  intr_vec array index to be set
+ * @param vec
+ *  Interrupt vector value.
+ *
+ * @return
+ *  - On success, zero
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_vec_list_index_set(struct rte_intr_handle *intr_handle, int index,
+			    int vec);
+
+/**
+ * @internal
+ * Returns the vector value at the given index of interrupt vector list array.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param index
+ *  intr_vec array index to be returned
+ *
+ * @return
+ *  - On success, interrupt vector
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_vec_list_index_get(const struct rte_intr_handle *intr_handle,
+			    int index);
+
+/**
+ * @internal
+ * Freeing the memory allocated for interrupt vector list array.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, zero
+ *  - On failure, a negative value.
+ */
+__rte_internal
+void
+rte_intr_vec_list_free(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * Reallocates the size efds and elist array based on size provided by user.
+ * By default efds and elist array are allocated with default size
+ * RTE_MAX_RXTX_INTR_VEC_ID on interrupt handle array creation. Later on device
+ * probe, device may have capability of more interrupts than
+ * RTE_MAX_RXTX_INTR_VEC_ID. Hence using this API, PMDs can reallocate the
+ * arrays as per the max interrupts capability of device.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param size
+ *  efds and elist array size.
+ *
+ * @return
+ *  - On success, zero
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_event_list_update(struct rte_intr_handle *intr_handle, int size);
+
+/**
+ * @internal
+ * This API returns the windows handle of the given interrupt instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ *
+ * @return
+ *  - On success, windows handle.
+ *  - On failure, NULL.
+ */
+__rte_internal
+void *
+rte_intr_instance_windows_handle_get(struct rte_intr_handle *intr_handle);
+
+/**
+ * @internal
+ * This API set the windows handle for the given interrupt instance.
+ *
+ * @param intr_handle
+ *  pointer to the interrupt handle.
+ * @param windows_handle
+ *  windows handle to be set.
+ *
+ * @return
+ *  - On success, zero
+ *  - On failure, a negative value.
+ */
+__rte_internal
+int
+rte_intr_instance_windows_handle_set(struct rte_intr_handle *intr_handle,
+				     void *windows_handle);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 38f7de83e1..0ef77c3b40 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -109,18 +109,10 @@  DPDK_22 {
 	rte_hexdump;
 	rte_hypervisor_get;
 	rte_hypervisor_get_name; # WINDOWS_NO_EXPORT
-	rte_intr_allow_others;
 	rte_intr_callback_register;
 	rte_intr_callback_unregister;
-	rte_intr_cap_multiple;
-	rte_intr_disable;
-	rte_intr_dp_is_en;
-	rte_intr_efd_disable;
-	rte_intr_efd_enable;
 	rte_intr_enable;
-	rte_intr_free_epoll_fd;
-	rte_intr_rx_ctl;
-	rte_intr_tls_epfd;
+	rte_intr_disable;
 	rte_keepalive_create; # WINDOWS_NO_EXPORT
 	rte_keepalive_dispatch_pings; # WINDOWS_NO_EXPORT
 	rte_keepalive_mark_alive; # WINDOWS_NO_EXPORT
@@ -420,6 +412,14 @@  EXPERIMENTAL {
 
 	# added in 21.08
 	rte_power_monitor_multi; # WINDOWS_NO_EXPORT
+
+	# added in 21.11
+	rte_intr_fd_set; # WINDOWS_NO_EXPORT
+	rte_intr_fd_get; # WINDOWS_NO_EXPORT
+	rte_intr_type_set; # WINDOWS_NO_EXPORT
+	rte_intr_type_get; # WINDOWS_NO_EXPORT
+	rte_intr_instance_alloc; # WINDOWS_NO_EXPORT
+	rte_intr_instance_free; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {
@@ -430,4 +430,33 @@  INTERNAL {
 	rte_mem_map;
 	rte_mem_page_size;
 	rte_mem_unmap;
+	rte_intr_cap_multiple;
+	rte_intr_dp_is_en;
+	rte_intr_efd_disable;
+	rte_intr_efd_enable;
+	rte_intr_free_epoll_fd;
+	rte_intr_rx_ctl;
+	rte_intr_allow_others;
+	rte_intr_tls_epfd;
+	rte_intr_dev_fd_set; # WINDOWS_NO_EXPORT
+	rte_intr_dev_fd_get; # WINDOWS_NO_EXPORT
+	rte_intr_instance_copy; # WINDOWS_NO_EXPORT
+	rte_intr_event_list_update; # WINDOWS_NO_EXPORT
+	rte_intr_max_intr_set; # WINDOWS_NO_EXPORT
+	rte_intr_max_intr_get; # WINDOWS_NO_EXPORT
+	rte_intr_nb_efd_set; # WINDOWS_NO_EXPORT
+	rte_intr_nb_efd_get; # WINDOWS_NO_EXPORT
+	rte_intr_nb_intr_get; # WINDOWS_NO_EXPORT
+	rte_intr_efds_index_set; # WINDOWS_NO_EXPORT
+	rte_intr_efds_index_get; # WINDOWS_NO_EXPORT
+	rte_intr_elist_index_set; # WINDOWS_NO_EXPORT
+	rte_intr_elist_index_get; # WINDOWS_NO_EXPORT
+	rte_intr_efd_counter_size_set; # WINDOWS_NO_EXPORT
+	rte_intr_efd_counter_size_get; # WINDOWS_NO_EXPORT
+	rte_intr_vec_list_alloc; # WINDOWS_NO_EXPORT
+	rte_intr_vec_list_index_set; # WINDOWS_NO_EXPORT
+	rte_intr_vec_list_index_get; # WINDOWS_NO_EXPORT
+	rte_intr_vec_list_free; # WINDOWS_NO_EXPORT
+	rte_intr_instance_windows_handle_get;
+	rte_intr_instance_windows_handle_set;
 };