[1/4] lib: introduce IF Proxy library

Message ID 20200306164104.15528-2-aostruszka@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Introduce IF proxy library |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-testing fail Testing issues
ci/iol-mellanox-Performance success Performance Testing PASS
ci/Intel-compilation fail Compilation issues

Commit Message

Andrzej Ostruszka [C] March 6, 2020, 4:41 p.m. UTC
  This library allows to designate ports visible to the system (such as
Tun/Tap or KNI) as port representors serving as proxies for other DPDK
ports.  When such a proxy is configured this library initially queries
network configuration from the system and later monitors its changes.

The information gathered is passed to the application either via a set
of user registered callbacks or as an event added to the configured
notification queue (or a combination of these two mechanisms).  This way
user can use normal network utilities (like those from the iproute2
suite) to configure DPDK ports.

Signed-off-by: Andrzej Ostruszka <aostruszka@marvell.com>
---
 MAINTAINERS                                   |   3 +
 config/common_base                            |   5 +
 config/common_linux                           |   1 +
 lib/Makefile                                  |   2 +
 .../common/include/rte_eal_interrupts.h       |   2 +
 lib/librte_eal/linux/eal/eal_interrupts.c     |  14 +-
 lib/librte_if_proxy/Makefile                  |  29 +
 lib/librte_if_proxy/if_proxy_common.c         | 494 +++++++++++++++
 lib/librte_if_proxy/if_proxy_priv.h           |  97 +++
 lib/librte_if_proxy/linux/Makefile            |   4 +
 lib/librte_if_proxy/linux/if_proxy.c          | 552 +++++++++++++++++
 lib/librte_if_proxy/meson.build               |  19 +
 lib/librte_if_proxy/rte_if_proxy.h            | 561 ++++++++++++++++++
 lib/librte_if_proxy/rte_if_proxy_version.map  |  19 +
 lib/meson.build                               |   2 +-
 15 files changed, 1799 insertions(+), 5 deletions(-)
 create mode 100644 lib/librte_if_proxy/Makefile
 create mode 100644 lib/librte_if_proxy/if_proxy_common.c
 create mode 100644 lib/librte_if_proxy/if_proxy_priv.h
 create mode 100644 lib/librte_if_proxy/linux/Makefile
 create mode 100644 lib/librte_if_proxy/linux/if_proxy.c
 create mode 100644 lib/librte_if_proxy/meson.build
 create mode 100644 lib/librte_if_proxy/rte_if_proxy.h
 create mode 100644 lib/librte_if_proxy/rte_if_proxy_version.map
  

Comments

Harman Kalra March 31, 2020, 12:36 p.m. UTC | #1
On Fri, Mar 06, 2020 at 05:41:01PM +0100, Andrzej Ostruszka wrote:
> This library allows to designate ports visible to the system (such as
> Tun/Tap or KNI) as port representors serving as proxies for other DPDK
> ports.  When such a proxy is configured this library initially queries
> network configuration from the system and later monitors its changes.
> 
> The information gathered is passed to the application either via a set
> of user registered callbacks or as an event added to the configured
> notification queue (or a combination of these two mechanisms).  This way
> user can use normal network utilities (like those from the iproute2
> suite) to configure DPDK ports.
> 
> Signed-off-by: Andrzej Ostruszka <aostruszka@marvell.com>
> ---
>  MAINTAINERS                                   |   3 +
>  config/common_base                            |   5 +
>  config/common_linux                           |   1 +
>  lib/Makefile                                  |   2 +
>  .../common/include/rte_eal_interrupts.h       |   2 +
>  lib/librte_eal/linux/eal/eal_interrupts.c     |  14 +-
>  lib/librte_if_proxy/Makefile                  |  29 +
>  lib/librte_if_proxy/if_proxy_common.c         | 494 +++++++++++++++
>  lib/librte_if_proxy/if_proxy_priv.h           |  97 +++
>  lib/librte_if_proxy/linux/Makefile            |   4 +
>  lib/librte_if_proxy/linux/if_proxy.c          | 552 +++++++++++++++++
>  lib/librte_if_proxy/meson.build               |  19 +
>  lib/librte_if_proxy/rte_if_proxy.h            | 561 ++++++++++++++++++
>  lib/librte_if_proxy/rte_if_proxy_version.map  |  19 +
>  lib/meson.build                               |   2 +-
>  15 files changed, 1799 insertions(+), 5 deletions(-)
>  create mode 100644 lib/librte_if_proxy/Makefile
>  create mode 100644 lib/librte_if_proxy/if_proxy_common.c
>  create mode 100644 lib/librte_if_proxy/if_proxy_priv.h
>  create mode 100644 lib/librte_if_proxy/linux/Makefile
>  create mode 100644 lib/librte_if_proxy/linux/if_proxy.c
>  create mode 100644 lib/librte_if_proxy/meson.build
>  create mode 100644 lib/librte_if_proxy/rte_if_proxy.h
>  create mode 100644 lib/librte_if_proxy/rte_if_proxy_version.map
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f4e0ed8e0..aec7326ca 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1469,6 +1469,9 @@ F: examples/bpf/
>  F: app/test/test_bpf.c
>  F: doc/guides/prog_guide/bpf_lib.rst
>  
> +IF Proxy - EXPERIMENTAL
> +M: Andrzej Ostruszka <aostruszka@marvell.com>
> +F: lib/librte_if_proxy/
>  
>  Test Applications
>  -----------------
> diff --git a/config/common_base b/config/common_base
> index 7ca2f28b1..dcc0a0650 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -1075,6 +1075,11 @@ CONFIG_RTE_LIBRTE_BPF_ELF=n
>  #
>  CONFIG_RTE_LIBRTE_IPSEC=y
>  
> +#
> +# Compile librte_if_proxy
> +#
> +CONFIG_RTE_LIBRTE_IF_PROXY=n
> +
>  #
>  # Compile the test application
>  #
> diff --git a/config/common_linux b/config/common_linux
> index 816810671..1244eb0ae 100644
> --- a/config/common_linux
> +++ b/config/common_linux
> @@ -16,6 +16,7 @@ CONFIG_RTE_LIBRTE_VHOST_NUMA=y
>  CONFIG_RTE_LIBRTE_VHOST_POSTCOPY=n
>  CONFIG_RTE_LIBRTE_PMD_VHOST=y
>  CONFIG_RTE_LIBRTE_IFC_PMD=y
> +CONFIG_RTE_LIBRTE_IF_PROXY=y
>  CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
>  CONFIG_RTE_LIBRTE_PMD_MEMIF=y
>  CONFIG_RTE_LIBRTE_PMD_SOFTNIC=y
> diff --git a/lib/Makefile b/lib/Makefile
> index 46b91ae1a..6a20806f1 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -118,6 +118,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
>  DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
>  DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu
>  DEPDIRS-librte_rcu := librte_eal
> +DIRS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += librte_if_proxy
> +DEPDIRS-librte_if_proxy := librte_eal librte_ethdev
>  
>  ifeq ($(CONFIG_RTE_EXEC_ENV_LINUX),y)
>  DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
> diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h
> index 773a34a42..296a3853d 100644
> --- a/lib/librte_eal/common/include/rte_eal_interrupts.h
> +++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
> @@ -36,6 +36,8 @@ enum rte_intr_handle_type {
>  	RTE_INTR_HANDLE_VDEV,         /**< virtual device */
>  	RTE_INTR_HANDLE_DEV_EVENT,    /**< device event handle */
>  	RTE_INTR_HANDLE_VFIO_REQ,     /**< VFIO request handle */
> +	RTE_INTR_HANDLE_NETLINK,      /**< netlink notification handle */
> +
>  	RTE_INTR_HANDLE_MAX           /**< count of elements */
>  };
>  
> diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
> index cb8e10709..16236a8c4 100644
> --- a/lib/librte_eal/linux/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linux/eal/eal_interrupts.c
> @@ -680,6 +680,9 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
>  		break;
>  	/* not used at this moment */
>  	case RTE_INTR_HANDLE_ALARM:
> +#if RTE_LIBRTE_IF_PROXY
> +	case RTE_INTR_HANDLE_NETLINK:
> +#endif
>  		return -1;
>  #ifdef VFIO_PRESENT
>  	case RTE_INTR_HANDLE_VFIO_MSIX:
> @@ -796,6 +799,9 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
>  		break;
>  	/* not used at this moment */
>  	case RTE_INTR_HANDLE_ALARM:
> +#if RTE_LIBRTE_IF_PROXY
> +	case RTE_INTR_HANDLE_NETLINK:
> +#endif
>  		return -1;
>  #ifdef VFIO_PRESENT
>  	case RTE_INTR_HANDLE_VFIO_MSIX:
> @@ -889,12 +895,12 @@ eal_intr_process_interrupts(struct epoll_event *events, int nfds)
>  			break;
>  #endif
>  #endif
> -		case RTE_INTR_HANDLE_VDEV:
>  		case RTE_INTR_HANDLE_EXT:
> -			bytes_read = 0;
> -			call = true;
> -			break;
> +		case RTE_INTR_HANDLE_VDEV:
>  		case RTE_INTR_HANDLE_DEV_EVENT:
> +#if RTE_LIBRTE_IF_PROXY
> +		case RTE_INTR_HANDLE_NETLINK:
> +#endif
>  			bytes_read = 0;
>  			call = true;
>  			break;
> diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
> new file mode 100644
> index 000000000..43cb702a2
> --- /dev/null
> +++ b/lib/librte_if_proxy/Makefile
> @@ -0,0 +1,29 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2020 Marvell International Ltd.
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +# library name
> +LIB = librte_if_proxy.a
> +
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +CFLAGS += -O3
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
> +LDLIBS += -lrte_eal -lrte_ethdev
> +
> +EXPORT_MAP := rte_if_proxy_version.map
> +
> +LIBABIVER := 1
> +
> +# all source are stored in SRCS-y
> +SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := if_proxy_common.c
> +
> +SYSDIR := $(patsubst "%app",%,$(CONFIG_RTE_EXEC_ENV))
> +include $(SRCDIR)/$(SYSDIR)/Makefile
> +
> +SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += $(addprefix $(SYSDIR)/,$(SRCS))
> +
> +# install this header file
> +SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
> +
> +include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/lib/librte_if_proxy/if_proxy_common.c b/lib/librte_if_proxy/if_proxy_common.c
> new file mode 100644
> index 000000000..230727d0c
> --- /dev/null
> +++ b/lib/librte_if_proxy/if_proxy_common.c
> @@ -0,0 +1,494 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2020 Marvell International Ltd.
> + */
> +
> +#include <if_proxy_priv.h>
> +#include <rte_string_fns.h>
> +
> +
> +/* Definitions of data mentioned in if_proxy_priv.h and local ones. */
> +int ifpx_log_type;
> +
> +uint16_t ifpx_ports[RTE_MAX_ETHPORTS];
> +
> +rte_spinlock_t ifpx_lock = RTE_SPINLOCK_INITIALIZER;
> +
> +struct ifpx_proxies_head ifpx_proxies = TAILQ_HEAD_INITIALIZER(ifpx_proxies);
> +
> +struct ifpx_queue_node {
> +	TAILQ_ENTRY(ifpx_queue_node) elem;
> +	uint16_t state;
> +	struct rte_ring *r;
> +};
> +static
> +TAILQ_HEAD(ifpx_queues_head, ifpx_queue_node) ifpx_queues =
> +		TAILQ_HEAD_INITIALIZER(ifpx_queues);
> +
> +/* All function pointers have the same size - so use this one to typecast
> + * different callbacks in rte_ifpx_callbacks and test their presence in a
> + * generic way.
> + */
> +union cb_ptr_t {
> +	int (*f_ptr)(void*);   /* type for normal event notification */
> +	int (*cfg_done)(void); /* lib notification for finished config */
> +};
> +union {
> +	struct rte_ifpx_callbacks cbs;
> +	union cb_ptr_t funcs[RTE_IFPX_NUM_EVENTS];
> +} ifpx_callbacks;
> +
> +uint64_t rte_ifpx_events_available(void)
> +{
> +	/* All events are supported on Linux. */
> +	return (1ULL << RTE_IFPX_NUM_EVENTS) - 1;
> +}
> +
> +uint16_t rte_ifpx_proxy_create(enum rte_ifpx_proxy_type type)
> +{
> +	char devargs[16] = { '\0' };
> +	int dev_cnt = 0, nlen;
> +	uint16_t port_id;
> +
> +	switch (type) {
> +	case RTE_IFPX_DEFAULT:
> +	case RTE_IFPX_TAP:
> +		nlen = strlcpy(devargs, "net_tap", sizeof(devargs));
> +		break;
> +	case RTE_IFPX_KNI:
> +		nlen = strlcpy(devargs, "net_kni", sizeof(devargs));
> +		break;
> +	default:
> +		IFPX_LOG(ERR, "Unknown proxy type: %d", type);
> +		return RTE_MAX_ETHPORTS;
> +	}
> +
> +	RTE_ETH_FOREACH_DEV(port_id) {
> +		if (strcmp(rte_eth_devices[port_id].device->driver->name,
> +			   devargs) == 0)
> +			++dev_cnt;
> +	}
> +	snprintf(devargs+nlen, sizeof(devargs)-nlen, "%d", dev_cnt);
> +
> +	return rte_ifpx_proxy_create_by_devarg(devargs);
> +}
> +
> +uint16_t rte_ifpx_proxy_create_by_devarg(const char *devarg)
> +{
> +	uint16_t port_id = RTE_MAX_ETHPORTS;
> +	struct rte_dev_iterator iter;
> +
> +	if (rte_dev_probe(devarg) < 0) {
> +		IFPX_LOG(ERR, "Failed to create proxy port %s\n", devarg);
> +		return RTE_MAX_ETHPORTS;
> +	}
> +
> +	if (rte_eth_iterator_init(&iter, devarg) == 0) {
> +		port_id = rte_eth_iterator_next(&iter);
> +		if (port_id != RTE_MAX_ETHPORTS)
> +			rte_eth_iterator_cleanup(&iter);
> +	}
> +
> +	return port_id;
> +}
> +
> +int ifpx_proxy_destroy(struct ifpx_proxy_node *px)
> +{
> +	unsigned int i;
> +	uint16_t proxy_id = px->proxy_id;
> +
> +	TAILQ_REMOVE(&ifpx_proxies, px, elem);
> +	free(px);
> +
> +	/* Clear any bindings for this proxy. */
> +	for (i = 0; i < RTE_DIM(ifpx_ports); ++i) {
> +		if (ifpx_ports[i] == proxy_id) {
> +			if (i == proxy_id) /* this entry is for proxy itself */
> +				ifpx_ports[i] = RTE_MAX_ETHPORTS;
> +			else
> +				rte_ifpx_port_unbind(i);
> +		}
> +	}
> +
> +	return rte_dev_remove(rte_eth_devices[proxy_id].device);
> +}
> +
> +int rte_ifpx_proxy_destroy(uint16_t proxy_id)
> +{
> +	struct ifpx_proxy_node *px;
> +	int ec = 0;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->proxy_id != proxy_id)
> +			continue;
> +	}
> +	if (!px) {
> +		ec = -EINVAL;
> +		goto exit;
> +	}
> +	if (px->state & IN_USE)
> +		px->state |= DEL_PENDING;
> +	else
> +		ec = ifpx_proxy_destroy(px);
> +exit:
> +	rte_spinlock_unlock(&ifpx_lock);
> +	return ec;
> +}
> +
> +int rte_ifpx_queue_add(struct rte_ring *r)
> +{
> +	struct ifpx_queue_node *node;
> +	int ec = 0;
> +
> +	if (!r)
> +		return -EINVAL;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(node, &ifpx_queues, elem) {
> +		if (node->r == r) {
> +			ec = -EEXIST;
> +			goto exit;
> +		}
> +	}
> +
> +	node = malloc(sizeof(*node));
> +	if (!node) {
> +		ec = -ENOMEM;
> +		goto exit;
> +	}
> +
> +	node->r = r;
> +	TAILQ_INSERT_TAIL(&ifpx_queues, node, elem);
> +exit:
> +	rte_spinlock_unlock(&ifpx_lock);
> +
> +	return ec;
> +}
> +
> +int rte_ifpx_queue_remove(struct rte_ring *r)
> +{
> +	struct ifpx_queue_node *node, *next;
> +	int ec = -EINVAL;
> +
> +	if (!r)
> +		return ec;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	for (node = TAILQ_FIRST(&ifpx_queues); node; node = next) {
> +		next = TAILQ_NEXT(node, elem);
> +		if (node->r != r)
> +			continue;
> +		TAILQ_REMOVE(&ifpx_queues, node, elem);
> +		free(node);
> +		ec = 0;
> +		break;
> +	}
> +	rte_spinlock_unlock(&ifpx_lock);
> +
> +	return ec;
> +}
> +
> +int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id)
> +{
> +	struct rte_eth_dev_info proxy_eth_info;
> +	struct ifpx_proxy_node *px;
> +	int ec;
> +
> +	if (port_id >= RTE_MAX_ETHPORTS || proxy_id >= RTE_MAX_ETHPORTS ||
> +	    /* port is a proxy */
> +	    ifpx_ports[port_id] == port_id) {
> +		IFPX_LOG(ERR, "Invalid port_id: %d", port_id);
> +		return -EINVAL;
> +	}
> +
> +	/* Do automatic rebinding but issue a warning since this is not
> +	 * considered to be a valid behaviour.
> +	 */
> +	if (ifpx_ports[port_id] != RTE_MAX_ETHPORTS) {
> +		IFPX_LOG(WARNING, "Port already bound: %d -> %d", port_id,
> +			 ifpx_ports[port_id]);
> +	}
> +
> +	/* Search for existing proxy - if not found add one to the list. */
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->proxy_id == proxy_id)
> +			break;
> +	}
> +	if (!px) {
> +		ec = rte_eth_dev_info_get(proxy_id, &proxy_eth_info);
> +		if (ec < 0 || proxy_eth_info.if_index == 0) {
> +			IFPX_LOG(ERR, "Invalid proxy: %d", proxy_id);
> +			rte_spinlock_unlock(&ifpx_lock);
> +			return ec < 0 ? ec : -EINVAL;
> +		}
> +		px = malloc(sizeof(*px));
> +		if (!px) {
> +			rte_spinlock_unlock(&ifpx_lock);
> +			return -ENOMEM;
> +		}
> +		px->proxy_id = proxy_id;
> +		px->info.if_index = proxy_eth_info.if_index;
> +		rte_eth_dev_get_mtu(proxy_id, &px->info.mtu);
> +		rte_eth_macaddr_get(proxy_id, &px->info.mac);
> +		memset(px->info.if_name, 0, sizeof(px->info.if_name));
> +		TAILQ_INSERT_TAIL(&ifpx_proxies, px, elem);
> +		ifpx_ports[proxy_id] = proxy_id;
> +	}
> +	rte_spinlock_unlock(&ifpx_lock);
> +	ifpx_ports[port_id] = proxy_id;
> +
> +	/* Add proxy MAC to the port - since port will often just forward
> +	 * packets from the proxy/system they will be sent with proxy MAC as
> +	 * src.  In order to pass communication in other direction we should be
> +	 * accepting packets with proxy MAC as dst.
> +	 */
> +	rte_eth_dev_mac_addr_add(port_id, &px->info.mac, 0);
> +
> +	if (ifpx_platform.get_info)
> +		ifpx_platform.get_info(px->info.if_index);
> +
> +	return 0;
> +}
> +
> +int rte_ifpx_port_unbind(uint16_t port_id)
> +{
> +	if (port_id >= RTE_MAX_ETHPORTS ||
> +	    ifpx_ports[port_id] == RTE_MAX_ETHPORTS ||
> +	    /* port is a proxy */
> +	    ifpx_ports[port_id] == port_id)
> +		return -EINVAL;
> +
> +	ifpx_ports[port_id] = RTE_MAX_ETHPORTS;
> +	/* Proxy without any port bound is OK - that is the state of the proxy
> +	 * that has just been created, and it can still report routing
> +	 * information.  So we do not even check if this is the case.
> +	 */
> +
> +	return 0;
> +}
> +
> +int rte_ifpx_callbacks_register(const struct rte_ifpx_callbacks *cbs)
> +{
> +	if (!cbs)
> +		return -EINVAL;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	ifpx_callbacks.cbs = *cbs;
> +	rte_spinlock_unlock(&ifpx_lock);
> +
> +	return 0;
> +}
> +
> +void rte_ifpx_callbacks_unregister(void)
> +{
> +	rte_spinlock_lock(&ifpx_lock);
> +	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
> +	rte_spinlock_unlock(&ifpx_lock);
> +}
> +
> +uint16_t rte_ifpx_proxy_get(uint16_t port_id)
> +{
> +	if (port_id >= RTE_MAX_ETHPORTS)
> +		return RTE_MAX_ETHPORTS;
> +
> +	return ifpx_ports[port_id];
> +}
> +
> +unsigned int rte_ifpx_port_get(uint16_t proxy_id,
> +			       uint16_t *ports, unsigned int num)
> +{
> +	unsigned int p, cnt = 0;
> +
> +	for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +		if (ifpx_ports[p] == proxy_id && ifpx_ports[p] != p) {
> +			++cnt;
> +			if (ports && num > 0) {
> +				*ports++ = p;
> +				--num;
> +			}
> +		}
> +	}
> +	return cnt;
> +}
> +
> +const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
> +{
> +	struct ifpx_proxy_node *px;
> +
> +	if (port_id >= RTE_MAX_ETHPORTS ||
> +	    ifpx_ports[port_id] == RTE_MAX_ETHPORTS)
> +		return NULL;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->proxy_id == ifpx_ports[port_id])
> +			break;
> +	}
> +	rte_spinlock_unlock(&ifpx_lock);
> +	RTE_ASSERT(px && "Internal IF Proxy library error");
> +
> +	return &px->info;
> +}
> +
> +static
> +void queue_event(const struct rte_ifpx_event *ev, struct rte_ring *r)
> +{
> +	struct rte_ifpx_event *e = malloc(sizeof(*ev));
> +
> +	if (!e) {
> +		IFPX_LOG(ERR, "Failed to allocate event!");
> +		return;
> +	}
> +	RTE_ASSERT(r);
> +
> +	*e = *ev;
> +	rte_ring_sp_enqueue(r, e);
> +}
> +
> +void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px)
> +{
> +	struct ifpx_queue_node *q;
> +	int done = 0;
> +	uint16_t p, proxy_id;
> +
> +	if (px) {
> +		if (px->state & DEL_PENDING)
> +			return;
> +		proxy_id = px->proxy_id;
> +		RTE_ASSERT(proxy_id != RTE_MAX_ETHPORTS);
> +		px->state |= IN_USE;
> +	} else
> +		proxy_id = RTE_MAX_ETHPORTS;
> +
> +	RTE_ASSERT(ev);
> +	/* This function is expected to be called with a lock held. */
> +	RTE_ASSERT(rte_spinlock_trylock(&ifpx_lock) == 0);
> +
> +	if (ifpx_callbacks.funcs[ev->type].f_ptr) {
> +		union cb_ptr_t cb = ifpx_callbacks.funcs[ev->type];
> +
> +		/* Drop the lock for the time of callback call. */
> +		rte_spinlock_unlock(&ifpx_lock);
> +		if (px) {
> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +				if (ifpx_ports[p] != proxy_id ||
> +				    ifpx_ports[p] == p)
> +					continue;
> +				ev->data.port_id = p;
> +				done = cb.f_ptr(&ev->data) || done;
Since callback are handled as DPDK interrupts, hope there is no event
which gets lost. Cannot afford to loose a route change event as kernel
might not send it again. 

> +			}
> +		} else {
> +			RTE_ASSERT(ev->type == RTE_IFPX_CFG_DONE);
> +			done = cb.cfg_done();
> +		}
> +		rte_spinlock_lock(&ifpx_lock);
> +	}
> +	if (done)
> +		goto exit;
> +
> +	/* Event not "consumed" yet so try to notify via queues. */
> +	TAILQ_FOREACH(q, &ifpx_queues, elem) {
> +		if (px) {
> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +				if (ifpx_ports[p] != proxy_id ||
> +				    ifpx_ports[p] == p)
> +					continue;
> +				/* Set the port_id - the remaining params should
> +				 * be filled before calling this function.
> +				 */
> +				ev->data.port_id = p;
> +				queue_event(ev, q->r);
> +			}
> +		} else
> +			queue_event(ev, q->r);
> +	}
> +exit:
> +	if (px)
> +		px->state &= ~IN_USE;
> +}
> +
> +void ifpx_cleanup_proxies(void)
> +{
> +	struct ifpx_proxy_node *px, *next;
> +	for (px = TAILQ_FIRST(&ifpx_proxies); px; px = next) {
> +		next = TAILQ_NEXT(px, elem);
> +		if (px->state & DEL_PENDING)
> +			ifpx_proxy_destroy(px);
> +	}
> +}
> +
> +int rte_ifpx_listen(void)
> +{
> +	int ec;
> +
> +	if (!ifpx_platform.listen)
> +		return -ENOTSUP;
> +
> +	ec = ifpx_platform.listen();
> +	if (ec == 0 && ifpx_platform.get_info)
> +		ifpx_platform.get_info(0);
nlink_get_info calls request_info with a if_index, passing 0 might
be good in current scenario but valid index should be passed to
get_info.

> +
> +	return ec;
> +}
> +
> +int rte_ifpx_close(void)
> +{
> +	struct ifpx_proxy_node *px;
> +	struct ifpx_queue_node *q;
> +	unsigned int p;
> +	int ec = 0;
> +
> +	if (ifpx_platform.close) {
> +		ec = ifpx_platform.close();
> +		if (ec != 0)
> +			IFPX_LOG(ERR, "Platform 'close' calback failed.");
> +	}
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	/* Remove queues. */
> +	while (!TAILQ_EMPTY(&ifpx_queues)) {
> +		q = TAILQ_FIRST(&ifpx_queues);
> +		TAILQ_REMOVE(&ifpx_queues, q, elem);
> +		free(q);
> +	}
> +
> +	/* Clear callbacks. */
> +	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
> +
> +	/* Unbind ports. */
> +	for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +		if (ifpx_ports[p] == RTE_MAX_ETHPORTS)
> +			continue;
> +		if (ifpx_ports[p] == p)
> +			/* port is a proxy - just clear entry */
> +			ifpx_ports[p] = RTE_MAX_ETHPORTS;
> +		else
> +			rte_ifpx_port_unbind(p);
> +	}
> +
> +	/* Clear proxies. */
> +	while (!TAILQ_EMPTY(&ifpx_proxies)) {
> +		px = TAILQ_FIRST(&ifpx_proxies);
> +		TAILQ_REMOVE(&ifpx_proxies, px, elem);
> +		free(px);
> +	}
> +
> +	rte_spinlock_unlock(&ifpx_lock);
> +
> +	return ec;
> +}
> +
> +RTE_INIT(if_proxy_init)
> +{
> +	unsigned int i;
> +	for (i = 0; i < RTE_DIM(ifpx_ports); ++i)
> +		ifpx_ports[i] = RTE_MAX_ETHPORTS;
> +
> +	ifpx_log_type = rte_log_register("lib.if_proxy");
> +	if (ifpx_log_type >= 0)
> +		rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
> +
> +	if (ifpx_platform.init)
> +		ifpx_platform.init();
> +}
> diff --git a/lib/librte_if_proxy/if_proxy_priv.h b/lib/librte_if_proxy/if_proxy_priv.h
> new file mode 100644
> index 000000000..2fbf9127a
> --- /dev/null
> +++ b/lib/librte_if_proxy/if_proxy_priv.h
> @@ -0,0 +1,97 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2020 Marvell International Ltd.
> + */
> +#ifndef _IF_PROXY_PRIV_H_
> +#define _IF_PROXY_PRIV_H_
> +
> +#include <rte_if_proxy.h>
> +#include <rte_spinlock.h>
> +
> +extern int ifpx_log_type;
> +#define IFPX_LOG(level, fmt, args...) \
> +	rte_log(RTE_LOG_ ## level, ifpx_log_type, "%s(): " fmt "\n", \
> +		__func__, ##args)
> +
> +/* Table keeping mapping between port and their proxies. */
> +extern
> +uint16_t ifpx_ports[RTE_MAX_ETHPORTS];
> +
> +/* Callbacks and proxies are kept in linked lists.  Since this library is really
> + * a slow/config path we guard them with a lock - and only one for all of them
> + * should be enough.  We don't expect a need to protect other data structures -
> + * e.g. data for given port is expected be accessed/modified from single thread.
> + */
> +extern rte_spinlock_t ifpx_lock;
> +
> +enum ifpx_node_status {
> +	IN_USE		= 1U << 0,
> +	DEL_PENDING	= 1U << 1,
> +};
> +
> +/* List of configured proxies */
> +struct ifpx_proxy_node {
> +	TAILQ_ENTRY(ifpx_proxy_node) elem;
> +	uint16_t proxy_id;
> +	uint16_t state;
> +	struct rte_ifpx_info info;
> +};
> +extern
> +TAILQ_HEAD(ifpx_proxies_head, ifpx_proxy_node) ifpx_proxies;
> +
> +/* This function should be called by the implementation whenever it notices
> + * change in the network configuration.  The arguments are:
> + * - ev : pointer to filled event data structure (all fields are expected to be
> + *     filled, with the exception of 'port_id' for all proxy/port related
> + *     events: this function clones the event notification for each bound port
> + *     and fills 'port_id' appropriately).
> + * - px : proxy node when given event is proxy/port related, otherwise pass NULL
> + */
> +void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px);
> +
> +/* This function should be called by the implementation whenever it is done with
> + * notification about network configuration change.  It is only really needed
> + * for the case of callback based API - from the callback user might to attempt
> + * to remove callbacks/proxies.  Removing of callbacks is handled by the
> + * ifpx_notify_event() function above, however only implementation really knows
> + * when notification for given proxy is finished so it is a duty of it to call
> + * this function to cleanup all proxies that has been marked for deletion.
> + */
> +void ifpx_cleanup_proxies(void);
> +
> +/* This is the internal function removing the proxy from the list.  It is
> + * related to the notification function above and intended to be used by the
> + * platform implementation for the case of callback based API.
> + * During notification via callback the internal lock is released so that
> + * operation would not deadlock on an attempt to take a lock.  However
> + * modification (destruction) is not really performed - instead the
> + * callbacks/proxies are marked as "to be deleted".
> + * Handling of callbacks that are "to be deleted" is done by the
> + * ifpx_notify_event() function itself however it cannot delete the proxies (in
> + * particular the proxy passed as an argument) since they might still be refered
> + * by the calling function.  So it is a responsibility of the platform
> + * implementation to check after calling notification function if there are any
> + * proxies to be removed and use ifpx_proxy_destroy() to actually release them.
> + */
> +int ifpx_proxy_destroy(struct ifpx_proxy_node *px);
> +
> +/* Every implementation should provide definition of this structure:
> + * - init : called during library initialization (NULL when not needed)
> + * - listen : this function should start service listening to the network
> + *     configuration events/changes,
> + * - close : this function should close the service started by listen()
> + * - get_info : this function should query system for current configuration of
> + *     interface with index 'if_index'.  After successful initialization of
> + *     listening service this function is calle with 0 as an argument.  In that
> + *     case configuration of all ports should be obtained - and when this
> + *     procedure completes a RTE_IFPX_CFG_DONE event should be signaled via
> + *     ifpx_notify_event().
> + */
> +extern
> +struct ifpx_platform_callbacks {
> +	void (*init)(void);
> +	int (*listen)(void);
> +	int (*close)(void);
> +	void (*get_info)(int if_index);
> +} ifpx_platform;
> +
> +#endif /* _IF_PROXY_PRIV_H_ */
> diff --git a/lib/librte_if_proxy/linux/Makefile b/lib/librte_if_proxy/linux/Makefile
> new file mode 100644
> index 000000000..275b7e1e3
> --- /dev/null
> +++ b/lib/librte_if_proxy/linux/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2020 Marvell International Ltd.
> +
> +SRCS += if_proxy.c
> diff --git a/lib/librte_if_proxy/linux/if_proxy.c b/lib/librte_if_proxy/linux/if_proxy.c
> new file mode 100644
> index 000000000..bf851c096
> --- /dev/null
> +++ b/lib/librte_if_proxy/linux/if_proxy.c
> @@ -0,0 +1,552 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2020 Marvell International Ltd.
> + */
> +#include <if_proxy_priv.h>
> +#include <rte_interrupts.h>
> +#include <rte_string_fns.h>
> +
> +#include <stdbool.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/socket.h>
> +#include <linux/rtnetlink.h>
> +#include <linux/if.h>
> +
> +static
> +struct rte_intr_handle ifpx_irq = {
> +	.type = RTE_INTR_HANDLE_NETLINK,
> +	.fd = -1,
> +};
> +
> +static
> +unsigned int ifpx_pid;
> +
> +static
> +int request_info(int type, int index)
> +{
> +	static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER;
> +	struct info_get {
> +		struct nlmsghdr h;
> +		union {
> +			struct ifinfomsg ifm;
> +			struct ifaddrmsg ifa;
> +			struct rtmsg rtm;
> +			struct ndmsg ndm;
> +		} __rte_aligned(NLMSG_ALIGNTO);
> +	} info_req;
> +	int ret;
> +
> +	memset(&info_req, 0, sizeof(info_req));
> +	/* First byte of these messages is family, so just make sure that this
> +	 * memset is enough to get all families.
> +	 */
> +	RTE_ASSERT(AF_UNSPEC == 0);
> +
> +	info_req.h.nlmsg_pid = ifpx_pid;
> +	info_req.h.nlmsg_type = type;
> +	info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	info_req.h.nlmsg_len = offsetof(struct info_get, ifm);
> +
> +	switch (type) {
> +	case RTM_GETLINK:
> +		info_req.h.nlmsg_len += sizeof(info_req.ifm);
> +		info_req.ifm.ifi_index = index;
> +		break;
> +	case RTM_GETADDR:
> +		info_req.h.nlmsg_len += sizeof(info_req.ifa);
> +		info_req.ifa.ifa_index = index;
> +		break;
> +	case RTM_GETROUTE:
> +		info_req.h.nlmsg_len += sizeof(info_req.rtm);
> +		break;
> +	case RTM_GETNEIGH:
> +		info_req.h.nlmsg_len += sizeof(info_req.ndm);
> +		break;
> +	default:
> +		IFPX_LOG(WARNING, "Unhandled message type: %d", type);
> +		return -EINVAL;
> +	}
> +	/* Store request type (and if it is global or link specific) in 'seq'.
> +	 * Later it is used during handling of reply to continue requesting of
> +	 * information dump from system - if needed.
> +	 */
> +	info_req.h.nlmsg_seq = index << 8 | type;
> +
> +	IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index);
> +
> +	rte_spinlock_lock(&send_lock);
> +	ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0);
> +	if (ret < 0) {
> +		IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno);
> +		rte_errno = errno;
> +	}
> +	rte_spinlock_unlock(&send_lock);
> +
> +	return ret;
> +}
> +
> +static
> +void handle_link(const struct nlmsghdr *h)
> +{
> +	const struct ifinfomsg *ifi = NLMSG_DATA(h);
> +	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
> +	const struct rtattr *attrs[IFLA_MAX+1] = { NULL };
> +	const struct rtattr *attr;
> +	struct ifpx_proxy_node *px;
> +	struct rte_ifpx_event ev;
> +
> +	IFPX_LOG(DEBUG, "\tLink action (%u): %u, 0x%x/0x%x (flags/changed)",
> +		 ifi->ifi_index, h->nlmsg_type, ifi->ifi_flags,
> +		 ifi->ifi_change);
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->info.if_index == (unsigned int)ifi->ifi_index)
> +			break;
> +	}
> +
> +	/* Drop messages that are not associated with any proxy */
> +	if (!px)
> +		goto exit;
> +	/* When message is a reply to request for specific interface then keep
> +	 * it only when it contains info for this interface.
> +	 */
> +	if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
> +	    (h->nlmsg_seq >> 8) != (unsigned)ifi->ifi_index)
> +		goto exit;
> +
> +	for (attr = IFLA_RTA(ifi); RTA_OK(attr, alen);
> +				   attr = RTA_NEXT(attr, alen)) {
> +		if (attr->rta_type > IFLA_MAX)
> +			continue;
> +		attrs[attr->rta_type] = attr;
> +	}
> +
> +	if (ifi->ifi_change & IFF_UP) {
> +		ev.type = RTE_IFPX_LINK_CHANGE;
> +		ev.link_change.is_up = ifi->ifi_flags & IFF_UP;
> +		ifpx_notify_event(&ev, px);
> +	}
> +	if (attrs[IFLA_MTU]) {
> +		uint16_t mtu = *(const int *)RTA_DATA(attrs[IFLA_MTU]);
> +		if (mtu != px->info.mtu) {
> +			px->info.mtu = mtu;
> +			ev.type = RTE_IFPX_MTU_CHANGE;
> +			ev.mtu_change.mtu = mtu;
> +			ifpx_notify_event(&ev, px);
> +		}
> +	}
> +	if (attrs[IFLA_ADDRESS]) {
> +		const struct rte_ether_addr *mac =
> +				RTA_DATA(attrs[IFLA_ADDRESS]);
> +
> +		RTE_ASSERT(RTA_PAYLOAD(attrs[IFLA_ADDRESS]) ==
> +			   RTE_ETHER_ADDR_LEN);
> +		if (memcmp(mac, &px->info.mac, RTE_ETHER_ADDR_LEN) != 0) {
> +			rte_ether_addr_copy(mac, &px->info.mac);
> +			ev.type = RTE_IFPX_MAC_CHANGE;
> +			rte_ether_addr_copy(mac, &ev.mac_change.mac);
> +			ifpx_notify_event(&ev, px);
> +		}
> +	}
> +	if (h->nlmsg_pid == ifpx_pid) {
> +		RTE_ASSERT((h->nlmsg_seq & 0xFF) == RTM_GETLINK);
> +		/* If this is reply for specific link request (not initial
> +		 * global dump) then follow up with address request, otherwise
> +		 * just store the interface name.
> +		 */
> +		if (h->nlmsg_seq >> 8)
> +			request_info(RTM_GETADDR, ifi->ifi_index);
> +		else if (!px->info.if_name[0] && attrs[IFLA_IFNAME])
> +			strlcpy(px->info.if_name, RTA_DATA(attrs[IFLA_IFNAME]),
> +				sizeof(px->info.if_name));
> +	}
> +
> +	ifpx_cleanup_proxies();
> +exit:
> +	rte_spinlock_unlock(&ifpx_lock);
> +}
> +
> +static
> +void handle_addr(const struct nlmsghdr *h, bool needs_del)
> +{
> +	const struct ifaddrmsg *ifa = NLMSG_DATA(h);
> +	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa));
> +	const struct rtattr *attrs[IFA_MAX+1] = { NULL };
> +	const struct rtattr *attr;
> +	struct ifpx_proxy_node *px;
> +	struct rte_ifpx_event ev;
> +	const uint8_t *ip;
> +
> +	IFPX_LOG(DEBUG, "\tAddr action (%u): %u, family: %u",
> +		 ifa->ifa_index, h->nlmsg_type, ifa->ifa_family);
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->info.if_index == ifa->ifa_index)
> +			break;
> +	}
> +
> +	/* Drop messages that are not associated with any proxy */
> +	if (!px)
> +		goto exit;
> +	/* When message is a reply to request for specific interface then keep
> +	 * it only when it contains info for this interface.
> +	 */
> +	if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
> +	    (h->nlmsg_seq >> 8) != ifa->ifa_index)
> +		goto exit;
> +
> +	for (attr = IFA_RTA(ifa); RTA_OK(attr, alen);
> +				  attr = RTA_NEXT(attr, alen)) {
> +		if (attr->rta_type > IFA_MAX)
> +			continue;
> +		attrs[attr->rta_type] = attr;
> +	}
> +
> +	if (attrs[IFA_ADDRESS]) {
> +		ip = RTA_DATA(attrs[IFA_ADDRESS]);
> +		if (ifa->ifa_family == AF_INET) {
> +			ev.type = needs_del ? RTE_IFPX_ADDR_DEL
> +					    : RTE_IFPX_ADDR_ADD;
> +			ev.addr_change.ip =
> +					RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
> +		} else {
> +			ev.type = needs_del ? RTE_IFPX_ADDR6_DEL
> +					    : RTE_IFPX_ADDR6_ADD;
> +			memcpy(ev.addr6_change.ip, ip, 16);
> +		}
> +		ifpx_notify_event(&ev, px);
> +		ifpx_cleanup_proxies();
> +	}
> +exit:
> +	rte_spinlock_unlock(&ifpx_lock);
> +}
> +
> +static
> +void handle_route(const struct nlmsghdr *h, bool needs_del)
> +{
> +	const struct rtmsg *r = NLMSG_DATA(h);
> +	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
> +	const struct rtattr *attrs[RTA_MAX+1] = { NULL };
> +	const struct rtattr *attr;
> +	struct rte_ifpx_event ev;
> +	struct ifpx_proxy_node *px = NULL;
> +	const uint8_t *ip;
> +
> +	IFPX_LOG(DEBUG, "\tRoute action: %u, family: %u",
> +		 h->nlmsg_type, r->rtm_family);
> +
> +	for (attr = RTM_RTA(r); RTA_OK(attr, alen);
> +				attr = RTA_NEXT(attr, alen)) {
> +		if (attr->rta_type > RTA_MAX)
> +			continue;
> +		attrs[attr->rta_type] = attr;
> +	}
> +
> +	memset(&ev, 0, sizeof(ev));
> +	ev.type = RTE_IFPX_NUM_EVENTS;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	if (attrs[RTA_OIF]) {
> +		int if_index = *((int32_t*)RTA_DATA(attrs[RTA_OIF]));
> +
> +		if (if_index > 0) {
> +			TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +				if (px->info.if_index == (uint32_t)if_index)
> +					break;
> +			}
> +		}
> +	}
> +	/* We are only interested in routes related to the proxy interfaces and
> +	 * we need to have dst - otherwise skip the message.
> +	 */
> +	if (!px || !attrs[RTA_DST])
> +		goto exit;
> +
> +	ip = RTA_DATA(attrs[RTA_DST]);
> +	/* This is common to both IPv4/6. */
> +	ev.route_change.depth = r->rtm_dst_len;
> +	if (r->rtm_family == AF_INET) {
> +		ev.type = needs_del ? RTE_IFPX_ROUTE_DEL
> +		                    : RTE_IFPX_ROUTE_ADD;
> +		ev.route_change.ip =
> +		                RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
> +	} else {
> +		ev.type = needs_del ? RTE_IFPX_ROUTE6_DEL
> +		                    : RTE_IFPX_ROUTE6_ADD;
> +		memcpy(ev.route6_change.ip, ip, 16);
> +	}
> +	if (attrs[RTA_GATEWAY]) {
> +		ip = RTA_DATA(attrs[RTA_GATEWAY]);
> +		if (r->rtm_family == AF_INET)
> +			ev.route_change.gateway =
> +					RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
> +		else
> +			memcpy(ev.route6_change.gateway, ip, 16);
> +	}
> +
> +	ifpx_notify_event(&ev, px);
> +	/* Let's check for proxies to remove here too - just in case somebody
> +	 * removed the non-proxy related callback.
> +	 */
> +	ifpx_cleanup_proxies();
> +exit:
> +	rte_spinlock_unlock(&ifpx_lock);
> +}
> +
> +/* Link, addr and route related messages seem to have this macro defined but not
> + * neighbour one.  Define one if it is missing - const qualifiers added just to
> + * silence compiler - for some reason it is not needed in equivalent macros for
> + * other messages and here compiler is complaining about (char*) cast on pointer
> + * to const.
> + */
> +#ifndef NDA_RTA
> +#define NDA_RTA(r) ((const struct rtattr*)(((const char*)(r)) + \
> +			NLMSG_ALIGN(sizeof(struct ndmsg))))
> +#endif
> +
> +static
> +void handle_neigh(const struct nlmsghdr *h, bool needs_del)
> +{
> +	const struct ndmsg *n = NLMSG_DATA(h);
> +	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*n));
> +	const struct rtattr *attrs[NDA_MAX+1] = { NULL };
> +	const struct rtattr *attr;
> +	struct ifpx_proxy_node *px;
> +	struct rte_ifpx_event ev;
> +	const uint8_t *ip;
> +
> +	IFPX_LOG(DEBUG, "\tNeighbour action: %u, family: %u, state: %u, if: %d",
> +		 h->nlmsg_type, n->ndm_family, n->ndm_state, n->ndm_ifindex);
> +
> +	for (attr = NDA_RTA(n); RTA_OK(attr, alen);
> +				attr = RTA_NEXT(attr, alen)) {
> +		if (attr->rta_type > NDA_MAX)
> +			continue;
> +		attrs[attr->rta_type] = attr;
> +	}
> +
> +	memset(&ev, 0, sizeof(ev));
> +	ev.type = RTE_IFPX_NUM_EVENTS;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->info.if_index == (unsigned)n->ndm_ifindex)
> +			break;
> +	}
> +	/* We need only subset of neighbourhood related to proxy interfaces.
> +	 * lladdr seems to be needed only for adding new entry - modifications
> +	 * (also reported via RTM_NEWLINK) and deletion include only dst.
> +	 */
> +	if (!px || !attrs[NDA_DST] || (!needs_del && !attrs[NDA_LLADDR]))
> +		goto exit;
> +
> +	ip = RTA_DATA(attrs[NDA_DST]);
> +	if (n->ndm_family == AF_INET) {
> +		ev.type = needs_del ? RTE_IFPX_NEIGH_DEL
> +		                    : RTE_IFPX_NEIGH_ADD;
> +		ev.neigh_change.ip =
> +		                RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
> +	} else {
> +		ev.type = needs_del ? RTE_IFPX_NEIGH6_DEL
> +		                    : RTE_IFPX_NEIGH6_ADD;
> +		memcpy(ev.neigh6_change.ip, ip, 16);
> +	}
> +	if (attrs[NDA_LLADDR])
> +		rte_ether_addr_copy(RTA_DATA(attrs[NDA_LLADDR]),
> +		                    &ev.neigh_change.mac);
> +
> +	ifpx_notify_event(&ev, px);
> +	/* Let's check for proxies to remove here too - just in case somebody
> +	 * removed the non-proxy related callback.
> +	 */
> +	ifpx_cleanup_proxies();
> +exit:
> +	rte_spinlock_unlock(&ifpx_lock);
> +}
> +
> +static
> +void if_proxy_intr_callback(void *arg __rte_unused)
> +{
> +	struct nlmsghdr *h;
> +	struct sockaddr_nl addr;
> +	socklen_t addr_len;
> +	char buf[8192];
> +	ssize_t len;
> +
> +restart:
> +	len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0,
> +		       (struct sockaddr *)&addr, &addr_len);
> +	if (len < 0) {
> +		if (errno == EINTR) {
> +			IFPX_LOG(DEBUG, "recvmsg() interrupted");
> +			goto restart;
> +		}
> +		IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)",
> +			 len, errno);
> +		return;
> +	}
> +	if (addr_len != sizeof(addr)) {
> +		IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len);
> +		return;
> +	}
> +	IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len,
> +		 sizeof(buf), addr.nl_pid, addr.nl_groups);
> +
> +	for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len);
> +					 h = NLMSG_NEXT(h, len)) {
> +		IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)",
> +			 h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags,
> +			 h->nlmsg_pid);
> +
> +		switch (h->nlmsg_type) {
> +		case RTM_NEWLINK:
> +		case RTM_DELLINK:
> +			handle_link(h);
> +			break;
> +		case RTM_NEWADDR:
> +		case RTM_DELADDR:
> +			handle_addr(h, h->nlmsg_type == RTM_DELADDR);
> +			break;
> +		case RTM_NEWROUTE:
> +		case RTM_DELROUTE:
> +			handle_route(h, h->nlmsg_type == RTM_DELROUTE);
> +			break;
> +		case RTM_NEWNEIGH:
> +		case RTM_DELNEIGH:
> +			handle_neigh(h, h->nlmsg_type == RTM_DELNEIGH);
> +			break;
> +		}
> +
> +		/* If this is a reply for global request then follow up with
> +		 * additional requests and notify about finish.
> +		 */
> +		if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 &&
> +		    h->nlmsg_type == NLMSG_DONE) {
Sorry, but in what scenario will the flow reach here.

> +			if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK)
> +				request_info(RTM_GETADDR, 0);
> +			else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR)
> +				request_info(RTM_GETROUTE, 0);
> +			else if ((h->nlmsg_seq & 0xFF) == RTM_GETROUTE)
> +				request_info(RTM_GETNEIGH, 0);
> +			else {
> +				struct rte_ifpx_event ev = {
> +					.type = RTE_IFPX_CFG_DONE
> +				};
> +
> +				RTE_ASSERT((h->nlmsg_seq & 0xFF) ==
> +						RTM_GETNEIGH);
> +				rte_spinlock_lock(&ifpx_lock);
> +				ifpx_notify_event(&ev, NULL);
> +				rte_spinlock_unlock(&ifpx_lock);
> +			}
> +		}
> +	}
> +	IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len);
> +}
> +
> +static
> +int nlink_listen(void)
> +{
> +	struct sockaddr_nl addr = {
> +		.nl_family = AF_NETLINK,
> +		.nl_pid = 0,
> +	};
> +	socklen_t addr_len = sizeof(addr);
> +	int ret;
> +
> +	if (ifpx_irq.fd != -1) {
> +		rte_errno = EBUSY;
> +		return -1;
> +	}
> +
> +	addr.nl_groups = 1 << (RTNLGRP_LINK-1)
> +			| 1 << (RTNLGRP_NEIGH-1)
> +			| 1 << (RTNLGRP_IPV4_IFADDR-1)
> +			| 1 << (RTNLGRP_IPV6_IFADDR-1)
> +			| 1 << (RTNLGRP_IPV4_ROUTE-1)
> +			| 1 << (RTNLGRP_IPV6_ROUTE-1);
> +
> +	ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
> +				 NETLINK_ROUTE);
> +	if (ifpx_irq.fd == -1) {
> +		IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno);
> +		goto error;
> +	}
> +	/* Starting with kernel 4.19 you can request dump for a specific
> +	 * interface and kernel will filter out and send only relevant info.
> +	 * Otherwise NLM_F_DUMP will generate info for all interfaces and you
> +	 * need to filter them yourself.
> +	 */
> +#ifdef NETLINK_DUMP_STRICT_CHK
> +	ret = 1; /* use this var also as an input param */
> +	ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK,
> +			 &ret, sizeof(ret));
> +	if (ret < 0) {
> +		IFPX_LOG(ERR, "Failed to set socket option: %d", errno);
> +		goto error;
> +	}
> +#endif
> +
> +	ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len);
> +	if (ret < 0) {
> +		IFPX_LOG(ERR, "Failed to bind socket: %d", errno);
> +		goto error;
> +	}
> +	ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len);
> +	if (ret < 0) {
> +		IFPX_LOG(ERR, "Failed to get socket addr: %d", errno);
> +		goto error;
> +	} else {
> +		ifpx_pid = addr.nl_pid;
> +		IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid);
> +	}
> +
> +	ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback,
> +					 NULL);
> +	if (ret == 0)
> +		return 0;
> +
> +error:
> +	rte_errno = errno;
> +	if (ifpx_irq.fd != -1) {
> +		close(ifpx_irq.fd);
> +		ifpx_irq.fd = -1;
> +	}
> +	return -1;
> +}
> +
> +static
> +int nlink_close(void)
> +{
> +	int ec;
> +
> +	if (ifpx_irq.fd < 0)
> +		return -EBADFD;
> +
> +	do
> +		ec = rte_intr_callback_unregister(&ifpx_irq,
> +		                                  if_proxy_intr_callback, NULL);
> +	while (ec == -EAGAIN); /* unlikely but possible - at least I think so */
> +
> +	close(ifpx_irq.fd);
> +	ifpx_irq.fd = -1;
> +	ifpx_pid = 0;
> +
> +	return 0;
> +}
> +
> +static
> +void nlink_get_info(int if_index)
> +{
> +	if (ifpx_irq.fd != -1)
> +		request_info(RTM_GETLINK, if_index);
> +}
> +
> +struct ifpx_platform_callbacks ifpx_platform = {
> +	.init = NULL,
> +	.listen = nlink_listen,
> +	.close = nlink_close,
> +	.get_info = nlink_get_info,
> +};
> diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
> new file mode 100644
> index 000000000..f0c1a6e15
> --- /dev/null
> +++ b/lib/librte_if_proxy/meson.build
> @@ -0,0 +1,19 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2020 Marvell International Ltd.
> +
> +# Currently only implemented on Linux
> +if not is_linux
> +	build = false
> +	reason = 'only supported on linux'
> +endif
> +
> +version = 1
> +allow_experimental_apis = true
> +
> +deps += ['ethdev']
> +sources = files('if_proxy_common.c')
> +headers = files('rte_if_proxy.h')
> +
> +if is_linux
> +	sources += files('linux/if_proxy.c')
> +endif
> diff --git a/lib/librte_if_proxy/rte_if_proxy.h b/lib/librte_if_proxy/rte_if_proxy.h
> new file mode 100644
> index 000000000..e620319b3
> --- /dev/null
> +++ b/lib/librte_if_proxy/rte_if_proxy.h
> @@ -0,0 +1,561 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2020 Marvell International Ltd.
> + */
> +
> +#ifndef _RTE_IF_PROXY_H_
> +#define _RTE_IF_PROXY_H_
> +
> +/**
> + * @file
> + * RTE IF Proxy library
> + *
> + * The IF Proxy library allows for monitoring of system network configuration
> + * and configuration of DPDK ports by using usual system utilities (like the
> + * ones from iproute2 package).
> + *
> + * It is based on the notion of "proxy interface" which actually can be any DPDK
> + * port which is also visible to the system - that is it has non-zero 'if_index'
> + * field in 'rte_eth_dev_info' structure.
> + *
> + * If application doesn't have any such port (or doesn't want to use it for
> + * proxy) it can create one by calling:
> + *
> + *   proxy_id = rte_ifpx_create(RTE_IFPX_DEFAULT);
> + *
> + * This function is just a wrapper that constructs valid 'devargs' string based
> + * on the proxy type chosen (currently Tap or KNI) and creates the interface by
> + * calling rte_ifpx_dev_create().
> + *
> + * Once one has DPDK port capable of being proxy one can bind target DPDK port
> + * to it by calling.
> + *
> + *   rte_ifpx_port_bind(port_id, proxy_id);
> + *
> + * This binding is a logical one - there is no automatic packet forwarding
> + * between port and it's proxy since the library doesn't know the structure of
> + * application's packet processing.  It remains application responsibility to
> + * forward the packets from/to proxy port (by calling the usual DPDK RX/TX burst
> + * API).  However when the library notes some change to the proxy interface it
> + * will simply call appropriate callback with 'port_id' of the DPDK port that is
> + * bound to this proxy interface.  The binding can be 1 to many - that is many
> + * ports can point to one proxy - in that case registered callbacks will be
> + * called for every bound port.
> + *
> + * The callbacks that are used for notifications are described by the
> + * 'rte_ifpx_callbacks' structure and they are registered by calling:
> + *
> + *   rte_ifpx_callbacks_register(&cbs);
> + *
> + * Finally the application should call:
> + *
> + *   rte_ifpx_listen();
> + *
> + * which will query system for present network configuration and start listening
> + * to its changes.
> + */
> +
> +#include <rte_eal.h>
> +#include <rte_ethdev.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Enum naming the type of proxy to create.
> + *
> + * @see rte_ifpx_create()
> + */
> +enum rte_ifpx_proxy_type {
> +	RTE_IFPX_DEFAULT,	/**< Use default proxy type for given arch. */
> +	RTE_IFPX_TAP,		/**< Use Tap based port for proxy. */
> +	RTE_IFPX_KNI		/**< Use KNI based port for proxy. */
> +};
> +
> +/**
> + * Create DPDK port that can serve as an interface proxy.
> + *
> + * This function is just a wrapper around rte_ifpx_create_by_devarg() that
> + * constructs its 'devarg' argument based on type of proxy requested.
> + *
> + * @param type
> + *   A type of proxy to create.
> + *
> + * @return
> + *   DPDK port id on success, RTE_MAX_ETHPORTS otherwise.
> + *
> + * @see enum rte_ifpx_type
> + * @see rte_ifpx_create_by_devarg()
> + */
> +__rte_experimental
> +uint16_t rte_ifpx_proxy_create(enum rte_ifpx_proxy_type type);
> +
> +/**
> + * Create DPDK port that can serve as an interface proxy.
> + *
> + * @param devarg
> + *   A string passed to rte_dev_probe() to create proxy port.
> + *
> + * @return
> + *   DPDK port id on success, RTE_MAX_ETHPORTS otherwise.
> + */
> +__rte_experimental
> +uint16_t rte_ifpx_proxy_create_by_devarg(const char *devarg);
> +
> +/**
> + * Remove DPDK proxy port.
> + *
> + * In addition to removing the proxy port the bindings (if any) are cleared.
> + *
> + * @param proxy_id
> + *   Port id of the proxy that should be removed.
> + *
> + * @return
> + *   0 on success, negative on error.
> + */
> +__rte_experimental
> +int rte_ifpx_proxy_destroy(uint16_t proxy_id);
> +
> +/**
> + * The rte_ifpx_event_type enum lists all possible event types that can be
> + * signaled by this library.  To learn what events are supported on your
> + * platform call rte_ifpx_events_available().
> + *
> + * NOTE - do not reorder these enums freely, their values need to correspond to
> + * the order of the callbacks in struct rte_ifpx_callbacks.
> + */
> +enum rte_ifpx_event_type {
> +	RTE_IFPX_MAC_CHANGE,  /**< @see struct rte_ifpx_mac_change */
> +	RTE_IFPX_MTU_CHANGE,  /**< @see struct rte_ifpx_mtu_change */
> +	RTE_IFPX_LINK_CHANGE, /**< @see struct rte_ifpx_link_change */
> +	RTE_IFPX_ADDR_ADD,    /**< @see struct rte_ifpx_addr_change */
> +	RTE_IFPX_ADDR_DEL,    /**< @see struct rte_ifpx_addr_change */
> +	RTE_IFPX_ADDR6_ADD,   /**< @see struct rte_ifpx_addr6_change */
> +	RTE_IFPX_ADDR6_DEL,   /**< @see struct rte_ifpx_addr6_change */
> +	RTE_IFPX_ROUTE_ADD,   /**< @see struct rte_ifpx_route_change */
> +	RTE_IFPX_ROUTE_DEL,   /**< @see struct rte_ifpx_route_change */
> +	RTE_IFPX_ROUTE6_ADD,  /**< @see struct rte_ifpx_route6_change */
> +	RTE_IFPX_ROUTE6_DEL,  /**< @see struct rte_ifpx_route6_change */
> +	RTE_IFPX_NEIGH_ADD,   /**< @see struct rte_ifpx_neigh_change */
> +	RTE_IFPX_NEIGH_DEL,   /**< @see struct rte_ifpx_neigh_change */
> +	RTE_IFPX_NEIGH6_ADD,  /**< @see struct rte_ifpx_neigh6_change */
> +	RTE_IFPX_NEIGH6_DEL,  /**< @see struct rte_ifpx_neigh6_change */
> +	RTE_IFPX_CFG_DONE,    /**< This event is a lib specific event - it is
> +                               * signaled when initial network configuration
> +			       * query is finished and has no event data.
> +			       */
> +	RTE_IFPX_NUM_EVENTS,
> +};
> +
> +/**
> + * Get the bit mask of implemented events/callbacks for this platform.
> + *
> + * @return
> + *   Bit mask of events/callbacks implemented: each event type can be tested by
> + *   checking bit (1 << ev) where 'ev' is one of the rte_ifpx_event_type enum
> + *   values.
> + * @see enum rte_ifpx_event_type
> + */
> +__rte_experimental
> +uint64_t rte_ifpx_events_available(void);
> +
> +/**
> + * The rte_ifpx_event defines structure used to pass notification event to
> + * application.  Each event type has its own dedicated inner structure - these
> + * structures are also used when using callbacks notifications.
> + */
> +struct rte_ifpx_event {
> +	enum rte_ifpx_event_type type;
> +	union {
> +		/** Structure used to pass notification about MAC change of the
> +		 * proxy interface.
> +		 * @see RTE_IFPX_MAC_CHANGE
> +		 */
> +		struct rte_ifpx_mac_change {
> +			uint16_t port_id;
> +			struct rte_ether_addr mac;
> +		} mac_change;
> +		/** Structure used to pass notification about MTU change.
> +		 * @see RTE_IFPX_MTU_CHANGE
> +		 */
> +		struct rte_ifpx_mtu_change {
> +			uint16_t port_id;
> +			uint16_t mtu;
> +		} mtu_change;
> +		/** Structure used to pass notification about link going
> +		 * up/down.
> +		 * @see RTE_IFPX_LINK_CHANGE
> +		 */
> +		struct rte_ifpx_link_change {
> +			uint16_t port_id;
> +			int is_up;
> +		} link_change;
> +		/** Structure used to pass notification about IPv4 address being
> +		 * added/removed.  All IPv4 addresses reported by this library
> +		 * are in host order.
> +		 * @see RTE_IFPX_ADDR_ADD
> +		 * @see RTE_IFPX_ADDR_DEL
> +		 */
> +		struct rte_ifpx_addr_change {
> +			uint16_t port_id;
> +			uint32_t ip;
> +		} addr_change;
> +		/** Structure used to pass notification about IPv6 address being
> +		 * added/removed.
> +		 * @see RTE_IFPX_ADDR6_ADD
> +		 * @see RTE_IFPX_ADDR6_DEL
> +		 */
> +		struct rte_ifpx_addr6_change {
> +			uint16_t port_id;
> +			uint8_t ip[16];
> +		} addr6_change;
> +		/** Structure used to pass notification about IPv4 route being
> +		 * added/removed.
> +		 * @see RTE_IFPX_ROUTE_ADD
> +		 * @see RTE_IFPX_ROUTE_DEL
> +		 */
> +		struct rte_ifpx_route_change {
> +			uint16_t port_id;
> +			uint8_t depth;
> +			uint32_t ip;
> +			uint32_t gateway;
> +		} route_change;
> +		/** Structure used to pass notification about IPv6 route being
> +		 * added/removed.
> +		 * @see RTE_IFPX_ROUTE6_ADD
> +		 * @see RTE_IFPX_ROUTE6_DEL
> +		 */
> +		struct rte_ifpx_route6_change {
> +			uint16_t port_id;
> +			uint8_t depth;
> +			uint8_t ip[16];
> +			uint8_t gateway[16];
> +		} route6_change;
> +		/** Structure used to pass notification about IPv4 neighbour
> +		 * info changes.
> +		 * @see RTE_IFPX_NEIGH_ADD
> +		 * @see RTE_IFPX_NEIGH_DEL
> +		 */
> +		struct rte_ifpx_neigh_change {
> +			uint16_t port_id;
> +			struct rte_ether_addr mac;
> +			uint32_t ip;
> +		} neigh_change;
> +		/** Structure used to pass notification about IPv6 neighbour
> +		 * info changes.
> +		 * @see RTE_IFPX_NEIGH6_ADD
> +		 * @see RTE_IFPX_NEIGH6_DEL
> +		 */
> +		struct rte_ifpx_neigh6_change {
> +			uint16_t port_id;
> +			struct rte_ether_addr mac;
> +			uint8_t ip[16];
> +		} neigh6_change;
> +		/* This structure is used internally - to abstract common parts
> +		 * of proxy/port related events and to be able to refer to this
> +		 * union without giving it a name.
> +		 */
> +		struct {
> +			uint16_t port_id;
> +		} data;
> +	};
> +};
> +
> +/**
> + * This library can deliver notification about network configuration changes
> + * either by the use of registered callbacks and/or by queueing change events to
> + * configured notification queues.  The logic used is:
> + * 1. If there is callback registered for given event type it is called.  In
> + *   case of many ports to one proxy binding, this callback is called for every
> + *   port bound.
> + * 2. If this callback returns non-zero value (for any of ports in case of
> + *   many-1 bindings) the handling of an event is considered as complete.
> + * 3. Otherwise the event is added to each configured event queue.  The event is
> + *   allocated with malloc() so after dequeueing and handling the application
> + *   should deallocate it with free().
> + *
> + * This dual notification mechanism is meant to provide some flexibility to
> + * application writer.  For example, if you store your data in a single writer/
> + * many readers coherent data structure you could just update this structure
> + * from the callback.  If you keep separate copy per lcore/port you could make
> + * some common preparations (if applicable) in the callback, return 0 and use
> + * notification queues to pick up the change and update data structures.  Or you
> + * could skip the callbacks altogether and just use notification queues - and
> + * configure them at the level appropriate for your application design (one
> + * global / one per lcore / one per port ...).
> + */
> +
> +/**
> + * Add notification queue to the list of queues.
> + *
> + * @param r
> + *   Ring used for queueing of notification events - application can assume that
> + *   there is only one producer.
> + * @return
> + *   0 on success, negative otherwise.
> + */
> +int rte_ifpx_queue_add(struct rte_ring *r);
> +
> +/**
> + * Remove notification queue from the list of queues.
> + *
> + * @param r
> + *   Notification ring used for queueing of notification events (previously
> + *   added via rte_ifpx_queue_add()).
> + * @return
> + *   0 on success, negative otherwise.
> + */
> +int rte_ifpx_queue_remove(struct rte_ring *r);
> +
> +/**
> + * This structure groups the callbacks that might be called as a notification
> + * events for changing network configuration.  Not every platform might
> + * implement all of them and you can query the availability with
> + * rte_ifpx_callbacks_available() function.
> + * @see rte_ifpx_events_available()
> + * @see rte_ifpx_callbacks_register()
> + */
> +struct rte_ifpx_callbacks {
> +	int (*mac_change)(const struct rte_ifpx_mac_change *event);
> +	/**< Callback for notification about MAC change of the proxy interface.
> +	 * This callback (as all other port related callbacks) is called for
> +	 * each port (with its port_id as a first argument) bound to the proxy
> +	 * interface for which change has been observed.
> +	 * @see struct rte_ifpx_mac_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*mtu_change)(const struct rte_ifpx_mtu_change *event);
> +	/**< Callback for notification about MTU change.
> +	 * @see struct rte_ifpx_mtu_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*link_change)(const struct rte_ifpx_link_change *event);
> +	/**< Callback for notification about link going up/down.
> +	 * @see struct rte_ifpx_link_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*addr_add)(const struct rte_ifpx_addr_change *event);
> +	/**< Callback for notification about IPv4 address being added.
> +	 * @see struct rte_ifpx_addr_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*addr_del)(const struct rte_ifpx_addr_change *event);
> +	/**< Callback for notification about IPv4 address removal.
> +	 * @see struct rte_ifpx_addr_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*addr6_add)(const struct rte_ifpx_addr6_change *event);
> +	/**< Callback for notification about IPv6 address being added.
> +	 * @see struct rte_ifpx_addr6_change
> +	 */
> +	int (*addr6_del)(const struct rte_ifpx_addr6_change *event);
> +	/**< Callback for notification about IPv4 address removal.
> +	 * @see struct rte_ifpx_addr6_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	/* Please note that "route" callbacks might be also called when user
> +	 * adds address to the interface (that is in addition to address related
> +	 * callbacks).
> +	 */
> +	int (*route_add)(const struct rte_ifpx_route_change *event);
> +	/**< Callback for notification about IPv4 route being added.
> +	 * @see struct rte_ifpx_route_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*route_del)(const struct rte_ifpx_route_change *event);
> +	/**< Callback for notification about IPv4 route removal.
> +	 * @see struct rte_ifpx_route_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*route6_add)(const struct rte_ifpx_route6_change *event);
> +	/**< Callback for notification about IPv6 route being added.
> +	 * @see struct rte_ifpx_route6_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*route6_del)(const struct rte_ifpx_route6_change *event);
> +	/**< Callback for notification about IPv6 route removal.
> +	 * @see struct rte_ifpx_route6_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*neigh_add)(const struct rte_ifpx_neigh_change *event);
> +	/**< Callback for notification about IPv4 neighbour being added.
> +	 * @see struct rte_ifpx_neigh_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*neigh_del)(const struct rte_ifpx_neigh_change *event);
> +	/**< Callback for notification about IPv4 neighbour removal.
> +	 * @see struct rte_ifpx_neigh_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*neigh6_add)(const struct rte_ifpx_neigh6_change *event);
> +	/**< Callback for notification about IPv6 neighbour being added.
> +	 * @see struct rte_ifpx_neigh_change
> +	 */
> +	int (*neigh6_del)(const struct rte_ifpx_neigh6_change *event);
> +	/**< Callback for notification about IPv6 neighbour removal.
> +	 * @see struct rte_ifpx_neigh_change
> +	 * @return non-zero if event handling is finished
> +	 */
> +	int (*cfg_done)(void);
> +	/**< Lib specific callback - called when initial network configuration
> +	 * query is finished.
> +	 * @return non-zero if event handling is finished
> +	 */
> +};
> +
> +/**
> + * Register proxy callbacks.
> + *
> + * This function registers callbacks to be called upon appropriate network
> + * event notification.
> + *
> + * @param cbs
> + *   Set of callbacks that will be called.  The library does not take any
> + *   ownership of the pointer passed - the callbacks are stored internally.
> + *
> + * @return
> + *   0 on success, negative otherwise.
> + */
> +__rte_experimental
> +int rte_ifpx_callbacks_register(const struct rte_ifpx_callbacks *cbs);
> +
> +/**
> + * Unregister proxy callbacks.
> + *
> + * This function unregisters callbacks previously registered with
> + * rte_ifpx_callbacks_register().
> + *
> + * @param cbs
> + *   Handle/pointer returned on previous callback registration.
> + *
> + * @return
> + *   0 on success, negative otherwise.
> + */
> +__rte_experimental
> +void rte_ifpx_callbacks_unregister(void);
> +
> +/**
> + * Bind the port to its proxy.
> + *
> + * After calling this function all network configuration of the proxy (and it's
> + * changes) will be passed to given port by calling registered callbacks with
> + * 'port_id' as an argument.
> + *
> + * Note: since both arguments are of the same type in order to not mix them and
> + * ease remembering the order the first one is kept the same for bind/unbind.
> + *
> + * @param port_id
> + *   Id of the port to be bound.
> + * @param proxy_id
> + *   Id of the proxy the port needs to be bound to.
> + * @return
> + *   0 on success, negative on error.
> + */
> +__rte_experimental
> +int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id);
> +
> +/**
> + * Unbind the port from its proxy.
> + *
> + * After calling this function registered callbacks will no longer be called for
> + * this port (but they might be called for other ports in one to many binding
> + * scenario).
> + *
> + * @param port_id
> + *   Id of the port to unbind.
> + * @return
> + *   0 on success, negative on error.
> + */
> +__rte_experimental
> +int rte_ifpx_port_unbind(uint16_t port_id);
> +
> +/**
> + * Get the system network configuration and start listening to its changes.
> + *
> + * @return
> + *   0 on success, negative otherwise.
> + */
> +__rte_experimental
> +int rte_ifpx_listen(void);
> +
> +/**
> + * Remove all bindings/callbacks and stop listening to network configuration.
> + *
> + * @return
> + *   0 on success, negative otherwise.
> + */
> +__rte_experimental
> +int rte_ifpx_close(void);
> +
> +/**
> + * Get the id of the proxy the port is bound to.
> + *
> + * @param port_id
> + *   Id of the port for which to get proxy.
> + * @return
> + *   Port id of the proxy on success, RTE_MAX_ETHPORTS on error.
> + */
> +__rte_experimental
> +uint16_t rte_ifpx_proxy_get(uint16_t port_id);
> +
> +/**
> + * Test for port acting as a proxy.
> + *
> + * @param port_id
> + *   Id of the port.
> + * @return
> + *   1 if port acts as a proxy, 0 otherwise.
> + */
> +static inline
> +int rte_ifpx_is_proxy(uint16_t port_id)
> +{
> +	return rte_ifpx_proxy_get(port_id) == port_id;
> +}
> +
> +/**
> + * Get the ids of the ports bound to the proxy.
> + *
> + * @param proxy_id
> + *   Id of the proxy for which to get ports.
> + * @param ports
> + *   Array where to store the port ids.
> + * @param num
> + *   Size of the 'ports' array.
> + * @return
> + *   The number of ports bound to given proxy.  Note that bound ports are filled
> + *   in 'ports' array up to its size but the return value is always the total
> + *   number of ports bound - so you can make call first with NULL/0 to query for
> + *   the size of the buffer to create or call it with the buffer you have and
> + *   later check if it was large enough.
> + */
> +__rte_experimental
> +unsigned int rte_ifpx_port_get(uint16_t proxy_id,
> +			       uint16_t *ports, unsigned int num);
> +
> +/**
> + * The structure containing some properties of the proxy interface.
> + */
> +struct rte_ifpx_info {
> +	unsigned int if_index; /* entry valid iff if_index != 0 */
> +	uint16_t mtu;
> +	struct rte_ether_addr mac;
> +	char if_name[RTE_ETH_NAME_MAX_LEN];
> +};
> +
> +/**
> + * Get the properties of the proxy interface.  Argument can be either id of the
> + * proxy or an id of a port that is bound to it.
> + *
> + * @param port_id
> + *   Id of the port (or proxy) for which to get proxy properties.
> + * @return
> + *   Pointer to the proxy information structure.
> + */
> +__rte_experimental
> +const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_IF_PROXY_H_ */
> diff --git a/lib/librte_if_proxy/rte_if_proxy_version.map b/lib/librte_if_proxy/rte_if_proxy_version.map
> new file mode 100644
> index 000000000..e2093137d
> --- /dev/null
> +++ b/lib/librte_if_proxy/rte_if_proxy_version.map
> @@ -0,0 +1,19 @@
> +EXPERIMENTAL {
> +	global:
> +
> +	 rte_ifpx_proxy_create;
> +	 rte_ifpx_proxy_create_by_devarg;
> +	 rte_ifpx_proxy_destroy;
> +	 rte_ifpx_events_available;
> +	 rte_ifpx_callbacks_register;
> +	 rte_ifpx_callbacks_unregister;
> +	 rte_ifpx_port_bind;
> +	 rte_ifpx_port_unbind;
> +	 rte_ifpx_listen;
> +	 rte_ifpx_close;
> +	 rte_ifpx_proxy_get;
> +	 rte_ifpx_port_get;
> +	 rte_ifpx_info_get;
> +
> +	local: *;
> +};
> diff --git a/lib/meson.build b/lib/meson.build
> index 0af3efab2..c913b33dd 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -19,7 +19,7 @@ libraries = [
>  	'acl', 'bbdev', 'bitratestats', 'cfgfile',
>  	'compressdev', 'cryptodev',
>  	'distributor', 'efd', 'eventdev',
> -	'gro', 'gso', 'ip_frag', 'jobstats',
> +	'gro', 'gso', 'if_proxy', 'ip_frag', 'jobstats',
>  	'kni', 'latencystats', 'lpm', 'member',
>  	'power', 'pdump', 'rawdev',
>  	'rcu', 'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
> -- 
> 2.17.1
>
  
Andrzej Ostruszka [C] March 31, 2020, 3:37 p.m. UTC | #2
On 3/31/20 2:36 PM, Harman Kalra wrote:
> On Fri, Mar 06, 2020 at 05:41:01PM +0100, Andrzej Ostruszka wrote:
[...]
>> +void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px)
>> +{
>> +	struct ifpx_queue_node *q;
>> +	int done = 0;
>> +	uint16_t p, proxy_id;
>> +
>> +	if (px) {
>> +		if (px->state & DEL_PENDING)
>> +			return;
>> +		proxy_id = px->proxy_id;
>> +		RTE_ASSERT(proxy_id != RTE_MAX_ETHPORTS);
>> +		px->state |= IN_USE;
>> +	} else
>> +		proxy_id = RTE_MAX_ETHPORTS;
>> +
>> +	RTE_ASSERT(ev);
>> +	/* This function is expected to be called with a lock held. */
>> +	RTE_ASSERT(rte_spinlock_trylock(&ifpx_lock) == 0);
>> +
>> +	if (ifpx_callbacks.funcs[ev->type].f_ptr) {
>> +		union cb_ptr_t cb = ifpx_callbacks.funcs[ev->type];
>> +
>> +		/* Drop the lock for the time of callback call. */
>> +		rte_spinlock_unlock(&ifpx_lock);
>> +		if (px) {
>> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
>> +				if (ifpx_ports[p] != proxy_id ||
>> +				    ifpx_ports[p] == p)
>> +					continue;
>> +				ev->data.port_id = p;
>> +				done = cb.f_ptr(&ev->data) || done;
> Since callback are handled as DPDK interrupts, hope there is no event
> which gets lost. Cannot afford to loose a route change event as kernel
> might not send it again. 

We have some protection against this in form of netlink socket buffer.
In general, callbacks (as noted previously by Morten) can't block so
this should be fine - we might need to play around with SO_RCVBUF socket
option of the netlink socket but so far I have not experienced any problem.

> 
>> +			}
>> +		} else {
>> +			RTE_ASSERT(ev->type == RTE_IFPX_CFG_DONE);
>> +			done = cb.cfg_done();
>> +		}
>> +		rte_spinlock_lock(&ifpx_lock);
>> +	}
>> +	if (done)
>> +		goto exit;
>> +
>> +	/* Event not "consumed" yet so try to notify via queues. */
>> +	TAILQ_FOREACH(q, &ifpx_queues, elem) {
>> +		if (px) {
>> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
>> +				if (ifpx_ports[p] != proxy_id ||
>> +				    ifpx_ports[p] == p)
>> +					continue;
>> +				/* Set the port_id - the remaining params should
>> +				 * be filled before calling this function.
>> +				 */
>> +				ev->data.port_id = p;
>> +				queue_event(ev, q->r);
>> +			}
>> +		} else
>> +			queue_event(ev, q->r);
>> +	}
>> +exit:
>> +	if (px)
>> +		px->state &= ~IN_USE;
>> +}
>> +
>> +void ifpx_cleanup_proxies(void)
>> +{
>> +	struct ifpx_proxy_node *px, *next;
>> +	for (px = TAILQ_FIRST(&ifpx_proxies); px; px = next) {
>> +		next = TAILQ_NEXT(px, elem);
>> +		if (px->state & DEL_PENDING)
>> +			ifpx_proxy_destroy(px);
>> +	}
>> +}
>> +
>> +int rte_ifpx_listen(void)
>> +{
>> +	int ec;
>> +
>> +	if (!ifpx_platform.listen)
>> +		return -ENOTSUP;
>> +
>> +	ec = ifpx_platform.listen();
>> +	if (ec == 0 && ifpx_platform.get_info)
>> +		ifpx_platform.get_info(0);
> nlink_get_info calls request_info with a if_index, passing 0 might
> be good in current scenario but valid index should be passed to
> get_info.

0 is an invalid if_index (on Windows too) so I've used it to encode "all
interfaces".  This is related to your next comment.  So I'll expand on
this there.

[...]
>> +static
>> +int request_info(int type, int index)
>> +{
>> +	static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER;
>> +	struct info_get {
>> +		struct nlmsghdr h;
>> +		union {
>> +			struct ifinfomsg ifm;
>> +			struct ifaddrmsg ifa;
>> +			struct rtmsg rtm;
>> +			struct ndmsg ndm;
>> +		} __rte_aligned(NLMSG_ALIGNTO);
>> +	} info_req;
>> +	int ret;
>> +
>> +	memset(&info_req, 0, sizeof(info_req));
>> +	/* First byte of these messages is family, so just make sure that this
>> +	 * memset is enough to get all families.
>> +	 */
>> +	RTE_ASSERT(AF_UNSPEC == 0);
>> +
>> +	info_req.h.nlmsg_pid = ifpx_pid;
>> +	info_req.h.nlmsg_type = type;
>> +	info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
>> +	info_req.h.nlmsg_len = offsetof(struct info_get, ifm);
>> +
>> +	switch (type) {
>> +	case RTM_GETLINK:
>> +		info_req.h.nlmsg_len += sizeof(info_req.ifm);
>> +		info_req.ifm.ifi_index = index;
>> +		break;
>> +	case RTM_GETADDR:
>> +		info_req.h.nlmsg_len += sizeof(info_req.ifa);
>> +		info_req.ifa.ifa_index = index;
>> +		break;
>> +	case RTM_GETROUTE:
>> +		info_req.h.nlmsg_len += sizeof(info_req.rtm);
>> +		break;
>> +	case RTM_GETNEIGH:
>> +		info_req.h.nlmsg_len += sizeof(info_req.ndm);
>> +		break;
>> +	default:
>> +		IFPX_LOG(WARNING, "Unhandled message type: %d", type);
>> +		return -EINVAL;
>> +	}
>> +	/* Store request type (and if it is global or link specific) in 'seq'.
>> +	 * Later it is used during handling of reply to continue requesting of
>> +	 * information dump from system - if needed.
>> +	 */
>> +	info_req.h.nlmsg_seq = index << 8 | type;
>> +
>> +	IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index);
>> +
>> +	rte_spinlock_lock(&send_lock);
>> +	ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0);
>> +	if (ret < 0) {
>> +		IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno);
>> +		rte_errno = errno;
>> +	}
>> +	rte_spinlock_unlock(&send_lock);
>> +
>> +	return ret;
>> +}

[...]

>> +static
>> +void if_proxy_intr_callback(void *arg __rte_unused)
>> +{
>> +	struct nlmsghdr *h;
>> +	struct sockaddr_nl addr;
>> +	socklen_t addr_len;
>> +	char buf[8192];
>> +	ssize_t len;
>> +
>> +restart:
>> +	len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0,
>> +		       (struct sockaddr *)&addr, &addr_len);
>> +	if (len < 0) {
>> +		if (errno == EINTR) {
>> +			IFPX_LOG(DEBUG, "recvmsg() interrupted");
>> +			goto restart;
>> +		}
>> +		IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)",
>> +			 len, errno);
>> +		return;
>> +	}
>> +	if (addr_len != sizeof(addr)) {
>> +		IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len);
>> +		return;
>> +	}
>> +	IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len,
>> +		 sizeof(buf), addr.nl_pid, addr.nl_groups);
>> +
>> +	for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len);
>> +					 h = NLMSG_NEXT(h, len)) {
>> +		IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)",
>> +			 h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags,
>> +			 h->nlmsg_pid);
>> +
>> +		switch (h->nlmsg_type) {
>> +		case RTM_NEWLINK:
>> +		case RTM_DELLINK:
>> +			handle_link(h);
>> +			break;
>> +		case RTM_NEWADDR:
>> +		case RTM_DELADDR:
>> +			handle_addr(h, h->nlmsg_type == RTM_DELADDR);
>> +			break;
>> +		case RTM_NEWROUTE:
>> +		case RTM_DELROUTE:
>> +			handle_route(h, h->nlmsg_type == RTM_DELROUTE);
>> +			break;
>> +		case RTM_NEWNEIGH:
>> +		case RTM_DELNEIGH:
>> +			handle_neigh(h, h->nlmsg_type == RTM_DELNEIGH);
>> +			break;
>> +		}
>> +
>> +		/* If this is a reply for global request then follow up with
>> +		 * additional requests and notify about finish.
>> +		 */
>> +		if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 &&
>> +		    h->nlmsg_type == NLMSG_DONE) {
> Sorry, but in what scenario will the flow reach here.

OK.  So let me describe the initialization flow on Linux (the only
available implementation right now).  When we start listening we first
request dumping of the whole configuration.  We call get_info(0).  Again
this '0' is invalid if_index so is used as "all intefaces" value.

This index is written in Netlink msg headers and is coupled with
possible filtering of messages on kernel side (see comment in
nlink_listen() below).  When we request info we always use REQUEST|DUMP
flags but on newer kernels there is an option (when if_index is
non-zero) to send out only information for that interace instead of
dumping all info.  In addition it is encoded in nlmsg_seq.

So there are different types of info we get from kernel:
link/address/routing/neighbouring.  Instead of requesting them all at
once I do that sequentially and in get_info() I start with a request for
link info.

This code that you asked about above is a check that:
- this message is a direct reply to us (pid)
- and reply for global request (index = 0)
- and this is the last part of multi-segmented message (this is how
Linux dumps info - sends couple of messages with the additional "DONE"
msg at the end).

And the logic below is sequencing LINK->ADDR->ROUTE->NEIGH-> we are done
so notify the user about that.  That way we have at most one active
"transaction" with kernel.

>> +			if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK)
>> +				request_info(RTM_GETADDR, 0);
>> +			else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR)
>> +				request_info(RTM_GETROUTE, 0);
>> +			else if ((h->nlmsg_seq & 0xFF) == RTM_GETROUTE)
>> +				request_info(RTM_GETNEIGH, 0);
>> +			else {
>> +				struct rte_ifpx_event ev = {
>> +					.type = RTE_IFPX_CFG_DONE
>> +				};
>> +
>> +				RTE_ASSERT((h->nlmsg_seq & 0xFF) ==
>> +						RTM_GETNEIGH);
>> +				rte_spinlock_lock(&ifpx_lock);
>> +				ifpx_notify_event(&ev, NULL);
>> +				rte_spinlock_unlock(&ifpx_lock);
>> +			}
>> +		}
>> +	}
>> +	IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len);
>> +}
>> +
>> +static
>> +int nlink_listen(void)
>> +{
>> +	struct sockaddr_nl addr = {
>> +		.nl_family = AF_NETLINK,
>> +		.nl_pid = 0,
>> +	};
>> +	socklen_t addr_len = sizeof(addr);
>> +	int ret;
>> +
>> +	if (ifpx_irq.fd != -1) {
>> +		rte_errno = EBUSY;
>> +		return -1;
>> +	}
>> +
>> +	addr.nl_groups = 1 << (RTNLGRP_LINK-1)
>> +			| 1 << (RTNLGRP_NEIGH-1)
>> +			| 1 << (RTNLGRP_IPV4_IFADDR-1)
>> +			| 1 << (RTNLGRP_IPV6_IFADDR-1)
>> +			| 1 << (RTNLGRP_IPV4_ROUTE-1)
>> +			| 1 << (RTNLGRP_IPV6_ROUTE-1);
>> +
>> +	ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
>> +				 NETLINK_ROUTE);
>> +	if (ifpx_irq.fd == -1) {
>> +		IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno);
>> +		goto error;
>> +	}
>> +	/* Starting with kernel 4.19 you can request dump for a specific
>> +	 * interface and kernel will filter out and send only relevant info.
>> +	 * Otherwise NLM_F_DUMP will generate info for all interfaces and you
>> +	 * need to filter them yourself.
>> +	 */
>> +#ifdef NETLINK_DUMP_STRICT_CHK
>> +	ret = 1; /* use this var also as an input param */
>> +	ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK,
>> +			 &ret, sizeof(ret));
>> +	if (ret < 0) {
>> +		IFPX_LOG(ERR, "Failed to set socket option: %d", errno);
>> +		goto error;
>> +	}
>> +#endif
>> +
>> +	ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len);
>> +	if (ret < 0) {
>> +		IFPX_LOG(ERR, "Failed to bind socket: %d", errno);
>> +		goto error;
>> +	}
>> +	ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len);
>> +	if (ret < 0) {
>> +		IFPX_LOG(ERR, "Failed to get socket addr: %d", errno);
>> +		goto error;
>> +	} else {
>> +		ifpx_pid = addr.nl_pid;
>> +		IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid);
>> +	}
>> +
>> +	ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback,
>> +					 NULL);
>> +	if (ret == 0)
>> +		return 0;
>> +
>> +error:
>> +	rte_errno = errno;
>> +	if (ifpx_irq.fd != -1) {
>> +		close(ifpx_irq.fd);
>> +		ifpx_irq.fd = -1;
>> +	}
>> +	return -1;
>> +}
[...]
If you are playing with this library (running test case or the exemplary
application) and would like to have better view what is going on you can
add "--log=lib.if_proxy:debug" to the arguments list.

Thanks for taking a look at this.  The more people do this the better
this should be.  E.g. explaining initialization flow to you made me
realize that the there is another case where I request info which is not
handled well - normally user should bind the proxies and start
listening.  But if for some reason user binds proxy later, during
listening, I request info for that particular interface but
implementation will request only link level and will not follow with
request for other info.  I will fix this in the next version.

With regards
Andrzej Ostruszka
  
Varghese, Vipin April 1, 2020, 5:29 a.m. UTC | #3
snipped
> diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
> new file mode 100644
> index 000000000..43cb702a2
> --- /dev/null
> +++ b/lib/librte_if_proxy/Makefile
> @@ -0,0 +1,29 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2020 Marvell International Ltd.
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +# library name
> +LIB = librte_if_proxy.a
> +
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +CFLAGS += -O3
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
> +LDLIBS += -lrte_eal -lrte_ethdev
> +
> +EXPORT_MAP := rte_if_proxy_version.map
> +
> +LIBABIVER := 1
> +
> +# all source are stored in SRCS-y
> +SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := if_proxy_common.c
> +
> +SYSDIR := $(patsubst "%app",%,$(CONFIG_RTE_EXEC_ENV))
> +include $(SRCDIR)/$(SYSDIR)/Makefile
> +
Should there be check `ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)` and `ifeq ($(CONFIG_RTE_LIBRTE_TAP),y)`?

> +SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += $(addprefix $(SYSDIR)/,$(SRCS))
> +
> +# install this header file
> +SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
> +
> +include $(RTE_SDK)/mk/rte.lib.mk
Snipped

> +
> +uint64_t rte_ifpx_events_available(void)
> +{
> +	/* All events are supported on Linux. */
> +	return (1ULL << RTE_IFPX_NUM_EVENTS) - 1;
Should we give the available from the used count?

> +}
> +

Snipped

> +
> +void rte_ifpx_callbacks_unregister(void)
> +{
> +	rte_spinlock_lock(&ifpx_lock);
> +	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
What would happen to pending events, are agreeing to drop all?

> +	rte_spinlock_unlock(&ifpx_lock);
> +}
> +
> +uint16_t rte_ifpx_proxy_get(uint16_t port_id)
> +{
> +	if (port_id >= RTE_MAX_ETHPORTS)
> +		return RTE_MAX_ETHPORTS;
In the init function, the default value is set with RTE_MAX_ETHPORTS. Will there be a scenario port_id can be greater?

> +
> +	return ifpx_ports[port_id];
> +}
> +
> +unsigned int rte_ifpx_port_get(uint16_t proxy_id,
> +			       uint16_t *ports, unsigned int num)
> +{
> +	unsigned int p, cnt = 0;
> +
> +	for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +		if (ifpx_ports[p] == proxy_id && ifpx_ports[p] != p) {
> +			++cnt;
> +			if (ports && num > 0) {
> +				*ports++ = p;
> +				--num;
> +			}
> +		}
> +	}
Application can dynamically ports to DPDK. if this is correct, will this require lock to make this thread safe?

> +	return cnt;
> +}
> +
> +const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
> +{
> +	struct ifpx_proxy_node *px;
> +
> +	if (port_id >= RTE_MAX_ETHPORTS ||
> +	    ifpx_ports[port_id] == RTE_MAX_ETHPORTS)
> +		return NULL;
> +
> +	rte_spinlock_lock(&ifpx_lock);
> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
> +		if (px->proxy_id == ifpx_ports[port_id])
> +			break;
> +	}
> +	rte_spinlock_unlock(&ifpx_lock);
> +	RTE_ASSERT(px && "Internal IF Proxy library error");

Can you help me understand the assert logic with const string?

> +
> +	return &px->info;
> +}
> +
> +static
> +void queue_event(const struct rte_ifpx_event *ev, struct rte_ring *r)
> +{
> +	struct rte_ifpx_event *e = malloc(sizeof(*ev));
Is there specific reason not to use rte_malloc?

> +
> +	if (!e) {
> +		IFPX_LOG(ERR, "Failed to allocate event!");
> +		return;
> +	}
> +	RTE_ASSERT(r);
> +
> +	*e = *ev;
> +	rte_ring_sp_enqueue(r, e);
> +}
> +
> +void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px)
> +{
> +	struct ifpx_queue_node *q;
> +	int done = 0;
> +	uint16_t p, proxy_id;
> +
> +	if (px) {
> +		if (px->state & DEL_PENDING)
> +			return;
> +		proxy_id = px->proxy_id;
> +		RTE_ASSERT(proxy_id != RTE_MAX_ETHPORTS);
> +		px->state |= IN_USE;
> +	} else
> +		proxy_id = RTE_MAX_ETHPORTS;
> +
> +	RTE_ASSERT(ev);
> +	/* This function is expected to be called with a lock held. */
> +	RTE_ASSERT(rte_spinlock_trylock(&ifpx_lock) == 0);
> +
> +	if (ifpx_callbacks.funcs[ev->type].f_ptr) {
> +		union cb_ptr_t cb = ifpx_callbacks.funcs[ev->type];
> +
> +		/* Drop the lock for the time of callback call. */
> +		rte_spinlock_unlock(&ifpx_lock);
> +		if (px) {
> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +				if (ifpx_ports[p] != proxy_id ||
> +				    ifpx_ports[p] == p)
> +					continue;
> +				ev->data.port_id = p;
> +				done = cb.f_ptr(&ev->data) || done;
> +			}
> +		} else {
> +			RTE_ASSERT(ev->type == RTE_IFPX_CFG_DONE);
> +			done = cb.cfg_done();
> +		}
> +		rte_spinlock_lock(&ifpx_lock);
> +	}
> +	if (done)
> +		goto exit;
> +
> +	/* Event not "consumed" yet so try to notify via queues. */

Is there a chance when trying to use queues the events are consumed by method above by listener?

> +	TAILQ_FOREACH(q, &ifpx_queues, elem) {
> +		if (px) {
> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
> +				if (ifpx_ports[p] != proxy_id ||
> +				    ifpx_ports[p] == p)
> +					continue;
> +				/* Set the port_id - the remaining params
> should
> +				 * be filled before calling this function.
> +				 */
> +				ev->data.port_id = p;
> +				queue_event(ev, q->r);
> +			}
> +		} else
> +			queue_event(ev, q->r);
> +	}
> +exit:
> +	if (px)
> +		px->state &= ~IN_USE;
> +}

Snipped

> +
> +RTE_INIT(if_proxy_init)
> +{
> +	unsigned int i;

Is IF_PROXY supported for vdev also? 

> +	for (i = 0; i < RTE_DIM(ifpx_ports); ++i)
> +		ifpx_ports[i] = RTE_MAX_ETHPORTS;
> +
> +	ifpx_log_type = rte_log_register("lib.if_proxy");
> +	if (ifpx_log_type >= 0)
> +		rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
> +
> +	if (ifpx_platform.init)
> +		ifpx_platform.init();
> +}
Snipped

> +SRCS += if_proxy.c
> diff --git a/lib/librte_if_proxy/linux/if_proxy.c
> b/lib/librte_if_proxy/linux/if_proxy.c
> new file mode 100644
> index 000000000..bf851c096
> --- /dev/null
> +++ b/lib/librte_if_proxy/linux/if_proxy.c
> @@ -0,0 +1,552 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2020 Marvell International Ltd.
> + */

Assuming all the events are executed `if and only if` the current process if Primary? If it is secondary for physical interface certain `rte_eth_api` will fail. Can we have check the events are processed for primary only?

Snipped

> diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
> new file mode 100644
> index 000000000..f0c1a6e15
> --- /dev/null
> +++ b/lib/librte_if_proxy/meson.build
> @@ -0,0 +1,19 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2020 Marvell International Ltd.
> +
> +# Currently only implemented on Linux
> +if not is_linux
> +	build = false
> +	reason = 'only supported on linux'
> +endif
> +
> +version = 1
> +allow_experimental_apis = true
> +
> +deps += ['ethdev']
> +sources = files('if_proxy_common.c')
> +headers = files('rte_if_proxy.h')

Does the if_proxy have dependency on TAP and KNI. Should not we add check as ` if dpdk_conf.has('RTE_LIBRTE_KNI')` and ` if dpdk_conf.has('RTE_LIBRTE_TAP')`?

> +
> +if is_linux
> +	sources += files('linux/if_proxy.c')
> +endif

Snipped
  
Andrzej Ostruszka [C] April 1, 2020, 8:08 p.m. UTC | #4
First of all thank you Vipin for taking a look at this.

On 4/1/20 7:29 AM, Varghese, Vipin wrote:
> snipped
>> diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
>> new file mode 100644
>> index 000000000..43cb702a2
>> --- /dev/null
>> +++ b/lib/librte_if_proxy/Makefile
>> @@ -0,0 +1,29 @@
>> +# SPDX-License-Identifier: BSD-3-Clause
>> +# Copyright(C) 2020 Marvell International Ltd.
>> +
>> +include $(RTE_SDK)/mk/rte.vars.mk
>> +
>> +# library name
>> +LIB = librte_if_proxy.a
>> +
>> +CFLAGS += -DALLOW_EXPERIMENTAL_API
>> +CFLAGS += -O3
>> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
>> +LDLIBS += -lrte_eal -lrte_ethdev
>> +
>> +EXPORT_MAP := rte_if_proxy_version.map
>> +
>> +LIBABIVER := 1
>> +
>> +# all source are stored in SRCS-y
>> +SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := if_proxy_common.c
>> +
>> +SYSDIR := $(patsubst "%app",%,$(CONFIG_RTE_EXEC_ENV))
>> +include $(SRCDIR)/$(SYSDIR)/Makefile
>> +
> Should there be check `ifeq ($(CONFIG_RTE_LIBRTE_KNI),y)` and `ifeq ($(CONFIG_RTE_LIBRTE_TAP),y)`?

Might not be necessary.  While it is true that if you want to create
proxy via this lib, then currently it is only KNI or TAP.  However any
DPDK port can act as a proxy - as long as it is visible to the system
and reports non-zero if_index in its dev_info.

However it is true that if we allow building of if_proxy even if TAP/KNI
is not enabled then I should add conditionals to the proxy creation
function that would show some meaningful warning when they are not
enabled.  Will take a look at this.

>> +SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += $(addprefix $(SYSDIR)/,$(SRCS))
>> +
>> +# install this header file
>> +SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
>> +
>> +include $(RTE_SDK)/mk/rte.lib.mk
> Snipped
> 
>> +
>> +uint64_t rte_ifpx_events_available(void)
>> +{
>> +	/* All events are supported on Linux. */
>> +	return (1ULL << RTE_IFPX_NUM_EVENTS) - 1;
> Should we give the available from the used count?

I'm not sure I follow what you wanted to ask.  I want to return bitmask
with each bit being lit for every event type.  I could go with or'ing of
all (1ULL << RTE_IFPX_MAC_CHANGE) | (1ULL << RTE_IFPX_MTU_CHANGE) ...
but deemed that this would be simpler.

>> +}
>> +
> 
> Snipped
> 
>> +
>> +void rte_ifpx_callbacks_unregister(void)
>> +{
>> +	rte_spinlock_lock(&ifpx_lock);
>> +	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
> What would happen to pending events, are agreeing to drop all?

ifpx_events_notify() is called under the same lock.  So either someone
calls this unregister and then notify will not find any callback or the
other way.  Note that notify drops the lock for the time of callback
call (to allow modifications from the callback) but the pointer is first
copied - so the behaviour would be as if the unregister was called later.

I'm not sure I answered your question though - if not then please ask
again with some more details.

>> +	rte_spinlock_unlock(&ifpx_lock);
>> +}
>> +
>> +uint16_t rte_ifpx_proxy_get(uint16_t port_id)
>> +{
>> +	if (port_id >= RTE_MAX_ETHPORTS)
>> +		return RTE_MAX_ETHPORTS;
> In the init function, the default value is set with RTE_MAX_ETHPORTS. Will there be a scenario port_id can be greater?

Here port_id is an input from user - (s)he can make an error.
Internally this should never happen.

>> +
>> +	return ifpx_ports[port_id];
>> +}
>> +
>> +unsigned int rte_ifpx_port_get(uint16_t proxy_id,
>> +			       uint16_t *ports, unsigned int num)
>> +{
>> +	unsigned int p, cnt = 0;
>> +
>> +	for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
>> +		if (ifpx_ports[p] == proxy_id && ifpx_ports[p] != p) {
>> +			++cnt;
>> +			if (ports && num > 0) {
>> +				*ports++ = p;
>> +				--num;
>> +			}
>> +		}
>> +	}
> Application can dynamically ports to DPDK. if this is correct, will this require lock to make this thread safe?

This is a good point.  Currently ifpx_ports is not protected by the
lock.  Since this is a slow/control path I'll go with moving this under
lock instead of trying to make this lockless.

>> +	return cnt;
>> +}
>> +
>> +const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
>> +{
>> +	struct ifpx_proxy_node *px;
>> +
>> +	if (port_id >= RTE_MAX_ETHPORTS ||
>> +	    ifpx_ports[port_id] == RTE_MAX_ETHPORTS)
>> +		return NULL;
>> +
>> +	rte_spinlock_lock(&ifpx_lock);
>> +	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
>> +		if (px->proxy_id == ifpx_ports[port_id])
>> +			break;
>> +	}
>> +	rte_spinlock_unlock(&ifpx_lock);
>> +	RTE_ASSERT(px && "Internal IF Proxy library error");
> 
> Can you help me understand the assert logic with const string?

This is a practice sometimes used to have a meaningful error message
printed (together with an expression) while assertion fires.  The value
of expression does not depend on this string but the expression is
"stringified" in macro and printed on console so that way you can add
some message to the condition being checked.  I think this is the only
public function where I've used this - all internal ASSERTS have no
message so I might drop it here if you want.

>> +
>> +	return &px->info;
>> +}
>> +
>> +static
>> +void queue_event(const struct rte_ifpx_event *ev, struct rte_ring *r)
>> +{
>> +	struct rte_ifpx_event *e = malloc(sizeof(*ev));
> Is there specific reason not to use rte_malloc?

Not really - that was actually a question of mine recently on this list.
This is a slow/control path, so maybe we should save hugepage memory for
the fast path?  I have no strong opinion here and can switch to
rte_malloc() if that is thought as a better option.

>> +
>> +	if (!e) {
>> +		IFPX_LOG(ERR, "Failed to allocate event!");
>> +		return;
>> +	}
>> +	RTE_ASSERT(r);
>> +
>> +	*e = *ev;
>> +	rte_ring_sp_enqueue(r, e);
>> +}
>> +
>> +void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px)
>> +{
>> +	struct ifpx_queue_node *q;
>> +	int done = 0;
>> +	uint16_t p, proxy_id;
>> +
>> +	if (px) {
>> +		if (px->state & DEL_PENDING)
>> +			return;
>> +		proxy_id = px->proxy_id;
>> +		RTE_ASSERT(proxy_id != RTE_MAX_ETHPORTS);
>> +		px->state |= IN_USE;
>> +	} else
>> +		proxy_id = RTE_MAX_ETHPORTS;
>> +
>> +	RTE_ASSERT(ev);
>> +	/* This function is expected to be called with a lock held. */
>> +	RTE_ASSERT(rte_spinlock_trylock(&ifpx_lock) == 0);
>> +
>> +	if (ifpx_callbacks.funcs[ev->type].f_ptr) {
>> +		union cb_ptr_t cb = ifpx_callbacks.funcs[ev->type];
>> +
>> +		/* Drop the lock for the time of callback call. */
>> +		rte_spinlock_unlock(&ifpx_lock);
>> +		if (px) {
>> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
>> +				if (ifpx_ports[p] != proxy_id ||
>> +				    ifpx_ports[p] == p)
>> +					continue;
>> +				ev->data.port_id = p;
>> +				done = cb.f_ptr(&ev->data) || done;
>> +			}
>> +		} else {
>> +			RTE_ASSERT(ev->type == RTE_IFPX_CFG_DONE);
>> +			done = cb.cfg_done();
>> +		}
>> +		rte_spinlock_lock(&ifpx_lock);
>> +	}
>> +	if (done)
>> +		goto exit;
>> +
>> +	/* Event not "consumed" yet so try to notify via queues. */
> 
> Is there a chance when trying to use queues the events are consumed by method above by listener?

This is fully under control of application - if application wants
certain events to be notified by the queues then either it should not
register callback for that event type or, if it registers, then this
callback should not return non-zero value (just do some common
preparation or something like that).

>> +	TAILQ_FOREACH(q, &ifpx_queues, elem) {
>> +		if (px) {
>> +			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
>> +				if (ifpx_ports[p] != proxy_id ||
>> +				    ifpx_ports[p] == p)
>> +					continue;
>> +				/* Set the port_id - the remaining params
>> should
>> +				 * be filled before calling this function.
>> +				 */
>> +				ev->data.port_id = p;
>> +				queue_event(ev, q->r);
>> +			}
>> +		} else
>> +			queue_event(ev, q->r);
>> +	}
>> +exit:
>> +	if (px)
>> +		px->state &= ~IN_USE;
>> +}
> 
> Snipped
> 
>> +
>> +RTE_INIT(if_proxy_init)
>> +{
>> +	unsigned int i;
> 
> Is IF_PROXY supported for vdev also?

I'm not sure I understand the question here.  Any port can be bound to a
proxy (vdev or not) and any port visible to system (having non-zero
if_index in dev_info) can be used as a proxy.  Does that answers your
question?  If not please explain.

>> +	for (i = 0; i < RTE_DIM(ifpx_ports); ++i)
>> +		ifpx_ports[i] = RTE_MAX_ETHPORTS;
>> +
>> +	ifpx_log_type = rte_log_register("lib.if_proxy");
>> +	if (ifpx_log_type >= 0)
>> +		rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
>> +
>> +	if (ifpx_platform.init)
>> +		ifpx_platform.init();
>> +}
> Snipped
> 
>> +SRCS += if_proxy.c
>> diff --git a/lib/librte_if_proxy/linux/if_proxy.c
>> b/lib/librte_if_proxy/linux/if_proxy.c
>> new file mode 100644
>> index 000000000..bf851c096
>> --- /dev/null
>> +++ b/lib/librte_if_proxy/linux/if_proxy.c
>> @@ -0,0 +1,552 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2020 Marvell International Ltd.
>> + */
> 
> Assuming all the events are executed `if and only if` the current process if Primary? If it is secondary for physical interface certain `rte_eth_api` will fail. Can we have check the events are processed for primary only?

Yes that was my assumption however at the moment I'm using:
- rte_eth_iterator_init/next/cleanup()
- rte_eth_dev_info_get()
- rte_eth_dev_get_mtu()
- rte_eth_macaddr_get()
- rte_eth_dev_mac_addr_add()
- rte_dev_probe/remove()

Is there a problem with these?  If it is, then I'll think about adding
check for secondary.

> Snipped
> 
>> diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
>> new file mode 100644
>> index 000000000..f0c1a6e15
>> --- /dev/null
>> +++ b/lib/librte_if_proxy/meson.build
>> @@ -0,0 +1,19 @@
>> +# SPDX-License-Identifier: BSD-3-Clause
>> +# Copyright(C) 2020 Marvell International Ltd.
>> +
>> +# Currently only implemented on Linux
>> +if not is_linux
>> +	build = false
>> +	reason = 'only supported on linux'
>> +endif
>> +
>> +version = 1
>> +allow_experimental_apis = true
>> +
>> +deps += ['ethdev']
>> +sources = files('if_proxy_common.c')
>> +headers = files('rte_if_proxy.h')
> 
> Does the if_proxy have dependency on TAP and KNI. Should not we add check as ` if dpdk_conf.has('RTE_LIBRTE_KNI')` and ` if dpdk_conf.has('RTE_LIBRTE_TAP')`?

This is the same as for Makefile - I think I'll go with allowing it to
build but adding conditionals in proxy creation.  However if you and/or
others think it would be better to skip build then I will adapt.

>> +
>> +if is_linux
>> +	sources += files('linux/if_proxy.c')
>> +endif
> 
> Snipped
> 

Thanks for reviewing this.

With regards
Andrzej
  
Varghese, Vipin April 8, 2020, 3:04 a.m. UTC | #5
Hi Andrzej,

Thanks for the reply. Please find explanations for some of the queries 

snipped
> >> +uint64_t rte_ifpx_events_available(void) {
> >> +	/* All events are supported on Linux. */
> >> +	return (1ULL << RTE_IFPX_NUM_EVENTS) - 1;
> > Should we give the available from the used count?
> 
> I'm not sure I follow what you wanted to ask.  I want to return bitmask with
> each bit being lit for every event type.  I could go with or'ing of all (1ULL <<
> RTE_IFPX_MAC_CHANGE) | (1ULL << RTE_IFPX_MTU_CHANGE) ...
> but deemed that this would be simpler.

I assume the function `rte_ifpx_events_available` returns current available events. That is at time t0, if we have used 3 events the return of function will give back ` return ((1ULL << RTE_IFPX_NUM_EVENTS) - 1 -  ifpx_consumed_events);`.

Snipped
> >
> >> +
> >> +void rte_ifpx_callbacks_unregister(void)
> >> +{
> >> +	rte_spinlock_lock(&ifpx_lock);
> >> +	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
> > What would happen to pending events, are agreeing to drop all?
> 
> ifpx_events_notify() is called under the same lock.  So either someone calls this
> unregister and then notify will not find any callback or the other way.  Note
> that notify drops the lock for the time of callback call (to allow modifications
> from the callback) but the pointer is first copied - so the behaviour would be as
> if the unregister was called later.
> 
> I'm not sure I answered your question though - if not then please ask again
> with some more details.

Let us assume we have 3 callbacks to service for event_a namely cb-1, cb-2, and cb-3. So tail-list cb-1->cb-2->cb3, the user invoked unregister. What will happen to the 3 events? Should we finish the 3 callback handler and then remove.

snipped
> > Assuming all the events are executed `if and only if` the current process if
> Primary? If it is secondary for physical interface certain `rte_eth_api` will fail.
> Can we have check the events are processed for primary only?
> 
> Yes that was my assumption however at the moment I'm using:
> - rte_eth_iterator_init/next/cleanup()
> - rte_eth_dev_info_get()
> - rte_eth_dev_get_mtu()
> - rte_eth_macaddr_get()
> - rte_eth_dev_mac_addr_add()
> - rte_dev_probe/remove()
> 
> Is there a problem with these?  If it is, then I'll think about adding check for
> secondary.
Based on my limited testing with PF and VF, certain functions works and other do not. In case of TUN PMD set/get mac_addr is not present.
  
Andrzej Ostruszka [C] April 8, 2020, 6:13 p.m. UTC | #6
On 4/8/20 5:04 AM, Varghese, Vipin wrote:
> Hi Andrzej,
> 
> Thanks for the reply. Please find explanations for some of the queries 
> 
> snipped
>>>> +uint64_t rte_ifpx_events_available(void) {
>>>> +	/* All events are supported on Linux. */
>>>> +	return (1ULL << RTE_IFPX_NUM_EVENTS) - 1;
>>> Should we give the available from the used count?
>>
>> I'm not sure I follow what you wanted to ask.  I want to return bitmask with
>> each bit being lit for every event type.  I could go with or'ing of all (1ULL <<
>> RTE_IFPX_MAC_CHANGE) | (1ULL << RTE_IFPX_MTU_CHANGE) ...
>> but deemed that this would be simpler.
> 
> I assume the function `rte_ifpx_events_available` returns current available events. That is at time t0, if we have used 3 events the return of function will give back ` return ((1ULL << RTE_IFPX_NUM_EVENTS) - 1 -  ifpx_consumed_events);`.

It returns events available on given platform - static thing, dependent
on the implementation of IF Proxy (currently only Linux supported though
- and it supports all events defined so far).

>>>> +
>>>> +void rte_ifpx_callbacks_unregister(void)
>>>> +{
>>>> +	rte_spinlock_lock(&ifpx_lock);
>>>> +	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
>>> What would happen to pending events, are agreeing to drop all?
>>
>> ifpx_events_notify() is called under the same lock.  So either someone calls this
>> unregister and then notify will not find any callback or the other way.  Note
>> that notify drops the lock for the time of callback call (to allow modifications
>> from the callback) but the pointer is first copied - so the behaviour would be as
>> if the unregister was called later.
>>
>> I'm not sure I answered your question though - if not then please ask again
>> with some more details.
> 
> Let us assume we have 3 callbacks to service for event_a namely cb-1, cb-2, and cb-3. So tail-list cb-1->cb-2->cb3, the user invoked unregister. What will happen to the 3 events? Should we finish the 3 callback handler and then remove.

Hhhmmm, have you been reviewing latest version?  With the introduction
of event queues there is now only one global set of callbacks (no list),
so only 1 callback for each possible event type.

>>> Assuming all the events are executed `if and only if` the current process if
>> Primary? If it is secondary for physical interface certain `rte_eth_api` will fail.
>> Can we have check the events are processed for primary only?
>>
>> Yes that was my assumption however at the moment I'm using:
>> - rte_eth_iterator_init/next/cleanup()
>> - rte_eth_dev_info_get()
>> - rte_eth_dev_get_mtu()
>> - rte_eth_macaddr_get()
>> - rte_eth_dev_mac_addr_add()
>> - rte_dev_probe/remove()
>>
>> Is there a problem with these?  If it is, then I'll think about adding check for
>> secondary.
> Based on my limited testing with PF and VF, certain functions works and other do not. In case of TUN PMD set/get mac_addr is not present.

TUN is not being used (for that reason) - only TAP.  I could add check
for PRIMARY, but that way I would be artificially excluding cases where
that would work without the change.  So for now I intend to leave things
like they are and address the actual problem (if it pops up).  Note also
that I'm not checking errors for the mac_get/set so if given
functionality is not supported nothing will happen.

With regards
Andrzej Ostruszka
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index f4e0ed8e0..aec7326ca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1469,6 +1469,9 @@  F: examples/bpf/
 F: app/test/test_bpf.c
 F: doc/guides/prog_guide/bpf_lib.rst
 
+IF Proxy - EXPERIMENTAL
+M: Andrzej Ostruszka <aostruszka@marvell.com>
+F: lib/librte_if_proxy/
 
 Test Applications
 -----------------
diff --git a/config/common_base b/config/common_base
index 7ca2f28b1..dcc0a0650 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1075,6 +1075,11 @@  CONFIG_RTE_LIBRTE_BPF_ELF=n
 #
 CONFIG_RTE_LIBRTE_IPSEC=y
 
+#
+# Compile librte_if_proxy
+#
+CONFIG_RTE_LIBRTE_IF_PROXY=n
+
 #
 # Compile the test application
 #
diff --git a/config/common_linux b/config/common_linux
index 816810671..1244eb0ae 100644
--- a/config/common_linux
+++ b/config/common_linux
@@ -16,6 +16,7 @@  CONFIG_RTE_LIBRTE_VHOST_NUMA=y
 CONFIG_RTE_LIBRTE_VHOST_POSTCOPY=n
 CONFIG_RTE_LIBRTE_PMD_VHOST=y
 CONFIG_RTE_LIBRTE_IFC_PMD=y
+CONFIG_RTE_LIBRTE_IF_PROXY=y
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
 CONFIG_RTE_LIBRTE_PMD_MEMIF=y
 CONFIG_RTE_LIBRTE_PMD_SOFTNIC=y
diff --git a/lib/Makefile b/lib/Makefile
index 46b91ae1a..6a20806f1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -118,6 +118,8 @@  DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
 DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
 DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu
 DEPDIRS-librte_rcu := librte_eal
+DIRS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += librte_if_proxy
+DEPDIRS-librte_if_proxy := librte_eal librte_ethdev
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUX),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h b/lib/librte_eal/common/include/rte_eal_interrupts.h
index 773a34a42..296a3853d 100644
--- a/lib/librte_eal/common/include/rte_eal_interrupts.h
+++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
@@ -36,6 +36,8 @@  enum rte_intr_handle_type {
 	RTE_INTR_HANDLE_VDEV,         /**< virtual device */
 	RTE_INTR_HANDLE_DEV_EVENT,    /**< device event handle */
 	RTE_INTR_HANDLE_VFIO_REQ,     /**< VFIO request handle */
+	RTE_INTR_HANDLE_NETLINK,      /**< netlink notification handle */
+
 	RTE_INTR_HANDLE_MAX           /**< count of elements */
 };
 
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
index cb8e10709..16236a8c4 100644
--- a/lib/librte_eal/linux/eal/eal_interrupts.c
+++ b/lib/librte_eal/linux/eal/eal_interrupts.c
@@ -680,6 +680,9 @@  rte_intr_enable(const struct rte_intr_handle *intr_handle)
 		break;
 	/* not used at this moment */
 	case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+	case RTE_INTR_HANDLE_NETLINK:
+#endif
 		return -1;
 #ifdef VFIO_PRESENT
 	case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -796,6 +799,9 @@  rte_intr_disable(const struct rte_intr_handle *intr_handle)
 		break;
 	/* not used at this moment */
 	case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+	case RTE_INTR_HANDLE_NETLINK:
+#endif
 		return -1;
 #ifdef VFIO_PRESENT
 	case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -889,12 +895,12 @@  eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 			break;
 #endif
 #endif
-		case RTE_INTR_HANDLE_VDEV:
 		case RTE_INTR_HANDLE_EXT:
-			bytes_read = 0;
-			call = true;
-			break;
+		case RTE_INTR_HANDLE_VDEV:
 		case RTE_INTR_HANDLE_DEV_EVENT:
+#if RTE_LIBRTE_IF_PROXY
+		case RTE_INTR_HANDLE_NETLINK:
+#endif
 			bytes_read = 0;
 			call = true;
 			break;
diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
new file mode 100644
index 000000000..43cb702a2
--- /dev/null
+++ b/lib/librte_if_proxy/Makefile
@@ -0,0 +1,29 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2020 Marvell International Ltd.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_if_proxy.a
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+LDLIBS += -lrte_eal -lrte_ethdev
+
+EXPORT_MAP := rte_if_proxy_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := if_proxy_common.c
+
+SYSDIR := $(patsubst "%app",%,$(CONFIG_RTE_EXEC_ENV))
+include $(SRCDIR)/$(SYSDIR)/Makefile
+
+SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += $(addprefix $(SYSDIR)/,$(SRCS))
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_if_proxy/if_proxy_common.c b/lib/librte_if_proxy/if_proxy_common.c
new file mode 100644
index 000000000..230727d0c
--- /dev/null
+++ b/lib/librte_if_proxy/if_proxy_common.c
@@ -0,0 +1,494 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#include <if_proxy_priv.h>
+#include <rte_string_fns.h>
+
+
+/* Definitions of data mentioned in if_proxy_priv.h and local ones. */
+int ifpx_log_type;
+
+uint16_t ifpx_ports[RTE_MAX_ETHPORTS];
+
+rte_spinlock_t ifpx_lock = RTE_SPINLOCK_INITIALIZER;
+
+struct ifpx_proxies_head ifpx_proxies = TAILQ_HEAD_INITIALIZER(ifpx_proxies);
+
+struct ifpx_queue_node {
+	TAILQ_ENTRY(ifpx_queue_node) elem;
+	uint16_t state;
+	struct rte_ring *r;
+};
+static
+TAILQ_HEAD(ifpx_queues_head, ifpx_queue_node) ifpx_queues =
+		TAILQ_HEAD_INITIALIZER(ifpx_queues);
+
+/* All function pointers have the same size - so use this one to typecast
+ * different callbacks in rte_ifpx_callbacks and test their presence in a
+ * generic way.
+ */
+union cb_ptr_t {
+	int (*f_ptr)(void*);   /* type for normal event notification */
+	int (*cfg_done)(void); /* lib notification for finished config */
+};
+union {
+	struct rte_ifpx_callbacks cbs;
+	union cb_ptr_t funcs[RTE_IFPX_NUM_EVENTS];
+} ifpx_callbacks;
+
+uint64_t rte_ifpx_events_available(void)
+{
+	/* All events are supported on Linux. */
+	return (1ULL << RTE_IFPX_NUM_EVENTS) - 1;
+}
+
+uint16_t rte_ifpx_proxy_create(enum rte_ifpx_proxy_type type)
+{
+	char devargs[16] = { '\0' };
+	int dev_cnt = 0, nlen;
+	uint16_t port_id;
+
+	switch (type) {
+	case RTE_IFPX_DEFAULT:
+	case RTE_IFPX_TAP:
+		nlen = strlcpy(devargs, "net_tap", sizeof(devargs));
+		break;
+	case RTE_IFPX_KNI:
+		nlen = strlcpy(devargs, "net_kni", sizeof(devargs));
+		break;
+	default:
+		IFPX_LOG(ERR, "Unknown proxy type: %d", type);
+		return RTE_MAX_ETHPORTS;
+	}
+
+	RTE_ETH_FOREACH_DEV(port_id) {
+		if (strcmp(rte_eth_devices[port_id].device->driver->name,
+			   devargs) == 0)
+			++dev_cnt;
+	}
+	snprintf(devargs+nlen, sizeof(devargs)-nlen, "%d", dev_cnt);
+
+	return rte_ifpx_proxy_create_by_devarg(devargs);
+}
+
+uint16_t rte_ifpx_proxy_create_by_devarg(const char *devarg)
+{
+	uint16_t port_id = RTE_MAX_ETHPORTS;
+	struct rte_dev_iterator iter;
+
+	if (rte_dev_probe(devarg) < 0) {
+		IFPX_LOG(ERR, "Failed to create proxy port %s\n", devarg);
+		return RTE_MAX_ETHPORTS;
+	}
+
+	if (rte_eth_iterator_init(&iter, devarg) == 0) {
+		port_id = rte_eth_iterator_next(&iter);
+		if (port_id != RTE_MAX_ETHPORTS)
+			rte_eth_iterator_cleanup(&iter);
+	}
+
+	return port_id;
+}
+
+int ifpx_proxy_destroy(struct ifpx_proxy_node *px)
+{
+	unsigned int i;
+	uint16_t proxy_id = px->proxy_id;
+
+	TAILQ_REMOVE(&ifpx_proxies, px, elem);
+	free(px);
+
+	/* Clear any bindings for this proxy. */
+	for (i = 0; i < RTE_DIM(ifpx_ports); ++i) {
+		if (ifpx_ports[i] == proxy_id) {
+			if (i == proxy_id) /* this entry is for proxy itself */
+				ifpx_ports[i] = RTE_MAX_ETHPORTS;
+			else
+				rte_ifpx_port_unbind(i);
+		}
+	}
+
+	return rte_dev_remove(rte_eth_devices[proxy_id].device);
+}
+
+int rte_ifpx_proxy_destroy(uint16_t proxy_id)
+{
+	struct ifpx_proxy_node *px;
+	int ec = 0;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->proxy_id != proxy_id)
+			continue;
+	}
+	if (!px) {
+		ec = -EINVAL;
+		goto exit;
+	}
+	if (px->state & IN_USE)
+		px->state |= DEL_PENDING;
+	else
+		ec = ifpx_proxy_destroy(px);
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+	return ec;
+}
+
+int rte_ifpx_queue_add(struct rte_ring *r)
+{
+	struct ifpx_queue_node *node;
+	int ec = 0;
+
+	if (!r)
+		return -EINVAL;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(node, &ifpx_queues, elem) {
+		if (node->r == r) {
+			ec = -EEXIST;
+			goto exit;
+		}
+	}
+
+	node = malloc(sizeof(*node));
+	if (!node) {
+		ec = -ENOMEM;
+		goto exit;
+	}
+
+	node->r = r;
+	TAILQ_INSERT_TAIL(&ifpx_queues, node, elem);
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return ec;
+}
+
+int rte_ifpx_queue_remove(struct rte_ring *r)
+{
+	struct ifpx_queue_node *node, *next;
+	int ec = -EINVAL;
+
+	if (!r)
+		return ec;
+
+	rte_spinlock_lock(&ifpx_lock);
+	for (node = TAILQ_FIRST(&ifpx_queues); node; node = next) {
+		next = TAILQ_NEXT(node, elem);
+		if (node->r != r)
+			continue;
+		TAILQ_REMOVE(&ifpx_queues, node, elem);
+		free(node);
+		ec = 0;
+		break;
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return ec;
+}
+
+int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id)
+{
+	struct rte_eth_dev_info proxy_eth_info;
+	struct ifpx_proxy_node *px;
+	int ec;
+
+	if (port_id >= RTE_MAX_ETHPORTS || proxy_id >= RTE_MAX_ETHPORTS ||
+	    /* port is a proxy */
+	    ifpx_ports[port_id] == port_id) {
+		IFPX_LOG(ERR, "Invalid port_id: %d", port_id);
+		return -EINVAL;
+	}
+
+	/* Do automatic rebinding but issue a warning since this is not
+	 * considered to be a valid behaviour.
+	 */
+	if (ifpx_ports[port_id] != RTE_MAX_ETHPORTS) {
+		IFPX_LOG(WARNING, "Port already bound: %d -> %d", port_id,
+			 ifpx_ports[port_id]);
+	}
+
+	/* Search for existing proxy - if not found add one to the list. */
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->proxy_id == proxy_id)
+			break;
+	}
+	if (!px) {
+		ec = rte_eth_dev_info_get(proxy_id, &proxy_eth_info);
+		if (ec < 0 || proxy_eth_info.if_index == 0) {
+			IFPX_LOG(ERR, "Invalid proxy: %d", proxy_id);
+			rte_spinlock_unlock(&ifpx_lock);
+			return ec < 0 ? ec : -EINVAL;
+		}
+		px = malloc(sizeof(*px));
+		if (!px) {
+			rte_spinlock_unlock(&ifpx_lock);
+			return -ENOMEM;
+		}
+		px->proxy_id = proxy_id;
+		px->info.if_index = proxy_eth_info.if_index;
+		rte_eth_dev_get_mtu(proxy_id, &px->info.mtu);
+		rte_eth_macaddr_get(proxy_id, &px->info.mac);
+		memset(px->info.if_name, 0, sizeof(px->info.if_name));
+		TAILQ_INSERT_TAIL(&ifpx_proxies, px, elem);
+		ifpx_ports[proxy_id] = proxy_id;
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+	ifpx_ports[port_id] = proxy_id;
+
+	/* Add proxy MAC to the port - since port will often just forward
+	 * packets from the proxy/system they will be sent with proxy MAC as
+	 * src.  In order to pass communication in other direction we should be
+	 * accepting packets with proxy MAC as dst.
+	 */
+	rte_eth_dev_mac_addr_add(port_id, &px->info.mac, 0);
+
+	if (ifpx_platform.get_info)
+		ifpx_platform.get_info(px->info.if_index);
+
+	return 0;
+}
+
+int rte_ifpx_port_unbind(uint16_t port_id)
+{
+	if (port_id >= RTE_MAX_ETHPORTS ||
+	    ifpx_ports[port_id] == RTE_MAX_ETHPORTS ||
+	    /* port is a proxy */
+	    ifpx_ports[port_id] == port_id)
+		return -EINVAL;
+
+	ifpx_ports[port_id] = RTE_MAX_ETHPORTS;
+	/* Proxy without any port bound is OK - that is the state of the proxy
+	 * that has just been created, and it can still report routing
+	 * information.  So we do not even check if this is the case.
+	 */
+
+	return 0;
+}
+
+int rte_ifpx_callbacks_register(const struct rte_ifpx_callbacks *cbs)
+{
+	if (!cbs)
+		return -EINVAL;
+
+	rte_spinlock_lock(&ifpx_lock);
+	ifpx_callbacks.cbs = *cbs;
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return 0;
+}
+
+void rte_ifpx_callbacks_unregister(void)
+{
+	rte_spinlock_lock(&ifpx_lock);
+	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+uint16_t rte_ifpx_proxy_get(uint16_t port_id)
+{
+	if (port_id >= RTE_MAX_ETHPORTS)
+		return RTE_MAX_ETHPORTS;
+
+	return ifpx_ports[port_id];
+}
+
+unsigned int rte_ifpx_port_get(uint16_t proxy_id,
+			       uint16_t *ports, unsigned int num)
+{
+	unsigned int p, cnt = 0;
+
+	for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
+		if (ifpx_ports[p] == proxy_id && ifpx_ports[p] != p) {
+			++cnt;
+			if (ports && num > 0) {
+				*ports++ = p;
+				--num;
+			}
+		}
+	}
+	return cnt;
+}
+
+const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
+{
+	struct ifpx_proxy_node *px;
+
+	if (port_id >= RTE_MAX_ETHPORTS ||
+	    ifpx_ports[port_id] == RTE_MAX_ETHPORTS)
+		return NULL;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->proxy_id == ifpx_ports[port_id])
+			break;
+	}
+	rte_spinlock_unlock(&ifpx_lock);
+	RTE_ASSERT(px && "Internal IF Proxy library error");
+
+	return &px->info;
+}
+
+static
+void queue_event(const struct rte_ifpx_event *ev, struct rte_ring *r)
+{
+	struct rte_ifpx_event *e = malloc(sizeof(*ev));
+
+	if (!e) {
+		IFPX_LOG(ERR, "Failed to allocate event!");
+		return;
+	}
+	RTE_ASSERT(r);
+
+	*e = *ev;
+	rte_ring_sp_enqueue(r, e);
+}
+
+void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px)
+{
+	struct ifpx_queue_node *q;
+	int done = 0;
+	uint16_t p, proxy_id;
+
+	if (px) {
+		if (px->state & DEL_PENDING)
+			return;
+		proxy_id = px->proxy_id;
+		RTE_ASSERT(proxy_id != RTE_MAX_ETHPORTS);
+		px->state |= IN_USE;
+	} else
+		proxy_id = RTE_MAX_ETHPORTS;
+
+	RTE_ASSERT(ev);
+	/* This function is expected to be called with a lock held. */
+	RTE_ASSERT(rte_spinlock_trylock(&ifpx_lock) == 0);
+
+	if (ifpx_callbacks.funcs[ev->type].f_ptr) {
+		union cb_ptr_t cb = ifpx_callbacks.funcs[ev->type];
+
+		/* Drop the lock for the time of callback call. */
+		rte_spinlock_unlock(&ifpx_lock);
+		if (px) {
+			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
+				if (ifpx_ports[p] != proxy_id ||
+				    ifpx_ports[p] == p)
+					continue;
+				ev->data.port_id = p;
+				done = cb.f_ptr(&ev->data) || done;
+			}
+		} else {
+			RTE_ASSERT(ev->type == RTE_IFPX_CFG_DONE);
+			done = cb.cfg_done();
+		}
+		rte_spinlock_lock(&ifpx_lock);
+	}
+	if (done)
+		goto exit;
+
+	/* Event not "consumed" yet so try to notify via queues. */
+	TAILQ_FOREACH(q, &ifpx_queues, elem) {
+		if (px) {
+			for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
+				if (ifpx_ports[p] != proxy_id ||
+				    ifpx_ports[p] == p)
+					continue;
+				/* Set the port_id - the remaining params should
+				 * be filled before calling this function.
+				 */
+				ev->data.port_id = p;
+				queue_event(ev, q->r);
+			}
+		} else
+			queue_event(ev, q->r);
+	}
+exit:
+	if (px)
+		px->state &= ~IN_USE;
+}
+
+void ifpx_cleanup_proxies(void)
+{
+	struct ifpx_proxy_node *px, *next;
+	for (px = TAILQ_FIRST(&ifpx_proxies); px; px = next) {
+		next = TAILQ_NEXT(px, elem);
+		if (px->state & DEL_PENDING)
+			ifpx_proxy_destroy(px);
+	}
+}
+
+int rte_ifpx_listen(void)
+{
+	int ec;
+
+	if (!ifpx_platform.listen)
+		return -ENOTSUP;
+
+	ec = ifpx_platform.listen();
+	if (ec == 0 && ifpx_platform.get_info)
+		ifpx_platform.get_info(0);
+
+	return ec;
+}
+
+int rte_ifpx_close(void)
+{
+	struct ifpx_proxy_node *px;
+	struct ifpx_queue_node *q;
+	unsigned int p;
+	int ec = 0;
+
+	if (ifpx_platform.close) {
+		ec = ifpx_platform.close();
+		if (ec != 0)
+			IFPX_LOG(ERR, "Platform 'close' calback failed.");
+	}
+
+	rte_spinlock_lock(&ifpx_lock);
+	/* Remove queues. */
+	while (!TAILQ_EMPTY(&ifpx_queues)) {
+		q = TAILQ_FIRST(&ifpx_queues);
+		TAILQ_REMOVE(&ifpx_queues, q, elem);
+		free(q);
+	}
+
+	/* Clear callbacks. */
+	memset(&ifpx_callbacks.cbs, 0, sizeof(ifpx_callbacks.cbs));
+
+	/* Unbind ports. */
+	for (p = 0; p < RTE_DIM(ifpx_ports); ++p) {
+		if (ifpx_ports[p] == RTE_MAX_ETHPORTS)
+			continue;
+		if (ifpx_ports[p] == p)
+			/* port is a proxy - just clear entry */
+			ifpx_ports[p] = RTE_MAX_ETHPORTS;
+		else
+			rte_ifpx_port_unbind(p);
+	}
+
+	/* Clear proxies. */
+	while (!TAILQ_EMPTY(&ifpx_proxies)) {
+		px = TAILQ_FIRST(&ifpx_proxies);
+		TAILQ_REMOVE(&ifpx_proxies, px, elem);
+		free(px);
+	}
+
+	rte_spinlock_unlock(&ifpx_lock);
+
+	return ec;
+}
+
+RTE_INIT(if_proxy_init)
+{
+	unsigned int i;
+	for (i = 0; i < RTE_DIM(ifpx_ports); ++i)
+		ifpx_ports[i] = RTE_MAX_ETHPORTS;
+
+	ifpx_log_type = rte_log_register("lib.if_proxy");
+	if (ifpx_log_type >= 0)
+		rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
+
+	if (ifpx_platform.init)
+		ifpx_platform.init();
+}
diff --git a/lib/librte_if_proxy/if_proxy_priv.h b/lib/librte_if_proxy/if_proxy_priv.h
new file mode 100644
index 000000000..2fbf9127a
--- /dev/null
+++ b/lib/librte_if_proxy/if_proxy_priv.h
@@ -0,0 +1,97 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+#ifndef _IF_PROXY_PRIV_H_
+#define _IF_PROXY_PRIV_H_
+
+#include <rte_if_proxy.h>
+#include <rte_spinlock.h>
+
+extern int ifpx_log_type;
+#define IFPX_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, ifpx_log_type, "%s(): " fmt "\n", \
+		__func__, ##args)
+
+/* Table keeping mapping between port and their proxies. */
+extern
+uint16_t ifpx_ports[RTE_MAX_ETHPORTS];
+
+/* Callbacks and proxies are kept in linked lists.  Since this library is really
+ * a slow/config path we guard them with a lock - and only one for all of them
+ * should be enough.  We don't expect a need to protect other data structures -
+ * e.g. data for given port is expected be accessed/modified from single thread.
+ */
+extern rte_spinlock_t ifpx_lock;
+
+enum ifpx_node_status {
+	IN_USE		= 1U << 0,
+	DEL_PENDING	= 1U << 1,
+};
+
+/* List of configured proxies */
+struct ifpx_proxy_node {
+	TAILQ_ENTRY(ifpx_proxy_node) elem;
+	uint16_t proxy_id;
+	uint16_t state;
+	struct rte_ifpx_info info;
+};
+extern
+TAILQ_HEAD(ifpx_proxies_head, ifpx_proxy_node) ifpx_proxies;
+
+/* This function should be called by the implementation whenever it notices
+ * change in the network configuration.  The arguments are:
+ * - ev : pointer to filled event data structure (all fields are expected to be
+ *     filled, with the exception of 'port_id' for all proxy/port related
+ *     events: this function clones the event notification for each bound port
+ *     and fills 'port_id' appropriately).
+ * - px : proxy node when given event is proxy/port related, otherwise pass NULL
+ */
+void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node *px);
+
+/* This function should be called by the implementation whenever it is done with
+ * notification about network configuration change.  It is only really needed
+ * for the case of callback based API - from the callback user might to attempt
+ * to remove callbacks/proxies.  Removing of callbacks is handled by the
+ * ifpx_notify_event() function above, however only implementation really knows
+ * when notification for given proxy is finished so it is a duty of it to call
+ * this function to cleanup all proxies that has been marked for deletion.
+ */
+void ifpx_cleanup_proxies(void);
+
+/* This is the internal function removing the proxy from the list.  It is
+ * related to the notification function above and intended to be used by the
+ * platform implementation for the case of callback based API.
+ * During notification via callback the internal lock is released so that
+ * operation would not deadlock on an attempt to take a lock.  However
+ * modification (destruction) is not really performed - instead the
+ * callbacks/proxies are marked as "to be deleted".
+ * Handling of callbacks that are "to be deleted" is done by the
+ * ifpx_notify_event() function itself however it cannot delete the proxies (in
+ * particular the proxy passed as an argument) since they might still be refered
+ * by the calling function.  So it is a responsibility of the platform
+ * implementation to check after calling notification function if there are any
+ * proxies to be removed and use ifpx_proxy_destroy() to actually release them.
+ */
+int ifpx_proxy_destroy(struct ifpx_proxy_node *px);
+
+/* Every implementation should provide definition of this structure:
+ * - init : called during library initialization (NULL when not needed)
+ * - listen : this function should start service listening to the network
+ *     configuration events/changes,
+ * - close : this function should close the service started by listen()
+ * - get_info : this function should query system for current configuration of
+ *     interface with index 'if_index'.  After successful initialization of
+ *     listening service this function is calle with 0 as an argument.  In that
+ *     case configuration of all ports should be obtained - and when this
+ *     procedure completes a RTE_IFPX_CFG_DONE event should be signaled via
+ *     ifpx_notify_event().
+ */
+extern
+struct ifpx_platform_callbacks {
+	void (*init)(void);
+	int (*listen)(void);
+	int (*close)(void);
+	void (*get_info)(int if_index);
+} ifpx_platform;
+
+#endif /* _IF_PROXY_PRIV_H_ */
diff --git a/lib/librte_if_proxy/linux/Makefile b/lib/librte_if_proxy/linux/Makefile
new file mode 100644
index 000000000..275b7e1e3
--- /dev/null
+++ b/lib/librte_if_proxy/linux/Makefile
@@ -0,0 +1,4 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2020 Marvell International Ltd.
+
+SRCS += if_proxy.c
diff --git a/lib/librte_if_proxy/linux/if_proxy.c b/lib/librte_if_proxy/linux/if_proxy.c
new file mode 100644
index 000000000..bf851c096
--- /dev/null
+++ b/lib/librte_if_proxy/linux/if_proxy.c
@@ -0,0 +1,552 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+#include <if_proxy_priv.h>
+#include <rte_interrupts.h>
+#include <rte_string_fns.h>
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+
+static
+struct rte_intr_handle ifpx_irq = {
+	.type = RTE_INTR_HANDLE_NETLINK,
+	.fd = -1,
+};
+
+static
+unsigned int ifpx_pid;
+
+static
+int request_info(int type, int index)
+{
+	static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER;
+	struct info_get {
+		struct nlmsghdr h;
+		union {
+			struct ifinfomsg ifm;
+			struct ifaddrmsg ifa;
+			struct rtmsg rtm;
+			struct ndmsg ndm;
+		} __rte_aligned(NLMSG_ALIGNTO);
+	} info_req;
+	int ret;
+
+	memset(&info_req, 0, sizeof(info_req));
+	/* First byte of these messages is family, so just make sure that this
+	 * memset is enough to get all families.
+	 */
+	RTE_ASSERT(AF_UNSPEC == 0);
+
+	info_req.h.nlmsg_pid = ifpx_pid;
+	info_req.h.nlmsg_type = type;
+	info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	info_req.h.nlmsg_len = offsetof(struct info_get, ifm);
+
+	switch (type) {
+	case RTM_GETLINK:
+		info_req.h.nlmsg_len += sizeof(info_req.ifm);
+		info_req.ifm.ifi_index = index;
+		break;
+	case RTM_GETADDR:
+		info_req.h.nlmsg_len += sizeof(info_req.ifa);
+		info_req.ifa.ifa_index = index;
+		break;
+	case RTM_GETROUTE:
+		info_req.h.nlmsg_len += sizeof(info_req.rtm);
+		break;
+	case RTM_GETNEIGH:
+		info_req.h.nlmsg_len += sizeof(info_req.ndm);
+		break;
+	default:
+		IFPX_LOG(WARNING, "Unhandled message type: %d", type);
+		return -EINVAL;
+	}
+	/* Store request type (and if it is global or link specific) in 'seq'.
+	 * Later it is used during handling of reply to continue requesting of
+	 * information dump from system - if needed.
+	 */
+	info_req.h.nlmsg_seq = index << 8 | type;
+
+	IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index);
+
+	rte_spinlock_lock(&send_lock);
+	ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0);
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno);
+		rte_errno = errno;
+	}
+	rte_spinlock_unlock(&send_lock);
+
+	return ret;
+}
+
+static
+void handle_link(const struct nlmsghdr *h)
+{
+	const struct ifinfomsg *ifi = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+	const struct rtattr *attrs[IFLA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct ifpx_proxy_node *px;
+	struct rte_ifpx_event ev;
+
+	IFPX_LOG(DEBUG, "\tLink action (%u): %u, 0x%x/0x%x (flags/changed)",
+		 ifi->ifi_index, h->nlmsg_type, ifi->ifi_flags,
+		 ifi->ifi_change);
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->info.if_index == (unsigned int)ifi->ifi_index)
+			break;
+	}
+
+	/* Drop messages that are not associated with any proxy */
+	if (!px)
+		goto exit;
+	/* When message is a reply to request for specific interface then keep
+	 * it only when it contains info for this interface.
+	 */
+	if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+	    (h->nlmsg_seq >> 8) != (unsigned)ifi->ifi_index)
+		goto exit;
+
+	for (attr = IFLA_RTA(ifi); RTA_OK(attr, alen);
+				   attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > IFLA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	if (ifi->ifi_change & IFF_UP) {
+		ev.type = RTE_IFPX_LINK_CHANGE;
+		ev.link_change.is_up = ifi->ifi_flags & IFF_UP;
+		ifpx_notify_event(&ev, px);
+	}
+	if (attrs[IFLA_MTU]) {
+		uint16_t mtu = *(const int *)RTA_DATA(attrs[IFLA_MTU]);
+		if (mtu != px->info.mtu) {
+			px->info.mtu = mtu;
+			ev.type = RTE_IFPX_MTU_CHANGE;
+			ev.mtu_change.mtu = mtu;
+			ifpx_notify_event(&ev, px);
+		}
+	}
+	if (attrs[IFLA_ADDRESS]) {
+		const struct rte_ether_addr *mac =
+				RTA_DATA(attrs[IFLA_ADDRESS]);
+
+		RTE_ASSERT(RTA_PAYLOAD(attrs[IFLA_ADDRESS]) ==
+			   RTE_ETHER_ADDR_LEN);
+		if (memcmp(mac, &px->info.mac, RTE_ETHER_ADDR_LEN) != 0) {
+			rte_ether_addr_copy(mac, &px->info.mac);
+			ev.type = RTE_IFPX_MAC_CHANGE;
+			rte_ether_addr_copy(mac, &ev.mac_change.mac);
+			ifpx_notify_event(&ev, px);
+		}
+	}
+	if (h->nlmsg_pid == ifpx_pid) {
+		RTE_ASSERT((h->nlmsg_seq & 0xFF) == RTM_GETLINK);
+		/* If this is reply for specific link request (not initial
+		 * global dump) then follow up with address request, otherwise
+		 * just store the interface name.
+		 */
+		if (h->nlmsg_seq >> 8)
+			request_info(RTM_GETADDR, ifi->ifi_index);
+		else if (!px->info.if_name[0] && attrs[IFLA_IFNAME])
+			strlcpy(px->info.if_name, RTA_DATA(attrs[IFLA_IFNAME]),
+				sizeof(px->info.if_name));
+	}
+
+	ifpx_cleanup_proxies();
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void handle_addr(const struct nlmsghdr *h, bool needs_del)
+{
+	const struct ifaddrmsg *ifa = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa));
+	const struct rtattr *attrs[IFA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct ifpx_proxy_node *px;
+	struct rte_ifpx_event ev;
+	const uint8_t *ip;
+
+	IFPX_LOG(DEBUG, "\tAddr action (%u): %u, family: %u",
+		 ifa->ifa_index, h->nlmsg_type, ifa->ifa_family);
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->info.if_index == ifa->ifa_index)
+			break;
+	}
+
+	/* Drop messages that are not associated with any proxy */
+	if (!px)
+		goto exit;
+	/* When message is a reply to request for specific interface then keep
+	 * it only when it contains info for this interface.
+	 */
+	if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+	    (h->nlmsg_seq >> 8) != ifa->ifa_index)
+		goto exit;
+
+	for (attr = IFA_RTA(ifa); RTA_OK(attr, alen);
+				  attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > IFA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	if (attrs[IFA_ADDRESS]) {
+		ip = RTA_DATA(attrs[IFA_ADDRESS]);
+		if (ifa->ifa_family == AF_INET) {
+			ev.type = needs_del ? RTE_IFPX_ADDR_DEL
+					    : RTE_IFPX_ADDR_ADD;
+			ev.addr_change.ip =
+					RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+		} else {
+			ev.type = needs_del ? RTE_IFPX_ADDR6_DEL
+					    : RTE_IFPX_ADDR6_ADD;
+			memcpy(ev.addr6_change.ip, ip, 16);
+		}
+		ifpx_notify_event(&ev, px);
+		ifpx_cleanup_proxies();
+	}
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void handle_route(const struct nlmsghdr *h, bool needs_del)
+{
+	const struct rtmsg *r = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
+	const struct rtattr *attrs[RTA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct rte_ifpx_event ev;
+	struct ifpx_proxy_node *px = NULL;
+	const uint8_t *ip;
+
+	IFPX_LOG(DEBUG, "\tRoute action: %u, family: %u",
+		 h->nlmsg_type, r->rtm_family);
+
+	for (attr = RTM_RTA(r); RTA_OK(attr, alen);
+				attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > RTA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = RTE_IFPX_NUM_EVENTS;
+
+	rte_spinlock_lock(&ifpx_lock);
+	if (attrs[RTA_OIF]) {
+		int if_index = *((int32_t*)RTA_DATA(attrs[RTA_OIF]));
+
+		if (if_index > 0) {
+			TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+				if (px->info.if_index == (uint32_t)if_index)
+					break;
+			}
+		}
+	}
+	/* We are only interested in routes related to the proxy interfaces and
+	 * we need to have dst - otherwise skip the message.
+	 */
+	if (!px || !attrs[RTA_DST])
+		goto exit;
+
+	ip = RTA_DATA(attrs[RTA_DST]);
+	/* This is common to both IPv4/6. */
+	ev.route_change.depth = r->rtm_dst_len;
+	if (r->rtm_family == AF_INET) {
+		ev.type = needs_del ? RTE_IFPX_ROUTE_DEL
+		                    : RTE_IFPX_ROUTE_ADD;
+		ev.route_change.ip =
+		                RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+	} else {
+		ev.type = needs_del ? RTE_IFPX_ROUTE6_DEL
+		                    : RTE_IFPX_ROUTE6_ADD;
+		memcpy(ev.route6_change.ip, ip, 16);
+	}
+	if (attrs[RTA_GATEWAY]) {
+		ip = RTA_DATA(attrs[RTA_GATEWAY]);
+		if (r->rtm_family == AF_INET)
+			ev.route_change.gateway =
+					RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+		else
+			memcpy(ev.route6_change.gateway, ip, 16);
+	}
+
+	ifpx_notify_event(&ev, px);
+	/* Let's check for proxies to remove here too - just in case somebody
+	 * removed the non-proxy related callback.
+	 */
+	ifpx_cleanup_proxies();
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+/* Link, addr and route related messages seem to have this macro defined but not
+ * neighbour one.  Define one if it is missing - const qualifiers added just to
+ * silence compiler - for some reason it is not needed in equivalent macros for
+ * other messages and here compiler is complaining about (char*) cast on pointer
+ * to const.
+ */
+#ifndef NDA_RTA
+#define NDA_RTA(r) ((const struct rtattr*)(((const char*)(r)) + \
+			NLMSG_ALIGN(sizeof(struct ndmsg))))
+#endif
+
+static
+void handle_neigh(const struct nlmsghdr *h, bool needs_del)
+{
+	const struct ndmsg *n = NLMSG_DATA(h);
+	int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*n));
+	const struct rtattr *attrs[NDA_MAX+1] = { NULL };
+	const struct rtattr *attr;
+	struct ifpx_proxy_node *px;
+	struct rte_ifpx_event ev;
+	const uint8_t *ip;
+
+	IFPX_LOG(DEBUG, "\tNeighbour action: %u, family: %u, state: %u, if: %d",
+		 h->nlmsg_type, n->ndm_family, n->ndm_state, n->ndm_ifindex);
+
+	for (attr = NDA_RTA(n); RTA_OK(attr, alen);
+				attr = RTA_NEXT(attr, alen)) {
+		if (attr->rta_type > NDA_MAX)
+			continue;
+		attrs[attr->rta_type] = attr;
+	}
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = RTE_IFPX_NUM_EVENTS;
+
+	rte_spinlock_lock(&ifpx_lock);
+	TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+		if (px->info.if_index == (unsigned)n->ndm_ifindex)
+			break;
+	}
+	/* We need only subset of neighbourhood related to proxy interfaces.
+	 * lladdr seems to be needed only for adding new entry - modifications
+	 * (also reported via RTM_NEWLINK) and deletion include only dst.
+	 */
+	if (!px || !attrs[NDA_DST] || (!needs_del && !attrs[NDA_LLADDR]))
+		goto exit;
+
+	ip = RTA_DATA(attrs[NDA_DST]);
+	if (n->ndm_family == AF_INET) {
+		ev.type = needs_del ? RTE_IFPX_NEIGH_DEL
+		                    : RTE_IFPX_NEIGH_ADD;
+		ev.neigh_change.ip =
+		                RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+	} else {
+		ev.type = needs_del ? RTE_IFPX_NEIGH6_DEL
+		                    : RTE_IFPX_NEIGH6_ADD;
+		memcpy(ev.neigh6_change.ip, ip, 16);
+	}
+	if (attrs[NDA_LLADDR])
+		rte_ether_addr_copy(RTA_DATA(attrs[NDA_LLADDR]),
+		                    &ev.neigh_change.mac);
+
+	ifpx_notify_event(&ev, px);
+	/* Let's check for proxies to remove here too - just in case somebody
+	 * removed the non-proxy related callback.
+	 */
+	ifpx_cleanup_proxies();
+exit:
+	rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void if_proxy_intr_callback(void *arg __rte_unused)
+{
+	struct nlmsghdr *h;
+	struct sockaddr_nl addr;
+	socklen_t addr_len;
+	char buf[8192];
+	ssize_t len;
+
+restart:
+	len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0,
+		       (struct sockaddr *)&addr, &addr_len);
+	if (len < 0) {
+		if (errno == EINTR) {
+			IFPX_LOG(DEBUG, "recvmsg() interrupted");
+			goto restart;
+		}
+		IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)",
+			 len, errno);
+		return;
+	}
+	if (addr_len != sizeof(addr)) {
+		IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len);
+		return;
+	}
+	IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len,
+		 sizeof(buf), addr.nl_pid, addr.nl_groups);
+
+	for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len);
+					 h = NLMSG_NEXT(h, len)) {
+		IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)",
+			 h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags,
+			 h->nlmsg_pid);
+
+		switch (h->nlmsg_type) {
+		case RTM_NEWLINK:
+		case RTM_DELLINK:
+			handle_link(h);
+			break;
+		case RTM_NEWADDR:
+		case RTM_DELADDR:
+			handle_addr(h, h->nlmsg_type == RTM_DELADDR);
+			break;
+		case RTM_NEWROUTE:
+		case RTM_DELROUTE:
+			handle_route(h, h->nlmsg_type == RTM_DELROUTE);
+			break;
+		case RTM_NEWNEIGH:
+		case RTM_DELNEIGH:
+			handle_neigh(h, h->nlmsg_type == RTM_DELNEIGH);
+			break;
+		}
+
+		/* If this is a reply for global request then follow up with
+		 * additional requests and notify about finish.
+		 */
+		if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 &&
+		    h->nlmsg_type == NLMSG_DONE) {
+			if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK)
+				request_info(RTM_GETADDR, 0);
+			else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR)
+				request_info(RTM_GETROUTE, 0);
+			else if ((h->nlmsg_seq & 0xFF) == RTM_GETROUTE)
+				request_info(RTM_GETNEIGH, 0);
+			else {
+				struct rte_ifpx_event ev = {
+					.type = RTE_IFPX_CFG_DONE
+				};
+
+				RTE_ASSERT((h->nlmsg_seq & 0xFF) ==
+						RTM_GETNEIGH);
+				rte_spinlock_lock(&ifpx_lock);
+				ifpx_notify_event(&ev, NULL);
+				rte_spinlock_unlock(&ifpx_lock);
+			}
+		}
+	}
+	IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len);
+}
+
+static
+int nlink_listen(void)
+{
+	struct sockaddr_nl addr = {
+		.nl_family = AF_NETLINK,
+		.nl_pid = 0,
+	};
+	socklen_t addr_len = sizeof(addr);
+	int ret;
+
+	if (ifpx_irq.fd != -1) {
+		rte_errno = EBUSY;
+		return -1;
+	}
+
+	addr.nl_groups = 1 << (RTNLGRP_LINK-1)
+			| 1 << (RTNLGRP_NEIGH-1)
+			| 1 << (RTNLGRP_IPV4_IFADDR-1)
+			| 1 << (RTNLGRP_IPV6_IFADDR-1)
+			| 1 << (RTNLGRP_IPV4_ROUTE-1)
+			| 1 << (RTNLGRP_IPV6_ROUTE-1);
+
+	ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+				 NETLINK_ROUTE);
+	if (ifpx_irq.fd == -1) {
+		IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno);
+		goto error;
+	}
+	/* Starting with kernel 4.19 you can request dump for a specific
+	 * interface and kernel will filter out and send only relevant info.
+	 * Otherwise NLM_F_DUMP will generate info for all interfaces and you
+	 * need to filter them yourself.
+	 */
+#ifdef NETLINK_DUMP_STRICT_CHK
+	ret = 1; /* use this var also as an input param */
+	ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK,
+			 &ret, sizeof(ret));
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to set socket option: %d", errno);
+		goto error;
+	}
+#endif
+
+	ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len);
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to bind socket: %d", errno);
+		goto error;
+	}
+	ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len);
+	if (ret < 0) {
+		IFPX_LOG(ERR, "Failed to get socket addr: %d", errno);
+		goto error;
+	} else {
+		ifpx_pid = addr.nl_pid;
+		IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid);
+	}
+
+	ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback,
+					 NULL);
+	if (ret == 0)
+		return 0;
+
+error:
+	rte_errno = errno;
+	if (ifpx_irq.fd != -1) {
+		close(ifpx_irq.fd);
+		ifpx_irq.fd = -1;
+	}
+	return -1;
+}
+
+static
+int nlink_close(void)
+{
+	int ec;
+
+	if (ifpx_irq.fd < 0)
+		return -EBADFD;
+
+	do
+		ec = rte_intr_callback_unregister(&ifpx_irq,
+		                                  if_proxy_intr_callback, NULL);
+	while (ec == -EAGAIN); /* unlikely but possible - at least I think so */
+
+	close(ifpx_irq.fd);
+	ifpx_irq.fd = -1;
+	ifpx_pid = 0;
+
+	return 0;
+}
+
+static
+void nlink_get_info(int if_index)
+{
+	if (ifpx_irq.fd != -1)
+		request_info(RTM_GETLINK, if_index);
+}
+
+struct ifpx_platform_callbacks ifpx_platform = {
+	.init = NULL,
+	.listen = nlink_listen,
+	.close = nlink_close,
+	.get_info = nlink_get_info,
+};
diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
new file mode 100644
index 000000000..f0c1a6e15
--- /dev/null
+++ b/lib/librte_if_proxy/meson.build
@@ -0,0 +1,19 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2020 Marvell International Ltd.
+
+# Currently only implemented on Linux
+if not is_linux
+	build = false
+	reason = 'only supported on linux'
+endif
+
+version = 1
+allow_experimental_apis = true
+
+deps += ['ethdev']
+sources = files('if_proxy_common.c')
+headers = files('rte_if_proxy.h')
+
+if is_linux
+	sources += files('linux/if_proxy.c')
+endif
diff --git a/lib/librte_if_proxy/rte_if_proxy.h b/lib/librte_if_proxy/rte_if_proxy.h
new file mode 100644
index 000000000..e620319b3
--- /dev/null
+++ b/lib/librte_if_proxy/rte_if_proxy.h
@@ -0,0 +1,561 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#ifndef _RTE_IF_PROXY_H_
+#define _RTE_IF_PROXY_H_
+
+/**
+ * @file
+ * RTE IF Proxy library
+ *
+ * The IF Proxy library allows for monitoring of system network configuration
+ * and configuration of DPDK ports by using usual system utilities (like the
+ * ones from iproute2 package).
+ *
+ * It is based on the notion of "proxy interface" which actually can be any DPDK
+ * port which is also visible to the system - that is it has non-zero 'if_index'
+ * field in 'rte_eth_dev_info' structure.
+ *
+ * If application doesn't have any such port (or doesn't want to use it for
+ * proxy) it can create one by calling:
+ *
+ *   proxy_id = rte_ifpx_create(RTE_IFPX_DEFAULT);
+ *
+ * This function is just a wrapper that constructs valid 'devargs' string based
+ * on the proxy type chosen (currently Tap or KNI) and creates the interface by
+ * calling rte_ifpx_dev_create().
+ *
+ * Once one has DPDK port capable of being proxy one can bind target DPDK port
+ * to it by calling.
+ *
+ *   rte_ifpx_port_bind(port_id, proxy_id);
+ *
+ * This binding is a logical one - there is no automatic packet forwarding
+ * between port and it's proxy since the library doesn't know the structure of
+ * application's packet processing.  It remains application responsibility to
+ * forward the packets from/to proxy port (by calling the usual DPDK RX/TX burst
+ * API).  However when the library notes some change to the proxy interface it
+ * will simply call appropriate callback with 'port_id' of the DPDK port that is
+ * bound to this proxy interface.  The binding can be 1 to many - that is many
+ * ports can point to one proxy - in that case registered callbacks will be
+ * called for every bound port.
+ *
+ * The callbacks that are used for notifications are described by the
+ * 'rte_ifpx_callbacks' structure and they are registered by calling:
+ *
+ *   rte_ifpx_callbacks_register(&cbs);
+ *
+ * Finally the application should call:
+ *
+ *   rte_ifpx_listen();
+ *
+ * which will query system for present network configuration and start listening
+ * to its changes.
+ */
+
+#include <rte_eal.h>
+#include <rte_ethdev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Enum naming the type of proxy to create.
+ *
+ * @see rte_ifpx_create()
+ */
+enum rte_ifpx_proxy_type {
+	RTE_IFPX_DEFAULT,	/**< Use default proxy type for given arch. */
+	RTE_IFPX_TAP,		/**< Use Tap based port for proxy. */
+	RTE_IFPX_KNI		/**< Use KNI based port for proxy. */
+};
+
+/**
+ * Create DPDK port that can serve as an interface proxy.
+ *
+ * This function is just a wrapper around rte_ifpx_create_by_devarg() that
+ * constructs its 'devarg' argument based on type of proxy requested.
+ *
+ * @param type
+ *   A type of proxy to create.
+ *
+ * @return
+ *   DPDK port id on success, RTE_MAX_ETHPORTS otherwise.
+ *
+ * @see enum rte_ifpx_type
+ * @see rte_ifpx_create_by_devarg()
+ */
+__rte_experimental
+uint16_t rte_ifpx_proxy_create(enum rte_ifpx_proxy_type type);
+
+/**
+ * Create DPDK port that can serve as an interface proxy.
+ *
+ * @param devarg
+ *   A string passed to rte_dev_probe() to create proxy port.
+ *
+ * @return
+ *   DPDK port id on success, RTE_MAX_ETHPORTS otherwise.
+ */
+__rte_experimental
+uint16_t rte_ifpx_proxy_create_by_devarg(const char *devarg);
+
+/**
+ * Remove DPDK proxy port.
+ *
+ * In addition to removing the proxy port the bindings (if any) are cleared.
+ *
+ * @param proxy_id
+ *   Port id of the proxy that should be removed.
+ *
+ * @return
+ *   0 on success, negative on error.
+ */
+__rte_experimental
+int rte_ifpx_proxy_destroy(uint16_t proxy_id);
+
+/**
+ * The rte_ifpx_event_type enum lists all possible event types that can be
+ * signaled by this library.  To learn what events are supported on your
+ * platform call rte_ifpx_events_available().
+ *
+ * NOTE - do not reorder these enums freely, their values need to correspond to
+ * the order of the callbacks in struct rte_ifpx_callbacks.
+ */
+enum rte_ifpx_event_type {
+	RTE_IFPX_MAC_CHANGE,  /**< @see struct rte_ifpx_mac_change */
+	RTE_IFPX_MTU_CHANGE,  /**< @see struct rte_ifpx_mtu_change */
+	RTE_IFPX_LINK_CHANGE, /**< @see struct rte_ifpx_link_change */
+	RTE_IFPX_ADDR_ADD,    /**< @see struct rte_ifpx_addr_change */
+	RTE_IFPX_ADDR_DEL,    /**< @see struct rte_ifpx_addr_change */
+	RTE_IFPX_ADDR6_ADD,   /**< @see struct rte_ifpx_addr6_change */
+	RTE_IFPX_ADDR6_DEL,   /**< @see struct rte_ifpx_addr6_change */
+	RTE_IFPX_ROUTE_ADD,   /**< @see struct rte_ifpx_route_change */
+	RTE_IFPX_ROUTE_DEL,   /**< @see struct rte_ifpx_route_change */
+	RTE_IFPX_ROUTE6_ADD,  /**< @see struct rte_ifpx_route6_change */
+	RTE_IFPX_ROUTE6_DEL,  /**< @see struct rte_ifpx_route6_change */
+	RTE_IFPX_NEIGH_ADD,   /**< @see struct rte_ifpx_neigh_change */
+	RTE_IFPX_NEIGH_DEL,   /**< @see struct rte_ifpx_neigh_change */
+	RTE_IFPX_NEIGH6_ADD,  /**< @see struct rte_ifpx_neigh6_change */
+	RTE_IFPX_NEIGH6_DEL,  /**< @see struct rte_ifpx_neigh6_change */
+	RTE_IFPX_CFG_DONE,    /**< This event is a lib specific event - it is
+                               * signaled when initial network configuration
+			       * query is finished and has no event data.
+			       */
+	RTE_IFPX_NUM_EVENTS,
+};
+
+/**
+ * Get the bit mask of implemented events/callbacks for this platform.
+ *
+ * @return
+ *   Bit mask of events/callbacks implemented: each event type can be tested by
+ *   checking bit (1 << ev) where 'ev' is one of the rte_ifpx_event_type enum
+ *   values.
+ * @see enum rte_ifpx_event_type
+ */
+__rte_experimental
+uint64_t rte_ifpx_events_available(void);
+
+/**
+ * The rte_ifpx_event defines structure used to pass notification event to
+ * application.  Each event type has its own dedicated inner structure - these
+ * structures are also used when using callbacks notifications.
+ */
+struct rte_ifpx_event {
+	enum rte_ifpx_event_type type;
+	union {
+		/** Structure used to pass notification about MAC change of the
+		 * proxy interface.
+		 * @see RTE_IFPX_MAC_CHANGE
+		 */
+		struct rte_ifpx_mac_change {
+			uint16_t port_id;
+			struct rte_ether_addr mac;
+		} mac_change;
+		/** Structure used to pass notification about MTU change.
+		 * @see RTE_IFPX_MTU_CHANGE
+		 */
+		struct rte_ifpx_mtu_change {
+			uint16_t port_id;
+			uint16_t mtu;
+		} mtu_change;
+		/** Structure used to pass notification about link going
+		 * up/down.
+		 * @see RTE_IFPX_LINK_CHANGE
+		 */
+		struct rte_ifpx_link_change {
+			uint16_t port_id;
+			int is_up;
+		} link_change;
+		/** Structure used to pass notification about IPv4 address being
+		 * added/removed.  All IPv4 addresses reported by this library
+		 * are in host order.
+		 * @see RTE_IFPX_ADDR_ADD
+		 * @see RTE_IFPX_ADDR_DEL
+		 */
+		struct rte_ifpx_addr_change {
+			uint16_t port_id;
+			uint32_t ip;
+		} addr_change;
+		/** Structure used to pass notification about IPv6 address being
+		 * added/removed.
+		 * @see RTE_IFPX_ADDR6_ADD
+		 * @see RTE_IFPX_ADDR6_DEL
+		 */
+		struct rte_ifpx_addr6_change {
+			uint16_t port_id;
+			uint8_t ip[16];
+		} addr6_change;
+		/** Structure used to pass notification about IPv4 route being
+		 * added/removed.
+		 * @see RTE_IFPX_ROUTE_ADD
+		 * @see RTE_IFPX_ROUTE_DEL
+		 */
+		struct rte_ifpx_route_change {
+			uint16_t port_id;
+			uint8_t depth;
+			uint32_t ip;
+			uint32_t gateway;
+		} route_change;
+		/** Structure used to pass notification about IPv6 route being
+		 * added/removed.
+		 * @see RTE_IFPX_ROUTE6_ADD
+		 * @see RTE_IFPX_ROUTE6_DEL
+		 */
+		struct rte_ifpx_route6_change {
+			uint16_t port_id;
+			uint8_t depth;
+			uint8_t ip[16];
+			uint8_t gateway[16];
+		} route6_change;
+		/** Structure used to pass notification about IPv4 neighbour
+		 * info changes.
+		 * @see RTE_IFPX_NEIGH_ADD
+		 * @see RTE_IFPX_NEIGH_DEL
+		 */
+		struct rte_ifpx_neigh_change {
+			uint16_t port_id;
+			struct rte_ether_addr mac;
+			uint32_t ip;
+		} neigh_change;
+		/** Structure used to pass notification about IPv6 neighbour
+		 * info changes.
+		 * @see RTE_IFPX_NEIGH6_ADD
+		 * @see RTE_IFPX_NEIGH6_DEL
+		 */
+		struct rte_ifpx_neigh6_change {
+			uint16_t port_id;
+			struct rte_ether_addr mac;
+			uint8_t ip[16];
+		} neigh6_change;
+		/* This structure is used internally - to abstract common parts
+		 * of proxy/port related events and to be able to refer to this
+		 * union without giving it a name.
+		 */
+		struct {
+			uint16_t port_id;
+		} data;
+	};
+};
+
+/**
+ * This library can deliver notification about network configuration changes
+ * either by the use of registered callbacks and/or by queueing change events to
+ * configured notification queues.  The logic used is:
+ * 1. If there is callback registered for given event type it is called.  In
+ *   case of many ports to one proxy binding, this callback is called for every
+ *   port bound.
+ * 2. If this callback returns non-zero value (for any of ports in case of
+ *   many-1 bindings) the handling of an event is considered as complete.
+ * 3. Otherwise the event is added to each configured event queue.  The event is
+ *   allocated with malloc() so after dequeueing and handling the application
+ *   should deallocate it with free().
+ *
+ * This dual notification mechanism is meant to provide some flexibility to
+ * application writer.  For example, if you store your data in a single writer/
+ * many readers coherent data structure you could just update this structure
+ * from the callback.  If you keep separate copy per lcore/port you could make
+ * some common preparations (if applicable) in the callback, return 0 and use
+ * notification queues to pick up the change and update data structures.  Or you
+ * could skip the callbacks altogether and just use notification queues - and
+ * configure them at the level appropriate for your application design (one
+ * global / one per lcore / one per port ...).
+ */
+
+/**
+ * Add notification queue to the list of queues.
+ *
+ * @param r
+ *   Ring used for queueing of notification events - application can assume that
+ *   there is only one producer.
+ * @return
+ *   0 on success, negative otherwise.
+ */
+int rte_ifpx_queue_add(struct rte_ring *r);
+
+/**
+ * Remove notification queue from the list of queues.
+ *
+ * @param r
+ *   Notification ring used for queueing of notification events (previously
+ *   added via rte_ifpx_queue_add()).
+ * @return
+ *   0 on success, negative otherwise.
+ */
+int rte_ifpx_queue_remove(struct rte_ring *r);
+
+/**
+ * This structure groups the callbacks that might be called as a notification
+ * events for changing network configuration.  Not every platform might
+ * implement all of them and you can query the availability with
+ * rte_ifpx_callbacks_available() function.
+ * @see rte_ifpx_events_available()
+ * @see rte_ifpx_callbacks_register()
+ */
+struct rte_ifpx_callbacks {
+	int (*mac_change)(const struct rte_ifpx_mac_change *event);
+	/**< Callback for notification about MAC change of the proxy interface.
+	 * This callback (as all other port related callbacks) is called for
+	 * each port (with its port_id as a first argument) bound to the proxy
+	 * interface for which change has been observed.
+	 * @see struct rte_ifpx_mac_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*mtu_change)(const struct rte_ifpx_mtu_change *event);
+	/**< Callback for notification about MTU change.
+	 * @see struct rte_ifpx_mtu_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*link_change)(const struct rte_ifpx_link_change *event);
+	/**< Callback for notification about link going up/down.
+	 * @see struct rte_ifpx_link_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*addr_add)(const struct rte_ifpx_addr_change *event);
+	/**< Callback for notification about IPv4 address being added.
+	 * @see struct rte_ifpx_addr_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*addr_del)(const struct rte_ifpx_addr_change *event);
+	/**< Callback for notification about IPv4 address removal.
+	 * @see struct rte_ifpx_addr_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*addr6_add)(const struct rte_ifpx_addr6_change *event);
+	/**< Callback for notification about IPv6 address being added.
+	 * @see struct rte_ifpx_addr6_change
+	 */
+	int (*addr6_del)(const struct rte_ifpx_addr6_change *event);
+	/**< Callback for notification about IPv4 address removal.
+	 * @see struct rte_ifpx_addr6_change
+	 * @return non-zero if event handling is finished
+	 */
+	/* Please note that "route" callbacks might be also called when user
+	 * adds address to the interface (that is in addition to address related
+	 * callbacks).
+	 */
+	int (*route_add)(const struct rte_ifpx_route_change *event);
+	/**< Callback for notification about IPv4 route being added.
+	 * @see struct rte_ifpx_route_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*route_del)(const struct rte_ifpx_route_change *event);
+	/**< Callback for notification about IPv4 route removal.
+	 * @see struct rte_ifpx_route_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*route6_add)(const struct rte_ifpx_route6_change *event);
+	/**< Callback for notification about IPv6 route being added.
+	 * @see struct rte_ifpx_route6_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*route6_del)(const struct rte_ifpx_route6_change *event);
+	/**< Callback for notification about IPv6 route removal.
+	 * @see struct rte_ifpx_route6_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*neigh_add)(const struct rte_ifpx_neigh_change *event);
+	/**< Callback for notification about IPv4 neighbour being added.
+	 * @see struct rte_ifpx_neigh_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*neigh_del)(const struct rte_ifpx_neigh_change *event);
+	/**< Callback for notification about IPv4 neighbour removal.
+	 * @see struct rte_ifpx_neigh_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*neigh6_add)(const struct rte_ifpx_neigh6_change *event);
+	/**< Callback for notification about IPv6 neighbour being added.
+	 * @see struct rte_ifpx_neigh_change
+	 */
+	int (*neigh6_del)(const struct rte_ifpx_neigh6_change *event);
+	/**< Callback for notification about IPv6 neighbour removal.
+	 * @see struct rte_ifpx_neigh_change
+	 * @return non-zero if event handling is finished
+	 */
+	int (*cfg_done)(void);
+	/**< Lib specific callback - called when initial network configuration
+	 * query is finished.
+	 * @return non-zero if event handling is finished
+	 */
+};
+
+/**
+ * Register proxy callbacks.
+ *
+ * This function registers callbacks to be called upon appropriate network
+ * event notification.
+ *
+ * @param cbs
+ *   Set of callbacks that will be called.  The library does not take any
+ *   ownership of the pointer passed - the callbacks are stored internally.
+ *
+ * @return
+ *   0 on success, negative otherwise.
+ */
+__rte_experimental
+int rte_ifpx_callbacks_register(const struct rte_ifpx_callbacks *cbs);
+
+/**
+ * Unregister proxy callbacks.
+ *
+ * This function unregisters callbacks previously registered with
+ * rte_ifpx_callbacks_register().
+ *
+ * @param cbs
+ *   Handle/pointer returned on previous callback registration.
+ *
+ * @return
+ *   0 on success, negative otherwise.
+ */
+__rte_experimental
+void rte_ifpx_callbacks_unregister(void);
+
+/**
+ * Bind the port to its proxy.
+ *
+ * After calling this function all network configuration of the proxy (and it's
+ * changes) will be passed to given port by calling registered callbacks with
+ * 'port_id' as an argument.
+ *
+ * Note: since both arguments are of the same type in order to not mix them and
+ * ease remembering the order the first one is kept the same for bind/unbind.
+ *
+ * @param port_id
+ *   Id of the port to be bound.
+ * @param proxy_id
+ *   Id of the proxy the port needs to be bound to.
+ * @return
+ *   0 on success, negative on error.
+ */
+__rte_experimental
+int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id);
+
+/**
+ * Unbind the port from its proxy.
+ *
+ * After calling this function registered callbacks will no longer be called for
+ * this port (but they might be called for other ports in one to many binding
+ * scenario).
+ *
+ * @param port_id
+ *   Id of the port to unbind.
+ * @return
+ *   0 on success, negative on error.
+ */
+__rte_experimental
+int rte_ifpx_port_unbind(uint16_t port_id);
+
+/**
+ * Get the system network configuration and start listening to its changes.
+ *
+ * @return
+ *   0 on success, negative otherwise.
+ */
+__rte_experimental
+int rte_ifpx_listen(void);
+
+/**
+ * Remove all bindings/callbacks and stop listening to network configuration.
+ *
+ * @return
+ *   0 on success, negative otherwise.
+ */
+__rte_experimental
+int rte_ifpx_close(void);
+
+/**
+ * Get the id of the proxy the port is bound to.
+ *
+ * @param port_id
+ *   Id of the port for which to get proxy.
+ * @return
+ *   Port id of the proxy on success, RTE_MAX_ETHPORTS on error.
+ */
+__rte_experimental
+uint16_t rte_ifpx_proxy_get(uint16_t port_id);
+
+/**
+ * Test for port acting as a proxy.
+ *
+ * @param port_id
+ *   Id of the port.
+ * @return
+ *   1 if port acts as a proxy, 0 otherwise.
+ */
+static inline
+int rte_ifpx_is_proxy(uint16_t port_id)
+{
+	return rte_ifpx_proxy_get(port_id) == port_id;
+}
+
+/**
+ * Get the ids of the ports bound to the proxy.
+ *
+ * @param proxy_id
+ *   Id of the proxy for which to get ports.
+ * @param ports
+ *   Array where to store the port ids.
+ * @param num
+ *   Size of the 'ports' array.
+ * @return
+ *   The number of ports bound to given proxy.  Note that bound ports are filled
+ *   in 'ports' array up to its size but the return value is always the total
+ *   number of ports bound - so you can make call first with NULL/0 to query for
+ *   the size of the buffer to create or call it with the buffer you have and
+ *   later check if it was large enough.
+ */
+__rte_experimental
+unsigned int rte_ifpx_port_get(uint16_t proxy_id,
+			       uint16_t *ports, unsigned int num);
+
+/**
+ * The structure containing some properties of the proxy interface.
+ */
+struct rte_ifpx_info {
+	unsigned int if_index; /* entry valid iff if_index != 0 */
+	uint16_t mtu;
+	struct rte_ether_addr mac;
+	char if_name[RTE_ETH_NAME_MAX_LEN];
+};
+
+/**
+ * Get the properties of the proxy interface.  Argument can be either id of the
+ * proxy or an id of a port that is bound to it.
+ *
+ * @param port_id
+ *   Id of the port (or proxy) for which to get proxy properties.
+ * @return
+ *   Pointer to the proxy information structure.
+ */
+__rte_experimental
+const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_IF_PROXY_H_ */
diff --git a/lib/librte_if_proxy/rte_if_proxy_version.map b/lib/librte_if_proxy/rte_if_proxy_version.map
new file mode 100644
index 000000000..e2093137d
--- /dev/null
+++ b/lib/librte_if_proxy/rte_if_proxy_version.map
@@ -0,0 +1,19 @@ 
+EXPERIMENTAL {
+	global:
+
+	 rte_ifpx_proxy_create;
+	 rte_ifpx_proxy_create_by_devarg;
+	 rte_ifpx_proxy_destroy;
+	 rte_ifpx_events_available;
+	 rte_ifpx_callbacks_register;
+	 rte_ifpx_callbacks_unregister;
+	 rte_ifpx_port_bind;
+	 rte_ifpx_port_unbind;
+	 rte_ifpx_listen;
+	 rte_ifpx_close;
+	 rte_ifpx_proxy_get;
+	 rte_ifpx_port_get;
+	 rte_ifpx_info_get;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index 0af3efab2..c913b33dd 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -19,7 +19,7 @@  libraries = [
 	'acl', 'bbdev', 'bitratestats', 'cfgfile',
 	'compressdev', 'cryptodev',
 	'distributor', 'efd', 'eventdev',
-	'gro', 'gso', 'ip_frag', 'jobstats',
+	'gro', 'gso', 'if_proxy', 'ip_frag', 'jobstats',
 	'kni', 'latencystats', 'lpm', 'member',
 	'power', 'pdump', 'rawdev',
 	'rcu', 'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',