[7/8] net/mlx5: add VXLAN encap support to switch flow rules

Message ID 20180831092038.23051-8-adrien.mazarguil@6wind.com
State Superseded, archived
Headers show
Series
  • net/mlx5: add switch offload for VXLAN encap/decap
Related show

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch warning coding style issues

Commit Message

Adrien Mazarguil Aug. 31, 2018, 9:57 a.m.
This patch is huge because support for VXLAN encapsulation in switch flow
rules involves configuration of virtual network interfaces on the host
system including source addresses, routes and neighbor entries for flow
rules to be offloadable by TC. All of this is done through Netlink.

VXLAN interfaces are dynamically created for each combination of local UDP
port and outer network interface associated with flow rules, then used as
targets for TC "flower" filters in order to perform encapsulation.

To automatically create and remove these interfaces on a needed basis
according to the applied flow rules, the PMD maintains global resources
shared between all PMD instances of the primary process.

Testpmd example:

- Setting up outer properties of VXLAN tunnel:

  set vxlan ip-version ipv4 vni 0x112233 udp-src 4242 udp-dst 4789
    ip-src 1.1.1.1 ip-dst 2.2.2.2
    eth-src 00:11:22:33:44:55 eth-dst 66:77:88:99:aa:bb

- Creating a flow rule on port ID 2 performing VXLAN encapsulation with the
  above properties and directing the resulting traffic to port ID 1:

  flow create 2 ingress transfer pattern eth src is 00:11:22:33:44:55 /
     ipv4 / udp dst is 5566 / end actions vxlan_encap / port_id id 1 / end

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/Makefile       |   10 +
 drivers/net/mlx5/mlx5_nl_flow.c | 1198 +++++++++++++++++++++++++++++++++-
 2 files changed, 1204 insertions(+), 4 deletions(-)

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 2e70dec5b..1ba4ce612 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -384,6 +384,16 @@  mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		/usr/include/assert.h \
 		define static_assert \
 		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_ACT_TUNNEL_KEY \
+		linux/tc_act/tc_tunnel_key.h \
+		define TCA_ACT_TUNNEL_KEY \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT \
+		linux/tc_act/tc_tunnel_key.h \
+		enum TCA_TUNNEL_KEY_ENC_DST_PORT \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx5_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
index 91ff90a13..672f92863 100644
--- a/drivers/net/mlx5/mlx5_nl_flow.c
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -6,7 +6,31 @@ 
 #include <assert.h>
 #include <errno.h>
 #include <libmnl/libmnl.h>
+/*
+ * Older versions of linux/if.h do not have the required safeties to coexist
+ * with net/if.h. This causes a compilation failure due to symbol
+ * redefinitions even when including the latter first.
+ *
+ * One workaround is to prevent net/if.h from defining conflicting symbols
+ * by removing __USE_MISC, and maintaining it undefined while including
+ * linux/if.h.
+ *
+ * Alphabetical order cannot be preserved since net/if.h must always be
+ * included before linux/if.h regardless.
+ */
+#ifdef __USE_MISC
+#undef __USE_MISC
+#define RESTORE_USE_MISC
+#endif
+#include <net/if.h>
+#include <linux/if.h>
+#ifdef RESTORE_USE_MISC
+#undef RESTORE_USE_MISC
+#define __USE_MISC 1
+#endif
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
+#include <linux/if_link.h>
 #include <linux/netlink.h>
 #include <linux/pkt_cls.h>
 #include <linux/pkt_sched.h>
@@ -14,11 +38,13 @@ 
 #include <linux/tc_act/tc_gact.h>
 #include <linux/tc_act/tc_mirred.h>
 #include <netinet/in.h>
+#include <pthread.h>
 #include <stdalign.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <sys/queue.h>
 #include <sys/socket.h>
 
 #include <rte_byteorder.h>
@@ -52,6 +78,34 @@  struct tc_vlan {
 
 #endif /* HAVE_TC_ACT_VLAN */
 
+#ifdef HAVE_TC_ACT_TUNNEL_KEY
+
+#include <linux/tc_act/tc_tunnel_key.h>
+
+#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
+#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
+#endif
+
+#else /* HAVE_TC_ACT_TUNNEL_KEY */
+
+#define TCA_ACT_TUNNEL_KEY 17
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE 2
+#define TCA_TUNNEL_KEY_PARMS 2
+#define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
+#define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
+#define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
+#define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
+#define TCA_TUNNEL_KEY_ENC_KEY_ID 7
+#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+#endif /* HAVE_TC_ACT_TUNNEL_KEY */
+
 /* Normally found in linux/netlink.h. */
 #ifndef NETLINK_CAP_ACK
 #define NETLINK_CAP_ACK 10
@@ -148,6 +202,71 @@  struct tc_vlan {
 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
 #endif
 
+#define BIT(b) (1 << (b))
+#define BIT_ENCAP(e) BIT(MLX5_NL_FLOW_ENCAP_ ## e)
+
+/** Flags used for @p mask in struct mlx5_nl_flow_encap. */
+enum mlx5_nl_flow_encap_flag {
+	MLX5_NL_FLOW_ENCAP_ETH_SRC,
+	MLX5_NL_FLOW_ENCAP_ETH_DST,
+	MLX5_NL_FLOW_ENCAP_IPV4_SRC,
+	MLX5_NL_FLOW_ENCAP_IPV4_DST,
+	MLX5_NL_FLOW_ENCAP_IPV6_SRC,
+	MLX5_NL_FLOW_ENCAP_IPV6_DST,
+	MLX5_NL_FLOW_ENCAP_UDP_SRC,
+	MLX5_NL_FLOW_ENCAP_UDP_DST,
+	MLX5_NL_FLOW_ENCAP_VXLAN_VNI,
+};
+
+/** Encapsulation structure with fixed format for convenience. */
+struct mlx5_nl_flow_encap {
+	uint32_t mask;
+	struct {
+		struct ether_addr src;
+		struct ether_addr dst;
+	} eth;
+	struct mlx5_nl_flow_encap_ip {
+		union mlx5_nl_flow_encap_ip_addr {
+			struct in_addr v4;
+			struct in6_addr v6;
+		} src;
+		union mlx5_nl_flow_encap_ip_addr dst;
+	} ip;
+	struct {
+		rte_be16_t src;
+		rte_be16_t dst;
+	} udp;
+	struct {
+		rte_be32_t vni;
+	} vxlan;
+};
+
+/** Generic address descriptor for encapsulation resources. */
+struct mlx5_nl_flow_encap_addr {
+	LIST_ENTRY(mlx5_nl_flow_encap_addr) next;
+	uint32_t refcnt;
+	uint32_t mask;
+	struct mlx5_nl_flow_encap_ip ip;
+};
+
+/** VXLAN-specific encapsulation resources. */
+struct mlx5_nl_flow_encap_vxlan {
+	LIST_ENTRY(mlx5_nl_flow_encap_vxlan) next;
+	uint32_t refcnt;
+	rte_be16_t port;
+	unsigned int inner;
+};
+
+/** Encapsulation interface descriptor. */
+struct mlx5_nl_flow_encap_ifindex {
+	LIST_ENTRY(mlx5_nl_flow_encap_ifindex) next;
+	uint32_t refcnt;
+	unsigned int outer;
+	LIST_HEAD(, mlx5_nl_flow_encap_vxlan) vxlan;
+	LIST_HEAD(, mlx5_nl_flow_encap_addr) local;
+	LIST_HEAD(, mlx5_nl_flow_encap_addr) neigh;
+};
+
 /** Context object required by most functions. */
 struct mlx5_nl_flow_ctx {
 	int socket; /**< NUMA socket for memory allocations. */
@@ -159,8 +278,10 @@  struct mlx5_nl_flow_ctx {
 struct mlx5_nl_flow {
 	uint32_t size; /**< Size of this object. */
 	uint32_t applied:1; /**< Whether rule is currently applied. */
+	unsigned int encap_ifindex; /**< Interface to use with @p encap. */
 	unsigned int *ifindex_src; /**< Source interface. */
 	unsigned int *ifindex_dst; /**< Destination interface. */
+	struct mlx5_nl_flow_encap *encap; /**< Encapsulation properties. */
 	alignas(struct nlmsghdr)
 	uint8_t msg[]; /**< Netlink message data. */
 };
@@ -179,6 +300,7 @@  enum mlx5_nl_flow_trans {
 	ITEM_IPV6,
 	ITEM_TCP,
 	ITEM_UDP,
+	ITEM_VXLAN,
 	ACTIONS,
 	ACTION_VOID,
 	ACTION_PORT_ID,
@@ -187,6 +309,8 @@  enum mlx5_nl_flow_trans {
 	ACTION_OF_PUSH_VLAN,
 	ACTION_OF_SET_VLAN_VID,
 	ACTION_OF_SET_VLAN_PCP,
+	ACTION_VXLAN_ENCAP,
+	ACTION_VXLAN_DECAP,
 	END,
 };
 
@@ -196,7 +320,8 @@  enum mlx5_nl_flow_trans {
 	ITEM_VOID, ITEM_PORT_ID, ACTIONS
 #define ACTIONS_COMMON \
 	ACTION_VOID, ACTION_OF_POP_VLAN, ACTION_OF_PUSH_VLAN, \
-	ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP
+	ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP, \
+	ACTION_VXLAN_ENCAP, ACTION_VXLAN_DECAP
 #define ACTIONS_FATE \
 	ACTION_PORT_ID, ACTION_DROP
 
@@ -213,7 +338,8 @@  static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
 	[ITEM_IPV4] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
 	[ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
 	[ITEM_TCP] = TRANS(PATTERN_COMMON),
-	[ITEM_UDP] = TRANS(PATTERN_COMMON),
+	[ITEM_UDP] = TRANS(ITEM_VXLAN, PATTERN_COMMON),
+	[ITEM_VXLAN] = TRANS(PATTERN_COMMON),
 	[ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[ACTION_VOID] = TRANS(BACK),
 	[ACTION_PORT_ID] = TRANS(ACTION_VOID, END),
@@ -222,6 +348,21 @@  static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
 	[ACTION_OF_PUSH_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[ACTION_OF_SET_VLAN_VID] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[ACTION_OF_SET_VLAN_PCP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+	[ACTION_VXLAN_ENCAP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+	[ACTION_VXLAN_DECAP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+	[END] = NULL,
+};
+
+/** Parser state transitions used by mlx5_nl_flow_encap_reap(). */
+static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_encap_reap_trans[] = {
+	[INVALID] = NULL,
+	[BACK] = NULL,
+	[ITEM_VOID] = TRANS(BACK),
+	[ITEM_ETH] = TRANS(ITEM_IPV4, ITEM_IPV6, ITEM_VOID),
+	[ITEM_IPV4] = TRANS(ITEM_UDP, ITEM_VOID),
+	[ITEM_IPV6] = TRANS(ITEM_UDP, ITEM_VOID),
+	[ITEM_UDP] = TRANS(ITEM_VXLAN, ITEM_VOID),
+	[ITEM_VXLAN] = TRANS(END),
 	[END] = NULL,
 };
 
@@ -234,6 +375,7 @@  static const union {
 	struct rte_flow_item_ipv6 ipv6;
 	struct rte_flow_item_tcp tcp;
 	struct rte_flow_item_udp udp;
+	struct rte_flow_item_vxlan vxlan;
 } mlx5_nl_flow_mask_empty;
 
 #define ETHER_ADDR_MASK "\xff\xff\xff\xff\xff\xff"
@@ -242,6 +384,7 @@  static const union {
 	"\xff\xff\xff\xff\xff\xff\xff\xff" \
 	"\xff\xff\xff\xff\xff\xff\xff\xff"
 #define BE16_MASK RTE_BE16(0xffff)
+#define VXLAN_VNI_MASK "\xff\xff\xff"
 
 /** Supported masks for known item types. */
 static const struct {
@@ -286,6 +429,35 @@  static const struct {
 	},
 };
 
+/** Supported masks for known encapsulation item types. */
+static const struct {
+	struct rte_flow_item_eth eth;
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_udp udp;
+	struct rte_flow_item_vxlan vxlan;
+} mlx5_nl_flow_encap_mask_supported = {
+	.eth = {
+		.dst.addr_bytes = ETHER_ADDR_MASK,
+		.src.addr_bytes = ETHER_ADDR_MASK,
+	},
+	.ipv4.hdr = {
+		.src_addr = IN_ADDR_MASK,
+		.dst_addr = IN_ADDR_MASK,
+	},
+	.ipv6.hdr = {
+		.src_addr = IN6_ADDR_MASK,
+		.dst_addr = IN6_ADDR_MASK,
+	},
+	.udp.hdr = {
+		.src_port = BE16_MASK,
+		.dst_port = BE16_MASK,
+	},
+	.vxlan = {
+		.vni = VXLAN_VNI_MASK,
+	},
+};
+
 /**
  * Retrieve mask for pattern item.
  *
@@ -361,6 +533,227 @@  mlx5_nl_flow_item_mask(const struct rte_flow_item *item,
 }
 
 /**
+ * Convert VXLAN VNI to 32-bit integer.
+ *
+ * @param[in] vni
+ *   VXLAN VNI in 24-bit wire format.
+ *
+ * @return
+ *   VXLAN VNI as a 32-bit integer value in network endian.
+ */
+static rte_be32_t
+vxlan_vni_as_be32(const uint8_t vni[3])
+{
+	return (volatile union { uint8_t u8[4]; rte_be32_t u32; })
+		{ { 0, vni[0], vni[1], vni[2] } }.u32;
+}
+
+/**
+ * Populate consolidated encapsulation object from list of pattern items.
+ *
+ * Helper function to process configuration of generic actions such as
+ * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP.
+ *
+ * @param[out] dst
+ *   Destination object.
+ * @param[in] src
+ *   List of pattern items to gather data from.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_encap_reap(struct mlx5_nl_flow_encap *dst,
+			const struct rte_flow_item *src,
+			struct rte_flow_error *error)
+{
+	struct mlx5_nl_flow_encap tmp = {
+		.mask = 0,
+	};
+	unsigned int n = 0;
+	const enum mlx5_nl_flow_trans *trans = TRANS(ITEM_ETH);
+	const enum mlx5_nl_flow_trans *back = trans;
+
+trans:
+	switch (trans[n++]) {
+		union {
+			const struct rte_flow_item_eth *eth;
+			const struct rte_flow_item_ipv4 *ipv4;
+			const struct rte_flow_item_ipv6 *ipv6;
+			const struct rte_flow_item_udp *udp;
+			const struct rte_flow_item_vxlan *vxlan;
+		} spec, mask;
+
+	default:
+	case INVALID:
+		goto error_encap;
+	case BACK:
+		trans = back;
+		n = 0;
+		goto trans;
+	case ITEM_VOID:
+		if (src->type != RTE_FLOW_ITEM_TYPE_VOID)
+			goto trans;
+		++src;
+		break;
+	case ITEM_ETH:
+		if (src->type != RTE_FLOW_ITEM_TYPE_ETH)
+			goto trans;
+		mask.eth = mlx5_nl_flow_item_mask
+			(src, &rte_flow_item_eth_mask,
+			 &mlx5_nl_flow_encap_mask_supported.eth,
+			 &mlx5_nl_flow_mask_empty.eth,
+			 sizeof(rte_flow_item_eth_mask), error);
+		if (!mask.eth)
+			return -rte_errno;
+		if (mask.eth == &mlx5_nl_flow_mask_empty.eth)
+			goto error_spec;
+		spec.eth = src->spec;
+		if (!is_zero_ether_addr(&mask.eth->src)) {
+			if (!is_broadcast_ether_addr(&mask.eth->src))
+				goto error_mask;
+			tmp.eth.src = spec.eth->src;
+			tmp.mask |= BIT_ENCAP(ETH_SRC);
+		}
+		if (!is_zero_ether_addr(&mask.eth->dst)) {
+			if (!is_broadcast_ether_addr(&mask.eth->dst))
+				goto error_mask;
+			tmp.eth.dst = spec.eth->dst;
+			tmp.mask |= BIT_ENCAP(ETH_DST);
+		}
+		++src;
+		break;
+	case ITEM_IPV4:
+		if (src->type != RTE_FLOW_ITEM_TYPE_IPV4)
+			goto trans;
+		mask.ipv4 = mlx5_nl_flow_item_mask
+			(src, &rte_flow_item_ipv4_mask,
+			 &mlx5_nl_flow_encap_mask_supported.ipv4,
+			 &mlx5_nl_flow_mask_empty.ipv4,
+			 sizeof(rte_flow_item_ipv4_mask), error);
+		if (!mask.ipv4)
+			return -rte_errno;
+		if (mask.ipv4 == &mlx5_nl_flow_mask_empty.ipv4)
+			goto error_spec;
+		spec.ipv4 = src->spec;
+		if (mask.ipv4->hdr.src_addr) {
+			if (mask.ipv4->hdr.src_addr != IN_ADDR_MASK)
+				goto error_mask;
+			tmp.ip.src.v4.s_addr = spec.ipv4->hdr.src_addr;
+			tmp.mask |= BIT_ENCAP(IPV4_SRC);
+		}
+		if (mask.ipv4->hdr.dst_addr) {
+			if (mask.ipv4->hdr.dst_addr != IN_ADDR_MASK)
+				goto error_mask;
+			tmp.ip.dst.v4.s_addr = spec.ipv4->hdr.dst_addr;
+			tmp.mask |= BIT_ENCAP(IPV4_DST);
+		}
+		++src;
+		break;
+	case ITEM_IPV6:
+		if (src->type != RTE_FLOW_ITEM_TYPE_IPV6)
+			goto trans;
+		mask.ipv6 = mlx5_nl_flow_item_mask
+			(src, &rte_flow_item_ipv6_mask,
+			 &mlx5_nl_flow_encap_mask_supported.ipv6,
+			 &mlx5_nl_flow_mask_empty.ipv6,
+			 sizeof(rte_flow_item_ipv6_mask), error);
+		if (!mask.ipv6)
+			return -rte_errno;
+		if (mask.ipv6 == &mlx5_nl_flow_mask_empty.ipv6)
+			goto error_spec;
+		spec.ipv6 = src->spec;
+		if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
+			if (memcmp(mask.ipv6->hdr.src_addr, IN6_ADDR_MASK, 16))
+				goto error_mask;
+			tmp.ip.src.v6 =	*(const struct in6_addr *)
+				spec.ipv6->hdr.src_addr;
+			tmp.mask |= BIT_ENCAP(IPV6_SRC);
+		}
+		if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
+			if (memcmp(mask.ipv6->hdr.dst_addr, IN6_ADDR_MASK, 16))
+				goto error_mask;
+			tmp.ip.dst.v6 =	*(const struct in6_addr *)
+				spec.ipv6->hdr.dst_addr;
+			tmp.mask |= BIT_ENCAP(IPV6_DST);
+		}
+		++src;
+		break;
+	case ITEM_UDP:
+		if (src->type != RTE_FLOW_ITEM_TYPE_UDP)
+			goto trans;
+		mask.udp = mlx5_nl_flow_item_mask
+			(src, &rte_flow_item_udp_mask,
+			 &mlx5_nl_flow_encap_mask_supported.udp,
+			 &mlx5_nl_flow_mask_empty.udp,
+			 sizeof(rte_flow_item_udp_mask), error);
+		if (!mask.udp)
+			return -rte_errno;
+		if (mask.udp == &mlx5_nl_flow_mask_empty.udp)
+			goto error_spec;
+		spec.udp = src->spec;
+		if (mask.udp->hdr.src_port) {
+			if (mask.udp->hdr.src_port != BE16_MASK)
+				goto error_mask;
+			tmp.udp.src = spec.udp->hdr.src_port;
+			tmp.mask |= BIT_ENCAP(UDP_SRC);
+		}
+		if (mask.udp->hdr.dst_port) {
+			if (mask.udp->hdr.dst_port != BE16_MASK)
+				goto error_mask;
+			tmp.udp.dst = spec.udp->hdr.dst_port;
+			tmp.mask |= BIT_ENCAP(UDP_DST);
+		}
+		++src;
+		break;
+	case ITEM_VXLAN:
+		if (src->type != RTE_FLOW_ITEM_TYPE_VXLAN)
+			goto trans;
+		mask.vxlan = mlx5_nl_flow_item_mask
+			(src, &rte_flow_item_vxlan_mask,
+			 &mlx5_nl_flow_encap_mask_supported.vxlan,
+			 &mlx5_nl_flow_mask_empty.vxlan,
+			 sizeof(rte_flow_item_vxlan_mask), error);
+		if (!mask.vxlan)
+			return -rte_errno;
+		if (mask.vxlan == &mlx5_nl_flow_mask_empty.vxlan)
+			goto error_spec;
+		spec.vxlan = src->spec;
+		if (vxlan_vni_as_be32(mask.vxlan->vni)) {
+			if (memcmp(mask.vxlan->vni, VXLAN_VNI_MASK, 3))
+				goto error_mask;
+			tmp.vxlan.vni = vxlan_vni_as_be32(spec.vxlan->vni);
+			tmp.mask |= BIT_ENCAP(VXLAN_VNI);
+		}
+		++src;
+		break;
+	case END:
+		if (src->type != RTE_FLOW_ITEM_TYPE_END)
+			goto trans;
+		*dst = tmp;
+		return 0;
+	}
+	back = trans;
+	trans = mlx5_nl_flow_encap_reap_trans[trans[n - 1]];
+	n = 0;
+	goto trans;
+error_encap:
+	return rte_flow_error_set
+		(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+		 "unsupported encapsulation format");
+error_spec:
+	return rte_flow_error_set
+		(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+		 "a specification structure is required for encapsulation");
+error_mask:
+	return rte_flow_error_set
+		(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+		 "partial masks are not supported for encapsulation");
+}
+
+/**
  * Transpose flow rule description to rtnetlink message.
  *
  * This function transposes a flow rule description to a traffic control
@@ -412,6 +805,7 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 	bool vlan_present;
 	bool vlan_eth_type_set;
 	bool ip_proto_set;
+	struct mlx5_nl_flow_encap encap;
 	struct nlattr *na_flower;
 	struct nlattr *na_flower_act;
 	struct nlattr *na_vlan_id;
@@ -425,8 +819,10 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		goto error_nobufs;
 	nl_flow->size = offsetof(struct mlx5_nl_flow, msg);
 	nl_flow->applied = 0;
+	nl_flow->encap_ifindex = 0;
 	nl_flow->ifindex_src = NULL;
 	nl_flow->ifindex_dst = NULL;
+	nl_flow->encap = NULL;
 	size -= nl_flow->size;
 	item = pattern;
 	action = actions;
@@ -437,6 +833,7 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 	vlan_present = false;
 	vlan_eth_type_set = false;
 	ip_proto_set = false;
+	memset(&encap, 0, sizeof(encap));
 	na_flower = NULL;
 	na_flower_act = NULL;
 	na_vlan_id = NULL;
@@ -461,6 +858,7 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 				of_set_vlan_vid;
 			const struct rte_flow_action_of_set_vlan_pcp *
 				of_set_vlan_pcp;
+			const struct rte_flow_action_vxlan_encap *vxlan_encap;
 		} conf;
 		struct nlmsghdr *nlh;
 		struct tcmsg *tcm;
@@ -887,6 +1285,12 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 			goto error_nobufs;
 		++item;
 		break;
+	case ITEM_VXLAN:
+		if (item->type != RTE_FLOW_ITEM_TYPE_VXLAN)
+			goto trans;
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+			 "VXLAN header matching is not supported yet");
 	case ACTIONS:
 		if (item->type != RTE_FLOW_ITEM_TYPE_END)
 			goto trans;
@@ -1042,6 +1446,77 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		}
 		++action;
 		break;
+	case ACTION_VXLAN_ENCAP:
+		if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP)
+			goto trans;
+		conf.vxlan_encap = action->conf;
+		if (mlx5_nl_flow_encap_reap(&encap,
+					    conf.vxlan_encap->definition,
+					    error))
+			return -rte_errno;
+		act_index =
+			mnl_attr_nest_start_check(buf, size, act_index_cur++);
+		if (!act_index ||
+		    !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND,
+					     "tunnel_key"))
+			goto error_nobufs;
+		act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
+		if (!act)
+			goto error_nobufs;
+		if (!mnl_attr_put_check(buf, size, TCA_TUNNEL_KEY_PARMS,
+					sizeof(struct tc_tunnel_key),
+					&(struct tc_tunnel_key){
+						.action = TC_ACT_PIPE,
+						.t_action =
+							TCA_TUNNEL_KEY_ACT_SET,
+					}))
+			goto error_nobufs;
+		if (encap.mask & BIT_ENCAP(IPV4_SRC) &&
+		    !mnl_attr_put_u32_check
+		    (buf, size, TCA_TUNNEL_KEY_ENC_IPV4_SRC,
+		     encap.ip.src.v4.s_addr))
+			goto error_nobufs;
+		if (encap.mask & BIT_ENCAP(IPV4_DST) &&
+		    !mnl_attr_put_u32_check
+		    (buf, size, TCA_TUNNEL_KEY_ENC_IPV4_DST,
+		     encap.ip.dst.v4.s_addr))
+			goto error_nobufs;
+		if (encap.mask & BIT_ENCAP(IPV6_SRC) &&
+		    !mnl_attr_put_check
+		    (buf, size, TCA_TUNNEL_KEY_ENC_IPV6_SRC,
+		     sizeof(encap.ip.src.v6), &encap.ip.src.v6))
+			goto error_nobufs;
+		if (encap.mask & BIT_ENCAP(IPV6_DST) &&
+		    !mnl_attr_put_check
+		    (buf, size, TCA_TUNNEL_KEY_ENC_IPV6_DST,
+		     sizeof(encap.ip.dst.v6), &encap.ip.dst.v6))
+			goto error_nobufs;
+		if (encap.mask & BIT_ENCAP(UDP_SRC) &&
+		    nl_flow != (void *)buf_tmp)
+			DRV_LOG(WARNING,
+				"UDP source port cannot be forced"
+				" for VXLAN encap; parameter ignored");
+		if (encap.mask & BIT_ENCAP(UDP_DST) &&
+		    !mnl_attr_put_u16_check
+		    (buf, size, TCA_TUNNEL_KEY_ENC_DST_PORT, encap.udp.dst))
+			goto error_nobufs;
+		if (!(encap.mask & BIT_ENCAP(VXLAN_VNI)))
+			return rte_flow_error_set
+				(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				 conf.vxlan_encap, "VXLAN VNI is missing");
+		if (!mnl_attr_put_u32_check
+		    (buf, size, TCA_TUNNEL_KEY_ENC_KEY_ID, encap.vxlan.vni))
+			goto error_nobufs;
+		mnl_attr_nest_end(buf, act);
+		mnl_attr_nest_end(buf, act_index);
+		++action;
+		break;
+	case ACTION_VXLAN_DECAP:
+		if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_DECAP)
+			goto trans;
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, action,
+			 "VXLAN decap is not supported yet");
 	case END:
 		if (item->type != RTE_FLOW_ITEM_TYPE_END ||
 		    action->type != RTE_FLOW_ACTION_TYPE_END)
@@ -1054,6 +1529,21 @@  mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		buf = NULL;
 		size -= nlh->nlmsg_len;
 		nl_flow->size += nlh->nlmsg_len;
+		if (!encap.mask)
+			return nl_flow->size;
+		i = RTE_ALIGN_CEIL(nl_flow->size,
+				   alignof(struct mlx5_nl_flow_encap));
+		i -= nl_flow->size;
+		if (size < i + sizeof(encap))
+			goto error_nobufs;
+		nl_flow->size += i;
+		buf = (void *)((uintptr_t)nl_flow + nl_flow->size);
+		size -= i;
+		nl_flow->encap = buf;
+		*nl_flow->encap = encap;
+		buf = NULL;
+		size -= sizeof(*nl_flow->encap);
+		nl_flow->size += sizeof(*nl_flow->encap);
 		return nl_flow->size;
 	}
 	back = trans;
@@ -1151,6 +1641,671 @@  mlx5_nl_flow_chat(struct mlx5_nl_flow_ctx *ctx, struct nlmsghdr *nlh,
 	return -err;
 }
 
+/** Data structure used by mlx5_nl_flow_init_vxlan_cb(). */
+struct mlx5_nl_flow_init_vxlan_data {
+	unsigned int ifindex; /**< Base interface index. */
+	rte_be16_t vxlan_port; /**< Remote UDP port. */
+	unsigned int *collect; /**< Collected interfaces. */
+	unsigned int collect_n; /**< Number of collected interfaces. */
+};
+
+/**
+ * Collect indices of VXLAN encap/decap interfaces associated with device.
+ *
+ * @param nlh
+ *   Pointer to reply header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   A positive, nonzero value on success, negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_init_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
+{
+	struct mlx5_nl_flow_init_vxlan_data *data = arg;
+	struct ifinfomsg *ifm;
+	struct nlattr *na;
+	struct nlattr *na_info = NULL;
+	struct nlattr *na_vxlan = NULL;
+	struct nlattr *na_vxlan_port = NULL;
+	bool found = false;
+	unsigned int *collect;
+
+	if (nlh->nlmsg_type != RTM_NEWLINK)
+		goto error_inval;
+	ifm = mnl_nlmsg_get_payload(nlh);
+	mnl_attr_for_each(na, nlh, sizeof(*ifm))
+		if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
+			na_info = na;
+			break;
+		}
+	if (!na_info)
+		return 1;
+	mnl_attr_for_each_nested(na, na_info) {
+		switch (mnl_attr_get_type(na)) {
+		case IFLA_INFO_KIND:
+			if (!strncmp("vxlan", mnl_attr_get_str(na),
+				     mnl_attr_get_len(na)))
+				found = true;
+			break;
+		case IFLA_INFO_DATA:
+			na_vxlan = na;
+			break;
+		}
+		if (found && na_vxlan)
+			break;
+	}
+	if (!found || !na_vxlan)
+		return 1;
+	found = false;
+	mnl_attr_for_each_nested(na, na_vxlan) {
+		switch (mnl_attr_get_type(na)) {
+		case IFLA_VXLAN_LINK:
+			if (mnl_attr_get_u32(na) == data->ifindex)
+				found = true;
+			break;
+		case IFLA_VXLAN_PORT:
+			na_vxlan_port = na;
+			break;
+		}
+		if (found && na_vxlan_port)
+			break;
+	}
+	if (!found ||
+	    (na_vxlan_port &&
+	     mnl_attr_get_u16(na_vxlan_port) != data->vxlan_port))
+		return 1;
+	if (!ifm->ifi_index)
+		goto error_inval;
+	collect = realloc(data->collect,
+			  (data->collect_n + 1) * sizeof(*data->collect));
+	if (!collect) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	collect[data->collect_n] = ifm->ifi_index;
+	data->collect = collect;
+	data->collect_n += 1;
+	return 1;
+error_inval:
+	rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Clean up and generate VXLAN encap/decap interface.
+ *
+ * @param ctx
+ *   Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param ifindex
+ *   Network interface index to associate VXLAN encap/decap with.
+ * @param vxlan_port
+ *   Remote UDP port.
+ * @param enable
+ *   If disabled, stop after initial clean up.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   Interface index on success, zero otherwise and rte_errno is set.
+ *
+ *   If @p enable is set, the returned ifindex is that of the new VXLAN
+ *   interface, otherwise @p ifindex is simply returned as is.
+ */
+static unsigned int
+mlx5_nl_flow_ifindex_vxlan(struct mlx5_nl_flow_ctx *ctx, unsigned int ifindex,
+			   rte_be16_t vxlan_port, int enable,
+			   struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*ifm) + 256)];
+	unsigned int ifindex_vxlan = 0;
+	struct mlx5_nl_flow_init_vxlan_data data = {
+		.ifindex = ifindex,
+		.vxlan_port = vxlan_port,
+		.collect = NULL,
+		.collect_n = 0,
+	};
+	char name[IF_NAMESIZE];
+	struct nlattr *na_info;
+	struct nlattr *na_vxlan;
+	unsigned int i;
+	int ret;
+
+	if (!ifindex) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	/*
+	 * Seek and destroy leftover VXLAN encap/decap interfaces with
+	 * matching properties.
+	 */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_GETLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+	ifm->ifi_family = AF_UNSPEC;
+	ret = mlx5_nl_flow_chat(ctx, nlh, mlx5_nl_flow_init_vxlan_cb, &data);
+	if (ret)
+		goto exit;
+	nlh->nlmsg_type = RTM_DELLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	for (i = 0; i != data.collect_n; ++i) {
+		ifm->ifi_index = data.collect[i];
+		DRV_LOG(DEBUG, "cleaning up VXLAN encap/decap ifindex %u",
+			ifm->ifi_index);
+		ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+		if (ret)
+			goto exit;
+	}
+	if (!enable)
+		return ifindex;
+	/* Add fresh VXLAN encap/decap interface. */
+	nlh->nlmsg_type = RTM_NEWLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_REPLACE;
+	ifm->ifi_type = ARPHRD_ETHER;
+	ifm->ifi_index = 0;
+	ifm->ifi_flags = IFF_UP;
+	ifm->ifi_change = 0xffffffff;
+	if (snprintf(name, sizeof(name), "vxlan_%u_%u",
+		     rte_be_to_cpu_16(vxlan_port), ifindex) == -1) {
+		ret = -errno;
+		goto exit;
+	}
+	ret = -ENOBUFS;
+	if (!mnl_attr_put_strz_check(nlh, sizeof(buf), IFLA_IFNAME, name))
+		goto exit;
+	na_info = mnl_attr_nest_start_check(nlh, sizeof(buf), IFLA_LINKINFO);
+	if (!na_info)
+		goto exit;
+	if (!mnl_attr_put_strz_check(nlh, sizeof(buf), IFLA_INFO_KIND, "vxlan"))
+		goto exit;
+	na_vxlan = mnl_attr_nest_start_check(nlh, sizeof(buf), IFLA_INFO_DATA);
+	if (!na_vxlan)
+		goto exit;
+	if (!mnl_attr_put_u32_check(nlh, sizeof(buf), IFLA_VXLAN_LINK, ifindex))
+		goto exit;
+	if (!mnl_attr_put_u8_check(nlh, sizeof(buf),
+				   IFLA_VXLAN_COLLECT_METADATA, 1))
+		goto exit;
+	/*
+	 * When destination port or VNI are either undefined or set to fixed
+	 * values, kernel complains with EEXIST ("A VXLAN device with the
+	 * specified VNI already exist") when creating subsequent VXLAN
+	 * interfaces with the same properties, even if linked with
+	 * different physical devices.
+	 *
+	 * Also since only destination ports assigned to existing VXLAN
+	 * interfaces can be offloaded to the switch, the above limitation
+	 * cannot be worked around by picking a random value here and using
+	 * a different one when creating flow rules later.
+	 *
+	 * Therefore request a hopefully unique VNI based on the interface
+	 * index in order to work around EEXIST. VNI will be overridden
+	 * later on a flow rule basis thanks to IFLA_VXLAN_COLLECT_METADATA.
+	 */
+	if (!mnl_attr_put_u16_check(nlh, sizeof(buf), IFLA_VXLAN_PORT,
+				    vxlan_port))
+		goto exit;
+	if (!mnl_attr_put_u32_check(nlh, sizeof(buf), IFLA_VXLAN_ID, ifindex))
+		goto exit;
+	mnl_attr_nest_end(nlh, na_vxlan);
+	mnl_attr_nest_end(nlh, na_info);
+	ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+	if (ret)
+		goto exit;
+	/* Lastly, retrieve its ifindex value. */
+	nlh->nlmsg_type = RTM_GETLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	data.collect_n = 0;
+	ret = mlx5_nl_flow_chat(ctx, nlh, mlx5_nl_flow_init_vxlan_cb, &data);
+	if (ret)
+		goto exit;
+	ret = -ENXIO;
+	if (data.collect_n != 1 || !*data.collect)
+		goto exit;
+	ifindex_vxlan = *data.collect;
+	DRV_LOG(DEBUG, "created VXLAN encap/decap ifindex %u (%s)",
+		ifindex_vxlan, name);
+	ret = mlx5_nl_flow_ifindex_init(ctx, ifindex_vxlan, error);
+	if (ret) {
+		mlx5_nl_flow_ifindex_vxlan(ctx, ifindex_vxlan, vxlan_port,
+					   false, NULL);
+		ifindex_vxlan = 0;
+		goto exit;
+	}
+	ret = 0;
+exit:
+	free(data.collect);
+	if (ret)
+		rte_flow_error_set
+			(error, -ret, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "netlink: failed to request VXLAN encap/decap"
+			 " interface creation/deletion");
+	return ifindex_vxlan;
+}
+
+/**
+ * Emit Netlink message to add/remove local address.
+ *
+ * Note that an implicit route is maintained by the kernel due to the
+ * presence of a peer address (IFA_ADDRESS).
+ *
+ * @param ctx
+ *   Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param[in] encap
+ *   Encapsulation properties (source address).
+ * @param ifindex
+ *   Network interface.
+ * @param enable
+ *   Toggle between add and remove.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_encap_local(struct mlx5_nl_flow_ctx *ctx,
+			 const struct mlx5_nl_flow_encap *encap,
+			 unsigned int ifindex,
+			 bool enable,
+			 struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct ifaddrmsg *ifa;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
+	nlh->nlmsg_flags =
+		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+	nlh->nlmsg_seq = 0;
+	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
+	if (encap->mask & BIT_ENCAP(IPV4_SRC)) {
+		ifa->ifa_family = AF_INET;
+		ifa->ifa_prefixlen = 32;
+	} else if (encap->mask & BIT_ENCAP(IPV6_SRC)) {
+		ifa->ifa_family = AF_INET6;
+		ifa->ifa_prefixlen = 128;
+	} else {
+		ifa->ifa_family = AF_UNSPEC;
+		ifa->ifa_prefixlen = 0;
+	}
+	ifa->ifa_flags = IFA_F_PERMANENT;
+	ifa->ifa_scope = RT_SCOPE_LINK;
+	ifa->ifa_index = ifindex;
+	if (encap->mask & BIT_ENCAP(IPV4_SRC) &&
+	    !mnl_attr_put_u32_check(nlh, sizeof(buf), IFA_LOCAL,
+				    encap->ip.src.v4.s_addr))
+		goto error_nobufs;
+	if (encap->mask & BIT_ENCAP(IPV6_SRC) &&
+	    !mnl_attr_put_check(nlh, sizeof(buf), IFA_LOCAL,
+				sizeof(encap->ip.src.v6), &encap->ip.src.v6))
+		goto error_nobufs;
+	if (encap->mask & BIT_ENCAP(IPV4_DST) &&
+	    !mnl_attr_put_u32_check(nlh, sizeof(buf), IFA_ADDRESS,
+				    encap->ip.dst.v4.s_addr))
+		goto error_nobufs;
+	if (encap->mask & BIT_ENCAP(IPV6_DST) &&
+	    !mnl_attr_put_check(nlh, sizeof(buf), IFA_ADDRESS,
+				sizeof(encap->ip.dst.v6), &encap->ip.dst.v6))
+		goto error_nobufs;
+	if (!mlx5_nl_flow_chat(ctx, nlh, NULL, NULL))
+		return 0;
+	return rte_flow_error_set
+		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "cannot complete IFA request");
+error_nobufs:
+	return rte_flow_error_set
+		(error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "generated IFA message is too large");
+}
+
+/**
+ * Emit Netlink message to add/remove neighbor.
+ *
+ * @param ctx
+ *   Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param[in] encap
+ *   Encapsulation properties (destination address).
+ * @param ifindex
+ *   Network interface.
+ * @param enable
+ *   Toggle between add and remove.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_encap_neigh(struct mlx5_nl_flow_ctx *ctx,
+			 const struct mlx5_nl_flow_encap *encap,
+			 unsigned int ifindex,
+			 bool enable,
+			 struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
+
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
+	nlh->nlmsg_flags =
+		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+	nlh->nlmsg_seq = 0;
+	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
+	if (encap->mask & BIT_ENCAP(IPV4_DST))
+		ndm->ndm_family = AF_INET;
+	else if (encap->mask & BIT_ENCAP(IPV6_DST))
+		ndm->ndm_family = AF_INET6;
+	else
+		ndm->ndm_family = AF_UNSPEC;
+	ndm->ndm_ifindex = ifindex;
+	ndm->ndm_state = NUD_PERMANENT;
+	ndm->ndm_flags = 0;
+	ndm->ndm_type = 0;
+	if (encap->mask & BIT_ENCAP(IPV4_DST) &&
+	    !mnl_attr_put_u32_check(nlh, sizeof(buf), NDA_DST,
+				    encap->ip.dst.v4.s_addr))
+		goto error_nobufs;
+	if (encap->mask & BIT_ENCAP(IPV6_DST) &&
+	    !mnl_attr_put_check(nlh, sizeof(buf), NDA_DST,
+				sizeof(encap->ip.dst.v6), &encap->ip.dst.v6))
+		goto error_nobufs;
+	if (encap->mask & BIT_ENCAP(ETH_SRC) && enable)
+		DRV_LOG(WARNING,
+			"Ethernet source address cannot be forced"
+			" for VXLAN encap; parameter ignored");
+	if (encap->mask & BIT_ENCAP(ETH_DST) &&
+	    !mnl_attr_put_check(nlh, sizeof(buf), NDA_LLADDR,
+				sizeof(encap->eth.dst), &encap->eth.dst))
+		goto error_nobufs;
+	if (!mlx5_nl_flow_chat(ctx, nlh, NULL, NULL))
+		return 0;
+	return rte_flow_error_set
+		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "cannot complete ND request");
+error_nobufs:
+	return rte_flow_error_set
+		(error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "generated ND message is too large");
+}
+
+/**
+ * Look for matching IP source/destination properties.
+ *
+ * @param[in] bag
+ *   Search target.
+ * @param bag_mask
+ *   Bit-mask for valid fields in @p bag.
+ * @param[in] what
+ *   Properties to look for in @p bag.
+ * @param what_mask
+ *   Bit-mask for valid fields in @p what.
+ *
+ * @return
+ *   True if @p what is found in @p bag, false otherwise.
+ */
+static bool
+mlx5_nl_flow_encap_ip_search(const struct mlx5_nl_flow_encap_ip *bag,
+			     uint32_t bag_mask,
+			     const struct mlx5_nl_flow_encap_ip *what,
+			     uint32_t what_mask)
+{
+	if ((what_mask & BIT_ENCAP(IPV4_SRC) &&
+	     (!(bag_mask & BIT_ENCAP(IPV4_SRC)) ||
+	      bag->src.v4.s_addr != what->src.v4.s_addr)) ||
+	    (what_mask & BIT_ENCAP(IPV4_DST) &&
+	     (!(bag_mask & BIT_ENCAP(IPV4_DST)) ||
+	      bag->dst.v4.s_addr != what->dst.v4.s_addr)) ||
+	    (what_mask & BIT_ENCAP(IPV6_SRC) &&
+	     (!(bag_mask & BIT_ENCAP(IPV6_SRC)) ||
+	      memcmp(&bag->src.v6, &what->src.v6, sizeof(bag->src.v6)))) ||
+	    (what_mask & BIT_ENCAP(IPV6_DST) &&
+	     (!(bag_mask & BIT_ENCAP(IPV6_DST)) ||
+	      memcmp(&bag->dst.v6, &what->dst.v6, sizeof(bag->dst.v6)))))
+		return false;
+	return true;
+}
+
+/**
+ * Interface resources list common to all driver instances of a given
+ * process. It is protected by a standard mutex because resource allocation
+ * is slow and involves system calls.
+ */
+static LIST_HEAD(, mlx5_nl_flow_encap_ifindex) mlx5_nl_flow_encap_ifindex_list =
+	LIST_HEAD_INITIALIZER();
+static pthread_mutex_t mlx5_nl_flow_encap_ifindex_list_lock =
+	PTHREAD_MUTEX_INITIALIZER;
+
+/**
+ * Retrieve target interface index for encapsulation.
+ *
+ * Resources are automatically allocated and released as necessary.
+ *
+ * @param ctx
+ *   Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param[in] encap
+ *   Encapsulation properties.
+ * @param ifindex
+ *   Outer network interface.
+ * @param enable
+ *   Toggle whether resources are allocated or released.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   Interface index on success, zero otherwise and rte_errno is set.
+ *
+ *   If @p enable is set, the returned ifindex is that of the inner
+ *   interface, otherwise @p ifindex is simply returned as is.
+ */
+static unsigned int
+mlx5_nl_flow_encap_ifindex(struct mlx5_nl_flow_ctx *ctx,
+			   const struct mlx5_nl_flow_encap *encap,
+			   unsigned int ifindex,
+			   bool enable,
+			   struct rte_flow_error *error)
+{
+	struct mlx5_nl_flow_encap_ifindex *encap_ifindex = NULL;
+	struct mlx5_nl_flow_encap_vxlan *encap_vxlan = NULL;
+	struct mlx5_nl_flow_encap_addr *encap_local = NULL;
+	struct mlx5_nl_flow_encap_addr *encap_neigh = NULL;
+	unsigned int ifindex_inner = ifindex;
+	int ret;
+
+	pthread_mutex_lock(&mlx5_nl_flow_encap_ifindex_list_lock);
+	/* Interface descriptor. */
+	LIST_FOREACH(encap_ifindex, &mlx5_nl_flow_encap_ifindex_list, next) {
+		if (encap_ifindex->outer != ifindex)
+			continue;
+		if (enable)
+			++encap_ifindex->refcnt;
+		break;
+	}
+	if (enable && !encap_ifindex) {
+		encap_ifindex =
+			rte_zmalloc_socket(__func__, sizeof(*encap_ifindex),
+					   0, ctx->socket);
+		if (!encap_ifindex) {
+			rte_flow_error_set
+				(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				 NULL, "missing ifindex encap data");
+			goto release;
+		}
+		*encap_ifindex = (struct mlx5_nl_flow_encap_ifindex){
+			.refcnt = 1,
+			.outer = ifindex,
+			.vxlan = LIST_HEAD_INITIALIZER(),
+			.local = LIST_HEAD_INITIALIZER(),
+			.neigh = LIST_HEAD_INITIALIZER(),
+		};
+		LIST_INSERT_HEAD(&mlx5_nl_flow_encap_ifindex_list,
+				 encap_ifindex, next);
+	}
+	if (!encap_ifindex) {
+		if (!enable)
+			goto release;
+		rte_flow_error_set
+			(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "nonexistent interface");
+		goto release;
+	}
+	/* VXLAN descriptor. */
+	if (!(encap->mask & BIT_ENCAP(VXLAN_VNI)) ||
+	    !(encap->mask & BIT_ENCAP(UDP_SRC)))
+		goto skip_vxlan;
+	LIST_FOREACH(encap_vxlan, &encap_ifindex->vxlan, next) {
+		if (encap->udp.src != encap_vxlan->port)
+			continue;
+		if (enable)
+			++encap_vxlan->refcnt;
+		break;
+	}
+	if (enable && !encap_vxlan) {
+		encap_vxlan =
+			rte_zmalloc_socket(__func__, sizeof(*encap_vxlan),
+					   0, ctx->socket);
+		if (!encap_vxlan) {
+			rte_flow_error_set
+				(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				 NULL, "missing VXLAN encap data");
+			goto release;
+		}
+		*encap_vxlan = (struct mlx5_nl_flow_encap_vxlan){
+			.refcnt = 1,
+			.port = encap->udp.src,
+			.inner = mlx5_nl_flow_ifindex_vxlan
+				(ctx, ifindex, encap->udp.src, true, error),
+		};
+		if (!encap_vxlan->inner) {
+			rte_free(encap_vxlan);
+			encap_vxlan = NULL;
+			goto release;
+		}
+		LIST_INSERT_HEAD(&encap_ifindex->vxlan, encap_vxlan, next);
+	}
+	ifindex_inner = encap_vxlan->inner;
+skip_vxlan:
+	/* Local address descriptor (source). */
+	LIST_FOREACH(encap_local, &encap_ifindex->local, next) {
+		if (!mlx5_nl_flow_encap_ip_search
+		    (&encap->ip, encap->mask,
+		     &encap_local->ip, encap_local->mask &
+		     (BIT_ENCAP(IPV4_SRC) | BIT_ENCAP(IPV6_SRC))))
+			continue;
+		if (enable)
+			++encap_local->refcnt;
+		break;
+	}
+	if (enable && !encap_local &&
+	    encap->mask & (BIT_ENCAP(IPV4_SRC) | BIT_ENCAP(IPV6_SRC))) {
+		encap_local =
+			rte_zmalloc_socket(__func__, sizeof(*encap_local),
+					   0, ctx->socket);
+		if (!encap_local) {
+			rte_flow_error_set
+				(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				 NULL, "missing local encap data");
+			goto release;
+		}
+		encap_local->refcnt = 1;
+		encap_local->mask =
+			encap->mask &
+			(BIT_ENCAP(IPV4_SRC) | BIT_ENCAP(IPV6_SRC));
+		if (encap->mask & BIT_ENCAP(IPV4_SRC))
+			encap_local->ip.src.v4 = encap->ip.src.v4;
+		if (encap->mask & BIT_ENCAP(IPV6_SRC))
+			encap_local->ip.src.v6 = encap->ip.src.v6;
+		ret = mlx5_nl_flow_encap_local(ctx, encap, ifindex, true,
+					       error);
+		if (ret) {
+			rte_free(encap_local);
+			encap_local = NULL;
+			goto release;
+		}
+		LIST_INSERT_HEAD(&encap_ifindex->local, encap_local, next);
+	}
+	/* Neighbor descriptor (destination). */
+	LIST_FOREACH(encap_neigh, &encap_ifindex->neigh, next) {
+		if (!mlx5_nl_flow_encap_ip_search
+		    (&encap->ip, encap->mask,
+		     &encap_local->ip, encap_local->mask &
+		     (BIT_ENCAP(IPV4_DST) | BIT_ENCAP(IPV6_DST))))
+			continue;
+		if (enable)
+			++encap_neigh->refcnt;
+		break;
+	}
+	if (enable && !encap_neigh &&
+	    encap->mask & (BIT_ENCAP(IPV4_DST) | BIT_ENCAP(IPV6_DST))) {
+		encap_neigh =
+			rte_zmalloc_socket(__func__, sizeof(*encap_neigh),
+					   0, ctx->socket);
+		if (!encap_neigh) {
+			rte_flow_error_set
+				(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				 NULL, "missing neigh encap data");
+			goto release;
+		}
+		encap_neigh->refcnt = 1;
+		encap_neigh->mask =
+			encap->mask &
+			(BIT_ENCAP(IPV4_DST) | BIT_ENCAP(IPV6_DST));
+		if (encap->mask & BIT_ENCAP(IPV4_DST))
+			encap_neigh->ip.dst.v4 = encap->ip.dst.v4;
+		if (encap->mask & BIT_ENCAP(IPV6_DST))
+			encap_neigh->ip.dst.v6 = encap->ip.dst.v6;
+		ret = mlx5_nl_flow_encap_neigh(ctx, encap, ifindex, true,
+					       error);
+		if (ret) {
+			rte_free(encap_neigh);
+			encap_neigh = NULL;
+			goto release;
+		}
+		LIST_INSERT_HEAD(&encap_ifindex->neigh, encap_neigh, next);
+	}
+	if (!enable)
+		goto release;
+	pthread_mutex_unlock(&mlx5_nl_flow_encap_ifindex_list_lock);
+	return ifindex_inner;
+release:
+	ret = rte_errno;
+	if (encap_neigh && !--encap_neigh->refcnt) {
+		LIST_REMOVE(encap_neigh, next);
+		mlx5_nl_flow_encap_neigh(ctx, encap, ifindex, false, NULL);
+		rte_free(encap_neigh);
+	}
+	if (encap_local && !--encap_local->refcnt) {
+		LIST_REMOVE(encap_local, next);
+		mlx5_nl_flow_encap_local(ctx, encap, ifindex, false, NULL);
+		rte_free(encap_local);
+	}
+	if (encap_vxlan && !--encap_vxlan->refcnt) {
+		LIST_REMOVE(encap_vxlan, next);
+		mlx5_nl_flow_ifindex_vxlan
+			(ctx, ifindex, encap_vxlan->port, false, NULL);
+		rte_free(encap_vxlan);
+	}
+	if (encap_ifindex && !--encap_ifindex->refcnt) {
+		LIST_REMOVE(encap_ifindex, next);
+		rte_free(encap_ifindex);
+	}
+	pthread_mutex_unlock(&mlx5_nl_flow_encap_ifindex_list_lock);
+	if (!enable)
+		return ifindex;
+	rte_errno = ret;
+	return 0;
+}
+
 /**
  * Create a Netlink flow rule.
  *
@@ -1169,17 +2324,35 @@  mlx5_nl_flow_create(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
 		    struct rte_flow_error *error)
 {
 	struct nlmsghdr *nlh = (void *)nl_flow->msg;
+	struct mlx5_nl_flow_encap *encap =
+		nl_flow->encap && nl_flow->ifindex_dst ?
+		nl_flow->encap : NULL;
+	unsigned int ifindex = encap ? *nl_flow->ifindex_dst : 0;
+	int ret;
 
 	if (nl_flow->applied)
 		return 0;
 	nlh->nlmsg_type = RTM_NEWTFILTER;
 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
-	if (!mlx5_nl_flow_chat(ctx, nlh, NULL, NULL)) {
+	if (encap) {
+		nl_flow->encap_ifindex = mlx5_nl_flow_encap_ifindex
+			(ctx, encap, ifindex, true, error);
+		if (!nl_flow->encap_ifindex)
+			return -rte_errno;
+		*nl_flow->ifindex_dst = nl_flow->encap_ifindex;
+	}
+	ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+	if (encap)
+		*nl_flow->ifindex_dst = ifindex;
+	if (!ret) {
 		nl_flow->applied = 1;
 		return 0;
 	}
+	ret = rte_errno;
+	if (nl_flow->encap_ifindex)
+		mlx5_nl_flow_encap_ifindex(ctx, encap, ifindex, false, NULL);
 	return rte_flow_error_set
-		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		(error, ret, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 		 "netlink: failed to create TC flow rule");
 }
 
@@ -1204,14 +2377,31 @@  mlx5_nl_flow_destroy(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
 		     struct rte_flow_error *error)
 {
 	struct nlmsghdr *nlh = (void *)nl_flow->msg;
+	struct mlx5_nl_flow_encap *encap =
+		nl_flow->encap && nl_flow->ifindex_dst ?
+		nl_flow->encap : NULL;
+	unsigned int ifindex = encap ? *nl_flow->ifindex_dst : 0;
+	int err = 0;
 	int ret;
 
 	if (!nl_flow->applied)
 		return 0;
 	nlh->nlmsg_type = RTM_DELTFILTER;
 	nlh->nlmsg_flags = NLM_F_REQUEST;
+	if (encap) {
+		if (!mlx5_nl_flow_encap_ifindex
+		    (ctx, encap, ifindex, false, error))
+			err = rte_errno;
+		*nl_flow->ifindex_dst = nl_flow->encap_ifindex;
+	}
 	ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+	if (encap)
+		*nl_flow->ifindex_dst = ifindex;
 	nl_flow->applied = 0;
+	if (err) {
+		rte_errno = err;
+		return -rte_errno;
+	}
 	if (!ret)
 		return 0;
 	return rte_flow_error_set