@@ -384,6 +384,16 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
/usr/include/assert.h \
define static_assert \
$(AUTOCONF_OUTPUT)
+ $Q sh -- '$<' '$@' \
+ HAVE_TC_ACT_TUNNEL_KEY \
+ linux/tc_act/tc_tunnel_key.h \
+ define TCA_ACT_TUNNEL_KEY \
+ $(AUTOCONF_OUTPUT)
+ $Q sh -- '$<' '$@' \
+ HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT \
+ linux/tc_act/tc_tunnel_key.h \
+ enum TCA_TUNNEL_KEY_ENC_DST_PORT \
+ $(AUTOCONF_OUTPUT)
# Create mlx5_autoconf.h or update it in case it differs from the new one.
@@ -6,7 +6,31 @@
#include <assert.h>
#include <errno.h>
#include <libmnl/libmnl.h>
+/*
+ * Older versions of linux/if.h do not have the required safeties to coexist
+ * with net/if.h. This causes a compilation failure due to symbol
+ * redefinitions even when including the latter first.
+ *
+ * One workaround is to prevent net/if.h from defining conflicting symbols
+ * by removing __USE_MISC, and maintaining it undefined while including
+ * linux/if.h.
+ *
+ * Alphabetical order cannot be preserved since net/if.h must always be
+ * included before linux/if.h regardless.
+ */
+#ifdef __USE_MISC
+#undef __USE_MISC
+#define RESTORE_USE_MISC
+#endif
+#include <net/if.h>
+#include <linux/if.h>
+#ifdef RESTORE_USE_MISC
+#undef RESTORE_USE_MISC
+#define __USE_MISC 1
+#endif
+#include <linux/if_arp.h>
#include <linux/if_ether.h>
+#include <linux/if_link.h>
#include <linux/netlink.h>
#include <linux/pkt_cls.h>
#include <linux/pkt_sched.h>
@@ -14,11 +38,13 @@
#include <linux/tc_act/tc_gact.h>
#include <linux/tc_act/tc_mirred.h>
#include <netinet/in.h>
+#include <pthread.h>
#include <stdalign.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
+#include <sys/queue.h>
#include <sys/socket.h>
#include <rte_byteorder.h>
@@ -52,6 +78,34 @@ struct tc_vlan {
#endif /* HAVE_TC_ACT_VLAN */
+#ifdef HAVE_TC_ACT_TUNNEL_KEY
+
+#include <linux/tc_act/tc_tunnel_key.h>
+
+#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
+#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
+#endif
+
+#else /* HAVE_TC_ACT_TUNNEL_KEY */
+
+#define TCA_ACT_TUNNEL_KEY 17
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE 2
+#define TCA_TUNNEL_KEY_PARMS 2
+#define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
+#define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
+#define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
+#define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
+#define TCA_TUNNEL_KEY_ENC_KEY_ID 7
+#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
+
+struct tc_tunnel_key {
+ tc_gen;
+ int t_action;
+};
+
+#endif /* HAVE_TC_ACT_TUNNEL_KEY */
+
/* Normally found in linux/netlink.h. */
#ifndef NETLINK_CAP_ACK
#define NETLINK_CAP_ACK 10
@@ -148,6 +202,71 @@ struct tc_vlan {
#define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
#endif
+#define BIT(b) (1 << (b))
+#define BIT_ENCAP(e) BIT(MLX5_NL_FLOW_ENCAP_ ## e)
+
+/** Flags used for @p mask in struct mlx5_nl_flow_encap. */
+enum mlx5_nl_flow_encap_flag {
+ MLX5_NL_FLOW_ENCAP_ETH_SRC,
+ MLX5_NL_FLOW_ENCAP_ETH_DST,
+ MLX5_NL_FLOW_ENCAP_IPV4_SRC,
+ MLX5_NL_FLOW_ENCAP_IPV4_DST,
+ MLX5_NL_FLOW_ENCAP_IPV6_SRC,
+ MLX5_NL_FLOW_ENCAP_IPV6_DST,
+ MLX5_NL_FLOW_ENCAP_UDP_SRC,
+ MLX5_NL_FLOW_ENCAP_UDP_DST,
+ MLX5_NL_FLOW_ENCAP_VXLAN_VNI,
+};
+
+/** Encapsulation structure with fixed format for convenience. */
+struct mlx5_nl_flow_encap {
+ uint32_t mask;
+ struct {
+ struct ether_addr src;
+ struct ether_addr dst;
+ } eth;
+ struct mlx5_nl_flow_encap_ip {
+ union mlx5_nl_flow_encap_ip_addr {
+ struct in_addr v4;
+ struct in6_addr v6;
+ } src;
+ union mlx5_nl_flow_encap_ip_addr dst;
+ } ip;
+ struct {
+ rte_be16_t src;
+ rte_be16_t dst;
+ } udp;
+ struct {
+ rte_be32_t vni;
+ } vxlan;
+};
+
+/** Generic address descriptor for encapsulation resources. */
+struct mlx5_nl_flow_encap_addr {
+ LIST_ENTRY(mlx5_nl_flow_encap_addr) next;
+ uint32_t refcnt;
+ uint32_t mask;
+ struct mlx5_nl_flow_encap_ip ip;
+};
+
+/** VXLAN-specific encapsulation resources. */
+struct mlx5_nl_flow_encap_vxlan {
+ LIST_ENTRY(mlx5_nl_flow_encap_vxlan) next;
+ uint32_t refcnt;
+ rte_be16_t port;
+ unsigned int inner;
+};
+
+/** Encapsulation interface descriptor. */
+struct mlx5_nl_flow_encap_ifindex {
+ LIST_ENTRY(mlx5_nl_flow_encap_ifindex) next;
+ uint32_t refcnt;
+ unsigned int outer;
+ LIST_HEAD(, mlx5_nl_flow_encap_vxlan) vxlan;
+ LIST_HEAD(, mlx5_nl_flow_encap_addr) local;
+ LIST_HEAD(, mlx5_nl_flow_encap_addr) neigh;
+};
+
/** Context object required by most functions. */
struct mlx5_nl_flow_ctx {
int socket; /**< NUMA socket for memory allocations. */
@@ -159,8 +278,10 @@ struct mlx5_nl_flow_ctx {
struct mlx5_nl_flow {
uint32_t size; /**< Size of this object. */
uint32_t applied:1; /**< Whether rule is currently applied. */
+ unsigned int encap_ifindex; /**< Interface to use with @p encap. */
unsigned int *ifindex_src; /**< Source interface. */
unsigned int *ifindex_dst; /**< Destination interface. */
+ struct mlx5_nl_flow_encap *encap; /**< Encapsulation properties. */
alignas(struct nlmsghdr)
uint8_t msg[]; /**< Netlink message data. */
};
@@ -179,6 +300,7 @@ enum mlx5_nl_flow_trans {
ITEM_IPV6,
ITEM_TCP,
ITEM_UDP,
+ ITEM_VXLAN,
ACTIONS,
ACTION_VOID,
ACTION_PORT_ID,
@@ -187,6 +309,8 @@ enum mlx5_nl_flow_trans {
ACTION_OF_PUSH_VLAN,
ACTION_OF_SET_VLAN_VID,
ACTION_OF_SET_VLAN_PCP,
+ ACTION_VXLAN_ENCAP,
+ ACTION_VXLAN_DECAP,
END,
};
@@ -196,7 +320,8 @@ enum mlx5_nl_flow_trans {
ITEM_VOID, ITEM_PORT_ID, ACTIONS
#define ACTIONS_COMMON \
ACTION_VOID, ACTION_OF_POP_VLAN, ACTION_OF_PUSH_VLAN, \
- ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP
+ ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP, \
+ ACTION_VXLAN_ENCAP, ACTION_VXLAN_DECAP
#define ACTIONS_FATE \
ACTION_PORT_ID, ACTION_DROP
@@ -213,7 +338,8 @@ static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
[ITEM_IPV4] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
[ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
[ITEM_TCP] = TRANS(PATTERN_COMMON),
- [ITEM_UDP] = TRANS(PATTERN_COMMON),
+ [ITEM_UDP] = TRANS(ITEM_VXLAN, PATTERN_COMMON),
+ [ITEM_VXLAN] = TRANS(PATTERN_COMMON),
[ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
[ACTION_VOID] = TRANS(BACK),
[ACTION_PORT_ID] = TRANS(ACTION_VOID, END),
@@ -222,6 +348,21 @@ static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
[ACTION_OF_PUSH_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
[ACTION_OF_SET_VLAN_VID] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
[ACTION_OF_SET_VLAN_PCP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+ [ACTION_VXLAN_ENCAP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+ [ACTION_VXLAN_DECAP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+ [END] = NULL,
+};
+
+/** Parser state transitions used by mlx5_nl_flow_encap_reap(). */
+static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_encap_reap_trans[] = {
+ [INVALID] = NULL,
+ [BACK] = NULL,
+ [ITEM_VOID] = TRANS(BACK),
+ [ITEM_ETH] = TRANS(ITEM_IPV4, ITEM_IPV6, ITEM_VOID),
+ [ITEM_IPV4] = TRANS(ITEM_UDP, ITEM_VOID),
+ [ITEM_IPV6] = TRANS(ITEM_UDP, ITEM_VOID),
+ [ITEM_UDP] = TRANS(ITEM_VXLAN, ITEM_VOID),
+ [ITEM_VXLAN] = TRANS(END),
[END] = NULL,
};
@@ -234,6 +375,7 @@ static const union {
struct rte_flow_item_ipv6 ipv6;
struct rte_flow_item_tcp tcp;
struct rte_flow_item_udp udp;
+ struct rte_flow_item_vxlan vxlan;
} mlx5_nl_flow_mask_empty;
#define ETHER_ADDR_MASK "\xff\xff\xff\xff\xff\xff"
@@ -242,6 +384,7 @@ static const union {
"\xff\xff\xff\xff\xff\xff\xff\xff" \
"\xff\xff\xff\xff\xff\xff\xff\xff"
#define BE16_MASK RTE_BE16(0xffff)
+#define VXLAN_VNI_MASK "\xff\xff\xff"
/** Supported masks for known item types. */
static const struct {
@@ -286,6 +429,35 @@ static const struct {
},
};
+/** Supported masks for known encapsulation item types. */
+static const struct {
+ struct rte_flow_item_eth eth;
+ struct rte_flow_item_ipv4 ipv4;
+ struct rte_flow_item_ipv6 ipv6;
+ struct rte_flow_item_udp udp;
+ struct rte_flow_item_vxlan vxlan;
+} mlx5_nl_flow_encap_mask_supported = {
+ .eth = {
+ .dst.addr_bytes = ETHER_ADDR_MASK,
+ .src.addr_bytes = ETHER_ADDR_MASK,
+ },
+ .ipv4.hdr = {
+ .src_addr = IN_ADDR_MASK,
+ .dst_addr = IN_ADDR_MASK,
+ },
+ .ipv6.hdr = {
+ .src_addr = IN6_ADDR_MASK,
+ .dst_addr = IN6_ADDR_MASK,
+ },
+ .udp.hdr = {
+ .src_port = BE16_MASK,
+ .dst_port = BE16_MASK,
+ },
+ .vxlan = {
+ .vni = VXLAN_VNI_MASK,
+ },
+};
+
/**
* Retrieve mask for pattern item.
*
@@ -361,6 +533,227 @@ mlx5_nl_flow_item_mask(const struct rte_flow_item *item,
}
/**
+ * Convert VXLAN VNI to 32-bit integer.
+ *
+ * @param[in] vni
+ * VXLAN VNI in 24-bit wire format.
+ *
+ * @return
+ * VXLAN VNI as a 32-bit integer value in network endian.
+ */
+static rte_be32_t
+vxlan_vni_as_be32(const uint8_t vni[3])
+{
+ return (volatile union { uint8_t u8[4]; rte_be32_t u32; })
+ { { 0, vni[0], vni[1], vni[2] } }.u32;
+}
+
+/**
+ * Populate consolidated encapsulation object from list of pattern items.
+ *
+ * Helper function to process configuration of generic actions such as
+ * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP.
+ *
+ * @param[out] dst
+ * Destination object.
+ * @param[in] src
+ * List of pattern items to gather data from.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_encap_reap(struct mlx5_nl_flow_encap *dst,
+ const struct rte_flow_item *src,
+ struct rte_flow_error *error)
+{
+ struct mlx5_nl_flow_encap tmp = {
+ .mask = 0,
+ };
+ unsigned int n = 0;
+ const enum mlx5_nl_flow_trans *trans = TRANS(ITEM_ETH);
+ const enum mlx5_nl_flow_trans *back = trans;
+
+trans:
+ switch (trans[n++]) {
+ union {
+ const struct rte_flow_item_eth *eth;
+ const struct rte_flow_item_ipv4 *ipv4;
+ const struct rte_flow_item_ipv6 *ipv6;
+ const struct rte_flow_item_udp *udp;
+ const struct rte_flow_item_vxlan *vxlan;
+ } spec, mask;
+
+ default:
+ case INVALID:
+ goto error_encap;
+ case BACK:
+ trans = back;
+ n = 0;
+ goto trans;
+ case ITEM_VOID:
+ if (src->type != RTE_FLOW_ITEM_TYPE_VOID)
+ goto trans;
+ ++src;
+ break;
+ case ITEM_ETH:
+ if (src->type != RTE_FLOW_ITEM_TYPE_ETH)
+ goto trans;
+ mask.eth = mlx5_nl_flow_item_mask
+ (src, &rte_flow_item_eth_mask,
+ &mlx5_nl_flow_encap_mask_supported.eth,
+ &mlx5_nl_flow_mask_empty.eth,
+ sizeof(rte_flow_item_eth_mask), error);
+ if (!mask.eth)
+ return -rte_errno;
+ if (mask.eth == &mlx5_nl_flow_mask_empty.eth)
+ goto error_spec;
+ spec.eth = src->spec;
+ if (!is_zero_ether_addr(&mask.eth->src)) {
+ if (!is_broadcast_ether_addr(&mask.eth->src))
+ goto error_mask;
+ tmp.eth.src = spec.eth->src;
+ tmp.mask |= BIT_ENCAP(ETH_SRC);
+ }
+ if (!is_zero_ether_addr(&mask.eth->dst)) {
+ if (!is_broadcast_ether_addr(&mask.eth->dst))
+ goto error_mask;
+ tmp.eth.dst = spec.eth->dst;
+ tmp.mask |= BIT_ENCAP(ETH_DST);
+ }
+ ++src;
+ break;
+ case ITEM_IPV4:
+ if (src->type != RTE_FLOW_ITEM_TYPE_IPV4)
+ goto trans;
+ mask.ipv4 = mlx5_nl_flow_item_mask
+ (src, &rte_flow_item_ipv4_mask,
+ &mlx5_nl_flow_encap_mask_supported.ipv4,
+ &mlx5_nl_flow_mask_empty.ipv4,
+ sizeof(rte_flow_item_ipv4_mask), error);
+ if (!mask.ipv4)
+ return -rte_errno;
+ if (mask.ipv4 == &mlx5_nl_flow_mask_empty.ipv4)
+ goto error_spec;
+ spec.ipv4 = src->spec;
+ if (mask.ipv4->hdr.src_addr) {
+ if (mask.ipv4->hdr.src_addr != IN_ADDR_MASK)
+ goto error_mask;
+ tmp.ip.src.v4.s_addr = spec.ipv4->hdr.src_addr;
+ tmp.mask |= BIT_ENCAP(IPV4_SRC);
+ }
+ if (mask.ipv4->hdr.dst_addr) {
+ if (mask.ipv4->hdr.dst_addr != IN_ADDR_MASK)
+ goto error_mask;
+ tmp.ip.dst.v4.s_addr = spec.ipv4->hdr.dst_addr;
+ tmp.mask |= BIT_ENCAP(IPV4_DST);
+ }
+ ++src;
+ break;
+ case ITEM_IPV6:
+ if (src->type != RTE_FLOW_ITEM_TYPE_IPV6)
+ goto trans;
+ mask.ipv6 = mlx5_nl_flow_item_mask
+ (src, &rte_flow_item_ipv6_mask,
+ &mlx5_nl_flow_encap_mask_supported.ipv6,
+ &mlx5_nl_flow_mask_empty.ipv6,
+ sizeof(rte_flow_item_ipv6_mask), error);
+ if (!mask.ipv6)
+ return -rte_errno;
+ if (mask.ipv6 == &mlx5_nl_flow_mask_empty.ipv6)
+ goto error_spec;
+ spec.ipv6 = src->spec;
+ if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
+ if (memcmp(mask.ipv6->hdr.src_addr, IN6_ADDR_MASK, 16))
+ goto error_mask;
+ tmp.ip.src.v6 = *(const struct in6_addr *)
+ spec.ipv6->hdr.src_addr;
+ tmp.mask |= BIT_ENCAP(IPV6_SRC);
+ }
+ if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
+ if (memcmp(mask.ipv6->hdr.dst_addr, IN6_ADDR_MASK, 16))
+ goto error_mask;
+ tmp.ip.dst.v6 = *(const struct in6_addr *)
+ spec.ipv6->hdr.dst_addr;
+ tmp.mask |= BIT_ENCAP(IPV6_DST);
+ }
+ ++src;
+ break;
+ case ITEM_UDP:
+ if (src->type != RTE_FLOW_ITEM_TYPE_UDP)
+ goto trans;
+ mask.udp = mlx5_nl_flow_item_mask
+ (src, &rte_flow_item_udp_mask,
+ &mlx5_nl_flow_encap_mask_supported.udp,
+ &mlx5_nl_flow_mask_empty.udp,
+ sizeof(rte_flow_item_udp_mask), error);
+ if (!mask.udp)
+ return -rte_errno;
+ if (mask.udp == &mlx5_nl_flow_mask_empty.udp)
+ goto error_spec;
+ spec.udp = src->spec;
+ if (mask.udp->hdr.src_port) {
+ if (mask.udp->hdr.src_port != BE16_MASK)
+ goto error_mask;
+ tmp.udp.src = spec.udp->hdr.src_port;
+ tmp.mask |= BIT_ENCAP(UDP_SRC);
+ }
+ if (mask.udp->hdr.dst_port) {
+ if (mask.udp->hdr.dst_port != BE16_MASK)
+ goto error_mask;
+ tmp.udp.dst = spec.udp->hdr.dst_port;
+ tmp.mask |= BIT_ENCAP(UDP_DST);
+ }
+ ++src;
+ break;
+ case ITEM_VXLAN:
+ if (src->type != RTE_FLOW_ITEM_TYPE_VXLAN)
+ goto trans;
+ mask.vxlan = mlx5_nl_flow_item_mask
+ (src, &rte_flow_item_vxlan_mask,
+ &mlx5_nl_flow_encap_mask_supported.vxlan,
+ &mlx5_nl_flow_mask_empty.vxlan,
+ sizeof(rte_flow_item_vxlan_mask), error);
+ if (!mask.vxlan)
+ return -rte_errno;
+ if (mask.vxlan == &mlx5_nl_flow_mask_empty.vxlan)
+ goto error_spec;
+ spec.vxlan = src->spec;
+ if (vxlan_vni_as_be32(mask.vxlan->vni)) {
+ if (memcmp(mask.vxlan->vni, VXLAN_VNI_MASK, 3))
+ goto error_mask;
+ tmp.vxlan.vni = vxlan_vni_as_be32(spec.vxlan->vni);
+ tmp.mask |= BIT_ENCAP(VXLAN_VNI);
+ }
+ ++src;
+ break;
+ case END:
+ if (src->type != RTE_FLOW_ITEM_TYPE_END)
+ goto trans;
+ *dst = tmp;
+ return 0;
+ }
+ back = trans;
+ trans = mlx5_nl_flow_encap_reap_trans[trans[n - 1]];
+ n = 0;
+ goto trans;
+error_encap:
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+ "unsupported encapsulation format");
+error_spec:
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+ "a specification structure is required for encapsulation");
+error_mask:
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, src,
+ "partial masks are not supported for encapsulation");
+}
+
+/**
* Transpose flow rule description to rtnetlink message.
*
* This function transposes a flow rule description to a traffic control
@@ -412,6 +805,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
bool vlan_present;
bool vlan_eth_type_set;
bool ip_proto_set;
+ struct mlx5_nl_flow_encap encap;
struct nlattr *na_flower;
struct nlattr *na_flower_act;
struct nlattr *na_vlan_id;
@@ -425,8 +819,10 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
goto error_nobufs;
nl_flow->size = offsetof(struct mlx5_nl_flow, msg);
nl_flow->applied = 0;
+ nl_flow->encap_ifindex = 0;
nl_flow->ifindex_src = NULL;
nl_flow->ifindex_dst = NULL;
+ nl_flow->encap = NULL;
size -= nl_flow->size;
item = pattern;
action = actions;
@@ -437,6 +833,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
vlan_present = false;
vlan_eth_type_set = false;
ip_proto_set = false;
+ memset(&encap, 0, sizeof(encap));
na_flower = NULL;
na_flower_act = NULL;
na_vlan_id = NULL;
@@ -461,6 +858,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
of_set_vlan_vid;
const struct rte_flow_action_of_set_vlan_pcp *
of_set_vlan_pcp;
+ const struct rte_flow_action_vxlan_encap *vxlan_encap;
} conf;
struct nlmsghdr *nlh;
struct tcmsg *tcm;
@@ -887,6 +1285,12 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
goto error_nobufs;
++item;
break;
+ case ITEM_VXLAN:
+ if (item->type != RTE_FLOW_ITEM_TYPE_VXLAN)
+ goto trans;
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+ "VXLAN header matching is not supported yet");
case ACTIONS:
if (item->type != RTE_FLOW_ITEM_TYPE_END)
goto trans;
@@ -1042,6 +1446,77 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
}
++action;
break;
+ case ACTION_VXLAN_ENCAP:
+ if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP)
+ goto trans;
+ conf.vxlan_encap = action->conf;
+ if (mlx5_nl_flow_encap_reap(&encap,
+ conf.vxlan_encap->definition,
+ error))
+ return -rte_errno;
+ act_index =
+ mnl_attr_nest_start_check(buf, size, act_index_cur++);
+ if (!act_index ||
+ !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND,
+ "tunnel_key"))
+ goto error_nobufs;
+ act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
+ if (!act)
+ goto error_nobufs;
+ if (!mnl_attr_put_check(buf, size, TCA_TUNNEL_KEY_PARMS,
+ sizeof(struct tc_tunnel_key),
+ &(struct tc_tunnel_key){
+ .action = TC_ACT_PIPE,
+ .t_action =
+ TCA_TUNNEL_KEY_ACT_SET,
+ }))
+ goto error_nobufs;
+ if (encap.mask & BIT_ENCAP(IPV4_SRC) &&
+ !mnl_attr_put_u32_check
+ (buf, size, TCA_TUNNEL_KEY_ENC_IPV4_SRC,
+ encap.ip.src.v4.s_addr))
+ goto error_nobufs;
+ if (encap.mask & BIT_ENCAP(IPV4_DST) &&
+ !mnl_attr_put_u32_check
+ (buf, size, TCA_TUNNEL_KEY_ENC_IPV4_DST,
+ encap.ip.dst.v4.s_addr))
+ goto error_nobufs;
+ if (encap.mask & BIT_ENCAP(IPV6_SRC) &&
+ !mnl_attr_put_check
+ (buf, size, TCA_TUNNEL_KEY_ENC_IPV6_SRC,
+ sizeof(encap.ip.src.v6), &encap.ip.src.v6))
+ goto error_nobufs;
+ if (encap.mask & BIT_ENCAP(IPV6_DST) &&
+ !mnl_attr_put_check
+ (buf, size, TCA_TUNNEL_KEY_ENC_IPV6_DST,
+ sizeof(encap.ip.dst.v6), &encap.ip.dst.v6))
+ goto error_nobufs;
+ if (encap.mask & BIT_ENCAP(UDP_SRC) &&
+ nl_flow != (void *)buf_tmp)
+ DRV_LOG(WARNING,
+ "UDP source port cannot be forced"
+ " for VXLAN encap; parameter ignored");
+ if (encap.mask & BIT_ENCAP(UDP_DST) &&
+ !mnl_attr_put_u16_check
+ (buf, size, TCA_TUNNEL_KEY_ENC_DST_PORT, encap.udp.dst))
+ goto error_nobufs;
+ if (!(encap.mask & BIT_ENCAP(VXLAN_VNI)))
+ return rte_flow_error_set
+ (error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+ conf.vxlan_encap, "VXLAN VNI is missing");
+ if (!mnl_attr_put_u32_check
+ (buf, size, TCA_TUNNEL_KEY_ENC_KEY_ID, encap.vxlan.vni))
+ goto error_nobufs;
+ mnl_attr_nest_end(buf, act);
+ mnl_attr_nest_end(buf, act_index);
+ ++action;
+ break;
+ case ACTION_VXLAN_DECAP:
+ if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_DECAP)
+ goto trans;
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, action,
+ "VXLAN decap is not supported yet");
case END:
if (item->type != RTE_FLOW_ITEM_TYPE_END ||
action->type != RTE_FLOW_ACTION_TYPE_END)
@@ -1054,6 +1529,21 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
buf = NULL;
size -= nlh->nlmsg_len;
nl_flow->size += nlh->nlmsg_len;
+ if (!encap.mask)
+ return nl_flow->size;
+ i = RTE_ALIGN_CEIL(nl_flow->size,
+ alignof(struct mlx5_nl_flow_encap));
+ i -= nl_flow->size;
+ if (size < i + sizeof(encap))
+ goto error_nobufs;
+ nl_flow->size += i;
+ buf = (void *)((uintptr_t)nl_flow + nl_flow->size);
+ size -= i;
+ nl_flow->encap = buf;
+ *nl_flow->encap = encap;
+ buf = NULL;
+ size -= sizeof(*nl_flow->encap);
+ nl_flow->size += sizeof(*nl_flow->encap);
return nl_flow->size;
}
back = trans;
@@ -1151,6 +1641,671 @@ mlx5_nl_flow_chat(struct mlx5_nl_flow_ctx *ctx, struct nlmsghdr *nlh,
return -err;
}
+/** Data structure used by mlx5_nl_flow_init_vxlan_cb(). */
+struct mlx5_nl_flow_init_vxlan_data {
+ unsigned int ifindex; /**< Base interface index. */
+ rte_be16_t vxlan_port; /**< Remote UDP port. */
+ unsigned int *collect; /**< Collected interfaces. */
+ unsigned int collect_n; /**< Number of collected interfaces. */
+};
+
+/**
+ * Collect indices of VXLAN encap/decap interfaces associated with device.
+ *
+ * @param nlh
+ * Pointer to reply header.
+ * @param arg
+ * Opaque data pointer for this callback.
+ *
+ * @return
+ * A positive, nonzero value on success, negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_init_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
+{
+ struct mlx5_nl_flow_init_vxlan_data *data = arg;
+ struct ifinfomsg *ifm;
+ struct nlattr *na;
+ struct nlattr *na_info = NULL;
+ struct nlattr *na_vxlan = NULL;
+ struct nlattr *na_vxlan_port = NULL;
+ bool found = false;
+ unsigned int *collect;
+
+ if (nlh->nlmsg_type != RTM_NEWLINK)
+ goto error_inval;
+ ifm = mnl_nlmsg_get_payload(nlh);
+ mnl_attr_for_each(na, nlh, sizeof(*ifm))
+ if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
+ na_info = na;
+ break;
+ }
+ if (!na_info)
+ return 1;
+ mnl_attr_for_each_nested(na, na_info) {
+ switch (mnl_attr_get_type(na)) {
+ case IFLA_INFO_KIND:
+ if (!strncmp("vxlan", mnl_attr_get_str(na),
+ mnl_attr_get_len(na)))
+ found = true;
+ break;
+ case IFLA_INFO_DATA:
+ na_vxlan = na;
+ break;
+ }
+ if (found && na_vxlan)
+ break;
+ }
+ if (!found || !na_vxlan)
+ return 1;
+ found = false;
+ mnl_attr_for_each_nested(na, na_vxlan) {
+ switch (mnl_attr_get_type(na)) {
+ case IFLA_VXLAN_LINK:
+ if (mnl_attr_get_u32(na) == data->ifindex)
+ found = true;
+ break;
+ case IFLA_VXLAN_PORT:
+ na_vxlan_port = na;
+ break;
+ }
+ if (found && na_vxlan_port)
+ break;
+ }
+ if (!found ||
+ (na_vxlan_port &&
+ mnl_attr_get_u16(na_vxlan_port) != data->vxlan_port))
+ return 1;
+ if (!ifm->ifi_index)
+ goto error_inval;
+ collect = realloc(data->collect,
+ (data->collect_n + 1) * sizeof(*data->collect));
+ if (!collect) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ collect[data->collect_n] = ifm->ifi_index;
+ data->collect = collect;
+ data->collect_n += 1;
+ return 1;
+error_inval:
+ rte_errno = EINVAL;
+ return -rte_errno;
+}
+
+/**
+ * Clean up and generate VXLAN encap/decap interface.
+ *
+ * @param ctx
+ * Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param ifindex
+ * Network interface index to associate VXLAN encap/decap with.
+ * @param vxlan_port
+ * Remote UDP port.
+ * @param enable
+ * If disabled, stop after initial clean up.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * Interface index on success, zero otherwise and rte_errno is set.
+ *
+ * If @p enable is set, the returned ifindex is that of the new VXLAN
+ * interface, otherwise @p ifindex is simply returned as is.
+ */
+static unsigned int
+mlx5_nl_flow_ifindex_vxlan(struct mlx5_nl_flow_ctx *ctx, unsigned int ifindex,
+ rte_be16_t vxlan_port, int enable,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ alignas(struct nlmsghdr)
+ uint8_t buf[mnl_nlmsg_size(sizeof(*ifm) + 256)];
+ unsigned int ifindex_vxlan = 0;
+ struct mlx5_nl_flow_init_vxlan_data data = {
+ .ifindex = ifindex,
+ .vxlan_port = vxlan_port,
+ .collect = NULL,
+ .collect_n = 0,
+ };
+ char name[IF_NAMESIZE];
+ struct nlattr *na_info;
+ struct nlattr *na_vxlan;
+ unsigned int i;
+ int ret;
+
+ if (!ifindex) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ /*
+ * Seek and destroy leftover VXLAN encap/decap interfaces with
+ * matching properties.
+ */
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = RTM_GETLINK;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ret = mlx5_nl_flow_chat(ctx, nlh, mlx5_nl_flow_init_vxlan_cb, &data);
+ if (ret)
+ goto exit;
+ nlh->nlmsg_type = RTM_DELLINK;
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+ for (i = 0; i != data.collect_n; ++i) {
+ ifm->ifi_index = data.collect[i];
+ DRV_LOG(DEBUG, "cleaning up VXLAN encap/decap ifindex %u",
+ ifm->ifi_index);
+ ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+ if (ret)
+ goto exit;
+ }
+ if (!enable)
+ return ifindex;
+ /* Add fresh VXLAN encap/decap interface. */
+ nlh->nlmsg_type = RTM_NEWLINK;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_REPLACE;
+ ifm->ifi_type = ARPHRD_ETHER;
+ ifm->ifi_index = 0;
+ ifm->ifi_flags = IFF_UP;
+ ifm->ifi_change = 0xffffffff;
+ if (snprintf(name, sizeof(name), "vxlan_%u_%u",
+ rte_be_to_cpu_16(vxlan_port), ifindex) == -1) {
+ ret = -errno;
+ goto exit;
+ }
+ ret = -ENOBUFS;
+ if (!mnl_attr_put_strz_check(nlh, sizeof(buf), IFLA_IFNAME, name))
+ goto exit;
+ na_info = mnl_attr_nest_start_check(nlh, sizeof(buf), IFLA_LINKINFO);
+ if (!na_info)
+ goto exit;
+ if (!mnl_attr_put_strz_check(nlh, sizeof(buf), IFLA_INFO_KIND, "vxlan"))
+ goto exit;
+ na_vxlan = mnl_attr_nest_start_check(nlh, sizeof(buf), IFLA_INFO_DATA);
+ if (!na_vxlan)
+ goto exit;
+ if (!mnl_attr_put_u32_check(nlh, sizeof(buf), IFLA_VXLAN_LINK, ifindex))
+ goto exit;
+ if (!mnl_attr_put_u8_check(nlh, sizeof(buf),
+ IFLA_VXLAN_COLLECT_METADATA, 1))
+ goto exit;
+ /*
+ * When destination port or VNI are either undefined or set to fixed
+ * values, kernel complains with EEXIST ("A VXLAN device with the
+ * specified VNI already exist") when creating subsequent VXLAN
+ * interfaces with the same properties, even if linked with
+ * different physical devices.
+ *
+ * Also since only destination ports assigned to existing VXLAN
+ * interfaces can be offloaded to the switch, the above limitation
+ * cannot be worked around by picking a random value here and using
+ * a different one when creating flow rules later.
+ *
+ * Therefore request a hopefully unique VNI based on the interface
+ * index in order to work around EEXIST. VNI will be overridden
+ * later on a flow rule basis thanks to IFLA_VXLAN_COLLECT_METADATA.
+ */
+ if (!mnl_attr_put_u16_check(nlh, sizeof(buf), IFLA_VXLAN_PORT,
+ vxlan_port))
+ goto exit;
+ if (!mnl_attr_put_u32_check(nlh, sizeof(buf), IFLA_VXLAN_ID, ifindex))
+ goto exit;
+ mnl_attr_nest_end(nlh, na_vxlan);
+ mnl_attr_nest_end(nlh, na_info);
+ ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+ if (ret)
+ goto exit;
+ /* Lastly, retrieve its ifindex value. */
+ nlh->nlmsg_type = RTM_GETLINK;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ data.collect_n = 0;
+ ret = mlx5_nl_flow_chat(ctx, nlh, mlx5_nl_flow_init_vxlan_cb, &data);
+ if (ret)
+ goto exit;
+ ret = -ENXIO;
+ if (data.collect_n != 1 || !*data.collect)
+ goto exit;
+ ifindex_vxlan = *data.collect;
+ DRV_LOG(DEBUG, "created VXLAN encap/decap ifindex %u (%s)",
+ ifindex_vxlan, name);
+ ret = mlx5_nl_flow_ifindex_init(ctx, ifindex_vxlan, error);
+ if (ret) {
+ mlx5_nl_flow_ifindex_vxlan(ctx, ifindex_vxlan, vxlan_port,
+ false, NULL);
+ ifindex_vxlan = 0;
+ goto exit;
+ }
+ ret = 0;
+exit:
+ free(data.collect);
+ if (ret)
+ rte_flow_error_set
+ (error, -ret, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "netlink: failed to request VXLAN encap/decap"
+ " interface creation/deletion");
+ return ifindex_vxlan;
+}
+
+/**
+ * Emit Netlink message to add/remove local address.
+ *
+ * Note that an implicit route is maintained by the kernel due to the
+ * presence of a peer address (IFA_ADDRESS).
+ *
+ * @param ctx
+ * Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param[in] encap
+ * Encapsulation properties (source address).
+ * @param ifindex
+ * Network interface.
+ * @param enable
+ * Toggle between add and remove.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_encap_local(struct mlx5_nl_flow_ctx *ctx,
+ const struct mlx5_nl_flow_encap *encap,
+ unsigned int ifindex,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh;
+ struct ifaddrmsg *ifa;
+ alignas(struct nlmsghdr)
+ uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
+
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
+ nlh->nlmsg_flags =
+ NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+ nlh->nlmsg_seq = 0;
+ ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
+ if (encap->mask & BIT_ENCAP(IPV4_SRC)) {
+ ifa->ifa_family = AF_INET;
+ ifa->ifa_prefixlen = 32;
+ } else if (encap->mask & BIT_ENCAP(IPV6_SRC)) {
+ ifa->ifa_family = AF_INET6;
+ ifa->ifa_prefixlen = 128;
+ } else {
+ ifa->ifa_family = AF_UNSPEC;
+ ifa->ifa_prefixlen = 0;
+ }
+ ifa->ifa_flags = IFA_F_PERMANENT;
+ ifa->ifa_scope = RT_SCOPE_LINK;
+ ifa->ifa_index = ifindex;
+ if (encap->mask & BIT_ENCAP(IPV4_SRC) &&
+ !mnl_attr_put_u32_check(nlh, sizeof(buf), IFA_LOCAL,
+ encap->ip.src.v4.s_addr))
+ goto error_nobufs;
+ if (encap->mask & BIT_ENCAP(IPV6_SRC) &&
+ !mnl_attr_put_check(nlh, sizeof(buf), IFA_LOCAL,
+ sizeof(encap->ip.src.v6), &encap->ip.src.v6))
+ goto error_nobufs;
+ if (encap->mask & BIT_ENCAP(IPV4_DST) &&
+ !mnl_attr_put_u32_check(nlh, sizeof(buf), IFA_ADDRESS,
+ encap->ip.dst.v4.s_addr))
+ goto error_nobufs;
+ if (encap->mask & BIT_ENCAP(IPV6_DST) &&
+ !mnl_attr_put_check(nlh, sizeof(buf), IFA_ADDRESS,
+ sizeof(encap->ip.dst.v6), &encap->ip.dst.v6))
+ goto error_nobufs;
+ if (!mlx5_nl_flow_chat(ctx, nlh, NULL, NULL))
+ return 0;
+ return rte_flow_error_set
+ (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot complete IFA request");
+error_nobufs:
+ return rte_flow_error_set
+ (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "generated IFA message is too large");
+}
+
+/**
+ * Emit Netlink message to add/remove neighbor.
+ *
+ * @param ctx
+ * Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param[in] encap
+ * Encapsulation properties (destination address).
+ * @param ifindex
+ * Network interface.
+ * @param enable
+ * Toggle between add and remove.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_encap_neigh(struct mlx5_nl_flow_ctx *ctx,
+ const struct mlx5_nl_flow_encap *encap,
+ unsigned int ifindex,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+ alignas(struct nlmsghdr)
+ uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
+
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
+ nlh->nlmsg_flags =
+ NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+ nlh->nlmsg_seq = 0;
+ ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
+ if (encap->mask & BIT_ENCAP(IPV4_DST))
+ ndm->ndm_family = AF_INET;
+ else if (encap->mask & BIT_ENCAP(IPV6_DST))
+ ndm->ndm_family = AF_INET6;
+ else
+ ndm->ndm_family = AF_UNSPEC;
+ ndm->ndm_ifindex = ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+ ndm->ndm_flags = 0;
+ ndm->ndm_type = 0;
+ if (encap->mask & BIT_ENCAP(IPV4_DST) &&
+ !mnl_attr_put_u32_check(nlh, sizeof(buf), NDA_DST,
+ encap->ip.dst.v4.s_addr))
+ goto error_nobufs;
+ if (encap->mask & BIT_ENCAP(IPV6_DST) &&
+ !mnl_attr_put_check(nlh, sizeof(buf), NDA_DST,
+ sizeof(encap->ip.dst.v6), &encap->ip.dst.v6))
+ goto error_nobufs;
+ if (encap->mask & BIT_ENCAP(ETH_SRC) && enable)
+ DRV_LOG(WARNING,
+ "Ethernet source address cannot be forced"
+ " for VXLAN encap; parameter ignored");
+ if (encap->mask & BIT_ENCAP(ETH_DST) &&
+ !mnl_attr_put_check(nlh, sizeof(buf), NDA_LLADDR,
+ sizeof(encap->eth.dst), &encap->eth.dst))
+ goto error_nobufs;
+ if (!mlx5_nl_flow_chat(ctx, nlh, NULL, NULL))
+ return 0;
+ return rte_flow_error_set
+ (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "cannot complete ND request");
+error_nobufs:
+ return rte_flow_error_set
+ (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "generated ND message is too large");
+}
+
+/**
+ * Look for matching IP source/destination properties.
+ *
+ * @param[in] bag
+ * Search target.
+ * @param bag_mask
+ * Bit-mask for valid fields in @p bag.
+ * @param[in] what
+ * Properties to look for in @p bag.
+ * @param what_mask
+ * Bit-mask for valid fields in @p what.
+ *
+ * @return
+ * True if @p what is found in @p bag, false otherwise.
+ */
+static bool
+mlx5_nl_flow_encap_ip_search(const struct mlx5_nl_flow_encap_ip *bag,
+ uint32_t bag_mask,
+ const struct mlx5_nl_flow_encap_ip *what,
+ uint32_t what_mask)
+{
+ if ((what_mask & BIT_ENCAP(IPV4_SRC) &&
+ (!(bag_mask & BIT_ENCAP(IPV4_SRC)) ||
+ bag->src.v4.s_addr != what->src.v4.s_addr)) ||
+ (what_mask & BIT_ENCAP(IPV4_DST) &&
+ (!(bag_mask & BIT_ENCAP(IPV4_DST)) ||
+ bag->dst.v4.s_addr != what->dst.v4.s_addr)) ||
+ (what_mask & BIT_ENCAP(IPV6_SRC) &&
+ (!(bag_mask & BIT_ENCAP(IPV6_SRC)) ||
+ memcmp(&bag->src.v6, &what->src.v6, sizeof(bag->src.v6)))) ||
+ (what_mask & BIT_ENCAP(IPV6_DST) &&
+ (!(bag_mask & BIT_ENCAP(IPV6_DST)) ||
+ memcmp(&bag->dst.v6, &what->dst.v6, sizeof(bag->dst.v6)))))
+ return false;
+ return true;
+}
+
+/**
+ * Interface resources list common to all driver instances of a given
+ * process. It is protected by a standard mutex because resource allocation
+ * is slow and involves system calls.
+ */
+static LIST_HEAD(, mlx5_nl_flow_encap_ifindex) mlx5_nl_flow_encap_ifindex_list =
+ LIST_HEAD_INITIALIZER();
+static pthread_mutex_t mlx5_nl_flow_encap_ifindex_list_lock =
+ PTHREAD_MUTEX_INITIALIZER;
+
+/**
+ * Retrieve target interface index for encapsulation.
+ *
+ * Resources are automatically allocated and released as necessary.
+ *
+ * @param ctx
+ * Context object initialized by mlx5_nl_flow_ctx_create().
+ * @param[in] encap
+ * Encapsulation properties.
+ * @param ifindex
+ * Outer network interface.
+ * @param enable
+ * Toggle whether resources are allocated or released.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * Interface index on success, zero otherwise and rte_errno is set.
+ *
+ * If @p enable is set, the returned ifindex is that of the inner
+ * interface, otherwise @p ifindex is simply returned as is.
+ */
+static unsigned int
+mlx5_nl_flow_encap_ifindex(struct mlx5_nl_flow_ctx *ctx,
+ const struct mlx5_nl_flow_encap *encap,
+ unsigned int ifindex,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ struct mlx5_nl_flow_encap_ifindex *encap_ifindex = NULL;
+ struct mlx5_nl_flow_encap_vxlan *encap_vxlan = NULL;
+ struct mlx5_nl_flow_encap_addr *encap_local = NULL;
+ struct mlx5_nl_flow_encap_addr *encap_neigh = NULL;
+ unsigned int ifindex_inner = ifindex;
+ int ret;
+
+ pthread_mutex_lock(&mlx5_nl_flow_encap_ifindex_list_lock);
+ /* Interface descriptor. */
+ LIST_FOREACH(encap_ifindex, &mlx5_nl_flow_encap_ifindex_list, next) {
+ if (encap_ifindex->outer != ifindex)
+ continue;
+ if (enable)
+ ++encap_ifindex->refcnt;
+ break;
+ }
+ if (enable && !encap_ifindex) {
+ encap_ifindex =
+ rte_zmalloc_socket(__func__, sizeof(*encap_ifindex),
+ 0, ctx->socket);
+ if (!encap_ifindex) {
+ rte_flow_error_set
+ (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "missing ifindex encap data");
+ goto release;
+ }
+ *encap_ifindex = (struct mlx5_nl_flow_encap_ifindex){
+ .refcnt = 1,
+ .outer = ifindex,
+ .vxlan = LIST_HEAD_INITIALIZER(),
+ .local = LIST_HEAD_INITIALIZER(),
+ .neigh = LIST_HEAD_INITIALIZER(),
+ };
+ LIST_INSERT_HEAD(&mlx5_nl_flow_encap_ifindex_list,
+ encap_ifindex, next);
+ }
+ if (!encap_ifindex) {
+ if (!enable)
+ goto release;
+ rte_flow_error_set
+ (error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "nonexistent interface");
+ goto release;
+ }
+ /* VXLAN descriptor. */
+ if (!(encap->mask & BIT_ENCAP(VXLAN_VNI)) ||
+ !(encap->mask & BIT_ENCAP(UDP_SRC)))
+ goto skip_vxlan;
+ LIST_FOREACH(encap_vxlan, &encap_ifindex->vxlan, next) {
+ if (encap->udp.src != encap_vxlan->port)
+ continue;
+ if (enable)
+ ++encap_vxlan->refcnt;
+ break;
+ }
+ if (enable && !encap_vxlan) {
+ encap_vxlan =
+ rte_zmalloc_socket(__func__, sizeof(*encap_vxlan),
+ 0, ctx->socket);
+ if (!encap_vxlan) {
+ rte_flow_error_set
+ (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "missing VXLAN encap data");
+ goto release;
+ }
+ *encap_vxlan = (struct mlx5_nl_flow_encap_vxlan){
+ .refcnt = 1,
+ .port = encap->udp.src,
+ .inner = mlx5_nl_flow_ifindex_vxlan
+ (ctx, ifindex, encap->udp.src, true, error),
+ };
+ if (!encap_vxlan->inner) {
+ rte_free(encap_vxlan);
+ encap_vxlan = NULL;
+ goto release;
+ }
+ LIST_INSERT_HEAD(&encap_ifindex->vxlan, encap_vxlan, next);
+ }
+ ifindex_inner = encap_vxlan->inner;
+skip_vxlan:
+ /* Local address descriptor (source). */
+ LIST_FOREACH(encap_local, &encap_ifindex->local, next) {
+ if (!mlx5_nl_flow_encap_ip_search
+ (&encap->ip, encap->mask,
+ &encap_local->ip, encap_local->mask &
+ (BIT_ENCAP(IPV4_SRC) | BIT_ENCAP(IPV6_SRC))))
+ continue;
+ if (enable)
+ ++encap_local->refcnt;
+ break;
+ }
+ if (enable && !encap_local &&
+ encap->mask & (BIT_ENCAP(IPV4_SRC) | BIT_ENCAP(IPV6_SRC))) {
+ encap_local =
+ rte_zmalloc_socket(__func__, sizeof(*encap_local),
+ 0, ctx->socket);
+ if (!encap_local) {
+ rte_flow_error_set
+ (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "missing local encap data");
+ goto release;
+ }
+ encap_local->refcnt = 1;
+ encap_local->mask =
+ encap->mask &
+ (BIT_ENCAP(IPV4_SRC) | BIT_ENCAP(IPV6_SRC));
+ if (encap->mask & BIT_ENCAP(IPV4_SRC))
+ encap_local->ip.src.v4 = encap->ip.src.v4;
+ if (encap->mask & BIT_ENCAP(IPV6_SRC))
+ encap_local->ip.src.v6 = encap->ip.src.v6;
+ ret = mlx5_nl_flow_encap_local(ctx, encap, ifindex, true,
+ error);
+ if (ret) {
+ rte_free(encap_local);
+ encap_local = NULL;
+ goto release;
+ }
+ LIST_INSERT_HEAD(&encap_ifindex->local, encap_local, next);
+ }
+ /* Neighbor descriptor (destination). */
+ LIST_FOREACH(encap_neigh, &encap_ifindex->neigh, next) {
+ if (!mlx5_nl_flow_encap_ip_search
+ (&encap->ip, encap->mask,
+ &encap_local->ip, encap_local->mask &
+ (BIT_ENCAP(IPV4_DST) | BIT_ENCAP(IPV6_DST))))
+ continue;
+ if (enable)
+ ++encap_neigh->refcnt;
+ break;
+ }
+ if (enable && !encap_neigh &&
+ encap->mask & (BIT_ENCAP(IPV4_DST) | BIT_ENCAP(IPV6_DST))) {
+ encap_neigh =
+ rte_zmalloc_socket(__func__, sizeof(*encap_neigh),
+ 0, ctx->socket);
+ if (!encap_neigh) {
+ rte_flow_error_set
+ (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "missing neigh encap data");
+ goto release;
+ }
+ encap_neigh->refcnt = 1;
+ encap_neigh->mask =
+ encap->mask &
+ (BIT_ENCAP(IPV4_DST) | BIT_ENCAP(IPV6_DST));
+ if (encap->mask & BIT_ENCAP(IPV4_DST))
+ encap_neigh->ip.dst.v4 = encap->ip.dst.v4;
+ if (encap->mask & BIT_ENCAP(IPV6_DST))
+ encap_neigh->ip.dst.v6 = encap->ip.dst.v6;
+ ret = mlx5_nl_flow_encap_neigh(ctx, encap, ifindex, true,
+ error);
+ if (ret) {
+ rte_free(encap_neigh);
+ encap_neigh = NULL;
+ goto release;
+ }
+ LIST_INSERT_HEAD(&encap_ifindex->neigh, encap_neigh, next);
+ }
+ if (!enable)
+ goto release;
+ pthread_mutex_unlock(&mlx5_nl_flow_encap_ifindex_list_lock);
+ return ifindex_inner;
+release:
+ ret = rte_errno;
+ if (encap_neigh && !--encap_neigh->refcnt) {
+ LIST_REMOVE(encap_neigh, next);
+ mlx5_nl_flow_encap_neigh(ctx, encap, ifindex, false, NULL);
+ rte_free(encap_neigh);
+ }
+ if (encap_local && !--encap_local->refcnt) {
+ LIST_REMOVE(encap_local, next);
+ mlx5_nl_flow_encap_local(ctx, encap, ifindex, false, NULL);
+ rte_free(encap_local);
+ }
+ if (encap_vxlan && !--encap_vxlan->refcnt) {
+ LIST_REMOVE(encap_vxlan, next);
+ mlx5_nl_flow_ifindex_vxlan
+ (ctx, ifindex, encap_vxlan->port, false, NULL);
+ rte_free(encap_vxlan);
+ }
+ if (encap_ifindex && !--encap_ifindex->refcnt) {
+ LIST_REMOVE(encap_ifindex, next);
+ rte_free(encap_ifindex);
+ }
+ pthread_mutex_unlock(&mlx5_nl_flow_encap_ifindex_list_lock);
+ if (!enable)
+ return ifindex;
+ rte_errno = ret;
+ return 0;
+}
+
/**
* Create a Netlink flow rule.
*
@@ -1169,17 +2324,35 @@ mlx5_nl_flow_create(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
struct rte_flow_error *error)
{
struct nlmsghdr *nlh = (void *)nl_flow->msg;
+ struct mlx5_nl_flow_encap *encap =
+ nl_flow->encap && nl_flow->ifindex_dst ?
+ nl_flow->encap : NULL;
+ unsigned int ifindex = encap ? *nl_flow->ifindex_dst : 0;
+ int ret;
if (nl_flow->applied)
return 0;
nlh->nlmsg_type = RTM_NEWTFILTER;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
- if (!mlx5_nl_flow_chat(ctx, nlh, NULL, NULL)) {
+ if (encap) {
+ nl_flow->encap_ifindex = mlx5_nl_flow_encap_ifindex
+ (ctx, encap, ifindex, true, error);
+ if (!nl_flow->encap_ifindex)
+ return -rte_errno;
+ *nl_flow->ifindex_dst = nl_flow->encap_ifindex;
+ }
+ ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+ if (encap)
+ *nl_flow->ifindex_dst = ifindex;
+ if (!ret) {
nl_flow->applied = 1;
return 0;
}
+ ret = rte_errno;
+ if (nl_flow->encap_ifindex)
+ mlx5_nl_flow_encap_ifindex(ctx, encap, ifindex, false, NULL);
return rte_flow_error_set
- (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ (error, ret, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
"netlink: failed to create TC flow rule");
}
@@ -1204,14 +2377,31 @@ mlx5_nl_flow_destroy(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
struct rte_flow_error *error)
{
struct nlmsghdr *nlh = (void *)nl_flow->msg;
+ struct mlx5_nl_flow_encap *encap =
+ nl_flow->encap && nl_flow->ifindex_dst ?
+ nl_flow->encap : NULL;
+ unsigned int ifindex = encap ? *nl_flow->ifindex_dst : 0;
+ int err = 0;
int ret;
if (!nl_flow->applied)
return 0;
nlh->nlmsg_type = RTM_DELTFILTER;
nlh->nlmsg_flags = NLM_F_REQUEST;
+ if (encap) {
+ if (!mlx5_nl_flow_encap_ifindex
+ (ctx, encap, ifindex, false, error))
+ err = rte_errno;
+ *nl_flow->ifindex_dst = nl_flow->encap_ifindex;
+ }
ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
+ if (encap)
+ *nl_flow->ifindex_dst = ifindex;
nl_flow->applied = 0;
+ if (err) {
+ rte_errno = err;
+ return -rte_errno;
+ }
if (!ret)
return 0;
return rte_flow_error_set