net/mlx5: fix ESXi VLAN in virtual machine

Message ID 1563198320-29068-1-git-send-email-viacheslavo@mellanox.com
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers show
Series
  • net/mlx5: fix ESXi VLAN in virtual machine
Related show

Checks

Context Check Description
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Viacheslav Ovsiienko July 15, 2019, 1:45 p.m.
On ESXi setups when we have SR-IOV and E-Switch enabled there is the
problem to receive VLAN traffic on VF interfaces. The NIC driver
in ESXi hypervisor does not setup E-Switch vport setting correctly
and VLAN traffic targeted to VF is dropped.

The patch provides the temporary workaround - if the rule
containing the VLAN pattern is being installed for VF the VLAN
network interface over VF is created, like the command does:

  ip link add link vf.if name mlx5.wa.1.100 type vlan id 100

The PMD in DPDK maintains the database of created VLAN interfaces
for each existing VF and requested VLAN tags. When all of the RTE
Flows using the given VLAN tag are removed the created VLAN interface
with this VLAN tag is deleted.

The name of created VLAN interface follows the format:

  evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex

Implementation limitations:

- mask in rules is ignored, rule must specify VLAN tags exactly,
  no wildcards (which are implemented by the masks) are allowed

- virtual environment is detected via rte_hypervisor() call,
  currently it checks the RTE_CPUFLAG_HYPERVISOR flag for x86
  platform. For other architectures workaround always
  applied for the Flow over PCI VF

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c            |   6 +
 drivers/net/mlx5/mlx5.h            |  30 ++++
 drivers/net/mlx5/mlx5_flow.c       |  22 +++
 drivers/net/mlx5/mlx5_flow.h       |   5 +
 drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
 drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
 drivers/net/mlx5/mlx5_nl.c         | 279 +++++++++++++++++++++++++++++++++++++
 7 files changed, 396 insertions(+), 4 deletions(-)

Comments

Matan Azrad July 29, 2019, 3:14 p.m. | #1
From: Viacheslav Ovsiienko
> On ESXi setups when we have SR-IOV and E-Switch enabled there is the
> problem to receive VLAN traffic on VF interfaces. The NIC driver in ESXi
> hypervisor does not setup E-Switch vport setting correctly and VLAN traffic
> targeted to VF is dropped.
> 
> The patch provides the temporary workaround - if the rule containing the
> VLAN pattern is being installed for VF the VLAN network interface over VF is
> created, like the command does:
> 
>   ip link add link vf.if name mlx5.wa.1.100 type vlan id 100
> 
> The PMD in DPDK maintains the database of created VLAN interfaces for
> each existing VF and requested VLAN tags. When all of the RTE Flows using
> the given VLAN tag are removed the created VLAN interface with this VLAN
> tag is deleted.
> 
> The name of created VLAN interface follows the format:
> 
>   evmlx.d1.d2, where d1 is VF interface ifindex, d2 - VLAN ifindex
> 
> Implementation limitations:
> 
> - mask in rules is ignored, rule must specify VLAN tags exactly,
>   no wildcards (which are implemented by the masks) are allowed
> 
> - virtual environment is detected via rte_hypervisor() call,
>   currently it checks the RTE_CPUFLAG_HYPERVISOR flag for x86
>   platform. For other architectures workaround always
>   applied for the Flow over PCI VF
> 
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>

After rebase, 
Acked-by: Matan Azrad <matan@mellanox.com>

> ---
>  drivers/net/mlx5/mlx5.c            |   6 +
>  drivers/net/mlx5/mlx5.h            |  30 ++++
>  drivers/net/mlx5/mlx5_flow.c       |  22 +++
>  drivers/net/mlx5/mlx5_flow.h       |   5 +
>  drivers/net/mlx5/mlx5_flow_dv.c    |  33 ++++-
>  drivers/net/mlx5/mlx5_flow_verbs.c |  25 +++-
>  drivers/net/mlx5/mlx5_nl.c         | 279
> +++++++++++++++++++++++++++++++++++++
>  7 files changed, 396 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> d93f92d..8549167 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -690,6 +690,8 @@ struct mlx5_dev_spawn_data {
>  		close(priv->nl_socket_route);
>  	if (priv->nl_socket_rdma >= 0)
>  		close(priv->nl_socket_rdma);
> +	if (priv->esxi_context)
> +		mlx5_vlan_esxi_exit(priv->esxi_context);
>  	if (priv->sh) {
>  		/*
>  		 * Free the shared context in last turn, because the cleanup
> @@ -1546,6 +1548,8 @@ struct mlx5_dev_spawn_data {  #endif
>  	/* Store device configuration on private structure. */
>  	priv->config = config;
> +	/* Create context for virtual machine VLAN workaround. */
> +	priv->esxi_context = mlx5_vlan_esxi_init(eth_dev, spawn->ifindex);
>  	if (config.dv_flow_en) {
>  		err = mlx5_alloc_shared_dr(priv);
>  		if (err)
> @@ -1572,6 +1576,8 @@ struct mlx5_dev_spawn_data {
>  			close(priv->nl_socket_route);
>  		if (priv->nl_socket_rdma >= 0)
>  			close(priv->nl_socket_rdma);
> +		if (priv->esxi_context)
> +			mlx5_vlan_esxi_exit(priv->esxi_context);
>  		if (own_domain_id)
>  			claim_zero(rte_eth_switch_domain_free(priv-
> >domain_id));
>  		rte_free(priv);
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 5af3f41..87afa7a 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -231,6 +231,27 @@ enum mlx5_verbs_alloc_type {
>  	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
>  };
> 
> +/* VLAN netdev for ESXi VLAN workaround. */ struct mlx5_vlan_dev {
> +	uint32_t refcnt;
> +	uint32_t ifindex; /**< Own interface index. */ };
> +
> +/* Structure for VF ESXi VLAN workaround. */ struct mlx5_vf_vlan {
> +	uint32_t tag:12;
> +	uint32_t created:1;
> +};
> +
> +/* Array of VLAN devices created on the base of VF */ struct
> +mlx5_vlan_esxi_context {
> +	int nl_socket;
> +	uint32_t nl_sn;
> +	uint32_t vf_ifindex;
> +	struct rte_eth_dev *dev;
> +	struct mlx5_vlan_dev vlan_dev[4096];
> +};
> +
>  /**
>   * Verbs allocator needs a context to know in the callback which kind of
>   * resources it is allocating.
> @@ -386,6 +407,7 @@ struct mlx5_priv {
>  	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
>  	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
>  	uint32_t nl_sn; /* Netlink message sequence number. */
> +	struct mlx5_vlan_esxi_context *esxi_context; /* ESXi VLAN context.
> */
>  #ifndef RTE_ARCH_64
>  	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
>  	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX]; @@ -582,6
> +604,14 @@ int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct
> rte_ether_addr *mac,  int mlx5_nl_switch_info(int nl, unsigned int ifindex,
>  			struct mlx5_switch_info *info);
> 
> +struct mlx5_vlan_esxi_context *mlx5_vlan_esxi_init(struct rte_eth_dev
> *dev,
> +						   uint32_t ifindex);
> +void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *ctx); void
> +mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vf_vlan);
> +void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vf_vlan);
> +
>  /* mlx5_devx_cmds.c */
> 
>  int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx, diff --git
> a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index
> 4ba34db..42743d2 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -1200,6 +1200,8 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,
>   *   Item specification.
>   * @param[in] item_flags
>   *   Bit-fields that holds the items detected until now.
> + * @param[in] dev
> + *   Ethernet device flow is being created on.
>   * @param[out] error
>   *   Pointer to error structure.
>   *
> @@ -1209,6 +1211,7 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,  int  mlx5_flow_validate_item_vlan(const
> struct rte_flow_item *item,
>  			     uint64_t item_flags,
> +			     struct rte_eth_dev *dev,
>  			     struct rte_flow_error *error)
>  {
>  	const struct rte_flow_item_vlan *spec = item->spec; @@ -1243,6
> +1246,25 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev,
> int32_t priority,
>  					error);
>  	if (ret)
>  		return ret;
> +	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
> +		struct mlx5_priv *priv = dev->data->dev_private;
> +
> +		if (priv->esxi_context) {
> +			/*
> +			 * Non-NULL context means we have a virtual
> machine
> +			 * and SR-IOV enabled, we have to create VLAN
> interface
> +			 * to make hypervisor (ESXi) to setup E-Switch vport
> +			 * context correctly. We avoid creating the multiple
> +			 * VLAN interfaces, so we cannot support VLAN tag
> mask.
> +			 */
> +			return rte_flow_error_set(error, EINVAL,
> +
> RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "VLAN tag mask is not"
> +						  " supported in virtual"
> +						  " environment");
> +		}
> +	}
>  	if (spec) {
>  		vlan_tag = spec->tci;
>  		vlan_tag &= mask->tci;
> diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
> index 72b339e..ac20572 100644
> --- a/drivers/net/mlx5/mlx5_flow.h
> +++ b/drivers/net/mlx5/mlx5_flow.h
> @@ -318,6 +318,8 @@ struct mlx5_flow_dv {
>  	/**< Pointer to the jump action resource. */
>  	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
>  	/**< Pointer to port ID action resource. */
> +	struct mlx5_vf_vlan vf_vlan;
> +	/**< Structure for VF ESXi VLAN workaround. */
>  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
>  	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
>  	/**< Action list. */
> @@ -343,6 +345,8 @@ struct mlx5_flow_verbs {
>  	struct ibv_flow *flow; /**< Verbs flow pointer. */
>  	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
>  	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
> +	struct mlx5_vf_vlan vf_vlan;
> +	/**< Structure for VF ESXi VLAN workaround. */
>  };
> 
>  /** Device flow structure. */
> @@ -507,6 +511,7 @@ int mlx5_flow_validate_item_udp(const struct
> rte_flow_item *item,
>  				struct rte_flow_error *error);
>  int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
>  				 uint64_t item_flags,
> +				 struct rte_eth_dev *dev,
>  				 struct rte_flow_error *error);
>  int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
>  				  uint64_t item_flags,
> diff --git a/drivers/net/mlx5/mlx5_flow_dv.c
> b/drivers/net/mlx5/mlx5_flow_dv.c index 3fa624b..63183b5 100644
> --- a/drivers/net/mlx5/mlx5_flow_dv.c
> +++ b/drivers/net/mlx5/mlx5_flow_dv.c
> @@ -2363,7 +2363,7 @@ struct field_modify_info modify_tcp[] = {
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
>  			ret = mlx5_flow_validate_item_vlan(items,
> item_flags,
> -							   error);
> +							   dev, error);
>  			if (ret < 0)
>  				return ret;
>  			last_item = tunnel ?
> MLX5_FLOW_LAYER_INNER_VLAN :
> @@ -2914,6 +2914,8 @@ struct field_modify_info modify_tcp[] = {
>  /**
>   * Add VLAN item to matcher and to the value.
>   *
> + * @param[in, out] dev_flow
> + *   Flow descriptor.
>   * @param[in, out] matcher
>   *   Flow matcher.
>   * @param[in, out] key
> @@ -2924,7 +2926,8 @@ struct field_modify_info modify_tcp[] = {
>   *   Item is inner pattern.
>   */
>  static void
> -flow_dv_translate_item_vlan(void *matcher, void *key,
> +flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
> +			    void *matcher, void *key,
>  			    const struct rte_flow_item *item,
>  			    int inner)
>  {
> @@ -2951,6 +2954,12 @@ struct field_modify_info modify_tcp[] = {
>  		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
>  					 outer_headers);
>  		headers_v = MLX5_ADDR_OF(fte_match_param, key,
> outer_headers);
> +		/*
> +		 * This is workaround, masks are not supported,
> +		 * and pre-validated.
> +		 */
> +		dev_flow->dv.vf_vlan.tag =
> +			rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
>  	}
>  	tci_m = rte_be_to_cpu_16(vlan_m->tci);
>  	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci); @@ -4443,7
> +4452,8 @@ struct field_modify_info modify_tcp[] = {
>  					     MLX5_FLOW_LAYER_OUTER_L2;
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
> -			flow_dv_translate_item_vlan(match_mask,
> match_value,
> +			flow_dv_translate_item_vlan(dev_flow,
> +						    match_mask, match_value,
>  						    items, tunnel);
>  			matcher.priority = MLX5_PRIORITY_MAP_L2;
>  			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2
> | @@ -4658,6 +4668,17 @@ struct field_modify_info modify_tcp[] = {
>  					   "hardware refuses to create flow");
>  			goto error;
>  		}
> +		if (priv->esxi_context &&
> +		    dev_flow->dv.vf_vlan.tag &&
> +		    !dev_flow->dv.vf_vlan.created) {
> +			/*
> +			 * The rule contains the VLAN pattern.
> +			 * For VF we are going to create VLAN
> +			 * interface to make ESXi set correct
> +			 * e-Switch vport context.
> +			 */
> +			mlx5_vlan_esxi_acquire(dev, &dev_flow-
> >dv.vf_vlan);
> +		}
>  	}
>  	return 0;
>  error:
> @@ -4671,6 +4692,9 @@ struct field_modify_info modify_tcp[] = {
>  				mlx5_hrxq_release(dev, dv->hrxq);
>  			dv->hrxq = NULL;
>  		}
> +		if (dev_flow->dv.vf_vlan.tag &&
> +		    dev_flow->dv.vf_vlan.created)
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >dv.vf_vlan);
>  	}
>  	rte_errno = err; /* Restore rte_errno. */
>  	return -rte_errno;
> @@ -4871,6 +4895,9 @@ struct field_modify_info modify_tcp[] = {
>  				mlx5_hrxq_release(dev, dv->hrxq);
>  			dv->hrxq = NULL;
>  		}
> +		if (dev_flow->dv.vf_vlan.tag &&
> +		    dev_flow->dv.vf_vlan.created)
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >dv.vf_vlan);
>  	}
>  }
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c
> b/drivers/net/mlx5/mlx5_flow_verbs.c
> index 2f4c80c..5909488 100644
> --- a/drivers/net/mlx5/mlx5_flow_verbs.c
> +++ b/drivers/net/mlx5/mlx5_flow_verbs.c
> @@ -386,6 +386,9 @@
>  		flow_verbs_spec_add(&dev_flow->verbs, &eth, size);
>  	else
>  		flow_verbs_item_vlan_update(dev_flow->verbs.attr, &eth);
> +	if (!tunnel)
> +		dev_flow->verbs.vf_vlan.tag =
> +			rte_be_to_cpu_16(spec->tci) & 0x0fff;
>  }
> 
>  /**
> @@ -1049,7 +1052,7 @@
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
>  			ret = mlx5_flow_validate_item_vlan(items,
> item_flags,
> -							   error);
> +							   dev, error);
>  			if (ret < 0)
>  				return ret;
>  			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2
> | @@ -1587,6 +1590,10 @@
>  				mlx5_hrxq_release(dev, verbs->hrxq);
>  			verbs->hrxq = NULL;
>  		}
> +		if (dev_flow->verbs.vf_vlan.tag &&
> +		    dev_flow->verbs.vf_vlan.created) {
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >verbs.vf_vlan);
> +		}
>  	}
>  }
> 
> @@ -1634,6 +1641,7 @@
>  flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
>  		 struct rte_flow_error *error)
>  {
> +	struct mlx5_priv *priv = dev->data->dev_private;
>  	struct mlx5_flow_verbs *verbs;
>  	struct mlx5_flow *dev_flow;
>  	int err;
> @@ -1683,6 +1691,17 @@
>  					   "hardware refuses to create flow");
>  			goto error;
>  		}
> +		if (priv->esxi_context &&
> +		    dev_flow->verbs.vf_vlan.tag &&
> +		    !dev_flow->verbs.vf_vlan.created) {
> +			/*
> +			 * The rule contains the VLAN pattern.
> +			 * For VF we are going to create VLAN
> +			 * interface to make ESXi set correct
> +			 * e-Switch vport context.
> +			 */
> +			mlx5_vlan_esxi_acquire(dev, &dev_flow-
> >verbs.vf_vlan);
> +		}
>  	}
>  	return 0;
>  error:
> @@ -1696,6 +1715,10 @@
>  				mlx5_hrxq_release(dev, verbs->hrxq);
>  			verbs->hrxq = NULL;
>  		}
> +		if (dev_flow->verbs.vf_vlan.tag &&
> +		    dev_flow->verbs.vf_vlan.created) {
> +			mlx5_vlan_esxi_release(dev, &dev_flow-
> >verbs.vf_vlan);
> +		}
>  	}
>  	rte_errno = err; /* Restore rte_errno. */
>  	return -rte_errno;
> diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c index
> 5773fa7..8516442 100644
> --- a/drivers/net/mlx5/mlx5_nl.c
> +++ b/drivers/net/mlx5/mlx5_nl.c
> @@ -12,11 +12,14 @@
>  #include <stdbool.h>
>  #include <stdint.h>
>  #include <stdlib.h>
> +#include <stdalign.h>
>  #include <string.h>
>  #include <sys/socket.h>
>  #include <unistd.h>
> 
>  #include <rte_errno.h>
> +#include <rte_malloc.h>
> +#include <rte_hypervisor.h>
> 
>  #include "mlx5.h"
>  #include "mlx5_utils.h"
> @@ -28,6 +31,8 @@
>  /* Receive buffer size for the Netlink socket */  #define
> MLX5_RECV_BUF_SIZE 32768
> 
> +/** Parameters of VLAN devices created by driver. */ #define
> +MLX5_ESXI_VLAN_DEVICE_PFX "evmlx"
>  /*
>   * Define NDA_RTA as defined in iproute2 sources.
>   *
> @@ -987,3 +992,277 @@ struct mlx5_nl_ifindex_data {
>  	}
>  	return ret;
>  }
> +
> +/*
> + * Delete VLAN network device by ifindex.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_vlan_esxi_init().
> + * @param[in] ifindex
> + *   Interface index of network device to delete.
> + */
> +static void
> +mlx5_vlan_esxi_delete(struct mlx5_vlan_esxi_context *esxi,
> +		      uint32_t ifindex)
> +{
> +	int ret;
> +	struct {
> +		struct nlmsghdr nh;
> +		struct ifinfomsg info;
> +	} req = {
> +		.nh = {
> +			.nlmsg_len = NLMSG_LENGTH(sizeof(struct
> ifinfomsg)),
> +			.nlmsg_type = RTM_DELLINK,
> +			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
> +		},
> +		.info = {
> +			.ifi_family = AF_UNSPEC,
> +			.ifi_index = ifindex,
> +		},
> +	};
> +
> +	if (ifindex) {
> +		++esxi->nl_sn;
> +		if (!esxi->nl_sn)
> +			++esxi->nl_sn;
> +		ret = mlx5_nl_send(esxi->nl_socket, &req.nh, esxi->nl_sn);
> +		if (ret >= 0)
> +			ret = mlx5_nl_recv(esxi->nl_socket,
> +					   esxi->nl_sn,
> +					   NULL, NULL);
> +		if (ret < 0)
> +			DRV_LOG(WARNING, "netlink: error deleting"
> +					 " VLAN ESXi ifindex %u, %d",
> +					 ifindex, ret);
> +	}
> +}
> +
> +/* Set of subroutines to build Netlink message. */ static struct nlattr
> +* nl_msg_tail(struct nlmsghdr *nlh) {
> +	return (struct nlattr *)
> +		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len)); }
> +
> +static void
> +nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
> +{
> +	struct nlattr *nla = nl_msg_tail(nlh);
> +
> +	nla->nla_type = type;
> +	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
> +	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
> +
> +	if (alen)
> +		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen); }
> +
> +static struct nlattr *
> +nl_attr_nest_start(struct nlmsghdr *nlh, int type) {
> +	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
> +
> +	nl_attr_put(nlh, type, NULL, 0);
> +	return nest;
> +}
> +
> +static void
> +nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest) {
> +	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest; }
> +
> +/*
> + * Create network VLAN device with specified VLAN tag.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_vlan_esxi_init().
> + * @param[in] ifindex
> + *   Base network interface index.
> + * @param[in] tag
> + *   VLAN tag for VLAN network device to create.
> + */
> +static uint32_t
> +mlx5_vlan_esxi_create(struct mlx5_vlan_esxi_context *esxi,
> +		      uint32_t ifindex,
> +		      uint16_t tag)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifinfomsg *ifm;
> +	char name[sizeof(MLX5_ESXI_VLAN_DEVICE_PFX) + 32];
> +
> +	alignas(RTE_CACHE_LINE_SIZE)
> +	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
> +		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
> +		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
> +		    NLMSG_ALIGN(sizeof(uint32_t)) +
> +		    NLMSG_ALIGN(sizeof(name)) +
> +		    NLMSG_ALIGN(sizeof("vlan")) +
> +		    NLMSG_ALIGN(sizeof(uint32_t)) +
> +		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
> +	struct nlattr *na_info;
> +	struct nlattr *na_vlan;
> +	int ret;
> +
> +	memset(buf, 0, sizeof(buf));
> +	++esxi->nl_sn;
> +	if (!esxi->nl_sn)
> +		++esxi->nl_sn;
> +	nlh = (struct nlmsghdr *)buf;
> +	nlh->nlmsg_len = sizeof(struct nlmsghdr);
> +	nlh->nlmsg_type = RTM_NEWLINK;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
> +			   NLM_F_EXCL | NLM_F_ACK;
> +	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
> +	nlh->nlmsg_len += sizeof(struct ifinfomsg);
> +	ifm->ifi_family = AF_UNSPEC;
> +	ifm->ifi_type = 0;
> +	ifm->ifi_index = 0;
> +	ifm->ifi_flags = IFF_UP;
> +	ifm->ifi_change = 0xffffffff;
> +	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
> +	ret = snprintf(name, sizeof(name), "%s.%u.%u",
> +		       MLX5_ESXI_VLAN_DEVICE_PFX, ifindex, tag);
> +	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
> +	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
> +	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
> +	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
> +	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
> +	nl_attr_nest_end(nlh, na_vlan);
> +	nl_attr_nest_end(nlh, na_info);
> +	assert(sizeof(buf) >= nlh->nlmsg_len);
> +	ret = mlx5_nl_send(esxi->nl_socket, nlh, esxi->nl_sn);
> +	if (ret >= 0)
> +		ret = mlx5_nl_recv(esxi->nl_socket, esxi->nl_sn, NULL,
> NULL);
> +	if (ret < 0) {
> +		DRV_LOG(WARNING,
> +			"netlink: VLAN %s create failure (%d)",
> +			name, ret);
> +	}
> +	// Try to get ifindex of created or pre-existing device.
> +	ret = if_nametoindex(name);
> +	if (!ret) {
> +		DRV_LOG(WARNING,
> +			"VLAN %s failed to get index (%d)",
> +			name, errno);
> +		return 0;
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Release VLAN network device, created for ESXi workaround.
> + *
> + * @param[in] dev
> + *   Ethernet device object, Netlink context provider.
> + * @param[in] vlan
> + *   Object representing the network device to release.
> + */
> +void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vlan)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
> +	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
> +
> +	assert(vlan->created);
> +	assert(priv->esxi_context);
> +	if (!vlan->created || !esxi)
> +		return;
> +	vlan->created = 0;
> +	assert(vlan_dev[vlan->tag].refcnt);
> +	if (--vlan_dev[vlan->tag].refcnt == 0 &&
> +	    vlan_dev[vlan->tag].ifindex) {
> +		mlx5_vlan_esxi_delete(esxi, vlan_dev[vlan->tag].ifindex);
> +		vlan_dev[vlan->tag].ifindex = 0;
> +	}
> +}
> +
> +/**
> + * Acquire VLAN interface with specified tag for ESXi workaround.
> + *
> + * @param[in] dev
> + *   Ethernet device object, Netlink context provider.
> + * @param[in] vlan
> + *   Object representing the network device to acquire.
> + */
> +void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
> +			    struct mlx5_vf_vlan *vlan)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
> +	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
> +
> +	assert(!vlan->created);
> +	assert(priv->esxi_context);
> +	if (vlan->created || !esxi)
> +		return;
> +	if (vlan_dev[vlan->tag].refcnt == 0) {
> +		assert(!vlan_dev[vlan->tag].ifindex);
> +		vlan_dev[vlan->tag].ifindex =
> +			mlx5_vlan_esxi_create(esxi,
> +					      esxi->vf_ifindex,
> +					      vlan->tag);
> +	}
> +	if (vlan_dev[vlan->tag].ifindex) {
> +		vlan_dev[vlan->tag].refcnt++;
> +		vlan->created = 1;
> +	}
> +}
> +
> +/*
> + * Create per ethernet device VLAN ESXi workaround context  */ struct
> +mlx5_vlan_esxi_context * mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
> +		    uint32_t ifindex)
> +{
> +	struct mlx5_priv *priv = dev->data->dev_private;
> +	struct mlx5_dev_config *config = &priv->config;
> +	struct mlx5_vlan_esxi_context *esxi;
> +
> +	/* Do not engage workaround over PF. */
> +	if (!config->vf)
> +		return NULL;
> +	/* Check whether there is virtual environment */
> +	if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
> +		return NULL;
> +	esxi = rte_zmalloc(__func__, sizeof(*esxi), sizeof(uint32_t));
> +	if (!esxi) {
> +		DRV_LOG(WARNING,
> +			"Can not allocate memory"
> +			" for ESXi VLAN context");
> +		return NULL;
> +	}
> +	esxi->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
> +	if (esxi->nl_socket < 0) {
> +		DRV_LOG(WARNING,
> +			"Can not create Netlink socket"
> +			" for ESXi VLAN context");
> +		rte_free(esxi);
> +		return NULL;
> +	}
> +	esxi->nl_sn = random();
> +	esxi->vf_ifindex = ifindex;
> +	esxi->dev = dev;
> +	/* Cleanup for existing VLAN devices. */
> +	return esxi;
> +}
> +
> +/*
> + * Destroy per ethernet device VLAN ESXi workaround context  */ void
> +mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *esxi) {
> +	unsigned int i;
> +
> +	/* Delete all remaining VLAN devices. */
> +	for (i = 0; i < RTE_DIM(esxi->vlan_dev); i++) {
> +		if (esxi->vlan_dev[i].ifindex)
> +			mlx5_vlan_esxi_delete(esxi, esxi-
> >vlan_dev[i].ifindex);
> +	}
> +	if (esxi->nl_socket >= 0)
> +		close(esxi->nl_socket);
> +	rte_free(esxi);
> +}
> --
> 1.8.3.1

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d93f92d..8549167 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -690,6 +690,8 @@  struct mlx5_dev_spawn_data {
 		close(priv->nl_socket_route);
 	if (priv->nl_socket_rdma >= 0)
 		close(priv->nl_socket_rdma);
+	if (priv->esxi_context)
+		mlx5_vlan_esxi_exit(priv->esxi_context);
 	if (priv->sh) {
 		/*
 		 * Free the shared context in last turn, because the cleanup
@@ -1546,6 +1548,8 @@  struct mlx5_dev_spawn_data {
 #endif
 	/* Store device configuration on private structure. */
 	priv->config = config;
+	/* Create context for virtual machine VLAN workaround. */
+	priv->esxi_context = mlx5_vlan_esxi_init(eth_dev, spawn->ifindex);
 	if (config.dv_flow_en) {
 		err = mlx5_alloc_shared_dr(priv);
 		if (err)
@@ -1572,6 +1576,8 @@  struct mlx5_dev_spawn_data {
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (priv->esxi_context)
+			mlx5_vlan_esxi_exit(priv->esxi_context);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 5af3f41..87afa7a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -231,6 +231,27 @@  enum mlx5_verbs_alloc_type {
 	MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
 };
 
+/* VLAN netdev for ESXi VLAN workaround. */
+struct mlx5_vlan_dev {
+	uint32_t refcnt;
+	uint32_t ifindex; /**< Own interface index. */
+};
+
+/* Structure for VF ESXi VLAN workaround. */
+struct mlx5_vf_vlan {
+	uint32_t tag:12;
+	uint32_t created:1;
+};
+
+/* Array of VLAN devices created on the base of VF */
+struct mlx5_vlan_esxi_context {
+	int nl_socket;
+	uint32_t nl_sn;
+	uint32_t vf_ifindex;
+	struct rte_eth_dev *dev;
+	struct mlx5_vlan_dev vlan_dev[4096];
+};
+
 /**
  * Verbs allocator needs a context to know in the callback which kind of
  * resources it is allocating.
@@ -386,6 +407,7 @@  struct mlx5_priv {
 	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
 	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
+	struct mlx5_vlan_esxi_context *esxi_context; /* ESXi VLAN context. */
 #ifndef RTE_ARCH_64
 	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
 	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
@@ -582,6 +604,14 @@  int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
 int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 			struct mlx5_switch_info *info);
 
+struct mlx5_vlan_esxi_context *mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+						   uint32_t ifindex);
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *ctx);
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vf_vlan);
+
 /* mlx5_devx_cmds.c */
 
 int mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 4ba34db..42743d2 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -1200,6 +1200,8 @@  uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
  *   Item specification.
  * @param[in] item_flags
  *   Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ *   Ethernet device flow is being created on.
  * @param[out] error
  *   Pointer to error structure.
  *
@@ -1209,6 +1211,7 @@  uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 int
 mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 			     uint64_t item_flags,
+			     struct rte_eth_dev *dev,
 			     struct rte_flow_error *error)
 {
 	const struct rte_flow_item_vlan *spec = item->spec;
@@ -1243,6 +1246,25 @@  uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
 					error);
 	if (ret)
 		return ret;
+	if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+
+		if (priv->esxi_context) {
+			/*
+			 * Non-NULL context means we have a virtual machine
+			 * and SR-IOV enabled, we have to create VLAN interface
+			 * to make hypervisor (ESXi) to setup E-Switch vport
+			 * context correctly. We avoid creating the multiple
+			 * VLAN interfaces, so we cannot support VLAN tag mask.
+			 */
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "VLAN tag mask is not"
+						  " supported in virtual"
+						  " environment");
+		}
+	}
 	if (spec) {
 		vlan_tag = spec->tci;
 		vlan_tag &= mask->tci;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 72b339e..ac20572 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -318,6 +318,8 @@  struct mlx5_flow_dv {
 	/**< Pointer to the jump action resource. */
 	struct mlx5_flow_dv_port_id_action_resource *port_id_action;
 	/**< Pointer to port ID action resource. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF ESXi VLAN workaround. */
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
 	/**< Action list. */
@@ -343,6 +345,8 @@  struct mlx5_flow_verbs {
 	struct ibv_flow *flow; /**< Verbs flow pointer. */
 	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
 	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+	struct mlx5_vf_vlan vf_vlan;
+	/**< Structure for VF ESXi VLAN workaround. */
 };
 
 /** Device flow structure. */
@@ -507,6 +511,7 @@  int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
 				struct rte_flow_error *error);
 int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
 				 uint64_t item_flags,
+				 struct rte_eth_dev *dev,
 				 struct rte_flow_error *error);
 int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
 				  uint64_t item_flags,
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 3fa624b..63183b5 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -2363,7 +2363,7 @@  struct field_modify_info modify_tcp[] = {
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
@@ -2914,6 +2914,8 @@  struct field_modify_info modify_tcp[] = {
 /**
  * Add VLAN item to matcher and to the value.
  *
+ * @param[in, out] dev_flow
+ *   Flow descriptor.
  * @param[in, out] matcher
  *   Flow matcher.
  * @param[in, out] key
@@ -2924,7 +2926,8 @@  struct field_modify_info modify_tcp[] = {
  *   Item is inner pattern.
  */
 static void
-flow_dv_translate_item_vlan(void *matcher, void *key,
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+			    void *matcher, void *key,
 			    const struct rte_flow_item *item,
 			    int inner)
 {
@@ -2951,6 +2954,12 @@  struct field_modify_info modify_tcp[] = {
 		headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
 					 outer_headers);
 		headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+		/*
+		 * This is workaround, masks are not supported,
+		 * and pre-validated.
+		 */
+		dev_flow->dv.vf_vlan.tag =
+			rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
 	}
 	tci_m = rte_be_to_cpu_16(vlan_m->tci);
 	tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
@@ -4443,7 +4452,8 @@  struct field_modify_info modify_tcp[] = {
 					     MLX5_FLOW_LAYER_OUTER_L2;
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
-			flow_dv_translate_item_vlan(match_mask, match_value,
+			flow_dv_translate_item_vlan(dev_flow,
+						    match_mask, match_value,
 						    items, tunnel);
 			matcher.priority = MLX5_PRIORITY_MAP_L2;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -4658,6 +4668,17 @@  struct field_modify_info modify_tcp[] = {
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->esxi_context &&
+		    dev_flow->dv.vf_vlan.tag &&
+		    !dev_flow->dv.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make ESXi set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_esxi_acquire(dev, &dev_flow->dv.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -4671,6 +4692,9 @@  struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
@@ -4871,6 +4895,9 @@  struct field_modify_info modify_tcp[] = {
 				mlx5_hrxq_release(dev, dv->hrxq);
 			dv->hrxq = NULL;
 		}
+		if (dev_flow->dv.vf_vlan.tag &&
+		    dev_flow->dv.vf_vlan.created)
+			mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
 	}
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 2f4c80c..5909488 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -386,6 +386,9 @@ 
 		flow_verbs_spec_add(&dev_flow->verbs, &eth, size);
 	else
 		flow_verbs_item_vlan_update(dev_flow->verbs.attr, &eth);
+	if (!tunnel)
+		dev_flow->verbs.vf_vlan.tag =
+			rte_be_to_cpu_16(spec->tci) & 0x0fff;
 }
 
 /**
@@ -1049,7 +1052,7 @@ 
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
 			ret = mlx5_flow_validate_item_vlan(items, item_flags,
-							   error);
+							   dev, error);
 			if (ret < 0)
 				return ret;
 			last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -1587,6 +1590,10 @@ 
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 }
 
@@ -1634,6 +1641,7 @@ 
 flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 		 struct rte_flow_error *error)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_verbs *verbs;
 	struct mlx5_flow *dev_flow;
 	int err;
@@ -1683,6 +1691,17 @@ 
 					   "hardware refuses to create flow");
 			goto error;
 		}
+		if (priv->esxi_context &&
+		    dev_flow->verbs.vf_vlan.tag &&
+		    !dev_flow->verbs.vf_vlan.created) {
+			/*
+			 * The rule contains the VLAN pattern.
+			 * For VF we are going to create VLAN
+			 * interface to make ESXi set correct
+			 * e-Switch vport context.
+			 */
+			mlx5_vlan_esxi_acquire(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	return 0;
 error:
@@ -1696,6 +1715,10 @@ 
 				mlx5_hrxq_release(dev, verbs->hrxq);
 			verbs->hrxq = NULL;
 		}
+		if (dev_flow->verbs.vf_vlan.tag &&
+		    dev_flow->verbs.vf_vlan.created) {
+			mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+		}
 	}
 	rte_errno = err; /* Restore rte_errno. */
 	return -rte_errno;
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index 5773fa7..8516442 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -12,11 +12,14 @@ 
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdalign.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <unistd.h>
 
 #include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -28,6 +31,8 @@ 
 /* Receive buffer size for the Netlink socket */
 #define MLX5_RECV_BUF_SIZE 32768
 
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_ESXI_VLAN_DEVICE_PFX "evmlx"
 /*
  * Define NDA_RTA as defined in iproute2 sources.
  *
@@ -987,3 +992,277 @@  struct mlx5_nl_ifindex_data {
 	}
 	return ret;
 }
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ *   Interface index of network device to delete.
+ */
+static void
+mlx5_vlan_esxi_delete(struct mlx5_vlan_esxi_context *esxi,
+		      uint32_t ifindex)
+{
+	int ret;
+	struct {
+		struct nlmsghdr nh;
+		struct ifinfomsg info;
+	} req = {
+		.nh = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+			.nlmsg_type = RTM_DELLINK,
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+		},
+		.info = {
+			.ifi_family = AF_UNSPEC,
+			.ifi_index = ifindex,
+		},
+	};
+
+	if (ifindex) {
+		++esxi->nl_sn;
+		if (!esxi->nl_sn)
+			++esxi->nl_sn;
+		ret = mlx5_nl_send(esxi->nl_socket, &req.nh, esxi->nl_sn);
+		if (ret >= 0)
+			ret = mlx5_nl_recv(esxi->nl_socket,
+					   esxi->nl_sn,
+					   NULL, NULL);
+		if (ret < 0)
+			DRV_LOG(WARNING, "netlink: error deleting"
+					 " VLAN ESXi ifindex %u, %d",
+					 ifindex, ret);
+	}
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+	return (struct nlattr *)
+		(((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+	struct nlattr *nla = nl_msg_tail(nlh);
+
+	nla->nla_type = type;
+	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+	if (alen)
+		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+	struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+	nl_attr_put(nlh, type, NULL, 0);
+	return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+	nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ *   Base network interface index.
+ * @param[in] tag
+ *   VLAN tag for VLAN network device to create.
+ */
+static uint32_t
+mlx5_vlan_esxi_create(struct mlx5_vlan_esxi_context *esxi,
+		      uint32_t ifindex,
+		      uint16_t tag)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	char name[sizeof(MLX5_ESXI_VLAN_DEVICE_PFX) + 32];
+
+	alignas(RTE_CACHE_LINE_SIZE)
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(name)) +
+		    NLMSG_ALIGN(sizeof("vlan")) +
+		    NLMSG_ALIGN(sizeof(uint32_t)) +
+		    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+	struct nlattr *na_info;
+	struct nlattr *na_vlan;
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	++esxi->nl_sn;
+	if (!esxi->nl_sn)
+		++esxi->nl_sn;
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = RTM_NEWLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+			   NLM_F_EXCL | NLM_F_ACK;
+	ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct ifinfomsg);
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_type = 0;
+	ifm->ifi_index = 0;
+	ifm->ifi_flags = IFF_UP;
+	ifm->ifi_change = 0xffffffff;
+	nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+	ret = snprintf(name, sizeof(name), "%s.%u.%u",
+		       MLX5_ESXI_VLAN_DEVICE_PFX, ifindex, tag);
+	nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+	na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+	nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+	na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+	nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+	nl_attr_nest_end(nlh, na_vlan);
+	nl_attr_nest_end(nlh, na_info);
+	assert(sizeof(buf) >= nlh->nlmsg_len);
+	ret = mlx5_nl_send(esxi->nl_socket, nlh, esxi->nl_sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(esxi->nl_socket, esxi->nl_sn, NULL, NULL);
+	if (ret < 0) {
+		DRV_LOG(WARNING,
+			"netlink: VLAN %s create failure (%d)",
+			name, ret);
+	}
+	// Try to get ifindex of created or pre-existing device.
+	ret = if_nametoindex(name);
+	if (!ret) {
+		DRV_LOG(WARNING,
+			"VLAN %s failed to get index (%d)",
+			name, errno);
+		return 0;
+	}
+	return ret;
+}
+
+/*
+ * Release VLAN network device, created for ESXi workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to release.
+ */
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+	assert(vlan->created);
+	assert(priv->esxi_context);
+	if (!vlan->created || !esxi)
+		return;
+	vlan->created = 0;
+	assert(vlan_dev[vlan->tag].refcnt);
+	if (--vlan_dev[vlan->tag].refcnt == 0 &&
+	    vlan_dev[vlan->tag].ifindex) {
+		mlx5_vlan_esxi_delete(esxi, vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex = 0;
+	}
+}
+
+/**
+ * Acquire VLAN interface with specified tag for ESXi workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to acquire.
+ */
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+			    struct mlx5_vf_vlan *vlan)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+	struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+	assert(!vlan->created);
+	assert(priv->esxi_context);
+	if (vlan->created || !esxi)
+		return;
+	if (vlan_dev[vlan->tag].refcnt == 0) {
+		assert(!vlan_dev[vlan->tag].ifindex);
+		vlan_dev[vlan->tag].ifindex =
+			mlx5_vlan_esxi_create(esxi,
+					      esxi->vf_ifindex,
+					      vlan->tag);
+	}
+	if (vlan_dev[vlan->tag].ifindex) {
+		vlan_dev[vlan->tag].refcnt++;
+		vlan->created = 1;
+	}
+}
+
+/*
+ * Create per ethernet device VLAN ESXi workaround context
+ */
+struct mlx5_vlan_esxi_context *
+mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+		    uint32_t ifindex)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_config *config = &priv->config;
+	struct mlx5_vlan_esxi_context *esxi;
+
+	/* Do not engage workaround over PF. */
+	if (!config->vf)
+		return NULL;
+	/* Check whether there is virtual environment */
+	if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
+		return NULL;
+	esxi = rte_zmalloc(__func__, sizeof(*esxi), sizeof(uint32_t));
+	if (!esxi) {
+		DRV_LOG(WARNING,
+			"Can not allocate memory"
+			" for ESXi VLAN context");
+		return NULL;
+	}
+	esxi->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+	if (esxi->nl_socket < 0) {
+		DRV_LOG(WARNING,
+			"Can not create Netlink socket"
+			" for ESXi VLAN context");
+		rte_free(esxi);
+		return NULL;
+	}
+	esxi->nl_sn = random();
+	esxi->vf_ifindex = ifindex;
+	esxi->dev = dev;
+	/* Cleanup for existing VLAN devices. */
+	return esxi;
+}
+
+/*
+ * Destroy per ethernet device VLAN ESXi workaround context
+ */
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *esxi)
+{
+	unsigned int i;
+
+	/* Delete all remaining VLAN devices. */
+	for (i = 0; i < RTE_DIM(esxi->vlan_dev); i++) {
+		if (esxi->vlan_dev[i].ifindex)
+			mlx5_vlan_esxi_delete(esxi, esxi->vlan_dev[i].ifindex);
+	}
+	if (esxi->nl_socket >= 0)
+		close(esxi->nl_socket);
+	rte_free(esxi);
+}