[dpdk-dev,v4,1/3] net/mlx5: use Netlink to add/remove MAC addresses

Message ID da1ddd10d7a144897fe2dfc4c71e9c922e378769.1522940682.git.nelio.laranjeiro@6wind.com (mailing list archive)
State Accepted, archived
Delegated to: Shahaf Shuler
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Nélio Laranjeiro April 5, 2018, 3:07 p.m. UTC
  VF devices are not able to receive traffic unless it fully requests it
though Netlink.  This will cause the request to be processed by the PF
which will add/remove the MAC address to the VF table if the VF is trusted.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 doc/guides/nics/mlx5.rst       |   9 +
 drivers/net/mlx5/Makefile      |   1 +
 drivers/net/mlx5/mlx5.c        |  23 ++
 drivers/net/mlx5/mlx5.h        |  16 +
 drivers/net/mlx5/mlx5_ethdev.c |  27 ++
 drivers/net/mlx5/mlx5_mac.c    |  20 +-
 drivers/net/mlx5/mlx5_nl.c     | 533 +++++++++++++++++++++++++++++++++
 7 files changed, 626 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/mlx5/mlx5_nl.c
  

Patch

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 46d26e4c8..c812f0b4f 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -135,6 +135,15 @@  Limitations
 - Flows with a VXLAN Network Identifier equal (or ends to be equal)
   to 0 are not supported.
 - VXLAN TSO and checksum offloads are not supported on VM.
+- VF: flow rules created on VF devices can only match traffic targeted at the
+  configured MAC addresses (see ``rte_eth_dev_mac_addr_add()``).
+
+.. note::
+
+   MAC addresses not already present in the bridge table of the associated
+   kernel network device will be added and cleaned up by the PMD when closing
+   the device. In case of ungraceful program termination, some entries may
+   remain present and should be removed manually by other means.
 
 Statistics
 ----------
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 201f6f06a..ae118ad33 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -59,6 +59,7 @@  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
 
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
 INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 7d58d66bb..e52c60fb3 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -13,6 +13,7 @@ 
 #include <errno.h>
 #include <net/if.h>
 #include <sys/mman.h>
+#include <linux/rtnetlink.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -205,6 +206,10 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 		rte_free(priv->reta_idx);
 	if (priv->primary_socket)
 		mlx5_socket_uninit(dev);
+	if (priv->config.vf)
+		mlx5_nl_mac_addr_flush(dev);
+	if (priv->nl_socket >= 0)
+		close(priv->nl_socket);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -597,6 +602,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	int err = 0;
 	struct ibv_context *attr_ctx = NULL;
 	struct ibv_device_attr_ex device_attr;
+	unsigned int vf;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -646,6 +652,14 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			continue;
 		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
 			list[i]->name);
+		vf = ((pci_dev->id.device_id ==
+		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
+		      (pci_dev->id.device_id ==
+		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
+		      (pci_dev->id.device_id ==
+		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
+		      (pci_dev->id.device_id ==
+		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
 		attr_ctx = mlx5_glue->open_device(list[i]);
 		rte_errno = errno;
 		err = rte_errno;
@@ -869,6 +883,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
+		config.vf = vf;
 		config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
 			      (device_attr_ex.tso_caps.supported_qpts &
 			      (1 << IBV_QPT_RAW_PACKET)));
@@ -946,6 +961,14 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->dev_ops = &mlx5_dev_ops;
 		/* Register MAC address. */
 		claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+		priv->nl_socket = -1;
+		priv->nl_sn = 0;
+		if (vf) {
+			priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
+			if (priv->nl_socket < 0)
+				priv->nl_socket = -1;
+			mlx5_nl_mac_addr_sync(eth_dev);
+		}
 		TAILQ_INIT(&priv->flows);
 		TAILQ_INIT(&priv->ctrl_flows);
 		/* Hint libmlx5 to use PMD allocator for data plane resources */
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index faacfd9d6..683026b0f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -78,6 +78,7 @@  struct mlx5_dev_config {
 	unsigned int hw_vlan_strip:1; /* VLAN stripping is supported. */
 	unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
 	unsigned int hw_padding:1; /* End alignment padding is supported. */
+	unsigned int vf:1; /* This is a VF. */
 	unsigned int mps:2; /* Multi-packet send supported mode. */
 	unsigned int tunnel_en:1;
 	/* Whether tunnel stateless offloads are supported. */
@@ -119,6 +120,8 @@  struct priv {
 	struct ibv_pd *pd; /* Protection Domain. */
 	char ibdev_path[IBV_SYSFS_PATH_MAX]; /* IB device path for secondary */
 	struct ether_addr mac[MLX5_MAX_MAC_ADDRESSES]; /* MAC addresses. */
+	BITFIELD_DECLARE(mac_own, uint64_t, MLX5_MAX_MAC_ADDRESSES);
+	/* Bit-field of MAC addresses owned by the PMD. */
 	uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */
 	unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
 	/* Device properties. */
@@ -154,6 +157,8 @@  struct priv {
 	struct mlx5_dev_config config; /* Device configuration. */
 	struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
 	/* Context for Verbs allocator. */
+	int nl_socket; /* Netlink socket. */
+	uint32_t nl_sn; /* Netlink message sequence number. */
 };
 
 /* mlx5.c */
@@ -163,6 +168,7 @@  int mlx5_getenv_int(const char *);
 /* mlx5_ethdev.c */
 
 int mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE]);
+int mlx5_ifindex(const struct rte_eth_dev *dev);
 int mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr);
 int mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu);
 int mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep,
@@ -297,4 +303,14 @@  struct mlx5_mr *mlx5_mr_get(struct rte_eth_dev *dev, struct rte_mempool *mp);
 int mlx5_mr_release(struct mlx5_mr *mr);
 int mlx5_mr_verify(struct rte_eth_dev *dev);
 
+/* mlx5_nl.c */
+
+int mlx5_nl_init(uint32_t nlgroups);
+int mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
+			 uint32_t index);
+int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
+			    uint32_t index);
+void mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev);
+void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index b6f5101cf..bdd03c3d7 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -176,6 +176,33 @@  mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 	return 0;
 }
 
+/**
+ * Get the interface index from device name.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Interface index on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_ifindex(const struct rte_eth_dev *dev)
+{
+	char ifname[IF_NAMESIZE];
+	int ret;
+
+	ret = mlx5_get_ifname(dev, &ifname);
+	if (ret)
+		return ret;
+	ret = if_nametoindex(ifname);
+	if (ret == -1) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	return ret;
+}
+
 /**
  * Perform ifreq ioctl() on associated Ethernet device.
  *
diff --git a/drivers/net/mlx5/mlx5_mac.c b/drivers/net/mlx5/mlx5_mac.c
index 01c7ba17a..e859fca6a 100644
--- a/drivers/net/mlx5/mlx5_mac.c
+++ b/drivers/net/mlx5/mlx5_mac.c
@@ -67,13 +67,19 @@  mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[ETHER_ADDR_LEN])
 void
 mlx5_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
 {
+	struct priv *priv = dev->data->dev_private;
+	const int vf = priv->config.vf;
+	int ret;
+
 	assert(index < MLX5_MAX_MAC_ADDRESSES);
+	if (vf)
+		mlx5_nl_mac_addr_remove(dev, &dev->data->mac_addrs[index],
+					index);
 	memset(&dev->data->mac_addrs[index], 0, sizeof(struct ether_addr));
 	if (!dev->data->promiscuous) {
-		int ret = mlx5_traffic_restart(dev);
-
+		ret = mlx5_traffic_restart(dev);
 		if (ret)
-			DRV_LOG(ERR, "port %u cannot remove mac address: %s",
+			DRV_LOG(ERR, "port %u cannot restart traffic: %s",
 				dev->data->port_id, strerror(rte_errno));
 	}
 }
@@ -97,6 +103,8 @@  int
 mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
 		  uint32_t index, uint32_t vmdq __rte_unused)
 {
+	struct priv *priv = dev->data->dev_private;
+	const int vf = priv->config.vf;
 	unsigned int i;
 
 	assert(index < MLX5_MAX_MAC_ADDRESSES);
@@ -111,6 +119,12 @@  mlx5_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
 		rte_errno = EADDRINUSE;
 		return -rte_errno;
 	}
+	if (vf) {
+		int ret = mlx5_nl_mac_addr_add(dev, mac, index);
+
+		if (ret)
+			return ret;
+	}
 	dev->data->mac_addrs[index] = *mac;
 	if (!dev->data->promiscuous)
 		return mlx5_traffic_restart(dev);
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
new file mode 100644
index 000000000..2f238beb3
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -0,0 +1,533 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <unistd.h>
+
+#include "mlx5.h"
+#include "mlx5_utils.h"
+
+/* Size of the buffer to receive kernel messages */
+#define MLX5_NL_BUF_SIZE (32 * 1024)
+/* Send buffer size for the Netlink socket */
+#define MLX5_SEND_BUF_SIZE 32768
+/* Receive buffer size for the Netlink socket */
+#define MLX5_RECV_BUF_SIZE 32768
+
+/*
+ * Define NDA_RTA as defined in iproute2 sources.
+ *
+ * see in iproute2 sources file include/libnetlink.h
+ */
+#ifndef MLX5_NDA_RTA
+#define MLX5_NDA_RTA(r) \
+	((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
+#endif
+
+/* Add/remove MAC address through Netlink */
+struct mlx5_nl_mac_addr {
+	struct ether_addr (*mac)[];
+	/**< MAC address handled by the device. */
+	int mac_n; /**< Number of addresses in the array. */
+};
+
+/**
+ * Opens a Netlink socket.
+ *
+ * @param nl_groups
+ *   Netlink group value (e.g. RTMGRP_LINK).
+ *
+ * @return
+ *   A file descriptor on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_nl_init(uint32_t nl_groups)
+{
+	int fd;
+	int sndbuf_size = MLX5_SEND_BUF_SIZE;
+	int rcvbuf_size = MLX5_RECV_BUF_SIZE;
+	struct sockaddr_nl local = {
+		.nl_family = AF_NETLINK,
+		.nl_groups = nl_groups,
+	};
+	int ret;
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd == -1) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
+	if (ret == -1) {
+		rte_errno = errno;
+		goto error;
+	}
+	ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
+	if (ret == -1) {
+		rte_errno = errno;
+		goto error;
+	}
+	ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
+	if (ret == -1) {
+		rte_errno = errno;
+		goto error;
+	}
+	return fd;
+error:
+	close(fd);
+	return -rte_errno;
+}
+
+/**
+ * Send a request message to the kernel on the Netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] nh
+ *   The Netlink message send to the kernel.
+ * @param[in] ssn
+ *   Sequence number.
+ * @param[in] req
+ *   Pointer to the request structure.
+ * @param[in] len
+ *   Length of the request in bytes.
+ *
+ * @return
+ *   The number of sent bytes on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
+		int len)
+{
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov[2] = {
+		{ .iov_base = nh, .iov_len = sizeof(*nh), },
+		{ .iov_base = req, .iov_len = len, },
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = iov,
+		.msg_iovlen = 2,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = sn;
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	return send_bytes;
+}
+
+/**
+ * Send a message to the kernel on the Netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The Netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The Netlink message send to the kernel.
+ * @param[in] sn
+ *   Sequence number.
+ *
+ * @return
+ *   The number of sent bytes on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
+{
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = sn;
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	return send_bytes;
+}
+
+/**
+ * Receive a message from the kernel on the Netlink socket, following
+ * mlx5_nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The Netlink socket file descriptor used for communication.
+ * @param[in] sn
+ *   Sequence number.
+ * @param[in] cb
+ *   The callback function to call for each Netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
+	     void *arg)
+{
+	struct sockaddr_nl sa;
+	char buf[MLX5_RECV_BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		/* One message at a time */
+		.msg_iovlen = 1,
+	};
+	int multipart = 0;
+	int ret = 0;
+
+	do {
+		struct nlmsghdr *nh;
+		int recv_bytes = 0;
+
+		do {
+			recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+			if (recv_bytes == -1) {
+				rte_errno = errno;
+				return -rte_errno;
+			}
+			nh = (struct nlmsghdr *)buf;
+		} while (nh->nlmsg_seq != sn);
+		for (;
+		     NLMSG_OK(nh, (unsigned int)recv_bytes);
+		     nh = NLMSG_NEXT(nh, recv_bytes)) {
+			if (nh->nlmsg_type == NLMSG_ERROR) {
+				struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+				if (err_data->error < 0) {
+					rte_errno = -err_data->error;
+					return -rte_errno;
+				}
+				/* Ack message. */
+				return 0;
+			}
+			/* Multi-part msgs and their trailing DONE message. */
+			if (nh->nlmsg_flags & NLM_F_MULTI) {
+				if (nh->nlmsg_type == NLMSG_DONE)
+					return 0;
+				multipart = 1;
+			}
+			if (cb) {
+				ret = cb(nh, arg);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	} while (multipart);
+	return ret;
+}
+
+/**
+ * Parse Netlink message to retrieve the bridge MAC address.
+ *
+ * @param nh
+ *   Pointer to Netlink Message Header.
+ * @param arg
+ *   PMD data register with this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct mlx5_nl_mac_addr *data = arg;
+	struct ndmsg *r = NLMSG_DATA(nh);
+	struct rtattr *attribute;
+	int len;
+
+	len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
+	for (attribute = MLX5_NDA_RTA(r);
+	     RTA_OK(attribute, len);
+	     attribute = RTA_NEXT(attribute, len)) {
+		if (attribute->rta_type == NDA_LLADDR) {
+			if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
+				DRV_LOG(WARNING,
+					"not enough room to finalise the"
+					" request");
+				rte_errno = ENOMEM;
+				return -rte_errno;
+			}
+#ifndef NDEBUG
+			char m[18];
+
+			ether_format_addr(m, 18, RTA_DATA(attribute));
+			DRV_LOG(DEBUG, "brige MAC address %s", m);
+#endif
+			memcpy(&(*data->mac)[data->mac_n++],
+			       RTA_DATA(attribute), ETHER_ADDR_LEN);
+		}
+	}
+	return 0;
+}
+
+/**
+ * Get bridge MAC addresses.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mac[out]
+ *   Pointer to the array table of MAC addresses to fill.
+ *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
+ * @param mac_n[out]
+ *   Number of entries filled in MAC array.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
+		      int *mac_n)
+{
+	struct priv *priv = dev->data->dev_private;
+	int iface_idx = mlx5_ifindex(dev);
+	struct {
+		struct nlmsghdr	hdr;
+		struct ifinfomsg ifm;
+	} req = {
+		.hdr = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+			.nlmsg_type = RTM_GETNEIGH,
+			.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		},
+		.ifm = {
+			.ifi_family = PF_BRIDGE,
+			.ifi_index = iface_idx,
+		},
+	};
+	struct mlx5_nl_mac_addr data = {
+		.mac = mac,
+		.mac_n = 0,
+	};
+	int fd;
+	int ret;
+	uint32_t sn = priv->nl_sn++;
+
+	if (priv->nl_socket == -1)
+		return 0;
+	fd = priv->nl_socket;
+	ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
+			      sizeof(struct ifinfomsg));
+	if (ret < 0)
+		goto error;
+	ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
+	if (ret < 0)
+		goto error;
+	*mac_n = data.mac_n;
+	return 0;
+error:
+	DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
+		dev->data->port_id, strerror(rte_errno));
+	return -rte_errno;
+}
+
+/**
+ * Modify the MAC address neighbour table with Netlink.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mac
+ *   MAC address to consider.
+ * @param add
+ *   1 to add the MAC address, 0 to remove the MAC address.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
+			int add)
+{
+	struct priv *priv = dev->data->dev_private;
+	int iface_idx = mlx5_ifindex(dev);
+	struct {
+		struct nlmsghdr hdr;
+		struct ndmsg ndm;
+		struct rtattr rta;
+		uint8_t buffer[ETHER_ADDR_LEN];
+	} req = {
+		.hdr = {
+			.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
+			.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+				NLM_F_EXCL | NLM_F_ACK,
+			.nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
+		},
+		.ndm = {
+			.ndm_family = PF_BRIDGE,
+			.ndm_state = NUD_NOARP | NUD_PERMANENT,
+			.ndm_ifindex = iface_idx,
+			.ndm_flags = NTF_SELF,
+		},
+		.rta = {
+			.rta_type = NDA_LLADDR,
+			.rta_len = RTA_LENGTH(ETHER_ADDR_LEN),
+		},
+	};
+	int fd;
+	int ret;
+	uint32_t sn = priv->nl_sn++;
+
+	if (priv->nl_socket == -1)
+		return 0;
+	fd = priv->nl_socket;
+	memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
+	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
+		RTA_ALIGN(req.rta.rta_len);
+	ret = mlx5_nl_send(fd, &req.hdr, sn);
+	if (ret < 0)
+		goto error;
+	ret = mlx5_nl_recv(fd, sn, NULL, NULL);
+	if (ret < 0)
+		goto error;
+	return 0;
+error:
+	DRV_LOG(DEBUG,
+		"port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
+		" %s",
+		dev->data->port_id,
+		add ? "add" : "remove",
+		mac->addr_bytes[0], mac->addr_bytes[1],
+		mac->addr_bytes[2], mac->addr_bytes[3],
+		mac->addr_bytes[4], mac->addr_bytes[5],
+		strerror(rte_errno));
+	return -rte_errno;
+}
+
+/**
+ * Add a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mac
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
+		     uint32_t index)
+{
+	struct priv *priv = dev->data->dev_private;
+	int ret;
+
+	ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
+	if (!ret)
+		BITFIELD_SET(priv->mac_own, index);
+	if (ret == -EEXIST)
+		return 0;
+	return ret;
+}
+
+/**
+ * Remove a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mac
+ *   MAC address to remove.
+ * @param index
+ *   MAC address index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
+			uint32_t index)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	BITFIELD_RESET(priv->mac_own, index);
+	return mlx5_nl_mac_addr_modify(dev, mac, 0);
+}
+
+/**
+ * Synchronise Netlink bridge table to the internal table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
+{
+	struct ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
+	int macs_n = 0;
+	int i;
+	int ret;
+
+	ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
+	if (ret)
+		return;
+	for (i = 0; i != macs_n; ++i) {
+		int j;
+
+		/* Verify the address is not in the array yet. */
+		for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
+			if (is_same_ether_addr(&macs[i],
+					       &dev->data->mac_addrs[j]))
+				break;
+		if (j != MLX5_MAX_MAC_ADDRESSES)
+			continue;
+		/* Find the first entry available. */
+		for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
+			if (is_zero_ether_addr(&dev->data->mac_addrs[j])) {
+				dev->data->mac_addrs[j] = macs[i];
+				break;
+			}
+		}
+	}
+}
+
+/**
+ * Flush all added MAC addresses.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
+{
+	struct priv *priv = dev->data->dev_private;
+	int i;
+
+	for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
+		struct ether_addr *m = &dev->data->mac_addrs[i];
+
+		if (BITFIELD_ISSET(priv->mac_own, i))
+			mlx5_nl_mac_addr_remove(dev, m, i);
+	}
+}