[v2,1/6] net/mlx5: lay groundwork for switch offloads

Message ID 20180713092910.26276-2-adrien.mazarguil@6wind.com (mailing list archive)
State Accepted, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx5: add support for switch flow rules |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Adrien Mazarguil July 13, 2018, 9:40 a.m. UTC
  With mlx5, unlike normal flow rules implemented through Verbs for traffic
emitted and received by the application, those targeting different logical
ports of the device (VF representors for instance) are offloaded at the
switch level and must be configured through Netlink (TC interface).

This patch adds preliminary support to manage such flow rules through the
flow API (rte_flow).

Instead of rewriting tons of Netlink helpers and as previously suggested by
Stephen [1], this patch introduces a new dependency to libmnl [2]
(LGPL-2.1) when compiling mlx5.

[1] https://mails.dpdk.org/archives/dev/2018-March/092676.html
[2] https://netfilter.org/projects/libmnl/

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Cc: Yongseok Koh <yskoh@mellanox.com>
--
v2 changes:

- Added NETLINK_CAP_ACK definition if missing from the host system. This
  parameter is also not mandatory anymore and won't prevent creation of
  NL sockets when not supported.
- Modified mlx5_nl_flow_nl_ack() and mlx5_nl_flow_init() to consume the
  least amount of stack space based on message size, instead of the fixed
  MNL_SOCKET_BUFFER_SIZE which is quite large.
---
 drivers/net/mlx5/Makefile       |   2 +
 drivers/net/mlx5/mlx5.c         |  32 ++++++++
 drivers/net/mlx5/mlx5.h         |  10 +++
 drivers/net/mlx5/mlx5_nl_flow.c | 147 +++++++++++++++++++++++++++++++++++
 mk/rte.app.mk                   |   2 +-
 5 files changed, 192 insertions(+), 1 deletion(-)
  

Comments

Yongseok Koh July 14, 2018, 1:29 a.m. UTC | #1
On Jul 13, 2018, at 6:27 PM, Adrien Mazarguil <adrien.mazarguil@6wind.com<mailto:adrien.mazarguil@6wind.com>> wrote:

With mlx5, unlike normal flow rules implemented through Verbs for traffic
emitted and received by the application, those targeting different logical
ports of the device (VF representors for instance) are offloaded at the
switch level and must be configured through Netlink (TC interface).

This patch adds preliminary support to manage such flow rules through the
flow API (rte_flow).

Instead of rewriting tons of Netlink helpers and as previously suggested by
Stephen [1], this patch introduces a new dependency to libmnl [2]
(LGPL-2.1) when compiling mlx5.

[1] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-March%2F092676.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7Ceb65cd0f56444f90d1e208d5e8a4baf7%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636670716587068083&amp;sdata=WlqYmX3p1gmGl3ekvNoduW64vGYz8H9R%2Favu8rsCB2g%3D&amp;reserved=0
[2] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fnetfilter.org%2Fprojects%2Flibmnl%2F&amp;data=02%7C01%7Cyskoh%40mellanox.com%7Ceb65cd0f56444f90d1e208d5e8a4baf7%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636670716587068083&amp;sdata=EDV86z3I27N46U%2Bmj73U2PguS4vYa%2FLFL5o2gY2QDKo%3D&amp;reserved=0

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com<mailto:adrien.mazarguil@6wind.com>>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com<mailto:nelio.laranjeiro@6wind.com>>
Cc: Yongseok Koh <yskoh@mellanox.com<mailto:yskoh@mellanox.com>>
--
Acked-by: Yongseok Koh <yskoh@mellanox.com<mailto:yskoh@mellanox.com>>

Thanks

v2 changes:

- Added NETLINK_CAP_ACK definition if missing from the host system. This
 parameter is also not mandatory anymore and won't prevent creation of
 NL sockets when not supported.
- Modified mlx5_nl_flow_nl_ack() and mlx5_nl_flow_init() to consume the
 least amount of stack space based on message size, instead of the fixed
 MNL_SOCKET_BUFFER_SIZE which is quite large.
---
drivers/net/mlx5/Makefile       |   2 +
drivers/net/mlx5/mlx5.c         |  32 ++++++++
drivers/net/mlx5/mlx5.h         |  10 +++
drivers/net/mlx5/mlx5_nl_flow.c | 147 +++++++++++++++++++++++++++++++++++
mk/rte.app.mk                   |   2 +-
5 files changed, 192 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 9e274964b..8d3cb219b 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -33,6 +33,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c

ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
@@ -56,6 +57,7 @@ LDLIBS += -ldl
else
LDLIBS += -libverbs -lmlx5
endif
+LDLIBS += -lmnl
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
LDLIBS += -lrte_bus_pci
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 6d3421fae..8fb8c91eb 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -282,6 +282,8 @@ mlx5_dev_close(struct rte_eth_dev *dev)
       close(priv->nl_socket_route);
   if (priv->nl_socket_rdma >= 0)
       close(priv->nl_socket_rdma);
+    if (priv->mnl_socket)
+        mlx5_nl_flow_socket_destroy(priv->mnl_socket);
   ret = mlx5_hrxq_ibv_verify(dev);
   if (ret)
       DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -1116,6 +1118,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
   claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
   if (vf && config.vf_nl_en)
       mlx5_nl_mac_addr_sync(eth_dev);
+    priv->mnl_socket = mlx5_nl_flow_socket_create();
+    if (!priv->mnl_socket) {
+        err = -rte_errno;
+        DRV_LOG(WARNING,
+            "flow rules relying on switch offloads will not be"
+            " supported: cannot open libmnl socket: %s",
+            strerror(rte_errno));
+    } else {
+        struct rte_flow_error error;
+        unsigned int ifindex = mlx5_ifindex(eth_dev);
+
+        if (!ifindex) {
+            err = -rte_errno;
+            error.message =
+                "cannot retrieve network interface index";
+        } else {
+            err = mlx5_nl_flow_init(priv->mnl_socket, ifindex,
+                        &error);
+        }
+        if (err) {
+            DRV_LOG(WARNING,
+                "flow rules relying on switch offloads will"
+                " not be supported: %s: %s",
+                error.message, strerror(rte_errno));
+            mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+            priv->mnl_socket = NULL;
+        }
+    }
   TAILQ_INIT(&priv->flows);
   TAILQ_INIT(&priv->ctrl_flows);
   /* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1168,6 +1198,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
           close(priv->nl_socket_route);
       if (priv->nl_socket_rdma >= 0)
           close(priv->nl_socket_rdma);
+        if (priv->mnl_socket)
+            mlx5_nl_flow_socket_destroy(priv->mnl_socket);
       if (own_domain_id)
           claim_zero(rte_eth_switch_domain_free(priv->domain_id));
       rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 131be334c..98b6ec07d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -156,6 +156,8 @@ struct mlx5_drop {
   struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
};

+struct mnl_socket;
+
struct priv {
   LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
   struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
@@ -215,6 +217,7 @@ struct priv {
   int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
   int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
   uint32_t nl_sn; /* Netlink message sequence number. */
+    struct mnl_socket *mnl_socket; /* Libmnl socket. */
};

#define PORT_ID(priv) ((priv)->dev_data->port_id)
@@ -380,4 +383,11 @@ unsigned int mlx5_nl_ifindex(int nl, const char *name);
int mlx5_nl_switch_info(int nl, unsigned int ifindex,
           struct mlx5_switch_info *info);

+/* mlx5_nl_flow.c */
+
+int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+              struct rte_flow_error *error);
+struct mnl_socket *mlx5_nl_flow_socket_create(void);
+void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl);
+
#endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
new file mode 100644
index 000000000..60a4493e5
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <errno.h>
+#include <libmnl/libmnl.h>
+#include <linux/netlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <rte_errno.h>
+#include <rte_flow.h>
+
+#include "mlx5.h"
+
+/* Normally found in linux/netlink.h. */
+#ifndef NETLINK_CAP_ACK
+#define NETLINK_CAP_ACK 10
+#endif
+
+/**
+ * Send Netlink message with acknowledgment.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param nlh
+ *   Message to send. This function always raises the NLM_F_ACK flag before
+ *   sending.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
+{
+    alignas(struct nlmsghdr)
+    uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
+            nlh->nlmsg_len - sizeof(*nlh)];
+    uint32_t seq = random();
+    int ret;
+
+    nlh->nlmsg_flags |= NLM_F_ACK;
+    nlh->nlmsg_seq = seq;
+    ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
+    if (ret != -1)
+        ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
+    if (ret != -1)
+        ret = mnl_cb_run
+            (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
+    if (!ret)
+        return 0;
+    rte_errno = errno;
+    return -rte_errno;
+}
+
+/**
+ * Initialize ingress qdisc of a given network interface.
+ *
+ * @param nl
+ *   Libmnl socket of the @p NETLINK_ROUTE kind.
+ * @param ifindex
+ *   Index of network interface to initialize.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+          struct rte_flow_error *error)
+{
+    struct nlmsghdr *nlh;
+    struct tcmsg *tcm;
+    alignas(struct nlmsghdr)
+    uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
+
+    /* Destroy existing ingress qdisc and everything attached to it. */
+    nlh = mnl_nlmsg_put_header(buf);
+    nlh->nlmsg_type = RTM_DELQDISC;
+    nlh->nlmsg_flags = NLM_F_REQUEST;
+    tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+    tcm->tcm_family = AF_UNSPEC;
+    tcm->tcm_ifindex = ifindex;
+    tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+    tcm->tcm_parent = TC_H_INGRESS;
+    /* Ignore errors when qdisc is already absent. */
+    if (mlx5_nl_flow_nl_ack(nl, nlh) &&
+        rte_errno != EINVAL && rte_errno != ENOENT)
+        return rte_flow_error_set
+            (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+             NULL, "netlink: failed to remove ingress qdisc");
+    /* Create fresh ingress qdisc. */
+    nlh = mnl_nlmsg_put_header(buf);
+    nlh->nlmsg_type = RTM_NEWQDISC;
+    nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+    tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+    tcm->tcm_family = AF_UNSPEC;
+    tcm->tcm_ifindex = ifindex;
+    tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+    tcm->tcm_parent = TC_H_INGRESS;
+    mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
+    if (mlx5_nl_flow_nl_ack(nl, nlh))
+        return rte_flow_error_set
+            (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+             NULL, "netlink: failed to create ingress qdisc");
+    return 0;
+}
+
+/**
+ * Create and configure a libmnl socket for Netlink flow rules.
+ *
+ * @return
+ *   A valid libmnl socket object pointer on success, NULL otherwise and
+ *   rte_errno is set.
+ */
+struct mnl_socket *
+mlx5_nl_flow_socket_create(void)
+{
+    struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+    if (nl) {
+        mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
+                      sizeof(int));
+        if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
+            return nl;
+    }
+    rte_errno = errno;
+    if (nl)
+        mnl_socket_close(nl);
+    return NULL;
+}
+
+/**
+ * Destroy a libmnl socket.
+ */
+void
+mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
+{
+    mnl_socket_close(nl);
+}
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 7bcf6308d..414f1b967 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -145,7 +145,7 @@ endif
ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -ldl
else
-_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5
+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5 -lmnl
endif
_LDLIBS-$(CONFIG_RTE_LIBRTE_MVPP2_PMD)      += -lrte_pmd_mvpp2 -L$(LIBMUSDK_PATH)/lib -lmusdk
_LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD)        += -lrte_pmd_nfp
--
2.11.0
  
Ferruh Yigit July 23, 2018, 9:40 p.m. UTC | #2
On 7/13/2018 10:40 AM, Adrien Mazarguil wrote:
> With mlx5, unlike normal flow rules implemented through Verbs for traffic
> emitted and received by the application, those targeting different logical
> ports of the device (VF representors for instance) are offloaded at the
> switch level and must be configured through Netlink (TC interface).
> 
> This patch adds preliminary support to manage such flow rules through the
> flow API (rte_flow).
> 
> Instead of rewriting tons of Netlink helpers and as previously suggested by
> Stephen [1], this patch introduces a new dependency to libmnl [2]
> (LGPL-2.1) when compiling mlx5.
> 
> [1] https://mails.dpdk.org/archives/dev/2018-March/092676.html
> [2] https://netfilter.org/projects/libmnl/

Just to highlight this new PMD level dependency to libmnl.

tap pmd also uses netlink and vdev_netvsc also does nl communication, perhaps we
can discuss unifying netlink usage around this new library.

> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> Cc: Yongseok Koh <yskoh@mellanox.com>
> --
> v2 changes:
> 
> - Added NETLINK_CAP_ACK definition if missing from the host system. This
>   parameter is also not mandatory anymore and won't prevent creation of
>   NL sockets when not supported.
> - Modified mlx5_nl_flow_nl_ack() and mlx5_nl_flow_init() to consume the
>   least amount of stack space based on message size, instead of the fixed
>   MNL_SOCKET_BUFFER_SIZE which is quite large.

<...>
  
Stephen Hemminger July 24, 2018, 12:50 a.m. UTC | #3
On Mon, 23 Jul 2018 22:40:47 +0100
Ferruh Yigit <ferruh.yigit@intel.com> wrote:

> On 7/13/2018 10:40 AM, Adrien Mazarguil wrote:
> > With mlx5, unlike normal flow rules implemented through Verbs for traffic
> > emitted and received by the application, those targeting different logical
> > ports of the device (VF representors for instance) are offloaded at the
> > switch level and must be configured through Netlink (TC interface).
> > 
> > This patch adds preliminary support to manage such flow rules through the
> > flow API (rte_flow).
> > 
> > Instead of rewriting tons of Netlink helpers and as previously suggested by
> > Stephen [1], this patch introduces a new dependency to libmnl [2]
> > (LGPL-2.1) when compiling mlx5.
> > 
> > [1] https://mails.dpdk.org/archives/dev/2018-March/092676.html
> > [2] https://netfilter.org/projects/libmnl/  
> 
> Just to highlight this new PMD level dependency to libmnl.
> 
> tap pmd also uses netlink and vdev_netvsc also does nl communication, perhaps we
> can discuss unifying netlink usage around this new library.
> 
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > Cc: Yongseok Koh <yskoh@mellanox.com>
> > --
> > v2 changes:
> > 
> > - Added NETLINK_CAP_ACK definition if missing from the host system. This
> >   parameter is also not mandatory anymore and won't prevent creation of
> >   NL sockets when not supported.
> > - Modified mlx5_nl_flow_nl_ack() and mlx5_nl_flow_init() to consume the
> >   least amount of stack space based on message size, instead of the fixed
> >   MNL_SOCKET_BUFFER_SIZE which is quite large.  
> 
> <...>
> 

I am concerned that this won't work on FreeBSD and it will end up
farther behind.
  
Shahaf Shuler July 24, 2018, 4:35 a.m. UTC | #4
Stephen,

Tuesday, July 24, 2018 3:51 AM, Stephen Hemminger:
> Subject: Re: [dpdk-dev] [PATCH v2 1/6] net/mlx5: lay groundwork for switch
> offloads
> 
> On Mon, 23 Jul 2018 22:40:47 +0100
> Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> >
> > Just to highlight this new PMD level dependency to libmnl.
> >
> > tap pmd also uses netlink and vdev_netvsc also does nl communication,
> > perhaps we can discuss unifying netlink usage around this new library.
> >
> >
> 
> I am concerned that this won't work on FreeBSD and it will end up farther
> behind.

Can you elaborate? What is the reason it will not work?
  
Stephen Hemminger July 24, 2018, 7:33 p.m. UTC | #5
On Tue, 24 Jul 2018 04:35:05 +0000
Shahaf Shuler <shahafs@mellanox.com> wrote:

> Stephen,
> 
> Tuesday, July 24, 2018 3:51 AM, Stephen Hemminger:
> > Subject: Re: [dpdk-dev] [PATCH v2 1/6] net/mlx5: lay groundwork for switch
> > offloads
> > 
> > On Mon, 23 Jul 2018 22:40:47 +0100
> > Ferruh Yigit <ferruh.yigit@intel.com> wrote:  
> > >
> > > Just to highlight this new PMD level dependency to libmnl.
> > >
> > > tap pmd also uses netlink and vdev_netvsc also does nl communication,
> > > perhaps we can discuss unifying netlink usage around this new library.
> > >
> > >  
> > 
> > I am concerned that this won't work on FreeBSD and it will end up farther
> > behind.  
> 
> Can you elaborate? What is the reason it will not work?
>  
> 

There is no working netlink on FreeBSD.
There is no eBPF on FreeBSD.
  

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 9e274964b..8d3cb219b 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -33,6 +33,7 @@  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c
 
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
 INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
@@ -56,6 +57,7 @@  LDLIBS += -ldl
 else
 LDLIBS += -libverbs -lmlx5
 endif
+LDLIBS += -lmnl
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
 LDLIBS += -lrte_bus_pci
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 6d3421fae..8fb8c91eb 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -282,6 +282,8 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 		close(priv->nl_socket_route);
 	if (priv->nl_socket_rdma >= 0)
 		close(priv->nl_socket_rdma);
+	if (priv->mnl_socket)
+		mlx5_nl_flow_socket_destroy(priv->mnl_socket);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -1116,6 +1118,34 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
 	if (vf && config.vf_nl_en)
 		mlx5_nl_mac_addr_sync(eth_dev);
+	priv->mnl_socket = mlx5_nl_flow_socket_create();
+	if (!priv->mnl_socket) {
+		err = -rte_errno;
+		DRV_LOG(WARNING,
+			"flow rules relying on switch offloads will not be"
+			" supported: cannot open libmnl socket: %s",
+			strerror(rte_errno));
+	} else {
+		struct rte_flow_error error;
+		unsigned int ifindex = mlx5_ifindex(eth_dev);
+
+		if (!ifindex) {
+			err = -rte_errno;
+			error.message =
+				"cannot retrieve network interface index";
+		} else {
+			err = mlx5_nl_flow_init(priv->mnl_socket, ifindex,
+						&error);
+		}
+		if (err) {
+			DRV_LOG(WARNING,
+				"flow rules relying on switch offloads will"
+				" not be supported: %s: %s",
+				error.message, strerror(rte_errno));
+			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+			priv->mnl_socket = NULL;
+		}
+	}
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1168,6 +1198,8 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (priv->mnl_socket)
+			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 131be334c..98b6ec07d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -156,6 +156,8 @@  struct mlx5_drop {
 	struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
 };
 
+struct mnl_socket;
+
 struct priv {
 	LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
@@ -215,6 +217,7 @@  struct priv {
 	int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
 	int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
 	uint32_t nl_sn; /* Netlink message sequence number. */
+	struct mnl_socket *mnl_socket; /* Libmnl socket. */
 };
 
 #define PORT_ID(priv) ((priv)->dev_data->port_id)
@@ -380,4 +383,11 @@  unsigned int mlx5_nl_ifindex(int nl, const char *name);
 int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 			struct mlx5_switch_info *info);
 
+/* mlx5_nl_flow.c */
+
+int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+		      struct rte_flow_error *error);
+struct mnl_socket *mlx5_nl_flow_socket_create(void);
+void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
new file mode 100644
index 000000000..60a4493e5
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -0,0 +1,147 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <errno.h>
+#include <libmnl/libmnl.h>
+#include <linux/netlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <rte_errno.h>
+#include <rte_flow.h>
+
+#include "mlx5.h"
+
+/* Normally found in linux/netlink.h. */
+#ifndef NETLINK_CAP_ACK
+#define NETLINK_CAP_ACK 10
+#endif
+
+/**
+ * Send Netlink message with acknowledgment.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param nlh
+ *   Message to send. This function always raises the NLM_F_ACK flag before
+ *   sending.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
+{
+	alignas(struct nlmsghdr)
+	uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
+		    nlh->nlmsg_len - sizeof(*nlh)];
+	uint32_t seq = random();
+	int ret;
+
+	nlh->nlmsg_flags |= NLM_F_ACK;
+	nlh->nlmsg_seq = seq;
+	ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
+	if (ret != -1)
+		ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
+	if (ret != -1)
+		ret = mnl_cb_run
+			(ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
+	if (!ret)
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Initialize ingress qdisc of a given network interface.
+ *
+ * @param nl
+ *   Libmnl socket of the @p NETLINK_ROUTE kind.
+ * @param ifindex
+ *   Index of network interface to initialize.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+		  struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct tcmsg *tcm;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
+
+	/* Destroy existing ingress qdisc and everything attached to it. */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_DELQDISC;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ifindex;
+	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	tcm->tcm_parent = TC_H_INGRESS;
+	/* Ignore errors when qdisc is already absent. */
+	if (mlx5_nl_flow_nl_ack(nl, nlh) &&
+	    rte_errno != EINVAL && rte_errno != ENOENT)
+		return rte_flow_error_set
+			(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "netlink: failed to remove ingress qdisc");
+	/* Create fresh ingress qdisc. */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_NEWQDISC;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ifindex;
+	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	tcm->tcm_parent = TC_H_INGRESS;
+	mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
+	if (mlx5_nl_flow_nl_ack(nl, nlh))
+		return rte_flow_error_set
+			(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "netlink: failed to create ingress qdisc");
+	return 0;
+}
+
+/**
+ * Create and configure a libmnl socket for Netlink flow rules.
+ *
+ * @return
+ *   A valid libmnl socket object pointer on success, NULL otherwise and
+ *   rte_errno is set.
+ */
+struct mnl_socket *
+mlx5_nl_flow_socket_create(void)
+{
+	struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+	if (nl) {
+		mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
+				      sizeof(int));
+		if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
+			return nl;
+	}
+	rte_errno = errno;
+	if (nl)
+		mnl_socket_close(nl);
+	return NULL;
+}
+
+/**
+ * Destroy a libmnl socket.
+ */
+void
+mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
+{
+	mnl_socket_close(nl);
+}
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 7bcf6308d..414f1b967 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -145,7 +145,7 @@  endif
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -ldl
 else
-_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5
+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5 -lmnl
 endif
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MVPP2_PMD)      += -lrte_pmd_mvpp2 -L$(LIBMUSDK_PATH)/lib -lmusdk
 _LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD)        += -lrte_pmd_nfp