[1/6] net/mlx5: lay groundwork for switch offloads

Message ID 20180627173355.4718-2-adrien.mazarguil@6wind.com (mailing list archive)
State Superseded, archived
Headers
Series net/mlx5: add support for switch flow rules |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Adrien Mazarguil June 27, 2018, 6:08 p.m. UTC
  With mlx5, unlike normal flow rules implemented through Verbs for traffic
emitted and received by the application, those targeting different logical
ports of the device (VF representors for instance) are offloaded at the
switch level and must be configured through Netlink (TC interface).

This patch adds preliminary support to manage such flow rules through the
flow API (rte_flow).

Instead of rewriting tons of Netlink helpers and as previously suggested by
Stephen [1], this patch introduces a new dependency to libmnl [2]
(LGPL-2.1) when compiling mlx5.

[1] https://mails.dpdk.org/archives/dev/2018-March/092676.html
[2] https://netfilter.org/projects/libmnl/

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/Makefile       |   2 +
 drivers/net/mlx5/mlx5.c         |  32 ++++++++
 drivers/net/mlx5/mlx5.h         |  10 +++
 drivers/net/mlx5/mlx5_nl_flow.c | 139 +++++++++++++++++++++++++++++++++++
 mk/rte.app.mk                   |   2 +-
 5 files changed, 184 insertions(+), 1 deletion(-)
  

Comments

Yongseok Koh July 12, 2018, 12:17 a.m. UTC | #1
On Wed, Jun 27, 2018 at 08:08:10PM +0200, Adrien Mazarguil wrote:
> With mlx5, unlike normal flow rules implemented through Verbs for traffic
> emitted and received by the application, those targeting different logical
> ports of the device (VF representors for instance) are offloaded at the
> switch level and must be configured through Netlink (TC interface).
> 
> This patch adds preliminary support to manage such flow rules through the
> flow API (rte_flow).
> 
> Instead of rewriting tons of Netlink helpers and as previously suggested by
> Stephen [1], this patch introduces a new dependency to libmnl [2]
> (LGPL-2.1) when compiling mlx5.
> 
> [1] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-March%2F092676.html&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=JrAyzK1s3JG5CnuquNcA7XRN4d2WYtHUi1KXyloGdvA%3D&reserved=0
> [2] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fnetfilter.org%2Fprojects%2Flibmnl%2F&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=yLYa0NzsTyE62BHDCZDoDah31snt6w4Coq47pY913Oo%3D&reserved=0
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/Makefile       |   2 +
>  drivers/net/mlx5/mlx5.c         |  32 ++++++++
>  drivers/net/mlx5/mlx5.h         |  10 +++
>  drivers/net/mlx5/mlx5_nl_flow.c | 139 +++++++++++++++++++++++++++++++++++
>  mk/rte.app.mk                   |   2 +-
>  5 files changed, 184 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
> index 8a5229e61..3325eed06 100644
> --- a/drivers/net/mlx5/Makefile
> +++ b/drivers/net/mlx5/Makefile
> @@ -33,6 +33,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
> +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c
>  
>  ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
>  INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
> @@ -56,6 +57,7 @@ LDLIBS += -ldl
>  else
>  LDLIBS += -libverbs -lmlx5
>  endif
> +LDLIBS += -lmnl
>  LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
>  LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
>  LDLIBS += -lrte_bus_pci
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index 665a3c31f..d9b9097b1 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -279,6 +279,8 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  		mlx5_nl_mac_addr_flush(dev);
>  	if (priv->nl_socket >= 0)
>  		close(priv->nl_socket);
> +	if (priv->mnl_socket)
> +		mlx5_nl_flow_socket_destroy(priv->mnl_socket);
>  	ret = mlx5_hrxq_ibv_verify(dev);
>  	if (ret)
>  		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
> @@ -1077,6 +1079,34 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  			priv->nl_socket = -1;
>  		mlx5_nl_mac_addr_sync(eth_dev);
>  	}
> +	priv->mnl_socket = mlx5_nl_flow_socket_create();
> +	if (!priv->mnl_socket) {
> +		err = -rte_errno;
> +		DRV_LOG(WARNING,
> +			"flow rules relying on switch offloads will not be"
> +			" supported: cannot open libmnl socket: %s",
> +			strerror(rte_errno));
> +	} else {
> +		struct rte_flow_error error;
> +		unsigned int ifindex = mlx5_ifindex(eth_dev);
> +
> +		if (!ifindex) {
> +			err = -rte_errno;
> +			error.message =
> +				"cannot retrieve network interface index";
> +		} else {
> +			err = mlx5_nl_flow_init(priv->mnl_socket, ifindex,
> +						&error);
> +		}
> +		if (err) {
> +			DRV_LOG(WARNING,
> +				"flow rules relying on switch offloads will"
> +				" not be supported: %s: %s",
> +				error.message, strerror(rte_errno));
> +			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
> +			priv->mnl_socket = NULL;
> +		}
> +	}
>  	TAILQ_INIT(&priv->flows);
>  	TAILQ_INIT(&priv->ctrl_flows);
>  	/* Hint libmlx5 to use PMD allocator for data plane resources */
> @@ -1127,6 +1157,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
>  	if (priv) {
>  		unsigned int i;
>  
> +		if (priv->mnl_socket)
> +			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
>  		i = mlx5_domain_to_port_id(priv->domain_id, NULL, 0);
>  		if (i == 1)
>  			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 1d8e156c8..390249adb 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -148,6 +148,8 @@ struct mlx5_drop {
>  	struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
>  };
>  
> +struct mnl_socket;
> +
>  struct priv {
>  	LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
>  	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
> @@ -207,6 +209,7 @@ struct priv {
>  	/* Context for Verbs allocator. */
>  	int nl_socket; /* Netlink socket. */
>  	uint32_t nl_sn; /* Netlink message sequence number. */
> +	struct mnl_socket *mnl_socket; /* Libmnl socket. */
>  };
>  
>  #define PORT_ID(priv) ((priv)->dev_data->port_id)
> @@ -369,4 +372,11 @@ void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev);
>  int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable);
>  int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable);
>  
> +/* mlx5_nl_flow.c */
> +
> +int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
> +		      struct rte_flow_error *error);
> +struct mnl_socket *mlx5_nl_flow_socket_create(void);
> +void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl);
> +
>  #endif /* RTE_PMD_MLX5_H_ */
> diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
> new file mode 100644
> index 000000000..7a8683b03
> --- /dev/null
> +++ b/drivers/net/mlx5/mlx5_nl_flow.c
> @@ -0,0 +1,139 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2018 6WIND S.A.
> + * Copyright 2018 Mellanox Technologies, Ltd
> + */
> +
> +#include <errno.h>
> +#include <libmnl/libmnl.h>
> +#include <linux/netlink.h>
> +#include <linux/pkt_sched.h>
> +#include <linux/rtnetlink.h>
> +#include <stdalign.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <sys/socket.h>
> +
> +#include <rte_errno.h>
> +#include <rte_flow.h>
> +
> +#include "mlx5.h"
> +
> +/**
> + * Send Netlink message with acknowledgment.
> + *
> + * @param nl
> + *   Libmnl socket to use.
> + * @param nlh
> + *   Message to send. This function always raises the NLM_F_ACK flag before
> + *   sending.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
> +{
> +	alignas(struct nlmsghdr)
> +	uint8_t ans[MNL_SOCKET_BUFFER_SIZE];

There are total 3 of this buffer. On a certain host having large pagesize, this
can be 8kB * 3 = 24kB. This is not a gigantic buffer but as all the functions
here are sequentially accessed, how about having just one global buffer instead?

> +	uint32_t seq = random();
> +	int ret;
> +
> +	nlh->nlmsg_flags |= NLM_F_ACK;
> +	nlh->nlmsg_seq = seq;
> +	ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
> +	if (ret != -1)
> +		ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
> +	if (ret != -1)
> +		ret = mnl_cb_run
> +			(ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
> +	if (!ret)
> +		return 0;
> +	rte_errno = errno;
> +	return -rte_errno;
> +}
> +
> +/**
> + * Initialize ingress qdisc of a given network interface.
> + *
> + * @param nl
> + *   Libmnl socket of the @p NETLINK_ROUTE kind.
> + * @param ifindex
> + *   Index of network interface to initialize.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +int
> +mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
> +		  struct rte_flow_error *error)
> +{
> +	uint8_t buf[MNL_SOCKET_BUFFER_SIZE];
> +	struct nlmsghdr *nlh;
> +	struct tcmsg *tcm;
> +
> +	/* Destroy existing ingress qdisc and everything attached to it. */
> +	nlh = mnl_nlmsg_put_header(buf);
> +	nlh->nlmsg_type = RTM_DELQDISC;
> +	nlh->nlmsg_flags = NLM_F_REQUEST;
> +	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
> +	tcm->tcm_family = AF_UNSPEC;
> +	tcm->tcm_ifindex = ifindex;
> +	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
> +	tcm->tcm_parent = TC_H_INGRESS;
> +	/* Ignore errors when qdisc is already absent. */
> +	if (mlx5_nl_flow_nl_ack(nl, nlh) &&
> +	    rte_errno != EINVAL && rte_errno != ENOENT)
> +		return rte_flow_error_set
> +			(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "netlink: failed to remove ingress qdisc");
> +	/* Create fresh ingress qdisc. */
> +	nlh = mnl_nlmsg_put_header(buf);
> +	nlh->nlmsg_type = RTM_NEWQDISC;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
> +	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
> +	tcm->tcm_family = AF_UNSPEC;
> +	tcm->tcm_ifindex = ifindex;
> +	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
> +	tcm->tcm_parent = TC_H_INGRESS;
> +	mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
> +	if (mlx5_nl_flow_nl_ack(nl, nlh))
> +		return rte_flow_error_set
> +			(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "netlink: failed to create ingress qdisc");
> +	return 0;
> +}
> +
> +/**
> + * Create and configure a libmnl socket for Netlink flow rules.
> + *
> + * @return
> + *   A valid libmnl socket object pointer on success, NULL otherwise and
> + *   rte_errno is set.
> + */
> +struct mnl_socket *
> +mlx5_nl_flow_socket_create(void)
> +{
> +	struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
> +
> +	if (nl &&
> +	    !mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
> +				   sizeof(int)) &&
> +	    !mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
> +		return nl;
> +	rte_errno = errno;
> +	if (nl)
> +		mnl_socket_close(nl);
> +	return NULL;
> +}
> +
> +/**
> + * Destroy a libmnl socket.
> + */
> +void
> +mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
> +{
> +	mnl_socket_close(nl);
> +}
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 7bcf6308d..414f1b967 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -145,7 +145,7 @@ endif
>  ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -ldl
>  else
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5 -lmnl
>  endif
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_MVPP2_PMD)      += -lrte_pmd_mvpp2 -L$(LIBMUSDK_PATH)/lib -lmusdk
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD)        += -lrte_pmd_nfp
> -- 
> 2.11.0
  
Adrien Mazarguil July 12, 2018, 10:46 a.m. UTC | #2
On Wed, Jul 11, 2018 at 05:17:09PM -0700, Yongseok Koh wrote:
> On Wed, Jun 27, 2018 at 08:08:10PM +0200, Adrien Mazarguil wrote:
> > With mlx5, unlike normal flow rules implemented through Verbs for traffic
> > emitted and received by the application, those targeting different logical
> > ports of the device (VF representors for instance) are offloaded at the
> > switch level and must be configured through Netlink (TC interface).
> > 
> > This patch adds preliminary support to manage such flow rules through the
> > flow API (rte_flow).
> > 
> > Instead of rewriting tons of Netlink helpers and as previously suggested by
> > Stephen [1], this patch introduces a new dependency to libmnl [2]
> > (LGPL-2.1) when compiling mlx5.
> > 
> > [1] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-March%2F092676.html&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=JrAyzK1s3JG5CnuquNcA7XRN4d2WYtHUi1KXyloGdvA%3D&reserved=0
> > [2] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fnetfilter.org%2Fprojects%2Flibmnl%2F&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=yLYa0NzsTyE62BHDCZDoDah31snt6w4Coq47pY913Oo%3D&reserved=0
> > 
> > Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
<snip>
> > diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
> > new file mode 100644
> > index 000000000..7a8683b03
> > --- /dev/null
> > +++ b/drivers/net/mlx5/mlx5_nl_flow.c
> > @@ -0,0 +1,139 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright 2018 6WIND S.A.
> > + * Copyright 2018 Mellanox Technologies, Ltd
> > + */
> > +
> > +#include <errno.h>
> > +#include <libmnl/libmnl.h>
> > +#include <linux/netlink.h>
> > +#include <linux/pkt_sched.h>
> > +#include <linux/rtnetlink.h>
> > +#include <stdalign.h>
> > +#include <stddef.h>
> > +#include <stdint.h>
> > +#include <stdlib.h>
> > +#include <sys/socket.h>
> > +
> > +#include <rte_errno.h>
> > +#include <rte_flow.h>
> > +
> > +#include "mlx5.h"
> > +
> > +/**
> > + * Send Netlink message with acknowledgment.
> > + *
> > + * @param nl
> > + *   Libmnl socket to use.
> > + * @param nlh
> > + *   Message to send. This function always raises the NLM_F_ACK flag before
> > + *   sending.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +static int
> > +mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
> > +{
> > +	alignas(struct nlmsghdr)
> > +	uint8_t ans[MNL_SOCKET_BUFFER_SIZE];
> 
> There are total 3 of this buffer. On a certain host having large pagesize, this
> can be 8kB * 3 = 24kB. This is not a gigantic buffer but as all the functions
> here are sequentially accessed, how about having just one global buffer instead?

All right it's not ideal, I opted for simplicity though. This is a generic
ack function. When NETLINK_CAP_ACK is not supported (note: this was made
optional for v2, some systems do not support it), an ack consumes a bit more
space than the original message, which may itself be huge, and failure to
receive acks is deemed fatal.

Its callers are mlx5_nl_flow_init(), called once per device during
initialization, and mlx5_nl_flow_create/destroy(), called for each
created/removed flow rule.

These last two are called often but do not put their own buffer on the
stack, they reuse previously generated messages from the heap.

So to improve stack consumption a bit, what I can do is size this buffer
according to nlh->nlmsg_len + extra room for ack header, yet still allocate
it locally since it would be a pain otherwise. Callers may not want their
own buffers to be overwritten with useless acks.
  
Yongseok Koh July 12, 2018, 5:33 p.m. UTC | #3
> On Jul 12, 2018, at 3:46 AM, Adrien Mazarguil <adrien.mazarguil@6wind.com> wrote:
> 
> On Wed, Jul 11, 2018 at 05:17:09PM -0700, Yongseok Koh wrote:
>> On Wed, Jun 27, 2018 at 08:08:10PM +0200, Adrien Mazarguil wrote:
>>> With mlx5, unlike normal flow rules implemented through Verbs for traffic
>>> emitted and received by the application, those targeting different logical
>>> ports of the device (VF representors for instance) are offloaded at the
>>> switch level and must be configured through Netlink (TC interface).
>>> 
>>> This patch adds preliminary support to manage such flow rules through the
>>> flow API (rte_flow).
>>> 
>>> Instead of rewriting tons of Netlink helpers and as previously suggested by
>>> Stephen [1], this patch introduces a new dependency to libmnl [2]
>>> (LGPL-2.1) when compiling mlx5.
>>> 
>>> [1] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-March%2F092676.html&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=JrAyzK1s3JG5CnuquNcA7XRN4d2WYtHUi1KXyloGdvA%3D&reserved=0
>>> [2] https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fnetfilter.org%2Fprojects%2Flibmnl%2F&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=yLYa0NzsTyE62BHDCZDoDah31snt6w4Coq47pY913Oo%3D&reserved=0
>>> 
>>> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> <snip>
>>> diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
>>> new file mode 100644
>>> index 000000000..7a8683b03
>>> --- /dev/null
>>> +++ b/drivers/net/mlx5/mlx5_nl_flow.c
>>> @@ -0,0 +1,139 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright 2018 6WIND S.A.
>>> + * Copyright 2018 Mellanox Technologies, Ltd
>>> + */
>>> +
>>> +#include <errno.h>
>>> +#include <libmnl/libmnl.h>
>>> +#include <linux/netlink.h>
>>> +#include <linux/pkt_sched.h>
>>> +#include <linux/rtnetlink.h>
>>> +#include <stdalign.h>
>>> +#include <stddef.h>
>>> +#include <stdint.h>
>>> +#include <stdlib.h>
>>> +#include <sys/socket.h>
>>> +
>>> +#include <rte_errno.h>
>>> +#include <rte_flow.h>
>>> +
>>> +#include "mlx5.h"
>>> +
>>> +/**
>>> + * Send Netlink message with acknowledgment.
>>> + *
>>> + * @param nl
>>> + *   Libmnl socket to use.
>>> + * @param nlh
>>> + *   Message to send. This function always raises the NLM_F_ACK flag before
>>> + *   sending.
>>> + *
>>> + * @return
>>> + *   0 on success, a negative errno value otherwise and rte_errno is set.
>>> + */
>>> +static int
>>> +mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
>>> +{
>>> +	alignas(struct nlmsghdr)
>>> +	uint8_t ans[MNL_SOCKET_BUFFER_SIZE];
>> 
>> There are total 3 of this buffer. On a certain host having large pagesize, this
>> can be 8kB * 3 = 24kB. This is not a gigantic buffer but as all the functions
>> here are sequentially accessed, how about having just one global buffer instead?
> 
> All right it's not ideal, I opted for simplicity though. This is a generic
> ack function. When NETLINK_CAP_ACK is not supported (note: this was made
> optional for v2, some systems do not support it), an ack consumes a bit more
> space than the original message, which may itself be huge, and failure to
> receive acks is deemed fatal.
> 
> Its callers are mlx5_nl_flow_init(), called once per device during
> initialization, and mlx5_nl_flow_create/destroy(), called for each
> created/removed flow rule.
> 
> These last two are called often but do not put their own buffer on the
> stack, they reuse previously generated messages from the heap.
> 
> So to improve stack consumption a bit, what I can do is size this buffer
> according to nlh->nlmsg_len + extra room for ack header, yet still allocate
> it locally since it would be a pain otherwise. Callers may not want their
> own buffers to be overwritten with useless acks.

I like this approach.

Thanks,
Yongseok
  

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 8a5229e61..3325eed06 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -33,6 +33,7 @@  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c
 
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
 INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
@@ -56,6 +57,7 @@  LDLIBS += -ldl
 else
 LDLIBS += -libverbs -lmlx5
 endif
+LDLIBS += -lmnl
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
 LDLIBS += -lrte_bus_pci
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 665a3c31f..d9b9097b1 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -279,6 +279,8 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 		mlx5_nl_mac_addr_flush(dev);
 	if (priv->nl_socket >= 0)
 		close(priv->nl_socket);
+	if (priv->mnl_socket)
+		mlx5_nl_flow_socket_destroy(priv->mnl_socket);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -1077,6 +1079,34 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 			priv->nl_socket = -1;
 		mlx5_nl_mac_addr_sync(eth_dev);
 	}
+	priv->mnl_socket = mlx5_nl_flow_socket_create();
+	if (!priv->mnl_socket) {
+		err = -rte_errno;
+		DRV_LOG(WARNING,
+			"flow rules relying on switch offloads will not be"
+			" supported: cannot open libmnl socket: %s",
+			strerror(rte_errno));
+	} else {
+		struct rte_flow_error error;
+		unsigned int ifindex = mlx5_ifindex(eth_dev);
+
+		if (!ifindex) {
+			err = -rte_errno;
+			error.message =
+				"cannot retrieve network interface index";
+		} else {
+			err = mlx5_nl_flow_init(priv->mnl_socket, ifindex,
+						&error);
+		}
+		if (err) {
+			DRV_LOG(WARNING,
+				"flow rules relying on switch offloads will"
+				" not be supported: %s: %s",
+				error.message, strerror(rte_errno));
+			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+			priv->mnl_socket = NULL;
+		}
+	}
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1127,6 +1157,8 @@  mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
 	if (priv) {
 		unsigned int i;
 
+		if (priv->mnl_socket)
+			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
 		i = mlx5_domain_to_port_id(priv->domain_id, NULL, 0);
 		if (i == 1)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 1d8e156c8..390249adb 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -148,6 +148,8 @@  struct mlx5_drop {
 	struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
 };
 
+struct mnl_socket;
+
 struct priv {
 	LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
@@ -207,6 +209,7 @@  struct priv {
 	/* Context for Verbs allocator. */
 	int nl_socket; /* Netlink socket. */
 	uint32_t nl_sn; /* Netlink message sequence number. */
+	struct mnl_socket *mnl_socket; /* Libmnl socket. */
 };
 
 #define PORT_ID(priv) ((priv)->dev_data->port_id)
@@ -369,4 +372,11 @@  void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev);
 int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable);
 int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable);
 
+/* mlx5_nl_flow.c */
+
+int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+		      struct rte_flow_error *error);
+struct mnl_socket *mlx5_nl_flow_socket_create(void);
+void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
new file mode 100644
index 000000000..7a8683b03
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -0,0 +1,139 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <errno.h>
+#include <libmnl/libmnl.h>
+#include <linux/netlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <rte_errno.h>
+#include <rte_flow.h>
+
+#include "mlx5.h"
+
+/**
+ * Send Netlink message with acknowledgment.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param nlh
+ *   Message to send. This function always raises the NLM_F_ACK flag before
+ *   sending.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
+{
+	alignas(struct nlmsghdr)
+	uint8_t ans[MNL_SOCKET_BUFFER_SIZE];
+	uint32_t seq = random();
+	int ret;
+
+	nlh->nlmsg_flags |= NLM_F_ACK;
+	nlh->nlmsg_seq = seq;
+	ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
+	if (ret != -1)
+		ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
+	if (ret != -1)
+		ret = mnl_cb_run
+			(ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
+	if (!ret)
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Initialize ingress qdisc of a given network interface.
+ *
+ * @param nl
+ *   Libmnl socket of the @p NETLINK_ROUTE kind.
+ * @param ifindex
+ *   Index of network interface to initialize.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+		  struct rte_flow_error *error)
+{
+	uint8_t buf[MNL_SOCKET_BUFFER_SIZE];
+	struct nlmsghdr *nlh;
+	struct tcmsg *tcm;
+
+	/* Destroy existing ingress qdisc and everything attached to it. */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_DELQDISC;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ifindex;
+	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	tcm->tcm_parent = TC_H_INGRESS;
+	/* Ignore errors when qdisc is already absent. */
+	if (mlx5_nl_flow_nl_ack(nl, nlh) &&
+	    rte_errno != EINVAL && rte_errno != ENOENT)
+		return rte_flow_error_set
+			(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "netlink: failed to remove ingress qdisc");
+	/* Create fresh ingress qdisc. */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_NEWQDISC;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ifindex;
+	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	tcm->tcm_parent = TC_H_INGRESS;
+	mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
+	if (mlx5_nl_flow_nl_ack(nl, nlh))
+		return rte_flow_error_set
+			(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "netlink: failed to create ingress qdisc");
+	return 0;
+}
+
+/**
+ * Create and configure a libmnl socket for Netlink flow rules.
+ *
+ * @return
+ *   A valid libmnl socket object pointer on success, NULL otherwise and
+ *   rte_errno is set.
+ */
+struct mnl_socket *
+mlx5_nl_flow_socket_create(void)
+{
+	struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+	if (nl &&
+	    !mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
+				   sizeof(int)) &&
+	    !mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
+		return nl;
+	rte_errno = errno;
+	if (nl)
+		mnl_socket_close(nl);
+	return NULL;
+}
+
+/**
+ * Destroy a libmnl socket.
+ */
+void
+mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
+{
+	mnl_socket_close(nl);
+}
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 7bcf6308d..414f1b967 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -145,7 +145,7 @@  endif
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -ldl
 else
-_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5
+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5 -lmnl
 endif
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MVPP2_PMD)      += -lrte_pmd_mvpp2 -L$(LIBMUSDK_PATH)/lib -lmusdk
 _LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD)        += -lrte_pmd_nfp