[v1,1/6] net/af_xdp: introduce AF_XDP PMD driver

Message ID 20190301080947.91086-2-xiaolong.ye@intel.com
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers show
Series
  • Introduce AF_XDP PMD
Related show

Checks

Context Check Description
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/Intel-compilation fail Compilation issues
ci/checkpatch success coding style OK

Commit Message

Ye Xiaolong March 1, 2019, 8:09 a.m.
Add a new PMD driver for AF_XDP which is a proposed faster version of
AF_PACKET interface in Linux. More info about AF_XDP, please refer to [1]
[2].

This is the vanilla version PMD which just uses a raw buffer registered as
the umem.

[1] https://fosdem.org/2018/schedule/event/af_xdp/
[2] https://lwn.net/Articles/745934/

Signed-off-by: Xiaolong Ye <xiaolong.ye@intel.com>
---
 MAINTAINERS                                   |   6 +
 config/common_base                            |   5 +
 doc/guides/nics/af_xdp.rst                    |  43 +
 doc/guides/rel_notes/release_18_11.rst        |   7 +
 drivers/net/Makefile                          |   1 +
 drivers/net/af_xdp/Makefile                   |  31 +
 drivers/net/af_xdp/meson.build                |   7 +
 drivers/net/af_xdp/rte_eth_af_xdp.c           | 903 ++++++++++++++++++
 drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
 mk/rte.app.mk                                 |   1 +
 10 files changed, 1008 insertions(+)
 create mode 100644 doc/guides/nics/af_xdp.rst
 create mode 100644 drivers/net/af_xdp/Makefile
 create mode 100644 drivers/net/af_xdp/meson.build
 create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
 create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map

Comments

Luca Boccassi March 1, 2019, 3:38 p.m. | #1
On Fri, 2019-03-01 at 16:09 +0800, Xiaolong Ye wrote:
> Add a new PMD driver for AF_XDP which is a proposed faster version of
> AF_PACKET interface in Linux. More info about AF_XDP, please refer to
> [1]
> [2].
> 
> This is the vanilla version PMD which just uses a raw buffer
> registered as
> the umem.
> 
> [1] https://fosdem.org/2018/schedule/event/af_xdp/
> [2] https://lwn.net/Articles/745934/
> 
> Signed-off-by: Xiaolong Ye <xiaolong.ye@intel.com>
> ---
>  MAINTAINERS                                   |   6 +
>  config/common_base                            |   5 +
>  doc/guides/nics/af_xdp.rst                    |  43 +
>  doc/guides/rel_notes/release_18_11.rst        |   7 +
>  drivers/net/Makefile                          |   1 +
>  drivers/net/af_xdp/Makefile                   |  31 +
>  drivers/net/af_xdp/meson.build                |   7 +
>  drivers/net/af_xdp/rte_eth_af_xdp.c           | 903
> ++++++++++++++++++
>  drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
>  mk/rte.app.mk                                 |   1 +
>  10 files changed, 1008 insertions(+)
>  create mode 100644 doc/guides/nics/af_xdp.rst
>  create mode 100644 drivers/net/af_xdp/Makefile
>  create mode 100644 drivers/net/af_xdp/meson.build
>  create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
>  create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> 
<..>

> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
>  endif
>  
>  DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
>  DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
>  DIRS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD) += atlantic
>  DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf

You need a similar change in drivers/net/meson.build

> diff --git a/drivers/net/af_xdp/Makefile
> b/drivers/net/af_xdp/Makefile
> new file mode 100644
> index 000000000..e3755fff2
> --- /dev/null
> +++ b/drivers/net/af_xdp/Makefile
> @@ -0,0 +1,31 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2018 Intel Corporation
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +#
> +# library name
> +#
> +LIB = librte_pmd_af_xdp.a
> +
> +EXPORT_MAP := rte_pmd_af_xdp_version.map
> +
> +LIBABIVER := 1
> +
> +
> +CFLAGS += -O3
> +# below line should be removed
> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/lib/bpf

Leftovers?

> +CFLAGS += $(WERROR_FLAGS)
> +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
> +LDLIBS += -lrte_bus_vdev
> +
> +#
> +# all source are stored in SRCS-y
> +#
> +SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
> +
> +include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/drivers/net/af_xdp/meson.build
> b/drivers/net/af_xdp/meson.build
> new file mode 100644
> index 000000000..4b6652685
> --- /dev/null
> +++ b/drivers/net/af_xdp/meson.build
> @@ -0,0 +1,7 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2018 Intel Corporation
> +
> +if host_machine.system() != 'linux'
> +	build = false
> +endif
> +sources = files('rte_eth_af_xdp.c')

You need to add a dependency() on libbpf, and given upstream doesn't
ship a pkg-config file yet a fallback to find_library, so that the pmd
can be linked against it.
Check other PMDs to see how dependencies are handled, like
drivers/net/pcap/meson.build

<..>

> diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> new file mode 100644
> index 000000000..ef3539840
> --- /dev/null
> +++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> @@ -0,0 +1,4 @@
> +DPDK_2.0 {

2.0 is a bit old :-)

> +
> +	local: *;
> +};
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index d0ab942d5..db3271c7b 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -143,6 +143,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL)  +=
> -lrte_mempool_dpaa2
>  endif
>  
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp
> -lelf -lbpf

Are symbols from libelf being used by the PMD?

>  _LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD)        += -lrte_pmd_ark
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD)   += -lrte_pmd_atlantic
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD)        += -lrte_pmd_avf
Stephen Hemminger March 1, 2019, 6:31 p.m. | #2
On Fri,  1 Mar 2019 16:09:42 +0800
Xiaolong Ye <xiaolong.ye@intel.com> wrote:

> +	if (umem->buffer)
> +		free(umem->buffer);

Minor nit: you don't need to check for NULL free() already handles this.
Stephen Hemminger March 1, 2019, 6:32 p.m. | #3
On Fri,  1 Mar 2019 16:09:42 +0800
Xiaolong Ye <xiaolong.ye@intel.com> wrote:

> +
> +static int
> +rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
> +{
> +	struct rte_kvargs *kvlist;
> +	char *if_name = NULL;
> +	int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
> +	struct rte_eth_dev *eth_dev;
> +	const char *name;
> +	int ret;
> +
> +	RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
> +		rte_vdev_device_name(dev));

The PMD log type is being phased out. I plan to mark it as deprecated.
All new drivers must use their own log types (see other every other device driver
for how to do this).
Ye Xiaolong March 2, 2019, 8:07 a.m. | #4
Hi,

On 03/01, Stephen Hemminger wrote:
>On Fri,  1 Mar 2019 16:09:42 +0800
>Xiaolong Ye <xiaolong.ye@intel.com> wrote:
>
>> +
>> +static int
>> +rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
>> +{
>> +	struct rte_kvargs *kvlist;
>> +	char *if_name = NULL;
>> +	int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
>> +	struct rte_eth_dev *eth_dev;
>> +	const char *name;
>> +	int ret;
>> +
>> +	RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
>> +		rte_vdev_device_name(dev));
>
>The PMD log type is being phased out. I plan to mark it as deprecated.
>All new drivers must use their own log types (see other every other device driver
>for how to do this).

Thanks for the feedback, will do.
Ye Xiaolong March 2, 2019, 8:08 a.m. | #5
Hi,

On 03/01, Stephen Hemminger wrote:
>On Fri,  1 Mar 2019 16:09:42 +0800
>Xiaolong Ye <xiaolong.ye@intel.com> wrote:
>
>> +	if (umem->buffer)
>> +		free(umem->buffer);
>
>Minor nit: you don't need to check for NULL free() already handles this.

Thanks for the suggestion, will change accordingly.
Ye Xiaolong March 2, 2019, 8:14 a.m. | #6
Hi, Luca

Thanks for your review.

On 03/01, Luca Boccassi wrote:
>On Fri, 2019-03-01 at 16:09 +0800, Xiaolong Ye wrote:
>> Add a new PMD driver for AF_XDP which is a proposed faster version of
>> AF_PACKET interface in Linux. More info about AF_XDP, please refer to
>> [1]
>> [2].
>> 
>> This is the vanilla version PMD which just uses a raw buffer
>> registered as
>> the umem.
>> 
>> [1] https://fosdem.org/2018/schedule/event/af_xdp/
>> [2] https://lwn.net/Articles/745934/
>> 
>> Signed-off-by: Xiaolong Ye <xiaolong.ye@intel.com>
>> ---
>>  MAINTAINERS                                   |   6 +
>>  config/common_base                            |   5 +
>>  doc/guides/nics/af_xdp.rst                    |  43 +
>>  doc/guides/rel_notes/release_18_11.rst        |   7 +
>>  drivers/net/Makefile                          |   1 +
>>  drivers/net/af_xdp/Makefile                   |  31 +
>>  drivers/net/af_xdp/meson.build                |   7 +
>>  drivers/net/af_xdp/rte_eth_af_xdp.c           | 903
>> ++++++++++++++++++
>>  drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
>>  mk/rte.app.mk                                 |   1 +
>>  10 files changed, 1008 insertions(+)
>>  create mode 100644 doc/guides/nics/af_xdp.rst
>>  create mode 100644 drivers/net/af_xdp/Makefile
>>  create mode 100644 drivers/net/af_xdp/meson.build
>>  create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
>>  create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>> 
><..>
>
>> --- a/drivers/net/Makefile
>> +++ b/drivers/net/Makefile
>> @@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
>>  endif
>>  
>>  DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
>> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
>>  DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
>>  DIRS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD) += atlantic
>>  DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
>
>You need a similar change in drivers/net/meson.build

will do.

>
>> diff --git a/drivers/net/af_xdp/Makefile
>> b/drivers/net/af_xdp/Makefile
>> new file mode 100644
>> index 000000000..e3755fff2
>> --- /dev/null
>> +++ b/drivers/net/af_xdp/Makefile
>> @@ -0,0 +1,31 @@
>> +# SPDX-License-Identifier: BSD-3-Clause
>> +# Copyright(c) 2018 Intel Corporation
>> +
>> +include $(RTE_SDK)/mk/rte.vars.mk
>> +
>> +#
>> +# library name
>> +#
>> +LIB = librte_pmd_af_xdp.a
>> +
>> +EXPORT_MAP := rte_pmd_af_xdp_version.map
>> +
>> +LIBABIVER := 1
>> +
>> +
>> +CFLAGS += -O3
>> +# below line should be removed
>> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
>> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/lib/bpf
>
>Leftovers?
>

Yes, will remove in next version.

>> +CFLAGS += $(WERROR_FLAGS)
>> +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
>> +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
>> +LDLIBS += -lrte_bus_vdev
>> +
>> +#
>> +# all source are stored in SRCS-y
>> +#
>> +SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
>> +
>> +include $(RTE_SDK)/mk/rte.lib.mk
>> diff --git a/drivers/net/af_xdp/meson.build
>> b/drivers/net/af_xdp/meson.build
>> new file mode 100644
>> index 000000000..4b6652685
>> --- /dev/null
>> +++ b/drivers/net/af_xdp/meson.build
>> @@ -0,0 +1,7 @@
>> +# SPDX-License-Identifier: BSD-3-Clause
>> +# Copyright(c) 2018 Intel Corporation
>> +
>> +if host_machine.system() != 'linux'
>> +	build = false
>> +endif
>> +sources = files('rte_eth_af_xdp.c')
>
>You need to add a dependency() on libbpf, and given upstream doesn't
>ship a pkg-config file yet a fallback to find_library, so that the pmd
>can be linked against it.
>Check other PMDs to see how dependencies are handled, like
>drivers/net/pcap/meson.build
>

will check the example to see how to handle it correctly.

><..>
>
>> diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>> b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>> new file mode 100644
>> index 000000000..ef3539840
>> --- /dev/null
>> +++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>> @@ -0,0 +1,4 @@
>> +DPDK_2.0 {
>
>2.0 is a bit old :-)

will change it.

>
>> +
>> +	local: *;
>> +};
>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>> index d0ab942d5..db3271c7b 100644
>> --- a/mk/rte.app.mk
>> +++ b/mk/rte.app.mk
>> @@ -143,6 +143,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL)  +=
>> -lrte_mempool_dpaa2
>>  endif
>>  
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp
>> -lelf -lbpf
>
>Are symbols from libelf being used by the PMD?

Hmm, it is a leftover of RFC, libelf is no longer needed in this version, will
remove it in next version.

Thanks,
Xiaolong
>
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD)        += -lrte_pmd_ark
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD)   += -lrte_pmd_atlantic
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD)        += -lrte_pmd_avf
>
>-- 
>Kind regards,
>Luca Boccassi
David Marchand March 5, 2019, 8:25 a.m. | #7
On Fri, Mar 1, 2019 at 9:13 AM Xiaolong Ye <xiaolong.ye@intel.com> wrote:

> diff --git a/doc/guides/rel_notes/release_18_11.rst
> b/doc/guides/rel_notes/release_18_11.rst
> index 65bab557d..e0918441a 100644
> --- a/doc/guides/rel_notes/release_18_11.rst
> +++ b/doc/guides/rel_notes/release_18_11.rst
> @@ -229,6 +229,13 @@ New Features
>    The AESNI MB PMD has been updated with additional support for the
> AES-GCM
>    algorithm.
>
> +* **Added the AF_XDP PMD.**
> +
> +  Added a Linux-specific PMD driver for AF_XDP, it can create the AF_XDP
> socket
> +  and bind it to a specific netdev queue, it allows a DPDK application to
> send
> +  and receive raw packets through the socket which would bypass the kernel
> +  network stack to achieve high performance packet processing.
> +
>

Should be in 19.05 release notes.


diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> b/drivers/net/af_xdp/rte_eth_af_xdp.c
> new file mode 100644
> index 000000000..6de769650
> --- /dev/null
> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
>
> [snip]

+struct pkt_rx_queue {
> +       struct xsk_ring_cons rx;
> +       struct xsk_umem_info *umem;
> +       struct xsk_socket *xsk;
> +       struct rte_mempool *mb_pool;
> +
> +       unsigned long rx_pkts;
> +       unsigned long rx_bytes;
> +       unsigned long rx_dropped;
>

ethdev stats are uint64_t, why declare those internal stats as unsigned
long ?

+
> +       struct pkt_tx_queue *pair;
> +       uint16_t queue_idx;
> +};
> +
> +struct pkt_tx_queue {
> +       struct xsk_ring_prod tx;
> +
> +       unsigned long tx_pkts;
> +       unsigned long err_pkts;
> +       unsigned long tx_bytes;
>

Idem.

+
> +       struct pkt_rx_queue *pair;
> +       uint16_t queue_idx;
> +};
>

[snip]

+static int
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
> +{
> +       struct pmd_internals *internals = dev->data->dev_private;
> +       struct xdp_statistics xdp_stats;
> +       struct pkt_rx_queue *rxq;
> +       socklen_t optlen;
> +       int i;
> +
> +       optlen = sizeof(struct xdp_statistics);
> +       for (i = 0; i < dev->data->nb_rx_queues; i++) {
> +               rxq = &internals->rx_queues[i];
> +               stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
> +               stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
> +
> +               stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
> +               stats->q_errors[i] = internals->tx_queues[i].err_pkts;
>

q_errors[] are for reception errors, see the patchset I sent:
http://mails.dpdk.org/archives/dev/2019-March/125703.html

If you want per queue stats, use xstats.
You can still account those errors in the global stats->oerrors below.

+               stats->q_obytes[i] = internals->tx_queues[i].tx_bytes;
> +
> +               stats->ipackets += stats->q_ipackets[i];
> +               stats->ibytes += stats->q_ibytes[i];
> +               stats->imissed += internals->rx_queues[i].rx_dropped;
> +               getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
> XDP_STATISTICS,
> +                               &xdp_stats, &optlen);
> +               stats->imissed += xdp_stats.rx_dropped;
> +
> +               stats->opackets += stats->q_opackets[i];
> +               stats->oerrors += stats->q_errors[i];
>

-               stats->oerrors += stats->q_errors[i];
+               stats->oerrors += internals->tx_queues[i].err_pkts;

>
> +               stats->obytes += stats->q_obytes[i];
> +       }
> +
> +       return 0;
> +}
>
>
Ye Xiaolong March 7, 2019, 3:19 a.m. | #8
Hi, David

Thanks for your review.

On 03/05, David Marchand wrote:
>On Fri, Mar 1, 2019 at 9:13 AM Xiaolong Ye <xiaolong.ye@intel.com> wrote:
>
>> diff --git a/doc/guides/rel_notes/release_18_11.rst
>> b/doc/guides/rel_notes/release_18_11.rst
>> index 65bab557d..e0918441a 100644
>> --- a/doc/guides/rel_notes/release_18_11.rst
>> +++ b/doc/guides/rel_notes/release_18_11.rst
>> @@ -229,6 +229,13 @@ New Features
>>    The AESNI MB PMD has been updated with additional support for the
>> AES-GCM
>>    algorithm.
>>
>> +* **Added the AF_XDP PMD.**
>> +
>> +  Added a Linux-specific PMD driver for AF_XDP, it can create the AF_XDP
>> socket
>> +  and bind it to a specific netdev queue, it allows a DPDK application to
>> send
>> +  and receive raw packets through the socket which would bypass the kernel
>> +  network stack to achieve high performance packet processing.
>> +
>>
>
>Should be in 19.05 release notes.

Ooops, my bad, will swith to 19.05 in next version.

>
>
>diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
>> b/drivers/net/af_xdp/rte_eth_af_xdp.c
>> new file mode 100644
>> index 000000000..6de769650
>> --- /dev/null
>> +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
>>
>> [snip]
>
>+struct pkt_rx_queue {
>> +       struct xsk_ring_cons rx;
>> +       struct xsk_umem_info *umem;
>> +       struct xsk_socket *xsk;
>> +       struct rte_mempool *mb_pool;
>> +
>> +       unsigned long rx_pkts;
>> +       unsigned long rx_bytes;
>> +       unsigned long rx_dropped;
>>
>
>ethdev stats are uint64_t, why declare those internal stats as unsigned
>long ?

You are right, should use uint64_t instead.

>
>+
>> +       struct pkt_tx_queue *pair;
>> +       uint16_t queue_idx;
>> +};
>> +
>> +struct pkt_tx_queue {
>> +       struct xsk_ring_prod tx;
>> +
>> +       unsigned long tx_pkts;
>> +       unsigned long err_pkts;
>> +       unsigned long tx_bytes;
>>
>
>Idem.

Will change to uint64_t.

>
>+
>> +       struct pkt_rx_queue *pair;
>> +       uint16_t queue_idx;
>> +};
>>
>
>[snip]
>
>+static int
>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>> +{
>> +       struct pmd_internals *internals = dev->data->dev_private;
>> +       struct xdp_statistics xdp_stats;
>> +       struct pkt_rx_queue *rxq;
>> +       socklen_t optlen;
>> +       int i;
>> +
>> +       optlen = sizeof(struct xdp_statistics);
>> +       for (i = 0; i < dev->data->nb_rx_queues; i++) {
>> +               rxq = &internals->rx_queues[i];
>> +               stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
>> +               stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
>> +
>> +               stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
>> +               stats->q_errors[i] = internals->tx_queues[i].err_pkts;
>>
>
>q_errors[] are for reception errors, see the patchset I sent:
>http://mails.dpdk.org/archives/dev/2019-March/125703.html
>
>If you want per queue stats, use xstats.
>You can still account those errors in the global stats->oerrors below.
>
>+               stats->q_obytes[i] = internals->tx_queues[i].tx_bytes;
>> +
>> +               stats->ipackets += stats->q_ipackets[i];
>> +               stats->ibytes += stats->q_ibytes[i];
>> +               stats->imissed += internals->rx_queues[i].rx_dropped;
>> +               getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
>> XDP_STATISTICS,
>> +                               &xdp_stats, &optlen);
>> +               stats->imissed += xdp_stats.rx_dropped;
>> +
>> +               stats->opackets += stats->q_opackets[i];
>> +               stats->oerrors += stats->q_errors[i];
>>
>
>-               stats->oerrors += stats->q_errors[i];
>+               stats->oerrors += internals->tx_queues[i].err_pkts;

will correct according to your patch.

Thanks,
Xiaolong

>
>>
>> +               stats->obytes += stats->q_obytes[i];
>> +       }
>> +
>> +       return 0;
>> +}
>>
>>
>
>-- 
>David Marchand
Ferruh Yigit March 11, 2019, 4:20 p.m. | #9
On 3/1/2019 8:09 AM, Xiaolong Ye wrote:
> Add a new PMD driver for AF_XDP which is a proposed faster version of
> AF_PACKET interface in Linux. More info about AF_XDP, please refer to [1]
> [2].
> 
> This is the vanilla version PMD which just uses a raw buffer registered as
> the umem.
> 
> [1] https://fosdem.org/2018/schedule/event/af_xdp/
> [2] https://lwn.net/Articles/745934/
> 
> Signed-off-by: Xiaolong Ye <xiaolong.ye@intel.com>
> ---
>  MAINTAINERS                                   |   6 +
>  config/common_base                            |   5 +
>  doc/guides/nics/af_xdp.rst                    |  43 +

Can you please add new .rst file to index file, doc/guides/nics/index.rst ?

>  doc/guides/rel_notes/release_18_11.rst        |   7 +

Please switch to latest release notes.

>  drivers/net/Makefile                          |   1 +
>  drivers/net/af_xdp/Makefile                   |  31 +
>  drivers/net/af_xdp/meson.build                |   7 +
>  drivers/net/af_xdp/rte_eth_af_xdp.c           | 903 ++++++++++++++++++
>  drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
>  mk/rte.app.mk                                 |   1 +

Can you please add .ini file too?

<...>

> @@ -416,6 +416,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
>  #
>  CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
>  
> +#
> +# Compile software PMD backed by AF_XDP sockets (Linux only)
> +#
> +CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
> +

Why it is not enabled in linux config (config/common_linuxapp)? Is it because of
the external library dependencies?
I guess there is a requirement to the specific Linux kernel version, can it be
possible to detect it in Makefile and enable/disable according this information?

<...>

> +Prerequisites
> +-------------
> +
> +This is a Linux-specific PMD, thus the following prerequisites apply:
> +
> +*  A Linux Kernel with XDP sockets configuration enabled;

Can you please give more details of what exact vanilla kernel version?

> +*  libbpf with latest af_xdp support installed

Is there a specific version of libbpf for this?
I can see in makefile, libelf is also linked, is it a dependency?

<...>

> @@ -0,0 +1,31 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2018 Intel Corporation
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +#
> +# library name
> +#
> +LIB = librte_pmd_af_xdp.a
> +
> +EXPORT_MAP := rte_pmd_af_xdp_version.map
> +
> +LIBABIVER := 1
> +
> +
> +CFLAGS += -O3
> +# below line should be removed

+1

> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/lib/bpf
> +
> +CFLAGS += $(WERROR_FLAGS)
> +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
> +LDLIBS += -lrte_bus_vdev

Dependent libraries should be linked here.

<...>

> +
> +#include <linux/if_ether.h>
> +#include <linux/if_xdp.h>
> +#include <linux/if_link.h>
> +#include <asm/barrier.h>

Getting an build error for this [1], can there be any include path param missing?

[1]
drivers/net/af_xdp/rte_eth_af_xdp.c:15:10: fatal error: asm/barrier.h: No such
file or directory

<...>

> +static void
> +eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
> +{
> +	struct pmd_internals *internals = dev->data->dev_private;
> +
> +	dev_info->if_index = internals->if_index;
> +	dev_info->max_mac_addrs = 1;
> +	dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
> +	dev_info->max_rx_queues = 1;
> +	dev_info->max_tx_queues = 1;

'ETH_AF_XDP_MAX_QUEUE_PAIRS' is '16' but you are forcing the max Rx/Tx queue
number to be '1', intentional?

> +	dev_info->min_rx_bufsize = 0;
> +
> +	dev_info->default_rxportconf.nb_queues = 1;
> +	dev_info->default_txportconf.nb_queues = 1;
> +	dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
> +	dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
> +}
> +
> +static int
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
> +{
> +	struct pmd_internals *internals = dev->data->dev_private;
> +	struct xdp_statistics xdp_stats;
> +	struct pkt_rx_queue *rxq;
> +	socklen_t optlen;
> +	int i;
> +
> +	optlen = sizeof(struct xdp_statistics);
> +	for (i = 0; i < dev->data->nb_rx_queues; i++) {
> +		rxq = &internals->rx_queues[i];
> +		stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
> +		stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
> +
> +		stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
> +		stats->q_errors[i] = internals->tx_queues[i].err_pkts;

There is a patch from David, which points the 'q_errors' is for Rx only:
https://patches.dpdk.org/cover/50783/

<...>

> +static void xdp_umem_destroy(struct xsk_umem_info *umem)
> +{
> +	if (umem->buffer)
> +		free(umem->buffer);
> +
> +	free(umem);

Should we set freed pointers to 'null'?

Should free 'umem->buf_ring' before freeing 'umem'?

<...>

> +static int
> +eth_rx_queue_setup(struct rte_eth_dev *dev,
> +		   uint16_t rx_queue_id,
> +		   uint16_t nb_rx_desc,
> +		   unsigned int socket_id __rte_unused,
> +		   const struct rte_eth_rxconf *rx_conf __rte_unused,
> +		   struct rte_mempool *mb_pool)
> +{
> +	struct pmd_internals *internals = dev->data->dev_private;
> +	unsigned int buf_size, data_size;
> +	struct pkt_rx_queue *rxq;
> +	int ret = 0;
> +
> +	if (mb_pool == NULL) {
> +		RTE_LOG(ERR, PMD,
> +			"Invalid mb_pool\n");
> +		ret = -EINVAL;
> +		goto err;
> +	}

if 'mb_pool' is 'null', it will crash in 'rte_eth_rx_queue_setup()' before
coming here, I think we can drop this check.

> +
> +	if (dev->data->nb_rx_queues <= rx_queue_id) {
> +		RTE_LOG(ERR, PMD,
> +			"Invalid rx queue id: %d\n", rx_queue_id);
> +		ret = -EINVAL;
> +		goto err;
> +	}

This check already done in 'rte_eth_rx_queue_setup()' shouldn't need to be done
here.
<...>

> +static int
> +eth_tx_queue_setup(struct rte_eth_dev *dev,
> +		   uint16_t tx_queue_id,
> +		   uint16_t nb_tx_desc,
> +		   unsigned int socket_id __rte_unused,
> +		   const struct rte_eth_txconf *tx_conf __rte_unused)
> +{
> +	struct pmd_internals *internals = dev->data->dev_private;
> +	struct pkt_tx_queue *txq;
> +
> +	if (dev->data->nb_tx_queues <= tx_queue_id) {
> +		RTE_LOG(ERR, PMD, "Invalid tx queue id: %d\n", tx_queue_id);
> +		return -EINVAL;
> +	}

Can skip the check, same as above.

> +
> +	RTE_LOG(WARNING, PMD, "tx queue setup size=%d will be skipped\n",
> +		nb_tx_desc);

Why setup will be skipped?

> +	txq = &internals->tx_queues[tx_queue_id];
> +
> +	dev->data->tx_queues[tx_queue_id] = txq;
> +	return 0;
> +}
> +
> +static int
> +eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
> +{
> +	struct pmd_internals *internals = dev->data->dev_private;
> +	struct ifreq ifr = { .ifr_mtu = mtu };
> +	int ret;
> +	int s;
> +
> +	s = socket(PF_INET, SOCK_DGRAM, 0);
> +	if (s < 0)
> +		return -EINVAL;
> +
> +	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);

Can you please prefer strlcpy?

> +	ret = ioctl(s, SIOCSIFMTU, &ifr);
> +	close(s);
> +
> +	if (ret < 0)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static void
> +eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
> +{
> +	struct ifreq ifr;
> +	int s;
> +
> +	s = socket(PF_INET, SOCK_DGRAM, 0);
> +	if (s < 0)
> +		return;
> +
> +	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);

Can you please prefer strlcpy?

<...>

> +
> +static struct rte_vdev_driver pmd_af_xdp_drv;

Do we need this forward decleration?

> +
> +static void
> +parse_parameters(struct rte_kvargs *kvlist,
> +		 char **if_name,
> +		 int *queue_idx)
> +{
> +	struct rte_kvargs_pair *pair = NULL;
> +	unsigned int k_idx;
> +
> +	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
> +		pair = &kvlist->pairs[k_idx];
> +		if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))

It is better to use 'rte_kvargs_process()' instead of accessing the 'kvargs'
internals.

> +			*if_name = pair->value;
> +		else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
> +			*queue_idx = atoi(pair->value);
> +	}
> +}
> +
> +static int
> +get_iface_info(const char *if_name,
> +	       struct ether_addr *eth_addr,
> +	       int *if_index)
> +{
> +	struct ifreq ifr;
> +	int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
> +
> +	if (sock < 0)
> +		return -1;
> +
> +	strcpy(ifr.ifr_name, if_name);

Please prefer strlcpy.

> +	if (ioctl(sock, SIOCGIFINDEX, &ifr))
> +		goto error;
> +
> +	if (ioctl(sock, SIOCGIFHWADDR, &ifr))
> +		goto error;
> +
> +	memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);

Can use 'ether_addr_copy()'

> +
> +	close(sock);
> +	*if_index = if_nametoindex(if_name);
> +	return 0;
> +
> +error:
> +	close(sock);
> +	return -1;
> +}
> +
> +static int
> +init_internals(struct rte_vdev_device *dev,
> +	       const char *if_name,
> +	       int queue_idx)
> +{
> +	const char *name = rte_vdev_device_name(dev);
> +	struct rte_eth_dev *eth_dev = NULL;
> +	const unsigned int numa_node = dev->device.numa_node;
> +	struct pmd_internals *internals = NULL;
> +	int ret;
> +	int i;
> +
> +	internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
> +	if (!internals)
> +		return -ENOMEM;
> +
> +	internals->queue_idx = queue_idx;
> +	strcpy(internals->if_name, if_name);

prefer 'strlcpy' please

> +
> +	for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
> +		internals->tx_queues[i].pair = &internals->rx_queues[i];
> +		internals->rx_queues[i].pair = &internals->tx_queues[i];
> +	}
> +
> +	ret = get_iface_info(if_name, &internals->eth_addr,
> +			     &internals->if_index);
> +	if (ret)
> +		goto err;
> +
> +	eth_dev = rte_eth_vdev_allocate(dev, 0);
> +	if (!eth_dev)
> +		goto err;
> +
> +	eth_dev->data->dev_private = internals;
> +	eth_dev->data->dev_link = pmd_link;
> +	eth_dev->data->mac_addrs = &internals->eth_addr;
> +	eth_dev->dev_ops = &ops;
> +	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
> +	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
> +
> +	rte_eth_dev_probing_finish(eth_dev);

What do you think moving this call into 'rte_pmd_af_xdp_probe' function if
'init_internals' returns sucess instead of setting here?

> +	return 0;
> +
> +err:
> +	rte_free(internals);
> +	return -1;
> +}
> +
> +static int
> +rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
> +{
> +	struct rte_kvargs *kvlist;
> +	char *if_name = NULL;
> +	int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;

This 'queue_idx' is for interface queue to pass to xsk_* API, also we have same
variable name 'queue_idx' that we use for DPDK queue index, they get confused
easily, what do you think rename this one something like 'xsk_queue_idx'?

> +	struct rte_eth_dev *eth_dev;
> +	const char *name;
> +	int ret;
> +
> +	RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
> +		rte_vdev_device_name(dev));
> +
> +	name = rte_vdev_device_name(dev);
> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
> +		strlen(rte_vdev_device_args(dev)) == 0) {
> +		eth_dev = rte_eth_dev_attach_secondary(name);
> +		if (!eth_dev) {
> +			RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
> +			return -EINVAL;
> +		}
> +		eth_dev->dev_ops = &ops;
> +		rte_eth_dev_probing_finish(eth_dev);
> +	}
> +
> +	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
> +	if (!kvlist) {
> +		RTE_LOG(ERR, PMD,
> +			"Invalid kvargs\n");

No need to break the line.

> +		return -EINVAL;
> +	}
> +
> +	if (dev->device.numa_node == SOCKET_ID_ANY)
> +		dev->device.numa_node = rte_socket_id();
> +
> +	parse_parameters(kvlist, &if_name,
> +			 &queue_idx);

Same, no need to break the line.

> +
> +	ret = init_internals(dev, if_name, queue_idx);
> +
> +	rte_kvargs_free(kvlist);
> +
> +	return ret;
> +}
> +
> +static int
> +rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
> +{
> +	struct rte_eth_dev *eth_dev = NULL;
> +	struct pmd_internals *internals;
> +
> +	RTE_LOG(INFO, PMD, "Removing AF_XDP ethdev on numa socket %u\n",
> +		rte_socket_id());
> +
> +	if (!dev)
> +		return -1;
> +
> +	/* find the ethdev entry */
> +	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
> +	if (!eth_dev)
> +		return -1;
> +
> +	internals = eth_dev->data->dev_private;
> +
> +	rte_ring_free(internals->umem->buf_ring);
> +	rte_free(internals->umem->buffer);
> +	rte_free(internals->umem);
> +	rte_free(internals);
> +
> +	rte_eth_dev_release_port(eth_dev);

This frees the 'eth_dev->data->dev_private', (internals), it can be problem to
try to free same pointer twice.

> +
> +
> +	return 0;
> +}
> +
> +static struct rte_vdev_driver pmd_af_xdp_drv = {
> +	.probe = rte_pmd_af_xdp_probe,
> +	.remove = rte_pmd_af_xdp_remove,
> +};
> +
> +RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
> +RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);

No need to create the alias, it is for backward compatibility.

> +RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
> +			      "iface=<string> "
> +			      "queue=<int> ");
> diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> new file mode 100644
> index 000000000..ef3539840
> --- /dev/null
> +++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
> @@ -0,0 +1,4 @@
> +DPDK_2.0 {

Use release version please.

> +
> +	local: *;
> +};
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index d0ab942d5..db3271c7b 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -143,6 +143,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL)  += -lrte_mempool_dpaa2
>  endif
>  
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp -lelf -lbpf

For which API libelf is required?
Ye Xiaolong March 12, 2019, 3:54 p.m. | #10
Hi, Ferruh

Thanks for your review.

On 03/11, Ferruh Yigit wrote:
>On 3/1/2019 8:09 AM, Xiaolong Ye wrote:
>> Add a new PMD driver for AF_XDP which is a proposed faster version of
>> AF_PACKET interface in Linux. More info about AF_XDP, please refer to [1]
>> [2].
>> 
>> This is the vanilla version PMD which just uses a raw buffer registered as
>> the umem.
>> 
>> [1] https://fosdem.org/2018/schedule/event/af_xdp/
>> [2] https://lwn.net/Articles/745934/
>> 
>> Signed-off-by: Xiaolong Ye <xiaolong.ye@intel.com>
>> ---
>>  MAINTAINERS                                   |   6 +
>>  config/common_base                            |   5 +
>>  doc/guides/nics/af_xdp.rst                    |  43 +
>
>Can you please add new .rst file to index file, doc/guides/nics/index.rst ?

Got it, will do in next version.

>
>>  doc/guides/rel_notes/release_18_11.rst        |   7 +
>
>Please switch to latest release notes.

My bad, will switch to 19.05..

>
>>  drivers/net/Makefile                          |   1 +
>>  drivers/net/af_xdp/Makefile                   |  31 +
>>  drivers/net/af_xdp/meson.build                |   7 +
>>  drivers/net/af_xdp/rte_eth_af_xdp.c           | 903 ++++++++++++++++++
>>  drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
>>  mk/rte.app.mk                                 |   1 +
>
>Can you please add .ini file too?

will do.

>
><...>
>
>> @@ -416,6 +416,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
>>  #
>>  CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
>>  
>> +#
>> +# Compile software PMD backed by AF_XDP sockets (Linux only)
>> +#
>> +CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
>> +
>
>Why it is not enabled in linux config (config/common_linuxapp)? Is it because of
>the external library dependencies?

Yes, af_xdp pmd is dependent on libbpf which is not included in any linux distribution yet.

>I guess there is a requirement to the specific Linux kernel version, can it be

libbpf should be included in kernel 5.1 release.

>possible to detect it in Makefile and enable/disable according this information?
>

Ok, I'll investigate how to do it.

><...>
>
>> +Prerequisites
>> +-------------
>> +
>> +This is a Linux-specific PMD, thus the following prerequisites apply:
>> +
>> +*  A Linux Kernel with XDP sockets configuration enabled;
>
>Can you please give more details of what exact vanilla kernel version?

Do you mean I should write more details about AF_XDP in kernel in this introduction 
document?

>
>> +*  libbpf with latest af_xdp support installed
>
>Is there a specific version of libbpf for this?

I'm not aware that there is specific version number for libbpf, it's part of linux
kernel src code.

>I can see in makefile, libelf is also linked, is it a dependency?

libelf is a leftover of RFC, will delete it in next version.

>
><...>
>
>> @@ -0,0 +1,31 @@
>> +# SPDX-License-Identifier: BSD-3-Clause
>> +# Copyright(c) 2018 Intel Corporation
>> +
>> +include $(RTE_SDK)/mk/rte.vars.mk
>> +
>> +#
>> +# library name
>> +#
>> +LIB = librte_pmd_af_xdp.a
>> +
>> +EXPORT_MAP := rte_pmd_af_xdp_version.map
>> +
>> +LIBABIVER := 1
>> +
>> +
>> +CFLAGS += -O3
>> +# below line should be removed
>
>+1
>
>> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
>> +CFLAGS += -I/root/yexl/shared_mks0/linux/tools/lib/bpf
>> +
>> +CFLAGS += $(WERROR_FLAGS)
>> +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
>> +LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
>> +LDLIBS += -lrte_bus_vdev
>
>Dependent libraries should be linked here.

Ok, I'll add libbpf here.

>
><...>
>
>> +
>> +#include <linux/if_ether.h>
>> +#include <linux/if_xdp.h>
>> +#include <linux/if_link.h>
>> +#include <asm/barrier.h>
>
>Getting an build error for this [1], can there be any include path param missing?
>
>[1]
>drivers/net/af_xdp/rte_eth_af_xdp.c:15:10: fatal error: asm/barrier.h: No such
>file or directory

Yes, it need something like 

CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include

as in above Makefile currently.

>
><...>
>
>> +static void
>> +eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>> +{
>> +	struct pmd_internals *internals = dev->data->dev_private;
>> +
>> +	dev_info->if_index = internals->if_index;
>> +	dev_info->max_mac_addrs = 1;
>> +	dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
>> +	dev_info->max_rx_queues = 1;
>> +	dev_info->max_tx_queues = 1;
>
>'ETH_AF_XDP_MAX_QUEUE_PAIRS' is '16' but you are forcing the max Rx/Tx queue
>number to be '1', intentional?

Yes, current implementation is single queue only, we plan to support muli-queues
in futher.

>
>> +	dev_info->min_rx_bufsize = 0;
>> +
>> +	dev_info->default_rxportconf.nb_queues = 1;
>> +	dev_info->default_txportconf.nb_queues = 1;
>> +	dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
>> +	dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
>> +}
>> +
>> +static int
>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>> +{
>> +	struct pmd_internals *internals = dev->data->dev_private;
>> +	struct xdp_statistics xdp_stats;
>> +	struct pkt_rx_queue *rxq;
>> +	socklen_t optlen;
>> +	int i;
>> +
>> +	optlen = sizeof(struct xdp_statistics);
>> +	for (i = 0; i < dev->data->nb_rx_queues; i++) {
>> +		rxq = &internals->rx_queues[i];
>> +		stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
>> +		stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
>> +
>> +		stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
>> +		stats->q_errors[i] = internals->tx_queues[i].err_pkts;
>
>There is a patch from David, which points the 'q_errors' is for Rx only:
>https://patches.dpdk.org/cover/50783/

Yes, I got the same comment from David, will change it accordingly.

>
><...>
>
>> +static void xdp_umem_destroy(struct xsk_umem_info *umem)
>> +{
>> +	if (umem->buffer)
>> +		free(umem->buffer);
>> +
>> +	free(umem);
>
>Should we set freed pointers to 'null'?

will do.

>
>Should free 'umem->buf_ring' before freeing 'umem'?

Good catch, will add free buf_ring.

>
><...>
>
>> +static int
>> +eth_rx_queue_setup(struct rte_eth_dev *dev,
>> +		   uint16_t rx_queue_id,
>> +		   uint16_t nb_rx_desc,
>> +		   unsigned int socket_id __rte_unused,
>> +		   const struct rte_eth_rxconf *rx_conf __rte_unused,
>> +		   struct rte_mempool *mb_pool)
>> +{
>> +	struct pmd_internals *internals = dev->data->dev_private;
>> +	unsigned int buf_size, data_size;
>> +	struct pkt_rx_queue *rxq;
>> +	int ret = 0;
>> +
>> +	if (mb_pool == NULL) {
>> +		RTE_LOG(ERR, PMD,
>> +			"Invalid mb_pool\n");
>> +		ret = -EINVAL;
>> +		goto err;
>> +	}
>
>if 'mb_pool' is 'null', it will crash in 'rte_eth_rx_queue_setup()' before
>coming here, I think we can drop this check.

Agree, it's a redundant check, will remove.

>
>> +
>> +	if (dev->data->nb_rx_queues <= rx_queue_id) {
>> +		RTE_LOG(ERR, PMD,
>> +			"Invalid rx queue id: %d\n", rx_queue_id);
>> +		ret = -EINVAL;
>> +		goto err;
>> +	}
>
>This check already done in 'rte_eth_rx_queue_setup()' shouldn't need to be done
>here.

will remove.

><...>
>
>> +static int
>> +eth_tx_queue_setup(struct rte_eth_dev *dev,
>> +		   uint16_t tx_queue_id,
>> +		   uint16_t nb_tx_desc,
>> +		   unsigned int socket_id __rte_unused,
>> +		   const struct rte_eth_txconf *tx_conf __rte_unused)
>> +{
>> +	struct pmd_internals *internals = dev->data->dev_private;
>> +	struct pkt_tx_queue *txq;
>> +
>> +	if (dev->data->nb_tx_queues <= tx_queue_id) {
>> +		RTE_LOG(ERR, PMD, "Invalid tx queue id: %d\n", tx_queue_id);
>> +		return -EINVAL;
>> +	}
>
>Can skip the check, same as above.

Got it.

>
>> +
>> +	RTE_LOG(WARNING, PMD, "tx queue setup size=%d will be skipped\n",
>> +		nb_tx_desc);
>
>Why setup will be skipped?

leftover of RFC, will remove.

>
>> +	txq = &internals->tx_queues[tx_queue_id];
>> +
>> +	dev->data->tx_queues[tx_queue_id] = txq;
>> +	return 0;
>> +}
>> +
>> +static int
>> +eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
>> +{
>> +	struct pmd_internals *internals = dev->data->dev_private;
>> +	struct ifreq ifr = { .ifr_mtu = mtu };
>> +	int ret;
>> +	int s;
>> +
>> +	s = socket(PF_INET, SOCK_DGRAM, 0);
>> +	if (s < 0)
>> +		return -EINVAL;
>> +
>> +	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
>
>Can you please prefer strlcpy?

Sure.

>
>> +	ret = ioctl(s, SIOCSIFMTU, &ifr);
>> +	close(s);
>> +
>> +	if (ret < 0)
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>> +static void
>> +eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
>> +{
>> +	struct ifreq ifr;
>> +	int s;
>> +
>> +	s = socket(PF_INET, SOCK_DGRAM, 0);
>> +	if (s < 0)
>> +		return;
>> +
>> +	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
>
>Can you please prefer strlcpy?

Sure.

>
><...>
>
>> +
>> +static struct rte_vdev_driver pmd_af_xdp_drv;
>
>Do we need this forward decleration?

Part of af_xdp pmd is refering to af_packet pmd, this is a simple copy from that
driver, and as you point out, it should be unnecessary, will remove it.

>
>> +
>> +static void
>> +parse_parameters(struct rte_kvargs *kvlist,
>> +		 char **if_name,
>> +		 int *queue_idx)
>> +{
>> +	struct rte_kvargs_pair *pair = NULL;
>> +	unsigned int k_idx;
>> +
>> +	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
>> +		pair = &kvlist->pairs[k_idx];
>> +		if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
>
>It is better to use 'rte_kvargs_process()' instead of accessing the 'kvargs'
>internals.

Will do.

>
>> +			*if_name = pair->value;
>> +		else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
>> +			*queue_idx = atoi(pair->value);
>> +	}
>> +}
>> +
>> +static int
>> +get_iface_info(const char *if_name,
>> +	       struct ether_addr *eth_addr,
>> +	       int *if_index)
>> +{
>> +	struct ifreq ifr;
>> +	int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
>> +
>> +	if (sock < 0)
>> +		return -1;
>> +
>> +	strcpy(ifr.ifr_name, if_name);
>
>Please prefer strlcpy.

Sure.

>
>> +	if (ioctl(sock, SIOCGIFINDEX, &ifr))
>> +		goto error;
>> +
>> +	if (ioctl(sock, SIOCGIFHWADDR, &ifr))
>> +		goto error;
>> +
>> +	memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
>
>Can use 'ether_addr_copy()'

Got it.

>
>> +
>> +	close(sock);
>> +	*if_index = if_nametoindex(if_name);
>> +	return 0;
>> +
>> +error:
>> +	close(sock);
>> +	return -1;
>> +}
>> +
>> +static int
>> +init_internals(struct rte_vdev_device *dev,
>> +	       const char *if_name,
>> +	       int queue_idx)
>> +{
>> +	const char *name = rte_vdev_device_name(dev);
>> +	struct rte_eth_dev *eth_dev = NULL;
>> +	const unsigned int numa_node = dev->device.numa_node;
>> +	struct pmd_internals *internals = NULL;
>> +	int ret;
>> +	int i;
>> +
>> +	internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
>> +	if (!internals)
>> +		return -ENOMEM;
>> +
>> +	internals->queue_idx = queue_idx;
>> +	strcpy(internals->if_name, if_name);
>
>prefer 'strlcpy' please

Got it.

>
>> +
>> +	for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
>> +		internals->tx_queues[i].pair = &internals->rx_queues[i];
>> +		internals->rx_queues[i].pair = &internals->tx_queues[i];
>> +	}
>> +
>> +	ret = get_iface_info(if_name, &internals->eth_addr,
>> +			     &internals->if_index);
>> +	if (ret)
>> +		goto err;
>> +
>> +	eth_dev = rte_eth_vdev_allocate(dev, 0);
>> +	if (!eth_dev)
>> +		goto err;
>> +
>> +	eth_dev->data->dev_private = internals;
>> +	eth_dev->data->dev_link = pmd_link;
>> +	eth_dev->data->mac_addrs = &internals->eth_addr;
>> +	eth_dev->dev_ops = &ops;
>> +	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
>> +	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
>> +
>> +	rte_eth_dev_probing_finish(eth_dev);
>
>What do you think moving this call into 'rte_pmd_af_xdp_probe' function if
>'init_internals' returns sucess instead of setting here?

Sounds better, will do.

>
>> +	return 0;
>> +
>> +err:
>> +	rte_free(internals);
>> +	return -1;
>> +}
>> +
>> +static int
>> +rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
>> +{
>> +	struct rte_kvargs *kvlist;
>> +	char *if_name = NULL;
>> +	int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
>
>This 'queue_idx' is for interface queue to pass to xsk_* API, also we have same
>variable name 'queue_idx' that we use for DPDK queue index, they get confused
>easily, what do you think rename this one something like 'xsk_queue_idx'?

Agree, xsk_queue_idx is a better name, will adopt.

>
>> +	struct rte_eth_dev *eth_dev;
>> +	const char *name;
>> +	int ret;
>> +
>> +	RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
>> +		rte_vdev_device_name(dev));
>> +
>> +	name = rte_vdev_device_name(dev);
>> +	if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
>> +		strlen(rte_vdev_device_args(dev)) == 0) {
>> +		eth_dev = rte_eth_dev_attach_secondary(name);
>> +		if (!eth_dev) {
>> +			RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
>> +			return -EINVAL;
>> +		}
>> +		eth_dev->dev_ops = &ops;
>> +		rte_eth_dev_probing_finish(eth_dev);
>> +	}
>> +
>> +	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
>> +	if (!kvlist) {
>> +		RTE_LOG(ERR, PMD,
>> +			"Invalid kvargs\n");
>
>No need to break the line.

Got it.

>
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (dev->device.numa_node == SOCKET_ID_ANY)
>> +		dev->device.numa_node = rte_socket_id();
>> +
>> +	parse_parameters(kvlist, &if_name,
>> +			 &queue_idx);
>
>Same, no need to break the line.

Got it.

>
>> +
>> +	ret = init_internals(dev, if_name, queue_idx);
>> +
>> +	rte_kvargs_free(kvlist);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
>> +{
>> +	struct rte_eth_dev *eth_dev = NULL;
>> +	struct pmd_internals *internals;
>> +
>> +	RTE_LOG(INFO, PMD, "Removing AF_XDP ethdev on numa socket %u\n",
>> +		rte_socket_id());
>> +
>> +	if (!dev)
>> +		return -1;
>> +
>> +	/* find the ethdev entry */
>> +	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
>> +	if (!eth_dev)
>> +		return -1;
>> +
>> +	internals = eth_dev->data->dev_private;
>> +
>> +	rte_ring_free(internals->umem->buf_ring);
>> +	rte_free(internals->umem->buffer);
>> +	rte_free(internals->umem);
>> +	rte_free(internals);
>> +
>> +	rte_eth_dev_release_port(eth_dev);
>
>This frees the 'eth_dev->data->dev_private', (internals), it can be problem to
>try to free same pointer twice.

Thanks for pointing out, will remove the `rte_free(internals)` to avoid a double free.

>
>> +
>> +
>> +	return 0;
>> +}
>> +
>> +static struct rte_vdev_driver pmd_af_xdp_drv = {
>> +	.probe = rte_pmd_af_xdp_probe,
>> +	.remove = rte_pmd_af_xdp_remove,
>> +};
>> +
>> +RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
>> +RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
>
>No need to create the alias, it is for backward compatibility.

Got it.

>
>> +RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
>> +			      "iface=<string> "
>> +			      "queue=<int> ");
>> diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>> new file mode 100644
>> index 000000000..ef3539840
>> --- /dev/null
>> +++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
>> @@ -0,0 +1,4 @@
>> +DPDK_2.0 {
>
>Use release version please.

Got it.

>
>> +
>> +	local: *;
>> +};
>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>> index d0ab942d5..db3271c7b 100644
>> --- a/mk/rte.app.mk
>> +++ b/mk/rte.app.mk
>> @@ -143,6 +143,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL)  += -lrte_mempool_dpaa2
>>  endif
>>  
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp -lelf -lbpf
>
>For which API libelf is required?

It is a leftover and will be removed.

Thanks,
Xiaolong
>
Ferruh Yigit March 13, 2019, 10:54 a.m. | #11
>> <...>
>>
>>> +Prerequisites
>>> +-------------
>>> +
>>> +This is a Linux-specific PMD, thus the following prerequisites apply:
>>> +
>>> +*  A Linux Kernel with XDP sockets configuration enabled;
>>
>> Can you please give more details of what exact vanilla kernel version?
> 
> Do you mean I should write more details about AF_XDP in kernel in this introduction 
> document?

I think it is good to document the exact version information instead of saying
"Linux Kernel with af_xdp".

>>> +*  libbpf with latest af_xdp support installed
>>
>> Is there a specific version of libbpf for this?
> 
> I'm not aware that there is specific version number for libbpf, it's part of linux
> kernel src code.

If it is coming with Linux kernel, which version of Linux kernel?

>> <...>
>>
>>> +
>>> +#include <linux/if_ether.h>
>>> +#include <linux/if_xdp.h>
>>> +#include <linux/if_link.h>
>>> +#include <asm/barrier.h>
>>
>> Getting an build error for this [1], can there be any include path param missing?
>>
>> [1]
>> drivers/net/af_xdp/rte_eth_af_xdp.c:15:10: fatal error: asm/barrier.h: No such
>> file or directory
> 
> Yes, it need something like 
> 
> CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
> 
> as in above Makefile currently.

I see, assuming you will be booting up with that kernel, can something like
below work:

CFLAGS += -I/lib/modules/$(shell uname -r)/build/tools/include/

>> <...>
>>
>>> +static void
>>> +eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>>> +{
>>> +	struct pmd_internals *internals = dev->data->dev_private;
>>> +
>>> +	dev_info->if_index = internals->if_index;
>>> +	dev_info->max_mac_addrs = 1;
>>> +	dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
>>> +	dev_info->max_rx_queues = 1;
>>> +	dev_info->max_tx_queues = 1;
>>
>> 'ETH_AF_XDP_MAX_QUEUE_PAIRS' is '16' but you are forcing the max Rx/Tx queue
>> number to be '1', intentional?
> 
> Yes, current implementation is single queue only, we plan to support muli-queues
> in futher.

OK, Can you please document this information?
Ye Xiaolong March 13, 2019, 11:12 a.m. | #12
On 03/13, Ferruh Yigit wrote:
>
>>> <...>
>>>
>>>> +Prerequisites
>>>> +-------------
>>>> +
>>>> +This is a Linux-specific PMD, thus the following prerequisites apply:
>>>> +
>>>> +*  A Linux Kernel with XDP sockets configuration enabled;
>>>
>>> Can you please give more details of what exact vanilla kernel version?
>> 
>> Do you mean I should write more details about AF_XDP in kernel in this introduction 
>> document?
>
>I think it is good to document the exact version information instead of saying
>"Linux Kernel with af_xdp".

Get your point now, will add the exact kernel info.

>
>>>> +*  libbpf with latest af_xdp support installed
>>>
>>> Is there a specific version of libbpf for this?
>> 
>> I'm not aware that there is specific version number for libbpf, it's part of linux
>> kernel src code.
>
>If it is coming with Linux kernel, which version of Linux kernel?

Will add the kernel version info.

>
>>> <...>
>>>
>>>> +
>>>> +#include <linux/if_ether.h>
>>>> +#include <linux/if_xdp.h>
>>>> +#include <linux/if_link.h>
>>>> +#include <asm/barrier.h>
>>>
>>> Getting an build error for this [1], can there be any include path param missing?
>>>
>>> [1]
>>> drivers/net/af_xdp/rte_eth_af_xdp.c:15:10: fatal error: asm/barrier.h: No such
>>> file or directory
>> 
>> Yes, it need something like 
>> 
>> CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
>> 
>> as in above Makefile currently.
>
>I see, assuming you will be booting up with that kernel, can something like
>below work:
>
>CFLAGS += -I/lib/modules/$(shell uname -r)/build/tools/include/
>

I'll have a try.

>>> <...>
>>>
>>>> +static void
>>>> +eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>>>> +{
>>>> +	struct pmd_internals *internals = dev->data->dev_private;
>>>> +
>>>> +	dev_info->if_index = internals->if_index;
>>>> +	dev_info->max_mac_addrs = 1;
>>>> +	dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
>>>> +	dev_info->max_rx_queues = 1;
>>>> +	dev_info->max_tx_queues = 1;
>>>
>>> 'ETH_AF_XDP_MAX_QUEUE_PAIRS' is '16' but you are forcing the max Rx/Tx queue
>>> number to be '1', intentional?
>> 
>> Yes, current implementation is single queue only, we plan to support muli-queues
>> in futher.
>
>OK, Can you please document this information?

Sure.

Thanks,
Xiaolong
>
Ye Xiaolong March 17, 2019, 3:34 a.m. | #13
On 03/02, Ye Xiaolong wrote:
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
>>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp
>>> -lelf -lbpf
>>
>>Are symbols from libelf being used by the PMD?
>
>Hmm, it is a leftover of RFC, libelf is no longer needed in this version, will
>remove it in next version.
>

Correction, libelf is needed for libbpf, so we still need to keep it. 

Thanks,
Xiaolong
>Thanks,
>Xiaolong
>>
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD)        += -lrte_pmd_ark
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD)   += -lrte_pmd_atlantic
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD)        += -lrte_pmd_avf
>>
>>-- 
>>Kind regards,
>>Luca Boccassi
Ye Xiaolong March 17, 2019, 3:35 a.m. | #14
On 03/12, Ye Xiaolong wrote:
>>I can see in makefile, libelf is also linked, is it a dependency?
>
>libelf is a leftover of RFC, will delete it in next version.

Correction, libbpf depends on libelf, so I still need to keep it.

Thanks,
Xiaolong
Luca Boccassi March 24, 2019, 12:07 p.m. | #15
On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
> On 03/02, Ye Xiaolong wrote:
> > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
> > > > -lrte_pmd_af_packet
> > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
> > > > -lrte_pmd_af_xdp
> > > > -lelf -lbpf
> > > 
> > > Are symbols from libelf being used by the PMD?
> > 
> > Hmm, it is a leftover of RFC, libelf is no longer needed in this
> > version, will
> > remove it in next version.
> > 
> 
> Correction, libelf is needed for libbpf, so we still need to keep
> it. 

If libbpf needs libelf for its internal usage, it should be linked
against it itself. Unless symbols from libelf are used in static
functions defined in libbpf's public headers. Is this the case?
Ye Xiaolong March 25, 2019, 2:45 a.m. | #16
On 03/24, Luca Boccassi wrote:
>On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
>> On 03/02, Ye Xiaolong wrote:
>> > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
>> > > > -lrte_pmd_af_packet
>> > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
>> > > > -lrte_pmd_af_xdp
>> > > > -lelf -lbpf
>> > > 
>> > > Are symbols from libelf being used by the PMD?
>> > 
>> > Hmm, it is a leftover of RFC, libelf is no longer needed in this
>> > version, will
>> > remove it in next version.
>> > 
>> 
>> Correction, libelf is needed for libbpf, so we still need to keep
>> it. 
>
>If libbpf needs libelf for its internal usage, it should be linked
>against it itself. Unless symbols from libelf are used in static
>functions defined in libbpf's public headers. Is this the case?
>

Yes, that's the case. without the libelf, it would have build error as below,
and these symbols are used in static functions of libbpf.

/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_nextscn'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_rawdata'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_memory'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `gelf_getrel'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_strptr'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_end'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_getscn'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_begin'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `gelf_getsym'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_version'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `gelf_getehdr'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `elf_getdata'
/usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so: undefined reference to `gelf_getshdr'

Thanks,
Xiaolong
>-- 
>Kind regards,
>Luca Boccassi
Luca Boccassi March 25, 2019, 10:42 a.m. | #17
On Mon, 2019-03-25 at 10:45 +0800, Ye Xiaolong wrote:
> On 03/24, Luca Boccassi wrote:
> > On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
> > > On 03/02, Ye Xiaolong wrote:
> > > > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
> > > > > > -lrte_pmd_af_packet
> > > > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
> > > > > > -lrte_pmd_af_xdp
> > > > > > -lelf -lbpf
> > > > > 
> > > > > Are symbols from libelf being used by the PMD?
> > > > 
> > > > Hmm, it is a leftover of RFC, libelf is no longer needed in
> > > > this
> > > > version, will
> > > > remove it in next version.
> > > > 
> > > 
> > > Correction, libelf is needed for libbpf, so we still need to keep
> > > it. 
> > 
> > If libbpf needs libelf for its internal usage, it should be linked
> > against it itself. Unless symbols from libelf are used in static
> > functions defined in libbpf's public headers. Is this the case?
> > 
> 
> Yes, that's the case. without the libelf, it would have build error
> as below,
> and these symbols are used in static functions of libbpf.
> 
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_nextscn'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_rawdata'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_memory'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `gelf_getrel'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_strptr'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_end'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_getscn'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_begin'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `gelf_getsym'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_version'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `gelf_getehdr'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `elf_getdata'
> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
> undefined reference to `gelf_getshdr'
> 
> Thanks,
> Xiaolong

Looking at that list, at least the very first symbol is not used by a
public header, but internally in libbpf:

$ grep -r elf_nextscn
libbpf.c:	while ((scn = elf_nextscn(elf, scn)) != NULL) {

https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/libbpf.c#n793

None of those symbols are used from the public headers:

$ grep elf_ bpf.h libbpf.h btf.h
$

Therefore, this is an omission in libbpf's Makefile, as it must link
against libelf. Please raise a ticket or send a patch upstream, and
remove the -lelf from DPDK's makefiles.
Ye Xiaolong March 25, 2019, 12:22 p.m. | #18
On 03/25, Luca Boccassi wrote:
>On Mon, 2019-03-25 at 10:45 +0800, Ye Xiaolong wrote:
>> On 03/24, Luca Boccassi wrote:
>> > On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
>> > > On 03/02, Ye Xiaolong wrote:
>> > > > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
>> > > > > > -lrte_pmd_af_packet
>> > > > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
>> > > > > > -lrte_pmd_af_xdp
>> > > > > > -lelf -lbpf
>> > > > > 
>> > > > > Are symbols from libelf being used by the PMD?
>> > > > 
>> > > > Hmm, it is a leftover of RFC, libelf is no longer needed in
>> > > > this
>> > > > version, will
>> > > > remove it in next version.
>> > > > 
>> > > 
>> > > Correction, libelf is needed for libbpf, so we still need to keep
>> > > it. 
>> > 
>> > If libbpf needs libelf for its internal usage, it should be linked
>> > against it itself. Unless symbols from libelf are used in static
>> > functions defined in libbpf's public headers. Is this the case?
>> > 
>> 
>> Yes, that's the case. without the libelf, it would have build error
>> as below,
>> and these symbols are used in static functions of libbpf.
>> 
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_nextscn'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_rawdata'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_memory'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getrel'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_strptr'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_end'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_getscn'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_begin'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getsym'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_version'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getehdr'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_getdata'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getshdr'
>> 
>> Thanks,
>> Xiaolong
>
>Looking at that list, at least the very first symbol is not used by a
>public header, but internally in libbpf:
>
>$ grep -r elf_nextscn
>libbpf.c:	while ((scn = elf_nextscn(elf, scn)) != NULL) {
>
>https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/libbpf.c#n793
>
>None of those symbols are used from the public headers:
>
>$ grep elf_ bpf.h libbpf.h btf.h
>$
>
>Therefore, this is an omission in libbpf's Makefile, as it must link
>against libelf. Please raise a ticket or send a patch upstream, and
>remove the -lelf from DPDK's makefiles.

Hi, Magnus, could you help handle this?

Thanks,
Xiaolong
>
>-- 
>Kind regards,
>Luca Boccassi
Ye Xiaolong March 26, 2019, 2:18 a.m. | #19
On 03/25, Luca Boccassi wrote:
>On Mon, 2019-03-25 at 10:45 +0800, Ye Xiaolong wrote:
>> On 03/24, Luca Boccassi wrote:
>> > On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
>> > > On 03/02, Ye Xiaolong wrote:
>> > > > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
>> > > > > > -lrte_pmd_af_packet
>> > > > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
>> > > > > > -lrte_pmd_af_xdp
>> > > > > > -lelf -lbpf
>> > > > > 
>> > > > > Are symbols from libelf being used by the PMD?
>> > > > 
>> > > > Hmm, it is a leftover of RFC, libelf is no longer needed in
>> > > > this
>> > > > version, will
>> > > > remove it in next version.
>> > > > 
>> > > 
>> > > Correction, libelf is needed for libbpf, so we still need to keep
>> > > it. 
>> > 
>> > If libbpf needs libelf for its internal usage, it should be linked
>> > against it itself. Unless symbols from libelf are used in static
>> > functions defined in libbpf's public headers. Is this the case?
>> > 
>> 
>> Yes, that's the case. without the libelf, it would have build error
>> as below,
>> and these symbols are used in static functions of libbpf.
>> 
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_nextscn'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_rawdata'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_memory'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getrel'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_strptr'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_end'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_getscn'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_begin'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getsym'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_version'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getehdr'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `elf_getdata'
>> /usr/lib/gcc/x86_64-redhat-linux/4.8.5/../../../../lib64/libbpf.so:
>> undefined reference to `gelf_getshdr'
>> 
>> Thanks,
>> Xiaolong
>
>Looking at that list, at least the very first symbol is not used by a
>public header, but internally in libbpf:
>
>$ grep -r elf_nextscn
>libbpf.c:	while ((scn = elf_nextscn(elf, scn)) != NULL) {
>
>https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/libbpf.c#n793
>
>None of those symbols are used from the public headers:
>
>$ grep elf_ bpf.h libbpf.h btf.h
>$
>
>Therefore, this is an omission in libbpf's Makefile, as it must link
>against libelf. Please raise a ticket or send a patch upstream, and
>remove the -lelf from DPDK's makefiles.

As it may need sometime for kernel community to handle the libbpf's Makefile, 
We may still need the -lelf for af_xdp pmd currently, I'll remove it later after
libbpf correct to link against libelf. Is it acceptable?

Thanks,
Xiaolong
>
>-- 
>Kind regards,
>Luca Boccassi
Luca Boccassi March 26, 2019, 10:14 a.m. | #20
On Tue, 2019-03-26 at 10:18 +0800, Ye Xiaolong wrote:
> On 03/25, Luca Boccassi wrote:
> > On Mon, 2019-03-25 at 10:45 +0800, Ye Xiaolong wrote:
> > > On 03/24, Luca Boccassi wrote:
> > > > On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
> > > > > On 03/02, Ye Xiaolong wrote:
> > > > > > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
> > > > > > > > -lrte_pmd_af_packet
> > > > > > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
> > > > > > > > -lrte_pmd_af_xdp
> > > > > > > > -lelf -lbpf
> > > > > > > 
> > > > > > > Are symbols from libelf being used by the PMD?
> > > > > > 
> > > > > > Hmm, it is a leftover of RFC, libelf is no longer needed in
> > > > > > this
> > > > > > version, will
> > > > > > remove it in next version.
> > > > > > 
> > > > > 
> > > > > Correction, libelf is needed for libbpf, so we still need to
> > > > > keep
> > > > > it. 
> > > > 
> > > > If libbpf needs libelf for its internal usage, it should be
> > > > linked
> > > > against it itself. Unless symbols from libelf are used in
> > > > static
> > > > functions defined in libbpf's public headers. Is this the case?
> > > > 
> > > 
> > > Yes, that's the case. without the libelf, it would have build
> > > error
> > > as below,
> > > and these symbols are used in static functions of libbpf.
> > > 
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_nextscn'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_rawdata'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_memory'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `gelf_getrel'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_strptr'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_end'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_getscn'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_begin'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `gelf_getsym'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_version'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `gelf_getehdr'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `elf_getdata'
> > > /usr/lib/gcc/x86_64-redhat-
> > > linux/4.8.5/../../../../lib64/libbpf.so:
> > > undefined reference to `gelf_getshdr'
> > > 
> > > Thanks,
> > > Xiaolong
> > 
> > Looking at that list, at least the very first symbol is not used by
> > a
> > public header, but internally in libbpf:
> > 
> > $ grep -r elf_nextscn
> > libbpf.c:	while ((scn = elf_nextscn(elf, scn)) != NULL) {
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/libbpf.c#n793
> > 
> > None of those symbols are used from the public headers:
> > 
> > $ grep elf_ bpf.h libbpf.h btf.h
> > $
> > 
> > Therefore, this is an omission in libbpf's Makefile, as it must
> > link
> > against libelf. Please raise a ticket or send a patch upstream, and
> > remove the -lelf from DPDK's makefiles.
> 
> As it may need sometime for kernel community to handle the libbpf's
> Makefile, 
> We may still need the -lelf for af_xdp pmd currently, I'll remove it
> later after
> libbpf correct to link against libelf. Is it acceptable?
> 
> Thanks,
> Xiaolong

Isn't the final version not out yet anyway till May? Can the fix be
included before the release?
Ye Xiaolong March 26, 2019, 12:12 p.m. | #21
On 03/26, Luca Boccassi wrote:
>On Tue, 2019-03-26 at 10:18 +0800, Ye Xiaolong wrote:
>> On 03/25, Luca Boccassi wrote:
>> > On Mon, 2019-03-25 at 10:45 +0800, Ye Xiaolong wrote:
>> > > On 03/24, Luca Boccassi wrote:
>> > > > On Sun, 2019-03-17 at 11:34 +0800, Ye Xiaolong wrote:
>> > > > > On 03/02, Ye Xiaolong wrote:
>> > > > > > > >  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  +=
>> > > > > > > > -lrte_pmd_af_packet
>> > > > > > > > +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     +=
>> > > > > > > > -lrte_pmd_af_xdp
>> > > > > > > > -lelf -lbpf
>> > > > > > > 
>> > > > > > > Are symbols from libelf being used by the PMD?
>> > > > > > 
>> > > > > > Hmm, it is a leftover of RFC, libelf is no longer needed in
>> > > > > > this
>> > > > > > version, will
>> > > > > > remove it in next version.
>> > > > > > 
>> > > > > 
>> > > > > Correction, libelf is needed for libbpf, so we still need to
>> > > > > keep
>> > > > > it. 
>> > > > 
>> > > > If libbpf needs libelf for its internal usage, it should be
>> > > > linked
>> > > > against it itself. Unless symbols from libelf are used in
>> > > > static
>> > > > functions defined in libbpf's public headers. Is this the case?
>> > > > 
>> > > 
>> > > Yes, that's the case. without the libelf, it would have build
>> > > error
>> > > as below,
>> > > and these symbols are used in static functions of libbpf.
>> > > 
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_nextscn'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_rawdata'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_memory'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `gelf_getrel'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_strptr'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_end'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_getscn'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_begin'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `gelf_getsym'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_version'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `gelf_getehdr'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `elf_getdata'
>> > > /usr/lib/gcc/x86_64-redhat-
>> > > linux/4.8.5/../../../../lib64/libbpf.so:
>> > > undefined reference to `gelf_getshdr'
>> > > 
>> > > Thanks,
>> > > Xiaolong
>> > 
>> > Looking at that list, at least the very first symbol is not used by
>> > a
>> > public header, but internally in libbpf:
>> > 
>> > $ grep -r elf_nextscn
>> > libbpf.c:	while ((scn = elf_nextscn(elf, scn)) != NULL) {
>> > 
>> > https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/libbpf.c#n793
>> > 
>> > None of those symbols are used from the public headers:
>> > 
>> > $ grep elf_ bpf.h libbpf.h btf.h
>> > $
>> > 
>> > Therefore, this is an omission in libbpf's Makefile, as it must
>> > link
>> > against libelf. Please raise a ticket or send a patch upstream, and
>> > remove the -lelf from DPDK's makefiles.
>> 
>> As it may need sometime for kernel community to handle the libbpf's
>> Makefile, 
>> We may still need the -lelf for af_xdp pmd currently, I'll remove it
>> later after
>> libbpf correct to link against libelf. Is it acceptable?
>> 
>> Thanks,
>> Xiaolong
>
>Isn't the final version not out yet anyway till May? Can the fix be
>included before the release?

I think so.

Thanks,
Xiaolong
>
>-- 
>Kind regards,
>Luca Boccassi

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 15c53888c..baa92a732 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -469,6 +469,12 @@  M: John W. Linville <linville@tuxdriver.com>
 F: drivers/net/af_packet/
 F: doc/guides/nics/features/afpacket.ini
 
+Linux AF_XDP
+M: Xiaolong Ye <xiaolong.ye@intel.com>
+M: Qi Zhang <qi.z.zhang@intel.com>
+F: drivers/net/af_xdp/
+F: doc/guides/nics/features/af_xdp.rst
+
 Amazon ENA
 M: Marcin Wojtas <mw@semihalf.com>
 M: Michal Krawczyk <mk@semihalf.com>
diff --git a/config/common_base b/config/common_base
index 7c6da5165..c45d2dad1 100644
--- a/config/common_base
+++ b/config/common_base
@@ -416,6 +416,11 @@  CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
 #
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
 
+#
+# Compile software PMD backed by AF_XDP sockets (Linux only)
+#
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
+
 #
 # Compile link bonding PMD library
 #
diff --git a/doc/guides/nics/af_xdp.rst b/doc/guides/nics/af_xdp.rst
new file mode 100644
index 000000000..126d9df3c
--- /dev/null
+++ b/doc/guides/nics/af_xdp.rst
@@ -0,0 +1,43 @@ 
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2018 Intel Corporation.
+
+AF_XDP Poll Mode Driver
+==========================
+
+AF_XDP is an address family that is optimized for high performance
+packet processing. AF_XDP sockets enable the possibility for XDP program to
+redirect packets to a memory buffer in userspace.
+
+For the full details behind AF_XDP socket, you can refer to
+`AF_XDP documentation in the Kernel
+<https://www.kernel.org/doc/Documentation/networking/af_xdp.rst>`_.
+
+This Linux-specific PMD driver creates the AF_XDP socket and binds it to a
+specific netdev queue, it allows a DPDK application to send and receive raw
+packets through the socket which would bypass the kernel network stack.
+
+Options
+-------
+
+The following options can be provided to set up an af_xdp port in DPDK.
+
+*   ``iface`` - name of the Kernel interface to attach to (required);
+*   ``queue`` - netdev queue id (optional, default 0);
+
+Prerequisites
+-------------
+
+This is a Linux-specific PMD, thus the following prerequisites apply:
+
+*  A Linux Kernel with XDP sockets configuration enabled;
+*  libbpf with latest af_xdp support installed
+*  A Kernel bound interface to attach to.
+
+Set up an af_xdp interface
+-----------------------------
+
+The following example will set up an af_xdp interface in DPDK:
+
+.. code-block:: console
+
+    --vdev eth_af_xdp,iface=ens786f1,queue=0
diff --git a/doc/guides/rel_notes/release_18_11.rst b/doc/guides/rel_notes/release_18_11.rst
index 65bab557d..e0918441a 100644
--- a/doc/guides/rel_notes/release_18_11.rst
+++ b/doc/guides/rel_notes/release_18_11.rst
@@ -229,6 +229,13 @@  New Features
   The AESNI MB PMD has been updated with additional support for the AES-GCM
   algorithm.
 
+* **Added the AF_XDP PMD.**
+
+  Added a Linux-specific PMD driver for AF_XDP, it can create the AF_XDP socket
+  and bind it to a specific netdev queue, it allows a DPDK application to send
+  and receive raw packets through the socket which would bypass the kernel
+  network stack to achieve high performance packet processing.
+
 * **Added NXP CAAM JR PMD.**
 
   Added the new caam job ring driver for NXP platforms. See the
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 670d7f75a..93cccd2a8 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -9,6 +9,7 @@  ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
 endif
 
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
 DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
 DIRS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD) += atlantic
 DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
diff --git a/drivers/net/af_xdp/Makefile b/drivers/net/af_xdp/Makefile
new file mode 100644
index 000000000..e3755fff2
--- /dev/null
+++ b/drivers/net/af_xdp/Makefile
@@ -0,0 +1,31 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_af_xdp.a
+
+EXPORT_MAP := rte_pmd_af_xdp_version.map
+
+LIBABIVER := 1
+
+
+CFLAGS += -O3
+# below line should be removed
+CFLAGS += -I/root/yexl/shared_mks0/linux/tools/include
+CFLAGS += -I/root/yexl/shared_mks0/linux/tools/lib/bpf
+
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_vdev
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/af_xdp/meson.build b/drivers/net/af_xdp/meson.build
new file mode 100644
index 000000000..4b6652685
--- /dev/null
+++ b/drivers/net/af_xdp/meson.build
@@ -0,0 +1,7 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+if host_machine.system() != 'linux'
+	build = false
+endif
+sources = files('rte_eth_af_xdp.c')
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
new file mode 100644
index 000000000..6de769650
--- /dev/null
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -0,0 +1,903 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
+#include <rte_bus_vdev.h>
+
+#include <linux/if_ether.h>
+#include <linux/if_xdp.h>
+#include <linux/if_link.h>
+#include <asm/barrier.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <bpf/bpf.h>
+#include <xsk.h>
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define ETH_AF_XDP_IFACE_ARG			"iface"
+#define ETH_AF_XDP_QUEUE_IDX_ARG		"queue"
+
+#define ETH_AF_XDP_FRAME_SIZE		XSK_UMEM__DEFAULT_FRAME_SIZE
+#define ETH_AF_XDP_NUM_BUFFERS		4096
+#define ETH_AF_XDP_DATA_HEADROOM	0
+#define ETH_AF_XDP_DFLT_NUM_DESCS	XSK_RING_CONS__DEFAULT_NUM_DESCS
+#define ETH_AF_XDP_DFLT_QUEUE_IDX	0
+
+#define ETH_AF_XDP_RX_BATCH_SIZE	32
+#define ETH_AF_XDP_TX_BATCH_SIZE	32
+
+#define ETH_AF_XDP_MAX_QUEUE_PAIRS     16
+
+struct xsk_umem_info {
+	struct xsk_ring_prod fq;
+	struct xsk_ring_cons cq;
+	struct xsk_umem *umem;
+	struct rte_ring *buf_ring;
+	void *buffer;
+};
+
+struct pkt_rx_queue {
+	struct xsk_ring_cons rx;
+	struct xsk_umem_info *umem;
+	struct xsk_socket *xsk;
+	struct rte_mempool *mb_pool;
+
+	unsigned long rx_pkts;
+	unsigned long rx_bytes;
+	unsigned long rx_dropped;
+
+	struct pkt_tx_queue *pair;
+	uint16_t queue_idx;
+};
+
+struct pkt_tx_queue {
+	struct xsk_ring_prod tx;
+
+	unsigned long tx_pkts;
+	unsigned long err_pkts;
+	unsigned long tx_bytes;
+
+	struct pkt_rx_queue *pair;
+	uint16_t queue_idx;
+};
+
+struct pmd_internals {
+	int if_index;
+	char if_name[IFNAMSIZ];
+	uint16_t queue_idx;
+	struct ether_addr eth_addr;
+	struct xsk_umem_info *umem;
+	struct rte_mempool *mb_pool_share;
+
+	struct pkt_rx_queue rx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS];
+	struct pkt_tx_queue tx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS];
+};
+
+static const char * const valid_arguments[] = {
+	ETH_AF_XDP_IFACE_ARG,
+	ETH_AF_XDP_QUEUE_IDX_ARG,
+	NULL
+};
+
+static struct rte_eth_link pmd_link = {
+	.link_speed = ETH_SPEED_NUM_10G,
+	.link_duplex = ETH_LINK_FULL_DUPLEX,
+	.link_status = ETH_LINK_DOWN,
+	.link_autoneg = ETH_LINK_AUTONEG
+};
+
+static inline int
+reserve_fill_queue(struct xsk_umem_info *umem, int reserve_size)
+{
+	struct xsk_ring_prod *fq = &umem->fq;
+	uint32_t idx;
+	void *addr = NULL;
+	int i, ret = 0;
+
+	ret = xsk_ring_prod__reserve(fq, reserve_size, &idx);
+	if (!ret) {
+		RTE_LOG(ERR, PMD, "Failed to reserve enough fq descs.\n");
+		return ret;
+	}
+
+	for (i = 0; i < reserve_size; i++) {
+		rte_ring_dequeue(umem->buf_ring, &addr);
+		*xsk_ring_prod__fill_addr(fq, idx++) = (uint64_t)addr;
+	}
+
+	xsk_ring_prod__submit(fq, reserve_size);
+
+	return 0;
+}
+
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct pkt_rx_queue *rxq = queue;
+	struct xsk_ring_cons *rx = &rxq->rx;
+	struct xsk_umem_info *umem = rxq->umem;
+	struct xsk_ring_prod *fq = &umem->fq;
+	uint32_t idx_rx;
+	uint32_t free_thresh = fq->size >> 1;
+	struct rte_mbuf *mbuf;
+	unsigned long dropped = 0;
+	unsigned long rx_bytes = 0;
+	uint16_t count = 0;
+	int rcvd, i;
+
+	nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
+		nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
+
+	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
+	if (!rcvd)
+		return 0;
+
+	if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
+		(void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE);
+
+	for (i = 0; i < rcvd; i++) {
+		uint64_t addr = xsk_ring_cons__rx_desc(rx, idx_rx)->addr;
+		uint32_t len = xsk_ring_cons__rx_desc(rx, idx_rx++)->len;
+		char *pkt = xsk_umem__get_data(rxq->umem->buffer, addr);
+
+		mbuf = rte_pktmbuf_alloc(rxq->mb_pool);
+		if (mbuf) {
+			memcpy(rte_pktmbuf_mtod(mbuf, void*), pkt, len);
+			rte_pktmbuf_pkt_len(mbuf) =
+				rte_pktmbuf_data_len(mbuf) = len;
+			rx_bytes += len;
+			bufs[count++] = mbuf;
+		} else {
+			dropped++;
+		}
+		rte_ring_enqueue(umem->buf_ring, (void *)addr);
+	}
+
+	xsk_ring_cons__release(rx, rcvd);
+
+	/* statistics */
+	rxq->rx_pkts += (rcvd - dropped);
+	rxq->rx_bytes += rx_bytes;
+	rxq->rx_dropped += dropped;
+
+	return count;
+}
+
+static void pull_umem_cq(struct xsk_umem_info *umem, int size)
+{
+	struct xsk_ring_cons *cq = &umem->cq;
+	int i, n;
+	uint32_t idx_cq;
+	uint64_t addr;
+
+	n = xsk_ring_cons__peek(cq, size, &idx_cq);
+	if (n > 0) {
+		for (i = 0; i < n; i++) {
+			addr = *xsk_ring_cons__comp_addr(cq,
+							 idx_cq++);
+			rte_ring_enqueue(umem->buf_ring, (void *)addr);
+		}
+
+		xsk_ring_cons__release(cq, n);
+	}
+}
+
+static void kick_tx(struct pkt_tx_queue *txq)
+{
+	struct xsk_umem_info *umem = txq->pair->umem;
+	int ret;
+
+	while (1) {
+		ret = sendto(xsk_socket__fd(txq->pair->xsk), NULL, 0,
+			     MSG_DONTWAIT, NULL, 0);
+
+		/* everything is ok */
+		if (ret >= 0)
+			break;
+
+		/* some thing unexpected */
+		if (errno != EBUSY && errno != EAGAIN)
+			break;
+
+		/* pull from complete qeueu to leave more space */
+		if (errno == EAGAIN)
+			pull_umem_cq(umem, ETH_AF_XDP_TX_BATCH_SIZE);
+	}
+}
+
+static uint16_t
+eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct pkt_tx_queue *txq = queue;
+	struct xsk_umem_info *umem = txq->pair->umem;
+	struct rte_mbuf *mbuf;
+	void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
+	unsigned long tx_bytes = 0;
+	int i, valid = 0;
+	uint32_t idx_tx;
+
+	nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
+		nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
+
+	pull_umem_cq(umem, nb_pkts);
+
+	nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
+					nb_pkts, NULL);
+	if (!nb_pkts)
+		return 0;
+
+	if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx)
+			!= nb_pkts)
+		return 0;
+
+	for (i = 0; i < nb_pkts; i++) {
+		struct xdp_desc *desc;
+		char *pkt;
+		unsigned int buf_len = ETH_AF_XDP_FRAME_SIZE
+					- ETH_AF_XDP_DATA_HEADROOM;
+		desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
+		mbuf = bufs[i];
+		if (mbuf->pkt_len <= buf_len) {
+			desc->addr = (uint64_t)addrs[valid];
+			desc->len = mbuf->pkt_len;
+			pkt = xsk_umem__get_data(umem->buffer,
+						 desc->addr);
+			memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+			       desc->len);
+			valid++;
+			tx_bytes += mbuf->pkt_len;
+		}
+		rte_pktmbuf_free(mbuf);
+	}
+
+	xsk_ring_prod__submit(&txq->tx, nb_pkts);
+
+	kick_tx(txq);
+
+	if (valid < nb_pkts)
+		rte_ring_enqueue_bulk(umem->buf_ring, &addrs[valid],
+				 nb_pkts - valid, NULL);
+
+	txq->err_pkts += nb_pkts - valid;
+	txq->tx_pkts += valid;
+	txq->tx_bytes += tx_bytes;
+
+	return nb_pkts;
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+	dev->data->dev_link.link_status = ETH_LINK_UP;
+
+	return 0;
+}
+
+/* This function gets called when the current port gets stopped. */
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+	dev->data->dev_link.link_status = ETH_LINK_DOWN;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+	/* rx/tx must be paired */
+	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	dev_info->if_index = internals->if_index;
+	dev_info->max_mac_addrs = 1;
+	dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
+	dev_info->max_rx_queues = 1;
+	dev_info->max_tx_queues = 1;
+	dev_info->min_rx_bufsize = 0;
+
+	dev_info->default_rxportconf.nb_queues = 1;
+	dev_info->default_txportconf.nb_queues = 1;
+	dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
+	dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
+}
+
+static int
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	struct xdp_statistics xdp_stats;
+	struct pkt_rx_queue *rxq;
+	socklen_t optlen;
+	int i;
+
+	optlen = sizeof(struct xdp_statistics);
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		rxq = &internals->rx_queues[i];
+		stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
+		stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
+
+		stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
+		stats->q_errors[i] = internals->tx_queues[i].err_pkts;
+		stats->q_obytes[i] = internals->tx_queues[i].tx_bytes;
+
+		stats->ipackets += stats->q_ipackets[i];
+		stats->ibytes += stats->q_ibytes[i];
+		stats->imissed += internals->rx_queues[i].rx_dropped;
+		getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP, XDP_STATISTICS,
+				&xdp_stats, &optlen);
+		stats->imissed += xdp_stats.rx_dropped;
+
+		stats->opackets += stats->q_opackets[i];
+		stats->oerrors += stats->q_errors[i];
+		stats->obytes += stats->q_obytes[i];
+	}
+
+	return 0;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	int i;
+
+	for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+		internals->rx_queues[i].rx_pkts = 0;
+		internals->rx_queues[i].rx_bytes = 0;
+		internals->rx_queues[i].rx_dropped = 0;
+
+		internals->tx_queues[i].tx_pkts = 0;
+		internals->tx_queues[i].err_pkts = 0;
+		internals->tx_queues[i].tx_bytes = 0;
+	}
+}
+
+static void remove_xdp_program(struct pmd_internals *internals)
+{
+	uint32_t curr_prog_id = 0;
+
+	if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
+				XDP_FLAGS_UPDATE_IF_NOEXIST)) {
+		RTE_LOG(ERR, PMD, "bpf_get_link_xdp_id failed\n");
+		return;
+	}
+	bpf_set_link_xdp_fd(internals->if_index, -1,
+			XDP_FLAGS_UPDATE_IF_NOEXIST);
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	struct pkt_rx_queue *rxq;
+	int i;
+
+	RTE_LOG(INFO, PMD, "Closing AF_XDP ethdev on numa socket %u\n",
+		rte_socket_id());
+
+	for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+		rxq = &internals->rx_queues[i];
+		if (!rxq->umem)
+			break;
+		xsk_socket__delete(rxq->xsk);
+	}
+
+	(void)xsk_umem__delete(internals->umem->umem);
+	remove_xdp_program(internals);
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+		int wait_to_complete __rte_unused)
+{
+	return 0;
+}
+
+static void xdp_umem_destroy(struct xsk_umem_info *umem)
+{
+	if (umem->buffer)
+		free(umem->buffer);
+
+	free(umem);
+}
+
+static struct xsk_umem_info *xdp_umem_configure(void)
+{
+	struct xsk_umem_info *umem;
+	struct xsk_umem_config usr_config = {
+		.fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
+		.comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
+		.frame_size = ETH_AF_XDP_FRAME_SIZE,
+		.frame_headroom = ETH_AF_XDP_DATA_HEADROOM };
+	void *bufs = NULL;
+	char ring_name[0x100];
+	int ret;
+	uint64_t i;
+
+	umem = calloc(1, sizeof(*umem));
+	if (!umem) {
+		RTE_LOG(ERR, PMD, "Failed to allocate umem info");
+		return NULL;
+	}
+
+	snprintf(ring_name, 0x100, "af_xdp_ring");
+	umem->buf_ring = rte_ring_create(ring_name,
+					 ETH_AF_XDP_NUM_BUFFERS,
+					 SOCKET_ID_ANY,
+					 0x0);
+	if (!umem->buf_ring) {
+		RTE_LOG(ERR, PMD,
+			"Failed to create rte_ring\n");
+		goto err;
+	}
+
+	for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
+		rte_ring_enqueue(umem->buf_ring,
+				 (void *)(i * ETH_AF_XDP_FRAME_SIZE +
+					  ETH_AF_XDP_DATA_HEADROOM));
+
+	if (posix_memalign(&bufs, getpagesize(),
+			   ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE)) {
+		RTE_LOG(ERR, PMD, "Failed to allocate memory pool.\n");
+		goto err;
+	}
+	ret = xsk_umem__create(&umem->umem, bufs,
+			       ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
+			       &umem->fq, &umem->cq,
+			       &usr_config);
+
+	if (ret) {
+		RTE_LOG(ERR, PMD, "Failed to create umem");
+		goto err;
+	}
+	umem->buffer = bufs;
+
+	return umem;
+
+err:
+	xdp_umem_destroy(umem);
+	return NULL;
+}
+
+static int
+xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
+	      int ring_size)
+{
+	struct xsk_socket_config cfg;
+	struct pkt_tx_queue *txq = rxq->pair;
+	int ret = 0;
+	int reserve_size;
+
+	rxq->umem = xdp_umem_configure();
+	if (!rxq->umem) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	cfg.rx_size = ring_size;
+	cfg.tx_size = ring_size;
+	cfg.libbpf_flags = 0;
+	cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+	cfg.bind_flags = 0;
+	ret = xsk_socket__create(&rxq->xsk, internals->if_name,
+			internals->queue_idx, rxq->umem->umem, &rxq->rx,
+			&txq->tx, &cfg);
+	if (ret) {
+		RTE_LOG(ERR, PMD, "Failed to create xsk socket.\n");
+		goto err;
+	}
+
+	reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS / 2;
+	ret = reserve_fill_queue(rxq->umem, reserve_size);
+	if (ret) {
+		RTE_LOG(ERR, PMD, "Failed to reserve fill queue.\n");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	xdp_umem_destroy(rxq->umem);
+
+	return ret;
+}
+
+static void
+queue_reset(struct pmd_internals *internals, uint16_t queue_idx)
+{
+	struct pkt_rx_queue *rxq = &internals->rx_queues[queue_idx];
+	struct pkt_tx_queue *txq = rxq->pair;
+	int xsk_fd = xsk_socket__fd(rxq->xsk);
+
+	if (xsk_fd) {
+		close(xsk_fd);
+		if (internals->umem) {
+			xdp_umem_destroy(internals->umem);
+			internals->umem = NULL;
+		}
+	}
+	memset(rxq, 0, sizeof(*rxq));
+	memset(txq, 0, sizeof(*txq));
+	rxq->pair = txq;
+	txq->pair = rxq;
+	rxq->queue_idx = queue_idx;
+	txq->queue_idx = queue_idx;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,
+		   uint16_t rx_queue_id,
+		   uint16_t nb_rx_desc,
+		   unsigned int socket_id __rte_unused,
+		   const struct rte_eth_rxconf *rx_conf __rte_unused,
+		   struct rte_mempool *mb_pool)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	unsigned int buf_size, data_size;
+	struct pkt_rx_queue *rxq;
+	int ret = 0;
+
+	if (mb_pool == NULL) {
+		RTE_LOG(ERR, PMD,
+			"Invalid mb_pool\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (dev->data->nb_rx_queues <= rx_queue_id) {
+		RTE_LOG(ERR, PMD,
+			"Invalid rx queue id: %d\n", rx_queue_id);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	rxq = &internals->rx_queues[rx_queue_id];
+	queue_reset(internals, rx_queue_id);
+
+	/* Now get the space available for data in the mbuf */
+	buf_size = rte_pktmbuf_data_room_size(mb_pool) -
+		RTE_PKTMBUF_HEADROOM;
+	data_size = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
+
+	if (data_size > buf_size) {
+		RTE_LOG(ERR, PMD,
+			"%s: %d bytes will not fit in mbuf (%d bytes)\n",
+			dev->device->name, data_size, buf_size);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	rxq->mb_pool = mb_pool;
+
+	if (xsk_configure(internals, rxq, nb_rx_desc)) {
+		RTE_LOG(ERR, PMD,
+			"Failed to configure xdp socket\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	internals->umem = rxq->umem;
+
+	dev->data->rx_queues[rx_queue_id] = rxq;
+	return 0;
+
+err:
+	queue_reset(internals, rx_queue_id);
+	return ret;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev,
+		   uint16_t tx_queue_id,
+		   uint16_t nb_tx_desc,
+		   unsigned int socket_id __rte_unused,
+		   const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	struct pkt_tx_queue *txq;
+
+	if (dev->data->nb_tx_queues <= tx_queue_id) {
+		RTE_LOG(ERR, PMD, "Invalid tx queue id: %d\n", tx_queue_id);
+		return -EINVAL;
+	}
+
+	RTE_LOG(WARNING, PMD, "tx queue setup size=%d will be skipped\n",
+		nb_tx_desc);
+	txq = &internals->tx_queues[tx_queue_id];
+
+	dev->data->tx_queues[tx_queue_id] = txq;
+	return 0;
+}
+
+static int
+eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	struct ifreq ifr = { .ifr_mtu = mtu };
+	int ret;
+	int s;
+
+	s = socket(PF_INET, SOCK_DGRAM, 0);
+	if (s < 0)
+		return -EINVAL;
+
+	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
+	ret = ioctl(s, SIOCSIFMTU, &ifr);
+	close(s);
+
+	if (ret < 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void
+eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
+{
+	struct ifreq ifr;
+	int s;
+
+	s = socket(PF_INET, SOCK_DGRAM, 0);
+	if (s < 0)
+		return;
+
+	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
+	if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
+		goto out;
+	ifr.ifr_flags &= mask;
+	ifr.ifr_flags |= flags;
+	if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
+		goto out;
+out:
+	close(s);
+}
+
+static void
+eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
+}
+
+static void
+eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
+}
+
+static const struct eth_dev_ops ops = {
+	.dev_start = eth_dev_start,
+	.dev_stop = eth_dev_stop,
+	.dev_close = eth_dev_close,
+	.dev_configure = eth_dev_configure,
+	.dev_infos_get = eth_dev_info,
+	.mtu_set = eth_dev_mtu_set,
+	.promiscuous_enable = eth_dev_promiscuous_enable,
+	.promiscuous_disable = eth_dev_promiscuous_disable,
+	.rx_queue_setup = eth_rx_queue_setup,
+	.tx_queue_setup = eth_tx_queue_setup,
+	.rx_queue_release = eth_queue_release,
+	.tx_queue_release = eth_queue_release,
+	.link_update = eth_link_update,
+	.stats_get = eth_stats_get,
+	.stats_reset = eth_stats_reset,
+};
+
+static struct rte_vdev_driver pmd_af_xdp_drv;
+
+static void
+parse_parameters(struct rte_kvargs *kvlist,
+		 char **if_name,
+		 int *queue_idx)
+{
+	struct rte_kvargs_pair *pair = NULL;
+	unsigned int k_idx;
+
+	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
+		pair = &kvlist->pairs[k_idx];
+		if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
+			*if_name = pair->value;
+		else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
+			*queue_idx = atoi(pair->value);
+	}
+}
+
+static int
+get_iface_info(const char *if_name,
+	       struct ether_addr *eth_addr,
+	       int *if_index)
+{
+	struct ifreq ifr;
+	int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+
+	if (sock < 0)
+		return -1;
+
+	strcpy(ifr.ifr_name, if_name);
+	if (ioctl(sock, SIOCGIFINDEX, &ifr))
+		goto error;
+
+	if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+		goto error;
+
+	memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
+
+	close(sock);
+	*if_index = if_nametoindex(if_name);
+	return 0;
+
+error:
+	close(sock);
+	return -1;
+}
+
+static int
+init_internals(struct rte_vdev_device *dev,
+	       const char *if_name,
+	       int queue_idx)
+{
+	const char *name = rte_vdev_device_name(dev);
+	struct rte_eth_dev *eth_dev = NULL;
+	const unsigned int numa_node = dev->device.numa_node;
+	struct pmd_internals *internals = NULL;
+	int ret;
+	int i;
+
+	internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+	if (!internals)
+		return -ENOMEM;
+
+	internals->queue_idx = queue_idx;
+	strcpy(internals->if_name, if_name);
+
+	for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+		internals->tx_queues[i].pair = &internals->rx_queues[i];
+		internals->rx_queues[i].pair = &internals->tx_queues[i];
+	}
+
+	ret = get_iface_info(if_name, &internals->eth_addr,
+			     &internals->if_index);
+	if (ret)
+		goto err;
+
+	eth_dev = rte_eth_vdev_allocate(dev, 0);
+	if (!eth_dev)
+		goto err;
+
+	eth_dev->data->dev_private = internals;
+	eth_dev->data->dev_link = pmd_link;
+	eth_dev->data->mac_addrs = &internals->eth_addr;
+	eth_dev->dev_ops = &ops;
+	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
+	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
+
+	rte_eth_dev_probing_finish(eth_dev);
+	return 0;
+
+err:
+	rte_free(internals);
+	return -1;
+}
+
+static int
+rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
+{
+	struct rte_kvargs *kvlist;
+	char *if_name = NULL;
+	int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
+	struct rte_eth_dev *eth_dev;
+	const char *name;
+	int ret;
+
+	RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
+		rte_vdev_device_name(dev));
+
+	name = rte_vdev_device_name(dev);
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
+		strlen(rte_vdev_device_args(dev)) == 0) {
+		eth_dev = rte_eth_dev_attach_secondary(name);
+		if (!eth_dev) {
+			RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
+			return -EINVAL;
+		}
+		eth_dev->dev_ops = &ops;
+		rte_eth_dev_probing_finish(eth_dev);
+	}
+
+	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+	if (!kvlist) {
+		RTE_LOG(ERR, PMD,
+			"Invalid kvargs\n");
+		return -EINVAL;
+	}
+
+	if (dev->device.numa_node == SOCKET_ID_ANY)
+		dev->device.numa_node = rte_socket_id();
+
+	parse_parameters(kvlist, &if_name,
+			 &queue_idx);
+
+	ret = init_internals(dev, if_name, queue_idx);
+
+	rte_kvargs_free(kvlist);
+
+	return ret;
+}
+
+static int
+rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
+{
+	struct rte_eth_dev *eth_dev = NULL;
+	struct pmd_internals *internals;
+
+	RTE_LOG(INFO, PMD, "Removing AF_XDP ethdev on numa socket %u\n",
+		rte_socket_id());
+
+	if (!dev)
+		return -1;
+
+	/* find the ethdev entry */
+	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
+	if (!eth_dev)
+		return -1;
+
+	internals = eth_dev->data->dev_private;
+
+	rte_ring_free(internals->umem->buf_ring);
+	rte_free(internals->umem->buffer);
+	rte_free(internals->umem);
+	rte_free(internals);
+
+	rte_eth_dev_release_port(eth_dev);
+
+
+	return 0;
+}
+
+static struct rte_vdev_driver pmd_af_xdp_drv = {
+	.probe = rte_pmd_af_xdp_probe,
+	.remove = rte_pmd_af_xdp_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
+RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
+RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
+			      "iface=<string> "
+			      "queue=<int> ");
diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
new file mode 100644
index 000000000..ef3539840
--- /dev/null
+++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
@@ -0,0 +1,4 @@ 
+DPDK_2.0 {
+
+	local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index d0ab942d5..db3271c7b 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -143,6 +143,7 @@  _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL)  += -lrte_mempool_dpaa2
 endif
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp -lelf -lbpf
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD)        += -lrte_pmd_ark
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ATLANTIC_PMD)   += -lrte_pmd_atlantic
 _LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD)        += -lrte_pmd_avf