[dpdk-dev,RFC,v2] vhost: Add VHOST PMD

Message ID 1440993326-21205-2-git-send-email-mukawa@igel.co.jp (mailing list archive)
State Changes Requested, archived
Headers

Commit Message

Tetsuya Mukawa Aug. 31, 2015, 3:55 a.m. UTC
  The patch introduces a new PMD. This PMD is implemented as thin wrapper
of librte_vhost. It means librte_vhost is also needed to compile the PMD.
The PMD can have 'iface' parameter like below to specify a path to connect
to a virtio-net device.

$ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i

To connect above testpmd, here is qemu command example.

$ qemu-system-x86_64 \
        <snip>
        -chardev socket,id=chr0,path=/tmp/sock0 \
        -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
        -device virtio-net-pci,netdev=net0

Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
---
 config/common_linuxapp                      |   6 +
 drivers/net/Makefile                        |   4 +
 drivers/net/vhost/Makefile                  |  61 +++
 drivers/net/vhost/rte_eth_vhost.c           | 640 ++++++++++++++++++++++++++++
 drivers/net/vhost/rte_pmd_vhost_version.map |   4 +
 mk/rte.app.mk                               |   8 +-
 6 files changed, 722 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/vhost/Makefile
 create mode 100644 drivers/net/vhost/rte_eth_vhost.c
 create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map
  

Comments

Loftus, Ciara Sept. 23, 2015, 5:47 p.m. UTC | #1
> The patch introduces a new PMD. This PMD is implemented as thin wrapper
> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
> The PMD can have 'iface' parameter like below to specify a path to connect
> to a virtio-net device.
> 
> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
> 
> To connect above testpmd, here is qemu command example.
> 
> $ qemu-system-x86_64 \
>         <snip>
>         -chardev socket,id=chr0,path=/tmp/sock0 \
>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>         -device virtio-net-pci,netdev=net0
> 
> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> ---
>  config/common_linuxapp                      |   6 +
>  drivers/net/Makefile                        |   4 +
>  drivers/net/vhost/Makefile                  |  61 +++
>  drivers/net/vhost/rte_eth_vhost.c           | 640
> ++++++++++++++++++++++++++++
>  drivers/net/vhost/rte_pmd_vhost_version.map |   4 +
>  mk/rte.app.mk                               |   8 +-
>  6 files changed, 722 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/net/vhost/Makefile
>  create mode 100644 drivers/net/vhost/rte_eth_vhost.c
>  create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map
> 
> diff --git a/config/common_linuxapp b/config/common_linuxapp
> index 0de43d5..7310240 100644
> --- a/config/common_linuxapp
> +++ b/config/common_linuxapp
> @@ -446,6 +446,12 @@ CONFIG_RTE_LIBRTE_VHOST_NUMA=n
>  CONFIG_RTE_LIBRTE_VHOST_DEBUG=n
> 
>  #
> +# Compile vhost PMD
> +# To compile, CONFIG_RTE_LIBRTE_VHOST should be enabled.
> +#
> +CONFIG_RTE_LIBRTE_PMD_VHOST=y
> +
> +#
>  #Compile Xen domain0 support
>  #
>  CONFIG_RTE_LIBRTE_XEN_DOM0=n
> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index 5ebf963..e46a38e 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -49,5 +49,9 @@ DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio
>  DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3
>  DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt
> 
> +ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
> +endif # $(CONFIG_RTE_LIBRTE_VHOST)
> +
>  include $(RTE_SDK)/mk/rte.sharelib.mk
>  include $(RTE_SDK)/mk/rte.subdir.mk
> diff --git a/drivers/net/vhost/Makefile b/drivers/net/vhost/Makefile
> new file mode 100644
> index 0000000..018edde
> --- /dev/null
> +++ b/drivers/net/vhost/Makefile
> @@ -0,0 +1,61 @@
> +#   BSD LICENSE
> +#
> +#   Copyright (c) 2010-2015 Intel Corporation.
> +#   All rights reserved.
> +#
> +#   Redistribution and use in source and binary forms, with or without
> +#   modification, are permitted provided that the following conditions
> +#   are met:
> +#
> +#     * Redistributions of source code must retain the above copyright
> +#       notice, this list of conditions and the following disclaimer.
> +#     * Redistributions in binary form must reproduce the above copyright
> +#       notice, this list of conditions and the following disclaimer in
> +#       the documentation and/or other materials provided with the
> +#       distribution.
> +#     * Neither the name of Intel corporation nor the names of its
> +#       contributors may be used to endorse or promote products derived
> +#       from this software without specific prior written permission.
> +#
> +#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> +#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> +#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> +#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> +#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> +#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> +#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> +#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> +#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> +#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> +#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +#
> +# library name
> +#
> +LIB = librte_pmd_vhost.a
> +
> +CFLAGS += -O3
> +CFLAGS += $(WERROR_FLAGS)
> +
> +EXPORT_MAP := rte_pmd_vhost_version.map
> +
> +LIBABIVER := 1
> +
> +#
> +# all source are stored in SRCS-y
> +#
> +SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += rte_eth_vhost.c
> +
> +#
> +# Export include files
> +#
> +SYMLINK-y-include +=
> +
> +# this lib depends upon:
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_mbuf
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_ether
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_kvargs
> +
> +include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/drivers/net/vhost/rte_eth_vhost.c
> b/drivers/net/vhost/rte_eth_vhost.c
> new file mode 100644
> index 0000000..679e893
> --- /dev/null
> +++ b/drivers/net/vhost/rte_eth_vhost.c
> @@ -0,0 +1,640 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright (c) 2010-2015 Intel Corporation.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +#include <unistd.h>
> +#include <pthread.h>
> +
> +#include <rte_mbuf.h>
> +#include <rte_ethdev.h>
> +#include <rte_malloc.h>
> +#include <rte_memcpy.h>
> +#include <rte_dev.h>
> +#include <rte_kvargs.h>
> +#include <rte_virtio_net.h>
> +
> +#define ETH_VHOST_IFACE_ARG		"iface"
> +
> +static const char *drivername = "VHOST PMD";
> +
> +static const char *valid_arguments[] = {
> +	ETH_VHOST_IFACE_ARG,
> +	NULL
> +};
> +
> +static struct ether_addr base_eth_addr = {
> +	.addr_bytes = {
> +		0x56 /* V */,
> +		0x48 /* H */,
> +		0x4F /* O */,
> +		0x53 /* S */,
> +		0x54 /* T */,
> +		0x00
> +	}
> +};
> +
> +struct vhost_queue {
> +	struct virtio_net *device;
> +	struct pmd_internal *internal;
> +	struct rte_mempool *mb_pool;
> +	rte_atomic64_t rx_pkts;
> +	rte_atomic64_t tx_pkts;
> +	rte_atomic64_t err_pkts;
> +	rte_atomic16_t rx_executing;
> +	rte_atomic16_t tx_executing;
> +};
> +
> +struct pmd_internal {
> +	TAILQ_ENTRY(pmd_internal) next;
> +	char *dev_name;
> +	char *iface_name;
> +	unsigned nb_rx_queues;
> +	unsigned nb_tx_queues;
> +	rte_atomic16_t xfer;
Is this flag just used to indicate the state of the virtio_net device?
Ie. if =0 then virtio_dev=NULL and if =1 then virtio_net !=NULL & the VIRTIO_DEV_RUNNING flag is set?

> +
> +	struct vhost_queue
> rx_vhost_queues[RTE_PMD_RING_MAX_RX_RINGS];
> +	struct vhost_queue
> tx_vhost_queues[RTE_PMD_RING_MAX_TX_RINGS];
> +};
> +
> +TAILQ_HEAD(pmd_internal_head, pmd_internal);
> +static struct pmd_internal_head internals_list =
> +	TAILQ_HEAD_INITIALIZER(internals_list);
> +
> +static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
> +
> +static struct rte_eth_link pmd_link = {
> +		.link_speed = 10000,
> +		.link_duplex = ETH_LINK_FULL_DUPLEX,
> +		.link_status = 0
> +};
> +
> +static uint16_t
> +eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
> +{
> +	struct vhost_queue *r = q;
> +	uint16_t nb_rx = 0;
> +
> +	if (unlikely(r->internal == NULL))
> +		return 0;
> +
> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
> +		return 0;
> +
> +	rte_atomic16_set(&r->rx_executing, 1);
> +
> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
> +		goto out;
> +
> +	nb_rx = (uint16_t)rte_vhost_dequeue_burst(r->device,
> +			VIRTIO_TXQ, r->mb_pool, bufs, nb_bufs);
> +
> +	rte_atomic64_add(&(r->rx_pkts), nb_rx);
> +
> +out:
> +	rte_atomic16_set(&r->rx_executing, 0);
> +
> +	return nb_rx;
> +}
> +
> +static uint16_t
> +eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
> +{
> +	struct vhost_queue *r = q;
> +	uint16_t i, nb_tx = 0;
> +
> +	if (unlikely(r->internal == NULL))
> +		return 0;
> +
> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
> +		return 0;
> +
> +	rte_atomic16_set(&r->tx_executing, 1);
> +
> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
> +		goto out;
> +
> +	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
> +			VIRTIO_RXQ, bufs, nb_bufs);
> +
> +	rte_atomic64_add(&(r->tx_pkts), nb_tx);
> +	rte_atomic64_add(&(r->err_pkts), nb_bufs - nb_tx);
> +
> +	for (i = 0; likely(i < nb_tx); i++)
> +		rte_pktmbuf_free(bufs[i]);

We may not always want to free these mbufs. For example, if a call is made to rte_eth_tx_burst with buffers from another (non DPDK) source, they may not be ours to free.

> +
> +out:
> +	rte_atomic16_set(&r->tx_executing, 0);
> +
> +	return nb_tx;
> +}
> +
> +static int
> +eth_dev_configure(struct rte_eth_dev *dev __rte_unused) { return 0; }
> +
> +static int
> +eth_dev_start(struct rte_eth_dev *dev)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	return rte_vhost_driver_register(internal->iface_name);
> +}
> +
> +static void
> +eth_dev_stop(struct rte_eth_dev *dev)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	rte_vhost_driver_unregister(internal->iface_name);
> +}
> +
> +static int
> +eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
> +		   uint16_t nb_rx_desc __rte_unused,
> +		   unsigned int socket_id __rte_unused,
> +		   const struct rte_eth_rxconf *rx_conf __rte_unused,
> +		   struct rte_mempool *mb_pool)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	internal->rx_vhost_queues[rx_queue_id].mb_pool = mb_pool;
> +	dev->data->rx_queues[rx_queue_id] = &internal-
> >rx_vhost_queues[rx_queue_id];
> +	return 0;
> +}
> +
> +static int
> +eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
> +		   uint16_t nb_tx_desc __rte_unused,
> +		   unsigned int socket_id __rte_unused,
> +		   const struct rte_eth_txconf *tx_conf __rte_unused)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	dev->data->tx_queues[tx_queue_id] = &internal-
> >tx_vhost_queues[tx_queue_id];
> +	return 0;
> +}
> +
> +
> +static void
> +eth_dev_info(struct rte_eth_dev *dev,
> +	     struct rte_eth_dev_info *dev_info)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	dev_info->driver_name = drivername;
> +	dev_info->max_mac_addrs = 1;
> +	dev_info->max_rx_pktlen = (uint32_t)-1;
> +	dev_info->max_rx_queues = (uint16_t)internal->nb_rx_queues;
> +	dev_info->max_tx_queues = (uint16_t)internal->nb_tx_queues;
> +	dev_info->min_rx_bufsize = 0;
> +	dev_info->pci_dev = dev->pci_dev;
> +}
> +
> +static void
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
> +{
> +	unsigned i;
> +	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
> +	const struct pmd_internal *internal = dev->data->dev_private;
> +
> +	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
> +	     i < internal->nb_rx_queues; i++) {
> +		igb_stats->q_ipackets[i] = internal-
> >rx_vhost_queues[i].rx_pkts.cnt;
> +		rx_total += igb_stats->q_ipackets[i];
> +	}
> +
> +	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
> +	     i < internal->nb_tx_queues; i++) {
> +		igb_stats->q_opackets[i] = internal-
> >tx_vhost_queues[i].tx_pkts.cnt;
> +		igb_stats->q_errors[i] = internal-
> >tx_vhost_queues[i].err_pkts.cnt;
> +		tx_total += igb_stats->q_opackets[i];
> +		tx_err_total += igb_stats->q_errors[i];
> +	}
> +
> +	igb_stats->ipackets = rx_total;
> +	igb_stats->opackets = tx_total;
> +	igb_stats->oerrors = tx_err_total;
> +}
> +
> +static void
> +eth_stats_reset(struct rte_eth_dev *dev)
> +{
> +	unsigned i;
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	for (i = 0; i < internal->nb_rx_queues; i++)
> +		internal->rx_vhost_queues[i].rx_pkts.cnt = 0;
> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> +		internal->tx_vhost_queues[i].tx_pkts.cnt = 0;
> +		internal->tx_vhost_queues[i].err_pkts.cnt = 0;
> +	}
> +}
> +
> +static void
> +eth_queue_release(void *q __rte_unused) { ; }
> +static int
> +eth_link_update(struct rte_eth_dev *dev __rte_unused,
> +		int wait_to_complete __rte_unused) { return 0; }
> +
> +static const struct eth_dev_ops ops = {
> +	.dev_start = eth_dev_start,
> +	.dev_stop = eth_dev_stop,
> +	.dev_configure = eth_dev_configure,
> +	.dev_infos_get = eth_dev_info,
> +	.rx_queue_setup = eth_rx_queue_setup,
> +	.tx_queue_setup = eth_tx_queue_setup,
> +	.rx_queue_release = eth_queue_release,
> +	.tx_queue_release = eth_queue_release,
> +	.link_update = eth_link_update,
> +	.stats_get = eth_stats_get,
> +	.stats_reset = eth_stats_reset,
> +};
> +
> +static struct eth_driver rte_vhost_pmd = {
> +	.pci_drv = {
> +		.name = "rte_vhost_pmd",
> +		.drv_flags = RTE_PCI_DRV_DETACHABLE,
> +	},
> +};
> +
> +static struct rte_pci_id id_table;
> +
> +static inline struct pmd_internal *
> +find_internal_resource(char *ifname)
> +{
> +	int found = 0;
> +	struct pmd_internal *internal;
> +
> +	if (ifname == NULL)
> +		return NULL;
> +
> +	pthread_mutex_lock(&internal_list_lock);
> +
> +	TAILQ_FOREACH(internal, &internals_list, next) {
> +		if (!strcmp(internal->iface_name, ifname)) {
> +			found = 1;
> +			break;
> +		}
> +	}
> +
> +	pthread_mutex_unlock(&internal_list_lock);
> +
> +	if (!found)
> +		return NULL;
> +
> +	return internal;
> +}
> +
> +static int
> +new_device(struct virtio_net *dev)
> +{
> +	struct rte_eth_dev *eth_dev;
> +	struct pmd_internal *internal;
> +	struct vhost_queue *vq;
> +	unsigned i;
> +
> +	if (dev == NULL) {
> +		RTE_LOG(INFO, PMD, "invalid argument\n");
> +		return -1;
> +	}
> +
> +	internal = find_internal_resource(dev->ifname);
> +	if (internal == NULL) {
> +		RTE_LOG(INFO, PMD, "invalid device name\n");
> +		return -1;
> +	}
> +
> +	eth_dev = rte_eth_dev_allocated(internal->dev_name);
> +	if (eth_dev == NULL) {
> +		RTE_LOG(INFO, PMD, "failuer to find ethdev\n");
Typo: Failure. Same for the destroy_device function

> +		return -1;
> +	}
> +
> +	internal = eth_dev->data->dev_private;
> +
> +	for (i = 0; i < internal->nb_rx_queues; i++) {
> +		vq = &internal->rx_vhost_queues[i];
> +		vq->device = dev;
> +		vq->internal = internal;
> +	}
> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> +		vq = &internal->tx_vhost_queues[i];
> +		vq->device = dev;
> +		vq->internal = internal;
> +	}
> +
> +	dev->flags |= VIRTIO_DEV_RUNNING;
> +	dev->priv = eth_dev;
> +
> +	eth_dev->data->dev_link.link_status = 1;
> +	rte_atomic16_set(&internal->xfer, 1);
> +
> +	RTE_LOG(INFO, PMD, "New connection established\n");
> +
> +	return 0;

Some freedom is taken away if the new_device and destroy_device callbacks are implemented in the driver.
For example if one wishes to  call the rte_vhost_enable_guest_notification function when a new device is brought up. They cannot now as there is no scope to modify these callbacks, as is done in for example the vHost sample app. Is this correct?

> +}
> +
> +static void
> +destroy_device(volatile struct virtio_net *dev)
> +{
> +	struct rte_eth_dev *eth_dev;
> +	struct pmd_internal *internal;
> +	struct vhost_queue *vq;
> +	unsigned i;
> +
> +	if (dev == NULL) {
> +		RTE_LOG(INFO, PMD, "invalid argument\n");
> +		return;
> +	}
> +
> +	eth_dev = (struct rte_eth_dev *)dev->priv;
> +	if (eth_dev == NULL) {
> +		RTE_LOG(INFO, PMD, "failuer to find a ethdev\n");
> +		return;
> +	}
> +
> +	internal = eth_dev->data->dev_private;
> +
> +	/* Wait until rx/tx_pkt_burst stops accesing vhost device */
> +	rte_atomic16_set(&internal->xfer, 0);
> +	for (i = 0; i < internal->nb_rx_queues; i++) {
> +		vq = &internal->rx_vhost_queues[i];
> +		while (rte_atomic16_read(&vq->rx_executing))
> +			rte_pause();
> +	}
> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> +		vq = &internal->tx_vhost_queues[i];
> +		while (rte_atomic16_read(&vq->tx_executing))
> +			rte_pause();
> +	}
> +
> +	eth_dev->data->dev_link.link_status = 0;
> +
> +	dev->priv = NULL;
> +	dev->flags &= ~VIRTIO_DEV_RUNNING;
> +
> +	for (i = 0; i < internal->nb_rx_queues; i++) {
> +		vq = &internal->rx_vhost_queues[i];
> +		vq->device = NULL;
> +	}
> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> +		vq = &internal->tx_vhost_queues[i];
> +		vq->device = NULL;
> +	}
> +
> +	RTE_LOG(INFO, PMD, "Connection closed\n");
> +}
> +
> +static void *vhost_driver_session(void *param __rte_unused)
> +{
> +	static struct virtio_net_device_ops *vhost_ops;
> +
> +	vhost_ops = rte_zmalloc(NULL, sizeof(*vhost_ops), 0);
> +	if (vhost_ops == NULL)
> +		rte_panic("Can't allocate memory\n");
> +
> +	/* set vhost arguments */
> +	vhost_ops->new_device = new_device;
> +	vhost_ops->destroy_device = destroy_device;
> +	if (rte_vhost_driver_callback_register(vhost_ops) < 0)
> +		rte_panic("Can't register callbacks\n");
> +
> +	/* start event handling */
> +	rte_vhost_driver_session_start();
> +
> +	rte_free(vhost_ops);
> +	pthread_exit(0);
> +}
> +
> +static pthread_once_t once_cont = PTHREAD_ONCE_INIT;
> +static pthread_t session_th;
> +
> +static void vhost_driver_session_start(void)
> +{
> +	int ret;
> +
> +	ret = pthread_create(&session_th, NULL, vhost_driver_session,
> NULL);
> +	if (ret)
> +		rte_panic("Can't create a thread\n");
> +}
> +
> +static int
> +eth_dev_vhost_create(const char *name, int index,
> +		     char *iface_name,
> +		     const unsigned numa_node)
> +{
> +	struct rte_eth_dev_data *data = NULL;
> +	struct rte_pci_device *pci_dev = NULL;
> +	struct pmd_internal *internal = NULL;
> +	struct rte_eth_dev *eth_dev = NULL;
> +	struct ether_addr *eth_addr = NULL;
> +	uint16_t nb_rx_queues = 1;
> +	uint16_t nb_tx_queues = 1;
> +
> +	RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa
> socket %u\n",
> +		numa_node);
> +
> +	/* now do all data allocation - for eth_dev structure, dummy pci
> driver
> +	 * and internal (private) data
> +	 */
> +	data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
> +	if (data == NULL)
> +		goto error;
> +
> +	pci_dev = rte_zmalloc_socket(name, sizeof(*pci_dev), 0,
> numa_node);
> +	if (pci_dev == NULL)
> +		goto error;
> +
> +	internal = rte_zmalloc_socket(name, sizeof(*internal), 0,
> numa_node);
> +	if (internal == NULL)
> +		goto error;
> +
> +	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0,
> numa_node);
> +	if (eth_addr == NULL)
> +		goto error;
> +	*eth_addr = base_eth_addr;
> +	eth_addr->addr_bytes[5] = index;
> +
> +	/* reserve an ethdev entry */
> +	eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL);
> +	if (eth_dev == NULL)
> +		goto error;
> +
> +	/* now put it all together
> +	 * - store queue data in internal,
> +	 * - store numa_node info in pci_driver
> +	 * - point eth_dev_data to internal and pci_driver
> +	 * - and point eth_dev structure to new eth_dev_data structure
> +	 */
> +	internal->nb_rx_queues = nb_rx_queues;
> +	internal->nb_tx_queues = nb_tx_queues;
> +	internal->dev_name = strdup(name);
> +	if (internal->dev_name == NULL)
> +		goto error;
> +	internal->iface_name = strdup(iface_name);
> +	if (internal->iface_name == NULL)
> +		goto error;
> +
> +	pthread_mutex_lock(&internal_list_lock);
> +	TAILQ_INSERT_TAIL(&internals_list, internal, next);
> +	pthread_mutex_unlock(&internal_list_lock);
> +
> +	rte_vhost_pmd.pci_drv.name = drivername;
> +	rte_vhost_pmd.pci_drv.id_table = &id_table;
> +
> +	pci_dev->numa_node = numa_node;
> +	pci_dev->driver = &rte_vhost_pmd.pci_drv;
> +
> +	data->dev_private = internal;
> +	data->port_id = eth_dev->data->port_id;
> +	memmove(data->name, eth_dev->data->name, sizeof(data-
> >name));
> +	data->nb_rx_queues = (uint16_t)nb_rx_queues;
> +	data->nb_tx_queues = (uint16_t)nb_tx_queues;
> +	data->dev_link = pmd_link;
> +	data->mac_addrs = eth_addr;
> +
> +	/* We'll replace the 'data' originally allocated by eth_dev. So the
> +	 * vhost PMD resources won't be shared between multi processes.
> +	 */
> +	eth_dev->data = data;
> +	eth_dev->driver = &rte_vhost_pmd;
> +	eth_dev->dev_ops = &ops;
> +	eth_dev->pci_dev = pci_dev;
> +
> +	/* finally assign rx and tx ops */
> +	eth_dev->rx_pkt_burst = eth_vhost_rx;
> +	eth_dev->tx_pkt_burst = eth_vhost_tx;
> +
> +	/* start vhost driver session. It should be called only once */
> +	pthread_once(&once_cont, vhost_driver_session_start);
> +
> +	return data->port_id;
> +
> +error:
> +	rte_free(data);
> +	rte_free(pci_dev);
> +	rte_free(internal);
> +	rte_free(eth_addr);
> +
> +	return -1;
> +}
> +
> +static inline int
> +open_iface(const char *key __rte_unused, const char *value, void
> *extra_args)
> +{
> +	const char **iface_name = extra_args;
> +
> +	if (value == NULL)
> +		return -1;
> +
> +	*iface_name = value;
> +
> +	return 0;
> +}
> +
> +static int
> +rte_pmd_vhost_devinit(const char *name, const char *params)
> +{
> +	struct rte_kvargs *kvlist = NULL;
> +	int ret = 0;
> +	int index;
> +	char *iface_name;
> +
> +	RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name);
> +
> +	kvlist = rte_kvargs_parse(params, valid_arguments);
> +	if (kvlist == NULL)
> +		return -1;
> +
> +	if (strlen(name) < strlen("eth_vhost"))
> +		return -1;
> +
> +	index = strtol(name + strlen("eth_vhost"), NULL, 0);
> +	if (errno == ERANGE)
> +		return -1;
> +
> +	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
> +		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
> +					 &open_iface, &iface_name);
> +		if (ret < 0)
> +			goto out_free;
> +
> +		eth_dev_vhost_create(name, index, iface_name,
> rte_socket_id());
> +	}
> +
> +out_free:
> +	rte_kvargs_free(kvlist);
> +	return ret;
> +}
> +
> +static int
> +rte_pmd_vhost_devuninit(const char *name)
> +{
> +	struct rte_eth_dev *eth_dev = NULL;
> +	struct pmd_internal *internal;
> +
> +	RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name);
> +
> +	if (name == NULL)
> +		return -EINVAL;
> +
> +	/* find an ethdev entry */
> +	eth_dev = rte_eth_dev_allocated(name);
> +	if (eth_dev == NULL)
> +		return -ENODEV;
> +
> +	internal = eth_dev->data->dev_private;
> +
> +	pthread_mutex_lock(&internal_list_lock);
> +	TAILQ_REMOVE(&internals_list, internal, next);
> +	pthread_mutex_unlock(&internal_list_lock);
> +
> +	eth_dev_stop(eth_dev);
> +
> +	if ((internal) && (internal->dev_name))
> +		free(internal->dev_name);
> +	if ((internal) && (internal->iface_name))
> +		free(internal->iface_name);
> +	rte_free(eth_dev->data->dev_private);
> +	rte_free(eth_dev->data);
> +	rte_free(eth_dev->pci_dev);
> +
> +	rte_eth_dev_release_port(eth_dev);
> +	return 0;
> +}
> +
> +static struct rte_driver pmd_vhost_drv = {
> +	.name = "eth_vhost",
> +	.type = PMD_VDEV,
> +	.init = rte_pmd_vhost_devinit,
> +	.uninit = rte_pmd_vhost_devuninit,
> +};
> +
> +PMD_REGISTER_DRIVER(pmd_vhost_drv);
> diff --git a/drivers/net/vhost/rte_pmd_vhost_version.map
> b/drivers/net/vhost/rte_pmd_vhost_version.map
> new file mode 100644
> index 0000000..5151684
> --- /dev/null
> +++ b/drivers/net/vhost/rte_pmd_vhost_version.map
> @@ -0,0 +1,4 @@
> +DPDK_2.2 {
> +
> +	local: *;
> +};
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 3871205..1c42fb1 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -144,7 +144,13 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_PCAP)       += -
> lrte_pmd_pcap
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -
> lrte_pmd_af_packet
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL)       += -lrte_pmd_null
> 
> -endif # ! $(CONFIG_RTE_BUILD_SHARED_LIB)
> +ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
> +
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST)      += -lrte_pmd_vhost
> +
> +endif # ! $(CONFIG_RTE_LIBRTE_VHOST)
> +
> +endif # $(CONFIG_RTE_BUILD_SHARED_LIB)
> 
>  endif # ! CONFIG_RTE_BUILD_COMBINE_LIBS
> 
> --
> 2.1.4
  
Tetsuya Mukawa Oct. 16, 2015, 8:40 a.m. UTC | #2
On 2015/09/24 2:47, Loftus, Ciara wrote:
>> The patch introduces a new PMD. This PMD is implemented as thin wrapper
>> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
>> The PMD can have 'iface' parameter like below to specify a path to connect
>> to a virtio-net device.
>>
>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>
>> To connect above testpmd, here is qemu command example.
>>
>> $ qemu-system-x86_64 \
>>         <snip>
>>         -chardev socket,id=chr0,path=/tmp/sock0 \
>>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>         -device virtio-net-pci,netdev=net0
>>
>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>> ---
>>  config/common_linuxapp                      |   6 +
>>  drivers/net/Makefile                        |   4 +
>>  drivers/net/vhost/Makefile                  |  61 +++
>>  drivers/net/vhost/rte_eth_vhost.c           | 640
>> ++++++++++++++++++++++++++++
>>  drivers/net/vhost/rte_pmd_vhost_version.map |   4 +
>>  mk/rte.app.mk                               |   8 +-
>>  6 files changed, 722 insertions(+), 1 deletion(-)
>>  create mode 100644 drivers/net/vhost/Makefile
>>  create mode 100644 drivers/net/vhost/rte_eth_vhost.c
>>  create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map
>>
>> +struct pmd_internal {
>> +	TAILQ_ENTRY(pmd_internal) next;
>> +	char *dev_name;
>> +	char *iface_name;
>> +	unsigned nb_rx_queues;
>> +	unsigned nb_tx_queues;
>> +	rte_atomic16_t xfer;
> Is this flag just used to indicate the state of the virtio_net device?
> Ie. if =0 then virtio_dev=NULL and if =1 then virtio_net !=NULL & the VIRTIO_DEV_RUNNING flag is set?

Hi Clara,

I am sorry for very late reply.

Yes, it is. Probably we can optimize it more.
I will change this implementation a bit in next patches.
Could you please check it?

>> +
>> +static uint16_t
>> +eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
>> +{
>> +	struct vhost_queue *r = q;
>> +	uint16_t i, nb_tx = 0;
>> +
>> +	if (unlikely(r->internal == NULL))
>> +		return 0;
>> +
>> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
>> +		return 0;
>> +
>> +	rte_atomic16_set(&r->tx_executing, 1);
>> +
>> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
>> +		goto out;
>> +
>> +	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
>> +			VIRTIO_RXQ, bufs, nb_bufs);
>> +
>> +	rte_atomic64_add(&(r->tx_pkts), nb_tx);
>> +	rte_atomic64_add(&(r->err_pkts), nb_bufs - nb_tx);
>> +
>> +	for (i = 0; likely(i < nb_tx); i++)
>> +		rte_pktmbuf_free(bufs[i]);
> We may not always want to free these mbufs. For example, if a call is made to rte_eth_tx_burst with buffers from another (non DPDK) source, they may not be ours to free.

Sorry, I am not sure what type of buffer you want to transfer.

This is a PMD that wraps librte_vhost.
And I guess other PMDs cannot handle buffers from another non DPDK source.
Should we take care such buffers?

I have also checked af_packet PMD.
It seems the tx function of af_packet PMD just frees mbuf.

>> +
>> +
>> +	eth_dev = rte_eth_dev_allocated(internal->dev_name);
>> +	if (eth_dev == NULL) {
>> +		RTE_LOG(INFO, PMD, "failuer to find ethdev\n");
> Typo: Failure. Same for the destroy_device function

Thanks, I will fix it in next patches.

>> +		return -1;
>> +	}
>> +
>> +	internal = eth_dev->data->dev_private;
>> +
>> +	for (i = 0; i < internal->nb_rx_queues; i++) {
>> +		vq = &internal->rx_vhost_queues[i];
>> +		vq->device = dev;
>> +		vq->internal = internal;
>> +	}
>> +	for (i = 0; i < internal->nb_tx_queues; i++) {
>> +		vq = &internal->tx_vhost_queues[i];
>> +		vq->device = dev;
>> +		vq->internal = internal;
>> +	}
>> +
>> +	dev->flags |= VIRTIO_DEV_RUNNING;
>> +	dev->priv = eth_dev;
>> +
>> +	eth_dev->data->dev_link.link_status = 1;
>> +	rte_atomic16_set(&internal->xfer, 1);
>> +
>> +	RTE_LOG(INFO, PMD, "New connection established\n");
>> +
>> +	return 0;
> Some freedom is taken away if the new_device and destroy_device callbacks are implemented in the driver.
> For example if one wishes to  call the rte_vhost_enable_guest_notification function when a new device is brought up. They cannot now as there is no scope to modify these callbacks, as is done in for example the vHost sample app. Is this correct?

So how about adding one more parameter to be able to choose guest
notification behavior?

ex)
./testpmd --vdev 'eth_vhost0,iface=/tmp/sock0,guest_notification=0'

In above case, all queues in this device will have VRING_USED_F_NO_NOTIFY.

Thanks,
Tetsuya
  
Bruce Richardson Oct. 16, 2015, 12:52 p.m. UTC | #3
On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
> The patch introduces a new PMD. This PMD is implemented as thin wrapper
> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
> The PMD can have 'iface' parameter like below to specify a path to connect
> to a virtio-net device.
> 
> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
> 
> To connect above testpmd, here is qemu command example.
> 
> $ qemu-system-x86_64 \
>         <snip>
>         -chardev socket,id=chr0,path=/tmp/sock0 \
>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>         -device virtio-net-pci,netdev=net0
> 
> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>

With this PMD in place, is there any need to keep the existing vhost library
around as a separate entity? Can the existing library be subsumed/converted into
a standard PMD?

/Bruce
  
Tetsuya Mukawa Oct. 19, 2015, 1:51 a.m. UTC | #4
On 2015/10/16 21:52, Bruce Richardson wrote:
> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
>> The patch introduces a new PMD. This PMD is implemented as thin wrapper
>> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
>> The PMD can have 'iface' parameter like below to specify a path to connect
>> to a virtio-net device.
>>
>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>
>> To connect above testpmd, here is qemu command example.
>>
>> $ qemu-system-x86_64 \
>>         <snip>
>>         -chardev socket,id=chr0,path=/tmp/sock0 \
>>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>         -device virtio-net-pci,netdev=net0
>>
>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> With this PMD in place, is there any need to keep the existing vhost library
> around as a separate entity? Can the existing library be subsumed/converted into
> a standard PMD?
>
> /Bruce

Hi Bruce,

I concern about whether the PMD has all features of librte_vhost,
because librte_vhost provides more features and freedom than ethdev API
provides.
In some cases, user needs to choose limited implementation without
librte_vhost.
I am going to eliminate such cases while implementing the PMD.
But I don't have strong belief that we can remove librte_vhost now.

So how about keeping current separation in next DPDK?
I guess people will try to replace librte_vhost to vhost PMD, because
apparently using ethdev APIs will be useful in many cases.
And we will get feedbacks like "vhost PMD needs to support like this usage".
(Or we will not have feedbacks, but it's also OK.)
Then, we will be able to merge librte_vhost and vhost PMD.

Thanks,
Tetsuya
  
Loftus, Ciara Oct. 19, 2015, 9:32 a.m. UTC | #5
> On 2015/10/16 21:52, Bruce Richardson wrote:
> > On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
> >> The patch introduces a new PMD. This PMD is implemented as thin
> wrapper
> >> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
> >> The PMD can have 'iface' parameter like below to specify a path to
> connect
> >> to a virtio-net device.
> >>
> >> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
> >>
> >> To connect above testpmd, here is qemu command example.
> >>
> >> $ qemu-system-x86_64 \
> >>         <snip>
> >>         -chardev socket,id=chr0,path=/tmp/sock0 \
> >>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
> >>         -device virtio-net-pci,netdev=net0
> >>
> >> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> > With this PMD in place, is there any need to keep the existing vhost library
> > around as a separate entity? Can the existing library be
> subsumed/converted into
> > a standard PMD?
> >
> > /Bruce
> 
> Hi Bruce,
> 
> I concern about whether the PMD has all features of librte_vhost,
> because librte_vhost provides more features and freedom than ethdev API
> provides.
> In some cases, user needs to choose limited implementation without
> librte_vhost.
> I am going to eliminate such cases while implementing the PMD.
> But I don't have strong belief that we can remove librte_vhost now.
> 
> So how about keeping current separation in next DPDK?
> I guess people will try to replace librte_vhost to vhost PMD, because
> apparently using ethdev APIs will be useful in many cases.
> And we will get feedbacks like "vhost PMD needs to support like this usage".
> (Or we will not have feedbacks, but it's also OK.)
> Then, we will be able to merge librte_vhost and vhost PMD.

I agree with the above. One the concerns I had when reviewing the patch was that the PMD removes some freedom that is available with the library. Eg. Ability to implement the new_device and destroy_device callbacks. If using the PMD you are constrained to the implementations of these in the PMD driver, but if using librte_vhost, you can implement your own with whatever functionality you like - a good example of this can be seen in the vhost sample app.
On the other hand, the PMD is useful in that it removes a lot of complexity for the user and may work for some more general use cases. So I would be in favour of having both options available too.

Ciara

> 
> Thanks,
> Tetsuya
  
Bruce Richardson Oct. 19, 2015, 9:45 a.m. UTC | #6
On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
> > On 2015/10/16 21:52, Bruce Richardson wrote:
> > > On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
> > >> The patch introduces a new PMD. This PMD is implemented as thin
> > wrapper
> > >> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
> > >> The PMD can have 'iface' parameter like below to specify a path to
> > connect
> > >> to a virtio-net device.
> > >>
> > >> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
> > >>
> > >> To connect above testpmd, here is qemu command example.
> > >>
> > >> $ qemu-system-x86_64 \
> > >>         <snip>
> > >>         -chardev socket,id=chr0,path=/tmp/sock0 \
> > >>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
> > >>         -device virtio-net-pci,netdev=net0
> > >>
> > >> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> > > With this PMD in place, is there any need to keep the existing vhost library
> > > around as a separate entity? Can the existing library be
> > subsumed/converted into
> > > a standard PMD?
> > >
> > > /Bruce
> > 
> > Hi Bruce,
> > 
> > I concern about whether the PMD has all features of librte_vhost,
> > because librte_vhost provides more features and freedom than ethdev API
> > provides.
> > In some cases, user needs to choose limited implementation without
> > librte_vhost.
> > I am going to eliminate such cases while implementing the PMD.
> > But I don't have strong belief that we can remove librte_vhost now.
> > 
> > So how about keeping current separation in next DPDK?
> > I guess people will try to replace librte_vhost to vhost PMD, because
> > apparently using ethdev APIs will be useful in many cases.
> > And we will get feedbacks like "vhost PMD needs to support like this usage".
> > (Or we will not have feedbacks, but it's also OK.)
> > Then, we will be able to merge librte_vhost and vhost PMD.
> 
> I agree with the above. One the concerns I had when reviewing the patch was that the PMD removes some freedom that is available with the library. Eg. Ability to implement the new_device and destroy_device callbacks. If using the PMD you are constrained to the implementations of these in the PMD driver, but if using librte_vhost, you can implement your own with whatever functionality you like - a good example of this can be seen in the vhost sample app.
> On the other hand, the PMD is useful in that it removes a lot of complexity for the user and may work for some more general use cases. So I would be in favour of having both options available too.
> 
> Ciara
>

Thanks.
However, just because the libraries are merged does not mean that you need
be limited by PMD functionality. Many PMDs provide additional library-specific
functions over and above their PMD capabilities. The bonded PMD is a good example
here, as it has a whole set of extra functions to create and manipulate bonded
devices - things that are obviously not part of the general ethdev API. Other
vPMDs similarly include functions to allow them to be created on the fly too.

regards,
/Bruce
  
Tetsuya Mukawa Oct. 19, 2015, 10:50 a.m. UTC | #7
On 2015/10/19 18:45, Bruce Richardson wrote:
> On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
>>> On 2015/10/16 21:52, Bruce Richardson wrote:
>>>> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
>>>>> The patch introduces a new PMD. This PMD is implemented as thin
>>> wrapper
>>>>> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
>>>>> The PMD can have 'iface' parameter like below to specify a path to
>>> connect
>>>>> to a virtio-net device.
>>>>>
>>>>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>>>>
>>>>> To connect above testpmd, here is qemu command example.
>>>>>
>>>>> $ qemu-system-x86_64 \
>>>>>         <snip>
>>>>>         -chardev socket,id=chr0,path=/tmp/sock0 \
>>>>>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>>>>         -device virtio-net-pci,netdev=net0
>>>>>
>>>>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>>>> With this PMD in place, is there any need to keep the existing vhost library
>>>> around as a separate entity? Can the existing library be
>>> subsumed/converted into
>>>> a standard PMD?
>>>>
>>>> /Bruce
>>> Hi Bruce,
>>>
>>> I concern about whether the PMD has all features of librte_vhost,
>>> because librte_vhost provides more features and freedom than ethdev API
>>> provides.
>>> In some cases, user needs to choose limited implementation without
>>> librte_vhost.
>>> I am going to eliminate such cases while implementing the PMD.
>>> But I don't have strong belief that we can remove librte_vhost now.
>>>
>>> So how about keeping current separation in next DPDK?
>>> I guess people will try to replace librte_vhost to vhost PMD, because
>>> apparently using ethdev APIs will be useful in many cases.
>>> And we will get feedbacks like "vhost PMD needs to support like this usage".
>>> (Or we will not have feedbacks, but it's also OK.)
>>> Then, we will be able to merge librte_vhost and vhost PMD.
>> I agree with the above. One the concerns I had when reviewing the patch was that the PMD removes some freedom that is available with the library. Eg. Ability to implement the new_device and destroy_device callbacks. If using the PMD you are constrained to the implementations of these in the PMD driver, but if using librte_vhost, you can implement your own with whatever functionality you like - a good example of this can be seen in the vhost sample app.
>> On the other hand, the PMD is useful in that it removes a lot of complexity for the user and may work for some more general use cases. So I would be in favour of having both options available too.
>>
>> Ciara
>>
> Thanks.
> However, just because the libraries are merged does not mean that you need
> be limited by PMD functionality. Many PMDs provide additional library-specific
> functions over and above their PMD capabilities. The bonded PMD is a good example
> here, as it has a whole set of extra functions to create and manipulate bonded
> devices - things that are obviously not part of the general ethdev API. Other
> vPMDs similarly include functions to allow them to be created on the fly too.
>
> regards,
> /Bruce

Hi Bruce,

I appreciate for showing a good example. I haven't noticed the PMD.
I will check the bonding PMD, and try to remove librte_vhost without
losing freedom and features of the library.

Regards,
Tetsuya
  
Panu Matilainen Oct. 19, 2015, 1:26 p.m. UTC | #8
On 10/19/2015 01:50 PM, Tetsuya Mukawa wrote:
> On 2015/10/19 18:45, Bruce Richardson wrote:
>> On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
>>>> On 2015/10/16 21:52, Bruce Richardson wrote:
>>>>> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
>>>>>> The patch introduces a new PMD. This PMD is implemented as thin
>>>> wrapper
>>>>>> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
>>>>>> The PMD can have 'iface' parameter like below to specify a path to
>>>> connect
>>>>>> to a virtio-net device.
>>>>>>
>>>>>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>>>>>
>>>>>> To connect above testpmd, here is qemu command example.
>>>>>>
>>>>>> $ qemu-system-x86_64 \
>>>>>>          <snip>
>>>>>>          -chardev socket,id=chr0,path=/tmp/sock0 \
>>>>>>          -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>>>>>          -device virtio-net-pci,netdev=net0
>>>>>>
>>>>>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>>>>> With this PMD in place, is there any need to keep the existing vhost library
>>>>> around as a separate entity? Can the existing library be
>>>> subsumed/converted into
>>>>> a standard PMD?
>>>>>
>>>>> /Bruce
>>>> Hi Bruce,
>>>>
>>>> I concern about whether the PMD has all features of librte_vhost,
>>>> because librte_vhost provides more features and freedom than ethdev API
>>>> provides.
>>>> In some cases, user needs to choose limited implementation without
>>>> librte_vhost.
>>>> I am going to eliminate such cases while implementing the PMD.
>>>> But I don't have strong belief that we can remove librte_vhost now.
>>>>
>>>> So how about keeping current separation in next DPDK?
>>>> I guess people will try to replace librte_vhost to vhost PMD, because
>>>> apparently using ethdev APIs will be useful in many cases.
>>>> And we will get feedbacks like "vhost PMD needs to support like this usage".
>>>> (Or we will not have feedbacks, but it's also OK.)
>>>> Then, we will be able to merge librte_vhost and vhost PMD.
>>> I agree with the above. One the concerns I had when reviewing the patch was that the PMD removes some freedom that is available with the library. Eg. Ability to implement the new_device and destroy_device callbacks. If using the PMD you are constrained to the implementations of these in the PMD driver, but if using librte_vhost, you can implement your own with whatever functionality you like - a good example of this can be seen in the vhost sample app.
>>> On the other hand, the PMD is useful in that it removes a lot of complexity for the user and may work for some more general use cases. So I would be in favour of having both options available too.
>>>
>>> Ciara
>>>
>> Thanks.
>> However, just because the libraries are merged does not mean that you need
>> be limited by PMD functionality. Many PMDs provide additional library-specific
>> functions over and above their PMD capabilities. The bonded PMD is a good example
>> here, as it has a whole set of extra functions to create and manipulate bonded
>> devices - things that are obviously not part of the general ethdev API. Other
>> vPMDs similarly include functions to allow them to be created on the fly too.
>>
>> regards,
>> /Bruce
>
> Hi Bruce,
>
> I appreciate for showing a good example. I haven't noticed the PMD.
> I will check the bonding PMD, and try to remove librte_vhost without
> losing freedom and features of the library.

Hi,

Just a gentle reminder - if you consider removing (even if by just 
replacing/renaming) an entire library, it needs to happen the ABI 
deprecation process.

It seems obvious enough. But for all the ABI policing here, somehow we 
all failed to notice the two compatibility breaking rename-elephants in 
the room during 2.1 development:
- libintel_dpdk was renamed to libdpdk
- librte_pmd_virtio_uio was renamed to librte_pmd_virtio

Of course these cases are easy to work around with symlinks, and are 
unrelated to the matter at hand. Just wanting to make sure such things 
dont happen again.

	- Panu -
  
Bruce Richardson Oct. 19, 2015, 1:27 p.m. UTC | #9
> -----Original Message-----

> From: Panu Matilainen [mailto:pmatilai@redhat.com]

> Sent: Monday, October 19, 2015 2:26 PM

> To: Tetsuya Mukawa <mukawa@igel.co.jp>; Richardson, Bruce

> <bruce.richardson@intel.com>; Loftus, Ciara <ciara.loftus@intel.com>

> Cc: dev@dpdk.org; ann.zhuangyanying@huawei.com

> Subject: Re: [dpdk-dev] [RFC PATCH v2] vhost: Add VHOST PMD

> 

> On 10/19/2015 01:50 PM, Tetsuya Mukawa wrote:

> > On 2015/10/19 18:45, Bruce Richardson wrote:

> >> On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:

> >>>> On 2015/10/16 21:52, Bruce Richardson wrote:

> >>>>> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:

> >>>>>> The patch introduces a new PMD. This PMD is implemented as thin

> >>>> wrapper

> >>>>>> of librte_vhost. It means librte_vhost is also needed to compile

> the PMD.

> >>>>>> The PMD can have 'iface' parameter like below to specify a path

> >>>>>> to

> >>>> connect

> >>>>>> to a virtio-net device.

> >>>>>>

> >>>>>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i

> >>>>>>

> >>>>>> To connect above testpmd, here is qemu command example.

> >>>>>>

> >>>>>> $ qemu-system-x86_64 \

> >>>>>>          <snip>

> >>>>>>          -chardev socket,id=chr0,path=/tmp/sock0 \

> >>>>>>          -netdev vhost-user,id=net0,chardev=chr0,vhostforce \

> >>>>>>          -device virtio-net-pci,netdev=net0

> >>>>>>

> >>>>>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>

> >>>>> With this PMD in place, is there any need to keep the existing

> >>>>> vhost library around as a separate entity? Can the existing

> >>>>> library be

> >>>> subsumed/converted into

> >>>>> a standard PMD?

> >>>>>

> >>>>> /Bruce

> >>>> Hi Bruce,

> >>>>

> >>>> I concern about whether the PMD has all features of librte_vhost,

> >>>> because librte_vhost provides more features and freedom than ethdev

> >>>> API provides.

> >>>> In some cases, user needs to choose limited implementation without

> >>>> librte_vhost.

> >>>> I am going to eliminate such cases while implementing the PMD.

> >>>> But I don't have strong belief that we can remove librte_vhost now.

> >>>>

> >>>> So how about keeping current separation in next DPDK?

> >>>> I guess people will try to replace librte_vhost to vhost PMD,

> >>>> because apparently using ethdev APIs will be useful in many cases.

> >>>> And we will get feedbacks like "vhost PMD needs to support like this

> usage".

> >>>> (Or we will not have feedbacks, but it's also OK.) Then, we will be

> >>>> able to merge librte_vhost and vhost PMD.

> >>> I agree with the above. One the concerns I had when reviewing the

> patch was that the PMD removes some freedom that is available with the

> library. Eg. Ability to implement the new_device and destroy_device

> callbacks. If using the PMD you are constrained to the implementations of

> these in the PMD driver, but if using librte_vhost, you can implement your

> own with whatever functionality you like - a good example of this can be

> seen in the vhost sample app.

> >>> On the other hand, the PMD is useful in that it removes a lot of

> complexity for the user and may work for some more general use cases. So I

> would be in favour of having both options available too.

> >>>

> >>> Ciara

> >>>

> >> Thanks.

> >> However, just because the libraries are merged does not mean that you

> >> need be limited by PMD functionality. Many PMDs provide additional

> >> library-specific functions over and above their PMD capabilities. The

> >> bonded PMD is a good example here, as it has a whole set of extra

> >> functions to create and manipulate bonded devices - things that are

> >> obviously not part of the general ethdev API. Other vPMDs similarly

> include functions to allow them to be created on the fly too.

> >>

> >> regards,

> >> /Bruce

> >

> > Hi Bruce,

> >

> > I appreciate for showing a good example. I haven't noticed the PMD.

> > I will check the bonding PMD, and try to remove librte_vhost without

> > losing freedom and features of the library.

> 

> Hi,

> 

> Just a gentle reminder - if you consider removing (even if by just

> replacing/renaming) an entire library, it needs to happen the ABI

> deprecation process.

> 

> It seems obvious enough. But for all the ABI policing here, somehow we all

> failed to notice the two compatibility breaking rename-elephants in the

> room during 2.1 development:

> - libintel_dpdk was renamed to libdpdk

> - librte_pmd_virtio_uio was renamed to librte_pmd_virtio

> 

> Of course these cases are easy to work around with symlinks, and are

> unrelated to the matter at hand. Just wanting to make sure such things

> dont happen again.

> 

> 	- Panu -


Still doesn't hurt to remind us, Panu! Thanks. :-)
  
Loftus, Ciara Oct. 20, 2015, 2:13 p.m. UTC | #10
> 
> On 2015/09/24 2:47, Loftus, Ciara wrote:
> >> The patch introduces a new PMD. This PMD is implemented as thin
> wrapper
> >> of librte_vhost. It means librte_vhost is also needed to compile the PMD.
> >> The PMD can have 'iface' parameter like below to specify a path to
> connect
> >> to a virtio-net device.
> >>
> >> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
> >>
> >> To connect above testpmd, here is qemu command example.
> >>
> >> $ qemu-system-x86_64 \
> >>         <snip>
> >>         -chardev socket,id=chr0,path=/tmp/sock0 \
> >>         -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
> >>         -device virtio-net-pci,netdev=net0
> >>
> >> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> >> ---
> >>  config/common_linuxapp                      |   6 +
> >>  drivers/net/Makefile                        |   4 +
> >>  drivers/net/vhost/Makefile                  |  61 +++
> >>  drivers/net/vhost/rte_eth_vhost.c           | 640
> >> ++++++++++++++++++++++++++++
> >>  drivers/net/vhost/rte_pmd_vhost_version.map |   4 +
> >>  mk/rte.app.mk                               |   8 +-
> >>  6 files changed, 722 insertions(+), 1 deletion(-)
> >>  create mode 100644 drivers/net/vhost/Makefile
> >>  create mode 100644 drivers/net/vhost/rte_eth_vhost.c
> >>  create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map
> >>
> >> +struct pmd_internal {
> >> +	TAILQ_ENTRY(pmd_internal) next;
> >> +	char *dev_name;
> >> +	char *iface_name;
> >> +	unsigned nb_rx_queues;
> >> +	unsigned nb_tx_queues;
> >> +	rte_atomic16_t xfer;
> > Is this flag just used to indicate the state of the virtio_net device?
> > Ie. if =0 then virtio_dev=NULL and if =1 then virtio_net !=NULL & the
> VIRTIO_DEV_RUNNING flag is set?
> 
> Hi Clara,
> 
> I am sorry for very late reply.
> 
> Yes, it is. Probably we can optimize it more.
> I will change this implementation a bit in next patches.
> Could you please check it?
Of course, thanks.

> 
> >> +
> >> +static uint16_t
> >> +eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
> >> +{
> >> +	struct vhost_queue *r = q;
> >> +	uint16_t i, nb_tx = 0;
> >> +
> >> +	if (unlikely(r->internal == NULL))
> >> +		return 0;
> >> +
> >> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
> >> +		return 0;
> >> +
> >> +	rte_atomic16_set(&r->tx_executing, 1);
> >> +
> >> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
> >> +		goto out;
> >> +
> >> +	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
> >> +			VIRTIO_RXQ, bufs, nb_bufs);
> >> +
> >> +	rte_atomic64_add(&(r->tx_pkts), nb_tx);
> >> +	rte_atomic64_add(&(r->err_pkts), nb_bufs - nb_tx);
> >> +
> >> +	for (i = 0; likely(i < nb_tx); i++)
> >> +		rte_pktmbuf_free(bufs[i]);
> > We may not always want to free these mbufs. For example, if a call is made
> to rte_eth_tx_burst with buffers from another (non DPDK) source, they may
> not be ours to free.
> 
> Sorry, I am not sure what type of buffer you want to transfer.
> 
> This is a PMD that wraps librte_vhost.
> And I guess other PMDs cannot handle buffers from another non DPDK
> source.
> Should we take care such buffers?
> 
> I have also checked af_packet PMD.
> It seems the tx function of af_packet PMD just frees mbuf.

For example if using the PMD with an application that receives buffers from another source. Eg. a virtual switch receiving packets from an interface using the kernel driver.
I see that af_packet also frees the mbuf. I've checked the ixgbe and ring pmds though and they don't seem to free the buffers, although I may have missed something, the code for these is rather large and I am unfamiliar with most of it. If I am correct though, should this behaviour vary from PMD to PMD I wonder?
> 
> >> +
> >> +
> >> +	eth_dev = rte_eth_dev_allocated(internal->dev_name);
> >> +	if (eth_dev == NULL) {
> >> +		RTE_LOG(INFO, PMD, "failuer to find ethdev\n");
> > Typo: Failure. Same for the destroy_device function
> 
> Thanks, I will fix it in next patches.
> 
> >> +		return -1;
> >> +	}
> >> +
> >> +	internal = eth_dev->data->dev_private;
> >> +
> >> +	for (i = 0; i < internal->nb_rx_queues; i++) {
> >> +		vq = &internal->rx_vhost_queues[i];
> >> +		vq->device = dev;
> >> +		vq->internal = internal;
> >> +	}
> >> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> >> +		vq = &internal->tx_vhost_queues[i];
> >> +		vq->device = dev;
> >> +		vq->internal = internal;
> >> +	}
> >> +
> >> +	dev->flags |= VIRTIO_DEV_RUNNING;
> >> +	dev->priv = eth_dev;
> >> +
> >> +	eth_dev->data->dev_link.link_status = 1;
> >> +	rte_atomic16_set(&internal->xfer, 1);
> >> +
> >> +	RTE_LOG(INFO, PMD, "New connection established\n");
> >> +
> >> +	return 0;
> > Some freedom is taken away if the new_device and destroy_device
> callbacks are implemented in the driver.
> > For example if one wishes to  call the rte_vhost_enable_guest_notification
> function when a new device is brought up. They cannot now as there is no
> scope to modify these callbacks, as is done in for example the vHost sample
> app. Is this correct?
> 
> So how about adding one more parameter to be able to choose guest
> notification behavior?
> 
> ex)
> ./testpmd --vdev 'eth_vhost0,iface=/tmp/sock0,guest_notification=0'
> 
> In above case, all queues in this device will have
> VRING_USED_F_NO_NOTIFY.

I'm not too concerned about this particular function, I was just making an example. The main concern I was expressing here and in the other thread with Bruce, is the risk that we will lose some functionality available in the library but not in the PMD. This function is an example of that. If we could find some way to retain the functionality available in the library, it would be ideal.

Thanks for the response! I will review and test further patches if they become available.

Ciara

> 
> Thanks,
> Tetsuya
  
Tetsuya Mukawa Oct. 21, 2015, 4:30 a.m. UTC | #11
On 2015/10/20 23:13, Loftus, Ciara wrote:
>
>>>> +
>>>> +static uint16_t
>>>> +eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
>>>> +{
>>>> +	struct vhost_queue *r = q;
>>>> +	uint16_t i, nb_tx = 0;
>>>> +
>>>> +	if (unlikely(r->internal == NULL))
>>>> +		return 0;
>>>> +
>>>> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
>>>> +		return 0;
>>>> +
>>>> +	rte_atomic16_set(&r->tx_executing, 1);
>>>> +
>>>> +	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
>>>> +		goto out;
>>>> +
>>>> +	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
>>>> +			VIRTIO_RXQ, bufs, nb_bufs);
>>>> +
>>>> +	rte_atomic64_add(&(r->tx_pkts), nb_tx);
>>>> +	rte_atomic64_add(&(r->err_pkts), nb_bufs - nb_tx);
>>>> +
>>>> +	for (i = 0; likely(i < nb_tx); i++)
>>>> +		rte_pktmbuf_free(bufs[i]);
>>> We may not always want to free these mbufs. For example, if a call is made
>> to rte_eth_tx_burst with buffers from another (non DPDK) source, they may
>> not be ours to free.
>>
>> Sorry, I am not sure what type of buffer you want to transfer.
>>
>> This is a PMD that wraps librte_vhost.
>> And I guess other PMDs cannot handle buffers from another non DPDK
>> source.
>> Should we take care such buffers?
>>
>> I have also checked af_packet PMD.
>> It seems the tx function of af_packet PMD just frees mbuf.
> For example if using the PMD with an application that receives buffers from another source. Eg. a virtual switch receiving packets from an interface using the kernel driver.

For example, if a software switch on host tries to send data to DPDK
application on guest using vhost PMD and virtio-net PMD.
Also let's assume transfer data of software switch is come from kernel
driver.
In this case, these data on software switch will be copied and
transferred to virio-net PMD through virtqueue.
Because of this, we can free data after sending.
Could you please also check API documentation rte_eth_tx_burst?
(Freeing buffer is default behavior)

> I see that af_packet also frees the mbuf. I've checked the ixgbe and ring pmds though and they don't seem to free the buffers, although I may have missed something, the code for these is rather large and I am unfamiliar with most of it. If I am correct though, should this behaviour vary from PMD to PMD I wonder?

I guess ring PMD is something special.
Because we don't want to copy data with this PMD, RX function doesn't
allocate buffers, also TX function doesn't free buffers.
But other normal PMD will allocate buffers when RX is called, and free
buffers when TX is called.

>>>> +
>>>> +
>>>> +	eth_dev = rte_eth_dev_allocated(internal->dev_name);
>>>> +	if (eth_dev == NULL) {
>>>> +		RTE_LOG(INFO, PMD, "failuer to find ethdev\n");
>>> Typo: Failure. Same for the destroy_device function
>> Thanks, I will fix it in next patches.
>>
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	internal = eth_dev->data->dev_private;
>>>> +
>>>> +	for (i = 0; i < internal->nb_rx_queues; i++) {
>>>> +		vq = &internal->rx_vhost_queues[i];
>>>> +		vq->device = dev;
>>>> +		vq->internal = internal;
>>>> +	}
>>>> +	for (i = 0; i < internal->nb_tx_queues; i++) {
>>>> +		vq = &internal->tx_vhost_queues[i];
>>>> +		vq->device = dev;
>>>> +		vq->internal = internal;
>>>> +	}
>>>> +
>>>> +	dev->flags |= VIRTIO_DEV_RUNNING;
>>>> +	dev->priv = eth_dev;
>>>> +
>>>> +	eth_dev->data->dev_link.link_status = 1;
>>>> +	rte_atomic16_set(&internal->xfer, 1);
>>>> +
>>>> +	RTE_LOG(INFO, PMD, "New connection established\n");
>>>> +
>>>> +	return 0;
>>> Some freedom is taken away if the new_device and destroy_device
>> callbacks are implemented in the driver.
>>> For example if one wishes to  call the rte_vhost_enable_guest_notification
>> function when a new device is brought up. They cannot now as there is no
>> scope to modify these callbacks, as is done in for example the vHost sample
>> app. Is this correct?
>>
>> So how about adding one more parameter to be able to choose guest
>> notification behavior?
>>
>> ex)
>> ./testpmd --vdev 'eth_vhost0,iface=/tmp/sock0,guest_notification=0'
>>
>> In above case, all queues in this device will have
>> VRING_USED_F_NO_NOTIFY.
> I'm not too concerned about this particular function, I was just making an example. The main concern I was expressing here and in the other thread with Bruce, is the risk that we will lose some functionality available in the library but not in the PMD. This function is an example of that. If we could find some way to retain the functionality available in the library, it would be ideal.

I will reply to an other thread.
Anyway, I am going to keep current vhost library APIs.

Thanks,
Tetsuya
  
Tetsuya Mukawa Oct. 21, 2015, 4:35 a.m. UTC | #12
On 2015/10/19 22:27, Richardson, Bruce wrote:
>> -----Original Message-----
>> From: Panu Matilainen [mailto:pmatilai@redhat.com]
>> Sent: Monday, October 19, 2015 2:26 PM
>> To: Tetsuya Mukawa <mukawa@igel.co.jp>; Richardson, Bruce
>> <bruce.richardson@intel.com>; Loftus, Ciara <ciara.loftus@intel.com>
>> Cc: dev@dpdk.org; ann.zhuangyanying@huawei.com
>> Subject: Re: [dpdk-dev] [RFC PATCH v2] vhost: Add VHOST PMD
>>
>> On 10/19/2015 01:50 PM, Tetsuya Mukawa wrote:
>>> On 2015/10/19 18:45, Bruce Richardson wrote:
>>>> On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
>>>>>> On 2015/10/16 21:52, Bruce Richardson wrote:
>>>>>>> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
>>>>>>>> The patch introduces a new PMD. This PMD is implemented as thin
>>>>>> wrapper
>>>>>>>> of librte_vhost. It means librte_vhost is also needed to compile
>> the PMD.
>>>>>>>> The PMD can have 'iface' parameter like below to specify a path
>>>>>>>> to
>>>>>> connect
>>>>>>>> to a virtio-net device.
>>>>>>>>
>>>>>>>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>>>>>>>
>>>>>>>> To connect above testpmd, here is qemu command example.
>>>>>>>>
>>>>>>>> $ qemu-system-x86_64 \
>>>>>>>>          <snip>
>>>>>>>>          -chardev socket,id=chr0,path=/tmp/sock0 \
>>>>>>>>          -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>>>>>>>          -device virtio-net-pci,netdev=net0
>>>>>>>>
>>>>>>>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>>>>>>> With this PMD in place, is there any need to keep the existing
>>>>>>> vhost library around as a separate entity? Can the existing
>>>>>>> library be
>>>>>> subsumed/converted into
>>>>>>> a standard PMD?
>>>>>>>
>>>>>>> /Bruce
>>>>>> Hi Bruce,
>>>>>>
>>>>>> I concern about whether the PMD has all features of librte_vhost,
>>>>>> because librte_vhost provides more features and freedom than ethdev
>>>>>> API provides.
>>>>>> In some cases, user needs to choose limited implementation without
>>>>>> librte_vhost.
>>>>>> I am going to eliminate such cases while implementing the PMD.
>>>>>> But I don't have strong belief that we can remove librte_vhost now.
>>>>>>
>>>>>> So how about keeping current separation in next DPDK?
>>>>>> I guess people will try to replace librte_vhost to vhost PMD,
>>>>>> because apparently using ethdev APIs will be useful in many cases.
>>>>>> And we will get feedbacks like "vhost PMD needs to support like this
>> usage".
>>>>>> (Or we will not have feedbacks, but it's also OK.) Then, we will be
>>>>>> able to merge librte_vhost and vhost PMD.
>>>>> I agree with the above. One the concerns I had when reviewing the
>> patch was that the PMD removes some freedom that is available with the
>> library. Eg. Ability to implement the new_device and destroy_device
>> callbacks. If using the PMD you are constrained to the implementations of
>> these in the PMD driver, but if using librte_vhost, you can implement your
>> own with whatever functionality you like - a good example of this can be
>> seen in the vhost sample app.
>>>>> On the other hand, the PMD is useful in that it removes a lot of
>> complexity for the user and may work for some more general use cases. So I
>> would be in favour of having both options available too.
>>>>> Ciara
>>>>>
>>>> Thanks.
>>>> However, just because the libraries are merged does not mean that you
>>>> need be limited by PMD functionality. Many PMDs provide additional
>>>> library-specific functions over and above their PMD capabilities. The
>>>> bonded PMD is a good example here, as it has a whole set of extra
>>>> functions to create and manipulate bonded devices - things that are
>>>> obviously not part of the general ethdev API. Other vPMDs similarly
>> include functions to allow them to be created on the fly too.
>>>> regards,
>>>> /Bruce
>>> Hi Bruce,
>>>
>>> I appreciate for showing a good example. I haven't noticed the PMD.
>>> I will check the bonding PMD, and try to remove librte_vhost without
>>> losing freedom and features of the library.
>> Hi,
>>
>> Just a gentle reminder - if you consider removing (even if by just
>> replacing/renaming) an entire library, it needs to happen the ABI
>> deprecation process.
>>
>> It seems obvious enough. But for all the ABI policing here, somehow we all
>> failed to notice the two compatibility breaking rename-elephants in the
>> room during 2.1 development:
>> - libintel_dpdk was renamed to libdpdk
>> - librte_pmd_virtio_uio was renamed to librte_pmd_virtio
>>
>> Of course these cases are easy to work around with symlinks, and are
>> unrelated to the matter at hand. Just wanting to make sure such things
>> dont happen again.
>>
>> 	- Panu -
> Still doesn't hurt to remind us, Panu! Thanks. :-)

Hi,

Thanks for reminder. I've checked the DPDK documentation.
I will submit deprecation notice to follow DPDK deprecation process.
(Probably we will be able to remove vhost library in DPDK-2.3 or later.)

BTW, I will merge vhost library and PMD like below.
Step1. Move vhost library under vhost PMD.
Step2. Rename current APIs.
Step3. Add a function to get a pointer of "struct virtio_net device" by
a portno.

Last steps allows us to be able to convert a portno to the pointer of
corresponding vrtio_net device.
And we can still use features and freedom vhost library APIs provided.

Thanks,
Tetsuya
  
Panu Matilainen Oct. 21, 2015, 6:25 a.m. UTC | #13
On 10/21/2015 07:35 AM, Tetsuya Mukawa wrote:
> On 2015/10/19 22:27, Richardson, Bruce wrote:
>>> -----Original Message-----
>>> From: Panu Matilainen [mailto:pmatilai@redhat.com]
>>> Sent: Monday, October 19, 2015 2:26 PM
>>> To: Tetsuya Mukawa <mukawa@igel.co.jp>; Richardson, Bruce
>>> <bruce.richardson@intel.com>; Loftus, Ciara <ciara.loftus@intel.com>
>>> Cc: dev@dpdk.org; ann.zhuangyanying@huawei.com
>>> Subject: Re: [dpdk-dev] [RFC PATCH v2] vhost: Add VHOST PMD
>>>
>>> On 10/19/2015 01:50 PM, Tetsuya Mukawa wrote:
>>>> On 2015/10/19 18:45, Bruce Richardson wrote:
>>>>> On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
>>>>>>> On 2015/10/16 21:52, Bruce Richardson wrote:
>>>>>>>> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
>>>>>>>>> The patch introduces a new PMD. This PMD is implemented as thin
>>>>>>> wrapper
>>>>>>>>> of librte_vhost. It means librte_vhost is also needed to compile
>>> the PMD.
>>>>>>>>> The PMD can have 'iface' parameter like below to specify a path
>>>>>>>>> to
>>>>>>> connect
>>>>>>>>> to a virtio-net device.
>>>>>>>>>
>>>>>>>>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>>>>>>>>
>>>>>>>>> To connect above testpmd, here is qemu command example.
>>>>>>>>>
>>>>>>>>> $ qemu-system-x86_64 \
>>>>>>>>>           <snip>
>>>>>>>>>           -chardev socket,id=chr0,path=/tmp/sock0 \
>>>>>>>>>           -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>>>>>>>>           -device virtio-net-pci,netdev=net0
>>>>>>>>>
>>>>>>>>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>>>>>>>> With this PMD in place, is there any need to keep the existing
>>>>>>>> vhost library around as a separate entity? Can the existing
>>>>>>>> library be
>>>>>>> subsumed/converted into
>>>>>>>> a standard PMD?
>>>>>>>>
>>>>>>>> /Bruce
>>>>>>> Hi Bruce,
>>>>>>>
>>>>>>> I concern about whether the PMD has all features of librte_vhost,
>>>>>>> because librte_vhost provides more features and freedom than ethdev
>>>>>>> API provides.
>>>>>>> In some cases, user needs to choose limited implementation without
>>>>>>> librte_vhost.
>>>>>>> I am going to eliminate such cases while implementing the PMD.
>>>>>>> But I don't have strong belief that we can remove librte_vhost now.
>>>>>>>
>>>>>>> So how about keeping current separation in next DPDK?
>>>>>>> I guess people will try to replace librte_vhost to vhost PMD,
>>>>>>> because apparently using ethdev APIs will be useful in many cases.
>>>>>>> And we will get feedbacks like "vhost PMD needs to support like this
>>> usage".
>>>>>>> (Or we will not have feedbacks, but it's also OK.) Then, we will be
>>>>>>> able to merge librte_vhost and vhost PMD.
>>>>>> I agree with the above. One the concerns I had when reviewing the
>>> patch was that the PMD removes some freedom that is available with the
>>> library. Eg. Ability to implement the new_device and destroy_device
>>> callbacks. If using the PMD you are constrained to the implementations of
>>> these in the PMD driver, but if using librte_vhost, you can implement your
>>> own with whatever functionality you like - a good example of this can be
>>> seen in the vhost sample app.
>>>>>> On the other hand, the PMD is useful in that it removes a lot of
>>> complexity for the user and may work for some more general use cases. So I
>>> would be in favour of having both options available too.
>>>>>> Ciara
>>>>>>
>>>>> Thanks.
>>>>> However, just because the libraries are merged does not mean that you
>>>>> need be limited by PMD functionality. Many PMDs provide additional
>>>>> library-specific functions over and above their PMD capabilities. The
>>>>> bonded PMD is a good example here, as it has a whole set of extra
>>>>> functions to create and manipulate bonded devices - things that are
>>>>> obviously not part of the general ethdev API. Other vPMDs similarly
>>> include functions to allow them to be created on the fly too.
>>>>> regards,
>>>>> /Bruce
>>>> Hi Bruce,
>>>>
>>>> I appreciate for showing a good example. I haven't noticed the PMD.
>>>> I will check the bonding PMD, and try to remove librte_vhost without
>>>> losing freedom and features of the library.
>>> Hi,
>>>
>>> Just a gentle reminder - if you consider removing (even if by just
>>> replacing/renaming) an entire library, it needs to happen the ABI
>>> deprecation process.
>>>
>>> It seems obvious enough. But for all the ABI policing here, somehow we all
>>> failed to notice the two compatibility breaking rename-elephants in the
>>> room during 2.1 development:
>>> - libintel_dpdk was renamed to libdpdk
>>> - librte_pmd_virtio_uio was renamed to librte_pmd_virtio
>>>
>>> Of course these cases are easy to work around with symlinks, and are
>>> unrelated to the matter at hand. Just wanting to make sure such things
>>> dont happen again.
>>>
>>> 	- Panu -
>> Still doesn't hurt to remind us, Panu! Thanks. :-)
>
> Hi,
>
> Thanks for reminder. I've checked the DPDK documentation.
> I will submit deprecation notice to follow DPDK deprecation process.
> (Probably we will be able to remove vhost library in DPDK-2.3 or later.)
>
> BTW, I will merge vhost library and PMD like below.
> Step1. Move vhost library under vhost PMD.
> Step2. Rename current APIs.
> Step3. Add a function to get a pointer of "struct virtio_net device" by
> a portno.
>
> Last steps allows us to be able to convert a portno to the pointer of
> corresponding vrtio_net device.
> And we can still use features and freedom vhost library APIs provided.

Just wondering, is that *really* worth the price of breaking every 
single vhost library user out there?

I mean, this is not about removing some bitrotten function or two which 
nobody cares about anymore but removing (by renaming) one of the more 
widely (AFAICS) used libraries and its entire API.

If current APIs are kept then compatibility is largely a matter of 
planting a strategic symlink or two, but it might make the API look 
inconsistent.

But just wondering about the benefit of this merge thing, compared to 
just adding a vhost pmd and leaving the library be. The ABI process is 
not there to make life miserable for DPDK developers, its there to help 
make DPDK nicer for *other* developers. And the first and the foremost 
rule is simply: dont break backwards compatibility. Not unless there's a 
damn good reason to doing so, and I fail to see that reason here.

	- Panu -

> Thanks,
> Tetsuya
>
  
Bruce Richardson Oct. 21, 2015, 10:09 a.m. UTC | #14
On Wed, Oct 21, 2015 at 01:30:54PM +0900, Tetsuya Mukawa wrote:
> On 2015/10/20 23:13, Loftus, Ciara wrote:
> >
> > I see that af_packet also frees the mbuf. I've checked the ixgbe and ring pmds though and they don't seem to free the buffers, although I may have missed something, the code for these is rather large and I am unfamiliar with most of it. If I am correct though, should this behaviour vary from PMD to PMD I wonder?
> 
> I guess ring PMD is something special.
> Because we don't want to copy data with this PMD, RX function doesn't
> allocate buffers, also TX function doesn't free buffers.
> But other normal PMD will allocate buffers when RX is called, and free
> buffers when TX is called.
> 

Yes, this is correct. Ring pmd is the exception since it automatically recycles
buffers, and so does not need to alloc/free mbufs. (ixgbe frees the buffers 
post-TX as part of the TX ring cleanup)

/Bruce
  
Bruce Richardson Oct. 21, 2015, 10:22 a.m. UTC | #15
On Wed, Oct 21, 2015 at 09:25:12AM +0300, Panu Matilainen wrote:
> On 10/21/2015 07:35 AM, Tetsuya Mukawa wrote:
> >On 2015/10/19 22:27, Richardson, Bruce wrote:
> >>>-----Original Message-----
> >>>From: Panu Matilainen [mailto:pmatilai@redhat.com]
> >>>Sent: Monday, October 19, 2015 2:26 PM
> >>>To: Tetsuya Mukawa <mukawa@igel.co.jp>; Richardson, Bruce
> >>><bruce.richardson@intel.com>; Loftus, Ciara <ciara.loftus@intel.com>
> >>>Cc: dev@dpdk.org; ann.zhuangyanying@huawei.com
> >>>Subject: Re: [dpdk-dev] [RFC PATCH v2] vhost: Add VHOST PMD
> >>>
> >>>On 10/19/2015 01:50 PM, Tetsuya Mukawa wrote:
> >>>>On 2015/10/19 18:45, Bruce Richardson wrote:
> >>>>>On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
> >>>>>>>On 2015/10/16 21:52, Bruce Richardson wrote:
> >>>>>>>>On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
> >>>>>>>>>The patch introduces a new PMD. This PMD is implemented as thin
> >>>>>>>wrapper
> >>>>>>>>>of librte_vhost. It means librte_vhost is also needed to compile
> >>>the PMD.
> >>>>>>>>>The PMD can have 'iface' parameter like below to specify a path
> >>>>>>>>>to
> >>>>>>>connect
> >>>>>>>>>to a virtio-net device.
> >>>>>>>>>
> >>>>>>>>>$ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
> >>>>>>>>>
> >>>>>>>>>To connect above testpmd, here is qemu command example.
> >>>>>>>>>
> >>>>>>>>>$ qemu-system-x86_64 \
> >>>>>>>>>          <snip>
> >>>>>>>>>          -chardev socket,id=chr0,path=/tmp/sock0 \
> >>>>>>>>>          -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
> >>>>>>>>>          -device virtio-net-pci,netdev=net0
> >>>>>>>>>
> >>>>>>>>>Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
> >>>>>>>>With this PMD in place, is there any need to keep the existing
> >>>>>>>>vhost library around as a separate entity? Can the existing
> >>>>>>>>library be
> >>>>>>>subsumed/converted into
> >>>>>>>>a standard PMD?
> >>>>>>>>
> >>>>>>>>/Bruce
> >>>>>>>Hi Bruce,
> >>>>>>>
> >>>>>>>I concern about whether the PMD has all features of librte_vhost,
> >>>>>>>because librte_vhost provides more features and freedom than ethdev
> >>>>>>>API provides.
> >>>>>>>In some cases, user needs to choose limited implementation without
> >>>>>>>librte_vhost.
> >>>>>>>I am going to eliminate such cases while implementing the PMD.
> >>>>>>>But I don't have strong belief that we can remove librte_vhost now.
> >>>>>>>
> >>>>>>>So how about keeping current separation in next DPDK?
> >>>>>>>I guess people will try to replace librte_vhost to vhost PMD,
> >>>>>>>because apparently using ethdev APIs will be useful in many cases.
> >>>>>>>And we will get feedbacks like "vhost PMD needs to support like this
> >>>usage".
> >>>>>>>(Or we will not have feedbacks, but it's also OK.) Then, we will be
> >>>>>>>able to merge librte_vhost and vhost PMD.
> >>>>>>I agree with the above. One the concerns I had when reviewing the
> >>>patch was that the PMD removes some freedom that is available with the
> >>>library. Eg. Ability to implement the new_device and destroy_device
> >>>callbacks. If using the PMD you are constrained to the implementations of
> >>>these in the PMD driver, but if using librte_vhost, you can implement your
> >>>own with whatever functionality you like - a good example of this can be
> >>>seen in the vhost sample app.
> >>>>>>On the other hand, the PMD is useful in that it removes a lot of
> >>>complexity for the user and may work for some more general use cases. So I
> >>>would be in favour of having both options available too.
> >>>>>>Ciara
> >>>>>>
> >>>>>Thanks.
> >>>>>However, just because the libraries are merged does not mean that you
> >>>>>need be limited by PMD functionality. Many PMDs provide additional
> >>>>>library-specific functions over and above their PMD capabilities. The
> >>>>>bonded PMD is a good example here, as it has a whole set of extra
> >>>>>functions to create and manipulate bonded devices - things that are
> >>>>>obviously not part of the general ethdev API. Other vPMDs similarly
> >>>include functions to allow them to be created on the fly too.
> >>>>>regards,
> >>>>>/Bruce
> >>>>Hi Bruce,
> >>>>
> >>>>I appreciate for showing a good example. I haven't noticed the PMD.
> >>>>I will check the bonding PMD, and try to remove librte_vhost without
> >>>>losing freedom and features of the library.
> >>>Hi,
> >>>
> >>>Just a gentle reminder - if you consider removing (even if by just
> >>>replacing/renaming) an entire library, it needs to happen the ABI
> >>>deprecation process.
> >>>
> >>>It seems obvious enough. But for all the ABI policing here, somehow we all
> >>>failed to notice the two compatibility breaking rename-elephants in the
> >>>room during 2.1 development:
> >>>- libintel_dpdk was renamed to libdpdk
> >>>- librte_pmd_virtio_uio was renamed to librte_pmd_virtio
> >>>
> >>>Of course these cases are easy to work around with symlinks, and are
> >>>unrelated to the matter at hand. Just wanting to make sure such things
> >>>dont happen again.
> >>>
> >>>	- Panu -
> >>Still doesn't hurt to remind us, Panu! Thanks. :-)
> >
> >Hi,
> >
> >Thanks for reminder. I've checked the DPDK documentation.
> >I will submit deprecation notice to follow DPDK deprecation process.
> >(Probably we will be able to remove vhost library in DPDK-2.3 or later.)
> >
> >BTW, I will merge vhost library and PMD like below.
> >Step1. Move vhost library under vhost PMD.
> >Step2. Rename current APIs.
> >Step3. Add a function to get a pointer of "struct virtio_net device" by
> >a portno.
> >
> >Last steps allows us to be able to convert a portno to the pointer of
> >corresponding vrtio_net device.
> >And we can still use features and freedom vhost library APIs provided.
> 
> Just wondering, is that *really* worth the price of breaking every single
> vhost library user out there?
> 
> I mean, this is not about removing some bitrotten function or two which
> nobody cares about anymore but removing (by renaming) one of the more widely
> (AFAICS) used libraries and its entire API.
> 
> If current APIs are kept then compatibility is largely a matter of planting
> a strategic symlink or two, but it might make the API look inconsistent.
> 
> But just wondering about the benefit of this merge thing, compared to just
> adding a vhost pmd and leaving the library be. The ABI process is not there
> to make life miserable for DPDK developers, its there to help make DPDK
> nicer for *other* developers. And the first and the foremost rule is simply:
> dont break backwards compatibility. Not unless there's a damn good reason to
> doing so, and I fail to see that reason here.
> 
> 	- Panu -
>
Good question, and I'll accept that maybe it's not worth doing. I'm not that
much of an expert on the internals and APIs of vhost library.

However, the merge I was looking for was more from a code locality point
of view, to have all the vhost code in one directory (under drivers/net),
than spread across multiple ones. What API's need to be deprecated
or not as part of that work, is a separate question, and so in theory we could
create a combined vhost library that does not deprecate anything (though to
avoid a build-up of technical debt, we'll probably want to deprecate some 
functions).

I'll leave it up to the vhost experts do decide what's best, but for me, any
library that handles transmission and reception of packets outside of a DPDK
app should be a PMD library using ethdev rx/tx burst routines, and located
under drivers/net. (KNI is another obvious target for such a move and conversion).

Regards,
/Bruce
  
Tetsuya Mukawa Oct. 22, 2015, 9:45 a.m. UTC | #16
The patch introduces a new PMD. This PMD is implemented as thin wrapper
of librte_vhost.

I've submitted below patches in former patch sets. But it seems some issues
were fixed already.

 - [PATCH 1/3] vhost: Fix return value of GET_VRING_BASE message
 - [PATCH 2/3] vhost: Fix RESET_OWNER handling not to close callfd
 - [PATCH 3/3] vhost: Fix RESET_OWNER handling not to free virtqueue

I've still seen some reasource leaks of vhost library, but in this RFC,
I focused on vhost PMD.
After I get agreement, I will submit a patch for the leak issue as separated
patch. So please check directionality of vhost PMD.

PATCH v3 changes:
 - Optimize performance.
   In RX/TX functions, change code to access only per core data.
 - Add below API to allow user to use vhost library APIs for a port managed
   by vhost PMD. There are a bit of limitations. See "rte_eth_vhost.h".
    - rte_eth_vhost_portid2vdev()
   To support this functionality, vhost library is also changed.
   Anyway, if users doesn't use vhost PMD, can fully use vhost library APIs.
 - Add code to support vhost multiple queues.
   Actually, multiple queues functionality is not enabled so far.

Tetsuya Mukawa (2):
  vhost: Add callback and private data for vhost PMD
  vhost: Add VHOST PMD

 config/common_linuxapp                        |   6 +
 drivers/net/Makefile                          |   4 +
 drivers/net/vhost/Makefile                    |  62 +++
 drivers/net/vhost/rte_eth_vhost.c             | 735 ++++++++++++++++++++++++++
 drivers/net/vhost/rte_eth_vhost.h             |  65 +++
 drivers/net/vhost/rte_pmd_vhost_version.map   |   8 +
 lib/librte_vhost/rte_virtio_net.h             |   3 +
 lib/librte_vhost/vhost_user/virtio-net-user.c |   8 +-
 lib/librte_vhost/virtio-net.c                 |  40 +-
 lib/librte_vhost/virtio-net.h                 |   3 +-
 mk/rte.app.mk                                 |   8 +-
 11 files changed, 934 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/vhost/Makefile
 create mode 100644 drivers/net/vhost/rte_eth_vhost.c
 create mode 100644 drivers/net/vhost/rte_eth_vhost.h
 create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map
  
Tetsuya Mukawa Oct. 22, 2015, 9:50 a.m. UTC | #17
On 2015/10/21 19:22, Bruce Richardson wrote:
> On Wed, Oct 21, 2015 at 09:25:12AM +0300, Panu Matilainen wrote:
>> On 10/21/2015 07:35 AM, Tetsuya Mukawa wrote:
>>> On 2015/10/19 22:27, Richardson, Bruce wrote:
>>>>> -----Original Message-----
>>>>> From: Panu Matilainen [mailto:pmatilai@redhat.com]
>>>>> Sent: Monday, October 19, 2015 2:26 PM
>>>>> To: Tetsuya Mukawa <mukawa@igel.co.jp>; Richardson, Bruce
>>>>> <bruce.richardson@intel.com>; Loftus, Ciara <ciara.loftus@intel.com>
>>>>> Cc: dev@dpdk.org; ann.zhuangyanying@huawei.com
>>>>> Subject: Re: [dpdk-dev] [RFC PATCH v2] vhost: Add VHOST PMD
>>>>>
>>>>> On 10/19/2015 01:50 PM, Tetsuya Mukawa wrote:
>>>>>> On 2015/10/19 18:45, Bruce Richardson wrote:
>>>>>>> On Mon, Oct 19, 2015 at 10:32:50AM +0100, Loftus, Ciara wrote:
>>>>>>>>> On 2015/10/16 21:52, Bruce Richardson wrote:
>>>>>>>>>> On Mon, Aug 31, 2015 at 12:55:26PM +0900, Tetsuya Mukawa wrote:
>>>>>>>>>>> The patch introduces a new PMD. This PMD is implemented as thin
>>>>>>>>> wrapper
>>>>>>>>>>> of librte_vhost. It means librte_vhost is also needed to compile
>>>>> the PMD.
>>>>>>>>>>> The PMD can have 'iface' parameter like below to specify a path
>>>>>>>>>>> to
>>>>>>>>> connect
>>>>>>>>>>> to a virtio-net device.
>>>>>>>>>>>
>>>>>>>>>>> $ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0' -- -i
>>>>>>>>>>>
>>>>>>>>>>> To connect above testpmd, here is qemu command example.
>>>>>>>>>>>
>>>>>>>>>>> $ qemu-system-x86_64 \
>>>>>>>>>>>          <snip>
>>>>>>>>>>>          -chardev socket,id=chr0,path=/tmp/sock0 \
>>>>>>>>>>>          -netdev vhost-user,id=net0,chardev=chr0,vhostforce \
>>>>>>>>>>>          -device virtio-net-pci,netdev=net0
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
>>>>>>>>>> With this PMD in place, is there any need to keep the existing
>>>>>>>>>> vhost library around as a separate entity? Can the existing
>>>>>>>>>> library be
>>>>>>>>> subsumed/converted into
>>>>>>>>>> a standard PMD?
>>>>>>>>>>
>>>>>>>>>> /Bruce
>>>>>>>>> Hi Bruce,
>>>>>>>>>
>>>>>>>>> I concern about whether the PMD has all features of librte_vhost,
>>>>>>>>> because librte_vhost provides more features and freedom than ethdev
>>>>>>>>> API provides.
>>>>>>>>> In some cases, user needs to choose limited implementation without
>>>>>>>>> librte_vhost.
>>>>>>>>> I am going to eliminate such cases while implementing the PMD.
>>>>>>>>> But I don't have strong belief that we can remove librte_vhost now.
>>>>>>>>>
>>>>>>>>> So how about keeping current separation in next DPDK?
>>>>>>>>> I guess people will try to replace librte_vhost to vhost PMD,
>>>>>>>>> because apparently using ethdev APIs will be useful in many cases.
>>>>>>>>> And we will get feedbacks like "vhost PMD needs to support like this
>>>>> usage".
>>>>>>>>> (Or we will not have feedbacks, but it's also OK.) Then, we will be
>>>>>>>>> able to merge librte_vhost and vhost PMD.
>>>>>>>> I agree with the above. One the concerns I had when reviewing the
>>>>> patch was that the PMD removes some freedom that is available with the
>>>>> library. Eg. Ability to implement the new_device and destroy_device
>>>>> callbacks. If using the PMD you are constrained to the implementations of
>>>>> these in the PMD driver, but if using librte_vhost, you can implement your
>>>>> own with whatever functionality you like - a good example of this can be
>>>>> seen in the vhost sample app.
>>>>>>>> On the other hand, the PMD is useful in that it removes a lot of
>>>>> complexity for the user and may work for some more general use cases. So I
>>>>> would be in favour of having both options available too.
>>>>>>>> Ciara
>>>>>>>>
>>>>>>> Thanks.
>>>>>>> However, just because the libraries are merged does not mean that you
>>>>>>> need be limited by PMD functionality. Many PMDs provide additional
>>>>>>> library-specific functions over and above their PMD capabilities. The
>>>>>>> bonded PMD is a good example here, as it has a whole set of extra
>>>>>>> functions to create and manipulate bonded devices - things that are
>>>>>>> obviously not part of the general ethdev API. Other vPMDs similarly
>>>>> include functions to allow them to be created on the fly too.
>>>>>>> regards,
>>>>>>> /Bruce
>>>>>> Hi Bruce,
>>>>>>
>>>>>> I appreciate for showing a good example. I haven't noticed the PMD.
>>>>>> I will check the bonding PMD, and try to remove librte_vhost without
>>>>>> losing freedom and features of the library.
>>>>> Hi,
>>>>>
>>>>> Just a gentle reminder - if you consider removing (even if by just
>>>>> replacing/renaming) an entire library, it needs to happen the ABI
>>>>> deprecation process.
>>>>>
>>>>> It seems obvious enough. But for all the ABI policing here, somehow we all
>>>>> failed to notice the two compatibility breaking rename-elephants in the
>>>>> room during 2.1 development:
>>>>> - libintel_dpdk was renamed to libdpdk
>>>>> - librte_pmd_virtio_uio was renamed to librte_pmd_virtio
>>>>>
>>>>> Of course these cases are easy to work around with symlinks, and are
>>>>> unrelated to the matter at hand. Just wanting to make sure such things
>>>>> dont happen again.
>>>>>
>>>>> 	- Panu -
>>>> Still doesn't hurt to remind us, Panu! Thanks. :-)
>>> Hi,
>>>
>>> Thanks for reminder. I've checked the DPDK documentation.
>>> I will submit deprecation notice to follow DPDK deprecation process.
>>> (Probably we will be able to remove vhost library in DPDK-2.3 or later.)
>>>
>>> BTW, I will merge vhost library and PMD like below.
>>> Step1. Move vhost library under vhost PMD.
>>> Step2. Rename current APIs.
>>> Step3. Add a function to get a pointer of "struct virtio_net device" by
>>> a portno.
>>>
>>> Last steps allows us to be able to convert a portno to the pointer of
>>> corresponding vrtio_net device.
>>> And we can still use features and freedom vhost library APIs provided.
>> Just wondering, is that *really* worth the price of breaking every single
>> vhost library user out there?
>>
>> I mean, this is not about removing some bitrotten function or two which
>> nobody cares about anymore but removing (by renaming) one of the more widely
>> (AFAICS) used libraries and its entire API.
>>
>> If current APIs are kept then compatibility is largely a matter of planting
>> a strategic symlink or two, but it might make the API look inconsistent.
>>
>> But just wondering about the benefit of this merge thing, compared to just
>> adding a vhost pmd and leaving the library be. The ABI process is not there
>> to make life miserable for DPDK developers, its there to help make DPDK
>> nicer for *other* developers. And the first and the foremost rule is simply:
>> dont break backwards compatibility. Not unless there's a damn good reason to
>> doing so, and I fail to see that reason here.
>>
>> 	- Panu -
>>
> Good question, and I'll accept that maybe it's not worth doing. I'm not that
> much of an expert on the internals and APIs of vhost library.
>
> However, the merge I was looking for was more from a code locality point
> of view, to have all the vhost code in one directory (under drivers/net),
> than spread across multiple ones. What API's need to be deprecated
> or not as part of that work, is a separate question, and so in theory we could
> create a combined vhost library that does not deprecate anything (though to
> avoid a build-up of technical debt, we'll probably want to deprecate some 
> functions).
>
> I'll leave it up to the vhost experts do decide what's best, but for me, any
> library that handles transmission and reception of packets outside of a DPDK
> app should be a PMD library using ethdev rx/tx burst routines, and located
> under drivers/net. (KNI is another obvious target for such a move and conversion).
>
> Regards,
> /Bruce
>

Hi,

I have submitted latest patches.
I will keep vhost library until we will have agreement to merge it to
vhost PMD.

Regards,
Testuya
  
Traynor, Kevin Oct. 27, 2015, 1:44 p.m. UTC | #18
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tetsuya Mukawa

[snip]

> 
> Hi,
> 
> I have submitted latest patches.
> I will keep vhost library until we will have agreement to merge it to
> vhost PMD.

Longer term there are pros and cons to keeping the vhost library. Personally
I think it would make sense to remove sometime as trying to maintain two API's
has a cost, but I think adding a deprecation notice in DPDK 2.2 for removal in
DPDK 2.3 is very premature. Until it's proven *in the field* that the vhost PMD
is a suitable fully functioning replacement for the vhost library and users
have time to migrate, then please don't remove.

> 
> Regards,
> Testuya
  
Tetsuya Mukawa Oct. 28, 2015, 2:24 a.m. UTC | #19
On 2015/10/27 22:44, Traynor, Kevin wrote:
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tetsuya Mukawa
> [snip]
>
>> Hi,
>>
>> I have submitted latest patches.
>> I will keep vhost library until we will have agreement to merge it to
>> vhost PMD.
> Longer term there are pros and cons to keeping the vhost library. Personally
> I think it would make sense to remove sometime as trying to maintain two API's
> has a cost, but I think adding a deprecation notice in DPDK 2.2 for removal in
> DPDK 2.3 is very premature. Until it's proven *in the field* that the vhost PMD
> is a suitable fully functioning replacement for the vhost library and users
> have time to migrate, then please don't remove.

Hi Kevin,

Thanks for commenting. I agree it's not the time to add deprecation notice.
(I haven't included it in the vhost PMD patches)

Tetsuya
  

Patch

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 0de43d5..7310240 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -446,6 +446,12 @@  CONFIG_RTE_LIBRTE_VHOST_NUMA=n
 CONFIG_RTE_LIBRTE_VHOST_DEBUG=n
 
 #
+# Compile vhost PMD
+# To compile, CONFIG_RTE_LIBRTE_VHOST should be enabled.
+#
+CONFIG_RTE_LIBRTE_PMD_VHOST=y
+
+#
 #Compile Xen domain0 support
 #
 CONFIG_RTE_LIBRTE_XEN_DOM0=n
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 5ebf963..e46a38e 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -49,5 +49,9 @@  DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio
 DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt
 
+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
+endif # $(CONFIG_RTE_LIBRTE_VHOST)
+
 include $(RTE_SDK)/mk/rte.sharelib.mk
 include $(RTE_SDK)/mk/rte.subdir.mk
diff --git a/drivers/net/vhost/Makefile b/drivers/net/vhost/Makefile
new file mode 100644
index 0000000..018edde
--- /dev/null
+++ b/drivers/net/vhost/Makefile
@@ -0,0 +1,61 @@ 
+#   BSD LICENSE
+#
+#   Copyright (c) 2010-2015 Intel Corporation.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_vhost.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+
+EXPORT_MAP := rte_pmd_vhost_version.map
+
+LIBABIVER := 1
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += rte_eth_vhost.c
+
+#
+# Export include files
+#
+SYMLINK-y-include +=
+
+# this lib depends upon:
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_mbuf
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_ether
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_kvargs
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c
new file mode 100644
index 0000000..679e893
--- /dev/null
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -0,0 +1,640 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2010-2015 Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <unistd.h>
+#include <pthread.h>
+
+#include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+#include <rte_dev.h>
+#include <rte_kvargs.h>
+#include <rte_virtio_net.h>
+
+#define ETH_VHOST_IFACE_ARG		"iface"
+
+static const char *drivername = "VHOST PMD";
+
+static const char *valid_arguments[] = {
+	ETH_VHOST_IFACE_ARG,
+	NULL
+};
+
+static struct ether_addr base_eth_addr = {
+	.addr_bytes = {
+		0x56 /* V */,
+		0x48 /* H */,
+		0x4F /* O */,
+		0x53 /* S */,
+		0x54 /* T */,
+		0x00
+	}
+};
+
+struct vhost_queue {
+	struct virtio_net *device;
+	struct pmd_internal *internal;
+	struct rte_mempool *mb_pool;
+	rte_atomic64_t rx_pkts;
+	rte_atomic64_t tx_pkts;
+	rte_atomic64_t err_pkts;
+	rte_atomic16_t rx_executing;
+	rte_atomic16_t tx_executing;
+};
+
+struct pmd_internal {
+	TAILQ_ENTRY(pmd_internal) next;
+	char *dev_name;
+	char *iface_name;
+	unsigned nb_rx_queues;
+	unsigned nb_tx_queues;
+	rte_atomic16_t xfer;
+
+	struct vhost_queue rx_vhost_queues[RTE_PMD_RING_MAX_RX_RINGS];
+	struct vhost_queue tx_vhost_queues[RTE_PMD_RING_MAX_TX_RINGS];
+};
+
+TAILQ_HEAD(pmd_internal_head, pmd_internal);
+static struct pmd_internal_head internals_list =
+	TAILQ_HEAD_INITIALIZER(internals_list);
+
+static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static struct rte_eth_link pmd_link = {
+		.link_speed = 10000,
+		.link_duplex = ETH_LINK_FULL_DUPLEX,
+		.link_status = 0
+};
+
+static uint16_t
+eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
+{
+	struct vhost_queue *r = q;
+	uint16_t nb_rx = 0;
+
+	if (unlikely(r->internal == NULL))
+		return 0;
+
+	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
+		return 0;
+
+	rte_atomic16_set(&r->rx_executing, 1);
+
+	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
+		goto out;
+
+	nb_rx = (uint16_t)rte_vhost_dequeue_burst(r->device,
+			VIRTIO_TXQ, r->mb_pool, bufs, nb_bufs);
+
+	rte_atomic64_add(&(r->rx_pkts), nb_rx);
+
+out:
+	rte_atomic16_set(&r->rx_executing, 0);
+
+	return nb_rx;
+}
+
+static uint16_t
+eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
+{
+	struct vhost_queue *r = q;
+	uint16_t i, nb_tx = 0;
+
+	if (unlikely(r->internal == NULL))
+		return 0;
+
+	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
+		return 0;
+
+	rte_atomic16_set(&r->tx_executing, 1);
+
+	if (unlikely(rte_atomic16_read(&r->internal->xfer) == 0))
+		goto out;
+
+	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
+			VIRTIO_RXQ, bufs, nb_bufs);
+
+	rte_atomic64_add(&(r->tx_pkts), nb_tx);
+	rte_atomic64_add(&(r->err_pkts), nb_bufs - nb_tx);
+
+	for (i = 0; likely(i < nb_tx); i++)
+		rte_pktmbuf_free(bufs[i]);
+
+out:
+	rte_atomic16_set(&r->tx_executing, 0);
+
+	return nb_tx;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused) { return 0; }
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	return rte_vhost_driver_register(internal->iface_name);
+}
+
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	rte_vhost_driver_unregister(internal->iface_name);
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+		   uint16_t nb_rx_desc __rte_unused,
+		   unsigned int socket_id __rte_unused,
+		   const struct rte_eth_rxconf *rx_conf __rte_unused,
+		   struct rte_mempool *mb_pool)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	internal->rx_vhost_queues[rx_queue_id].mb_pool = mb_pool;
+	dev->data->rx_queues[rx_queue_id] = &internal->rx_vhost_queues[rx_queue_id];
+	return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+		   uint16_t nb_tx_desc __rte_unused,
+		   unsigned int socket_id __rte_unused,
+		   const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	dev->data->tx_queues[tx_queue_id] = &internal->tx_vhost_queues[tx_queue_id];
+	return 0;
+}
+
+
+static void
+eth_dev_info(struct rte_eth_dev *dev,
+	     struct rte_eth_dev_info *dev_info)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	dev_info->driver_name = drivername;
+	dev_info->max_mac_addrs = 1;
+	dev_info->max_rx_pktlen = (uint32_t)-1;
+	dev_info->max_rx_queues = (uint16_t)internal->nb_rx_queues;
+	dev_info->max_tx_queues = (uint16_t)internal->nb_tx_queues;
+	dev_info->min_rx_bufsize = 0;
+	dev_info->pci_dev = dev->pci_dev;
+}
+
+static void
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
+{
+	unsigned i;
+	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
+	const struct pmd_internal *internal = dev->data->dev_private;
+
+	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
+	     i < internal->nb_rx_queues; i++) {
+		igb_stats->q_ipackets[i] = internal->rx_vhost_queues[i].rx_pkts.cnt;
+		rx_total += igb_stats->q_ipackets[i];
+	}
+
+	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
+	     i < internal->nb_tx_queues; i++) {
+		igb_stats->q_opackets[i] = internal->tx_vhost_queues[i].tx_pkts.cnt;
+		igb_stats->q_errors[i] = internal->tx_vhost_queues[i].err_pkts.cnt;
+		tx_total += igb_stats->q_opackets[i];
+		tx_err_total += igb_stats->q_errors[i];
+	}
+
+	igb_stats->ipackets = rx_total;
+	igb_stats->opackets = tx_total;
+	igb_stats->oerrors = tx_err_total;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+	unsigned i;
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	for (i = 0; i < internal->nb_rx_queues; i++)
+		internal->rx_vhost_queues[i].rx_pkts.cnt = 0;
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		internal->tx_vhost_queues[i].tx_pkts.cnt = 0;
+		internal->tx_vhost_queues[i].err_pkts.cnt = 0;
+	}
+}
+
+static void
+eth_queue_release(void *q __rte_unused) { ; }
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+		int wait_to_complete __rte_unused) { return 0; }
+
+static const struct eth_dev_ops ops = {
+	.dev_start = eth_dev_start,
+	.dev_stop = eth_dev_stop,
+	.dev_configure = eth_dev_configure,
+	.dev_infos_get = eth_dev_info,
+	.rx_queue_setup = eth_rx_queue_setup,
+	.tx_queue_setup = eth_tx_queue_setup,
+	.rx_queue_release = eth_queue_release,
+	.tx_queue_release = eth_queue_release,
+	.link_update = eth_link_update,
+	.stats_get = eth_stats_get,
+	.stats_reset = eth_stats_reset,
+};
+
+static struct eth_driver rte_vhost_pmd = {
+	.pci_drv = {
+		.name = "rte_vhost_pmd",
+		.drv_flags = RTE_PCI_DRV_DETACHABLE,
+	},
+};
+
+static struct rte_pci_id id_table;
+
+static inline struct pmd_internal *
+find_internal_resource(char *ifname)
+{
+	int found = 0;
+	struct pmd_internal *internal;
+
+	if (ifname == NULL)
+		return NULL;
+
+	pthread_mutex_lock(&internal_list_lock);
+
+	TAILQ_FOREACH(internal, &internals_list, next) {
+		if (!strcmp(internal->iface_name, ifname)) {
+			found = 1;
+			break;
+		}
+	}
+
+	pthread_mutex_unlock(&internal_list_lock);
+
+	if (!found)
+		return NULL;
+
+	return internal;
+}
+
+static int
+new_device(struct virtio_net *dev)
+{
+	struct rte_eth_dev *eth_dev;
+	struct pmd_internal *internal;
+	struct vhost_queue *vq;
+	unsigned i;
+
+	if (dev == NULL) {
+		RTE_LOG(INFO, PMD, "invalid argument\n");
+		return -1;
+	}
+
+	internal = find_internal_resource(dev->ifname);
+	if (internal == NULL) {
+		RTE_LOG(INFO, PMD, "invalid device name\n");
+		return -1;
+	}
+
+	eth_dev = rte_eth_dev_allocated(internal->dev_name);
+	if (eth_dev == NULL) {
+		RTE_LOG(INFO, PMD, "failuer to find ethdev\n");
+		return -1;
+	}
+
+	internal = eth_dev->data->dev_private;
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = &internal->rx_vhost_queues[i];
+		vq->device = dev;
+		vq->internal = internal;
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = &internal->tx_vhost_queues[i];
+		vq->device = dev;
+		vq->internal = internal;
+	}
+
+	dev->flags |= VIRTIO_DEV_RUNNING;
+	dev->priv = eth_dev;
+
+	eth_dev->data->dev_link.link_status = 1;
+	rte_atomic16_set(&internal->xfer, 1);
+
+	RTE_LOG(INFO, PMD, "New connection established\n");
+
+	return 0;
+}
+
+static void
+destroy_device(volatile struct virtio_net *dev)
+{
+	struct rte_eth_dev *eth_dev;
+	struct pmd_internal *internal;
+	struct vhost_queue *vq;
+	unsigned i;
+
+	if (dev == NULL) {
+		RTE_LOG(INFO, PMD, "invalid argument\n");
+		return;
+	}
+
+	eth_dev = (struct rte_eth_dev *)dev->priv;
+	if (eth_dev == NULL) {
+		RTE_LOG(INFO, PMD, "failuer to find a ethdev\n");
+		return;
+	}
+
+	internal = eth_dev->data->dev_private;
+
+	/* Wait until rx/tx_pkt_burst stops accesing vhost device */
+	rte_atomic16_set(&internal->xfer, 0);
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = &internal->rx_vhost_queues[i];
+		while (rte_atomic16_read(&vq->rx_executing))
+			rte_pause();
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = &internal->tx_vhost_queues[i];
+		while (rte_atomic16_read(&vq->tx_executing))
+			rte_pause();
+	}
+
+	eth_dev->data->dev_link.link_status = 0;
+
+	dev->priv = NULL;
+	dev->flags &= ~VIRTIO_DEV_RUNNING;
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = &internal->rx_vhost_queues[i];
+		vq->device = NULL;
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = &internal->tx_vhost_queues[i];
+		vq->device = NULL;
+	}
+
+	RTE_LOG(INFO, PMD, "Connection closed\n");
+}
+
+static void *vhost_driver_session(void *param __rte_unused)
+{
+	static struct virtio_net_device_ops *vhost_ops;
+
+	vhost_ops = rte_zmalloc(NULL, sizeof(*vhost_ops), 0);
+	if (vhost_ops == NULL)
+		rte_panic("Can't allocate memory\n");
+
+	/* set vhost arguments */
+	vhost_ops->new_device = new_device;
+	vhost_ops->destroy_device = destroy_device;
+	if (rte_vhost_driver_callback_register(vhost_ops) < 0)
+		rte_panic("Can't register callbacks\n");
+
+	/* start event handling */
+	rte_vhost_driver_session_start();
+
+	rte_free(vhost_ops);
+	pthread_exit(0);
+}
+
+static pthread_once_t once_cont = PTHREAD_ONCE_INIT;
+static pthread_t session_th;
+
+static void vhost_driver_session_start(void)
+{
+	int ret;
+
+	ret = pthread_create(&session_th, NULL, vhost_driver_session, NULL);
+	if (ret)
+		rte_panic("Can't create a thread\n");
+}
+
+static int
+eth_dev_vhost_create(const char *name, int index,
+		     char *iface_name,
+		     const unsigned numa_node)
+{
+	struct rte_eth_dev_data *data = NULL;
+	struct rte_pci_device *pci_dev = NULL;
+	struct pmd_internal *internal = NULL;
+	struct rte_eth_dev *eth_dev = NULL;
+	struct ether_addr *eth_addr = NULL;
+	uint16_t nb_rx_queues = 1;
+	uint16_t nb_tx_queues = 1;
+
+	RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
+		numa_node);
+
+	/* now do all data allocation - for eth_dev structure, dummy pci driver
+	 * and internal (private) data
+	 */
+	data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
+	if (data == NULL)
+		goto error;
+
+	pci_dev = rte_zmalloc_socket(name, sizeof(*pci_dev), 0, numa_node);
+	if (pci_dev == NULL)
+		goto error;
+
+	internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node);
+	if (internal == NULL)
+		goto error;
+
+	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
+	if (eth_addr == NULL)
+		goto error;
+	*eth_addr = base_eth_addr;
+	eth_addr->addr_bytes[5] = index;
+
+	/* reserve an ethdev entry */
+	eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL);
+	if (eth_dev == NULL)
+		goto error;
+
+	/* now put it all together
+	 * - store queue data in internal,
+	 * - store numa_node info in pci_driver
+	 * - point eth_dev_data to internal and pci_driver
+	 * - and point eth_dev structure to new eth_dev_data structure
+	 */
+	internal->nb_rx_queues = nb_rx_queues;
+	internal->nb_tx_queues = nb_tx_queues;
+	internal->dev_name = strdup(name);
+	if (internal->dev_name == NULL)
+		goto error;
+	internal->iface_name = strdup(iface_name);
+	if (internal->iface_name == NULL)
+		goto error;
+
+	pthread_mutex_lock(&internal_list_lock);
+	TAILQ_INSERT_TAIL(&internals_list, internal, next);
+	pthread_mutex_unlock(&internal_list_lock);
+
+	rte_vhost_pmd.pci_drv.name = drivername;
+	rte_vhost_pmd.pci_drv.id_table = &id_table;
+
+	pci_dev->numa_node = numa_node;
+	pci_dev->driver = &rte_vhost_pmd.pci_drv;
+
+	data->dev_private = internal;
+	data->port_id = eth_dev->data->port_id;
+	memmove(data->name, eth_dev->data->name, sizeof(data->name));
+	data->nb_rx_queues = (uint16_t)nb_rx_queues;
+	data->nb_tx_queues = (uint16_t)nb_tx_queues;
+	data->dev_link = pmd_link;
+	data->mac_addrs = eth_addr;
+
+	/* We'll replace the 'data' originally allocated by eth_dev. So the
+	 * vhost PMD resources won't be shared between multi processes.
+	 */
+	eth_dev->data = data;
+	eth_dev->driver = &rte_vhost_pmd;
+	eth_dev->dev_ops = &ops;
+	eth_dev->pci_dev = pci_dev;
+
+	/* finally assign rx and tx ops */
+	eth_dev->rx_pkt_burst = eth_vhost_rx;
+	eth_dev->tx_pkt_burst = eth_vhost_tx;
+
+	/* start vhost driver session. It should be called only once */
+	pthread_once(&once_cont, vhost_driver_session_start);
+
+	return data->port_id;
+
+error:
+	rte_free(data);
+	rte_free(pci_dev);
+	rte_free(internal);
+	rte_free(eth_addr);
+
+	return -1;
+}
+
+static inline int
+open_iface(const char *key __rte_unused, const char *value, void *extra_args)
+{
+	const char **iface_name = extra_args;
+
+	if (value == NULL)
+		return -1;
+
+	*iface_name = value;
+
+	return 0;
+}
+
+static int
+rte_pmd_vhost_devinit(const char *name, const char *params)
+{
+	struct rte_kvargs *kvlist = NULL;
+	int ret = 0;
+	int index;
+	char *iface_name;
+
+	RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name);
+
+	kvlist = rte_kvargs_parse(params, valid_arguments);
+	if (kvlist == NULL)
+		return -1;
+
+	if (strlen(name) < strlen("eth_vhost"))
+		return -1;
+
+	index = strtol(name + strlen("eth_vhost"), NULL, 0);
+	if (errno == ERANGE)
+		return -1;
+
+	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
+		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
+					 &open_iface, &iface_name);
+		if (ret < 0)
+			goto out_free;
+
+		eth_dev_vhost_create(name, index, iface_name, rte_socket_id());
+	}
+
+out_free:
+	rte_kvargs_free(kvlist);
+	return ret;
+}
+
+static int
+rte_pmd_vhost_devuninit(const char *name)
+{
+	struct rte_eth_dev *eth_dev = NULL;
+	struct pmd_internal *internal;
+
+	RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name);
+
+	if (name == NULL)
+		return -EINVAL;
+
+	/* find an ethdev entry */
+	eth_dev = rte_eth_dev_allocated(name);
+	if (eth_dev == NULL)
+		return -ENODEV;
+
+	internal = eth_dev->data->dev_private;
+
+	pthread_mutex_lock(&internal_list_lock);
+	TAILQ_REMOVE(&internals_list, internal, next);
+	pthread_mutex_unlock(&internal_list_lock);
+
+	eth_dev_stop(eth_dev);
+
+	if ((internal) && (internal->dev_name))
+		free(internal->dev_name);
+	if ((internal) && (internal->iface_name))
+		free(internal->iface_name);
+	rte_free(eth_dev->data->dev_private);
+	rte_free(eth_dev->data);
+	rte_free(eth_dev->pci_dev);
+
+	rte_eth_dev_release_port(eth_dev);
+	return 0;
+}
+
+static struct rte_driver pmd_vhost_drv = {
+	.name = "eth_vhost",
+	.type = PMD_VDEV,
+	.init = rte_pmd_vhost_devinit,
+	.uninit = rte_pmd_vhost_devuninit,
+};
+
+PMD_REGISTER_DRIVER(pmd_vhost_drv);
diff --git a/drivers/net/vhost/rte_pmd_vhost_version.map b/drivers/net/vhost/rte_pmd_vhost_version.map
new file mode 100644
index 0000000..5151684
--- /dev/null
+++ b/drivers/net/vhost/rte_pmd_vhost_version.map
@@ -0,0 +1,4 @@ 
+DPDK_2.2 {
+
+	local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 3871205..1c42fb1 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -144,7 +144,13 @@  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_PCAP)       += -lrte_pmd_pcap
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL)       += -lrte_pmd_null
 
-endif # ! $(CONFIG_RTE_BUILD_SHARED_LIB)
+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
+
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST)      += -lrte_pmd_vhost
+
+endif # ! $(CONFIG_RTE_LIBRTE_VHOST)
+
+endif # $(CONFIG_RTE_BUILD_SHARED_LIB)
 
 endif # ! CONFIG_RTE_BUILD_COMBINE_LIBS