[dpdk-dev,RFC,v2,08/12] lib/librte_vhost: vhost-user support

Message ID 1418247477-13920-9-git-send-email-huawei.xie@intel.com (mailing list archive)
State RFC, archived
Headers

Commit Message

Huawei Xie Dec. 10, 2014, 9:37 p.m. UTC
  vhost-user support


Signed-off-by: Huawei Xie <huawei.xie@intel.com>
---
 lib/librte_vhost/Makefile                     |   5 +-
 lib/librte_vhost/vhost-net.h                  |   4 +
 lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |   9 +
 lib/librte_vhost/vhost_user/vhost-net-user.c  | 422 ++++++++++++++++++++++++++
 lib/librte_vhost/vhost_user/vhost-net-user.h  | 108 +++++++
 lib/librte_vhost/vhost_user/virtio-net-user.c | 199 ++++++++++++
 lib/librte_vhost/vhost_user/virtio-net-user.h |  48 +++
 lib/librte_vhost/virtio-net.c                 |  16 +-
 lib/librte_vhost/virtio-net.h                 |  43 +++
 9 files changed, 842 insertions(+), 12 deletions(-)
 create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
 create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
 create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
 create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
 create mode 100644 lib/librte_vhost/virtio-net.h
  

Comments

Linhaifeng Dec. 11, 2014, 5:36 a.m. UTC | #1
On 2014/12/11 5:37, Huawei Xie wrote:
> vhost-user support
> 
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  lib/librte_vhost/Makefile                     |   5 +-
>  lib/librte_vhost/vhost-net.h                  |   4 +
>  lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |   9 +
>  lib/librte_vhost/vhost_user/vhost-net-user.c  | 422 ++++++++++++++++++++++++++
>  lib/librte_vhost/vhost_user/vhost-net-user.h  | 108 +++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 199 ++++++++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.h |  48 +++
>  lib/librte_vhost/virtio-net.c                 |  16 +-
>  lib/librte_vhost/virtio-net.h                 |  43 +++
>  9 files changed, 842 insertions(+), 12 deletions(-)
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
>  create mode 100644 lib/librte_vhost/virtio-net.h
> 
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index e0d0ef6..b2f14a0 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -34,10 +34,11 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  # library name
>  LIB = librte_vhost.a
>  
> -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -I vhost_user -O3 -D_FILE_OFFSET_BITS=64 -lfuse
>  LDFLAGS += -lfuse
>  # all source are stored in SRCS-y
> -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c virtio-net.c vhost_rxtx.c
>  
>  # install includes
>  SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index f7e96fd..f9ec40b 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -41,8 +41,12 @@
>  
>  #include <rte_log.h>
>  
> +#include "rte_virtio_net.h"
> +
>  #define VHOST_MEMORY_MAX_NREGIONS 8
>  
> +extern struct vhost_net_device_ops const *ops;
> +
>  /* Macros for printing using RTE_LOG */
>  #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
>  #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
> diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> index edcbc10..8ac3360 100644
> --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> +++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> @@ -268,6 +268,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  	struct vhost_memory_region *mem_regions = (void *)(uintptr_t)
>  		((uint64_t)(uintptr_t)mem_regions_addr + size);
>  	uint64_t base_address = 0, mapped_address, mapped_size;
> +	struct virtio_dev *dev;
>  
>  	for (idx = 0; idx < nregions; idx++) {
>  		regions[idx].guest_phys_address =
> @@ -335,6 +336,14 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  			regions[idx].guest_phys_address;
>  	}
>  
> +	dev = get_device(ctx);
> +	if (dev && dev->mem && dev->mmaped_address) {
> +		munmap((void *)(uintptr_t)dev->mmaped_address,
> +			(size_t)dev->mmaped_size);
> +		free(dev->mem);
> +		dev->mem = NULL;
> +	}
> +
>  	ops->set_mem_table(ctx, &regions[0], valid_regions);
>  	return 0;
>  }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> new file mode 100644
> index 0000000..841d7e6
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -0,0 +1,422 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <limits.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +
> +#include <rte_log.h>
> +#include <rte_virtio_net.h>
> +
> +#include "fd_man.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +#include "virtio-net-user.h"
> +
> +static void vserver_new_vq_conn(int fd, uint64_t data);
> +static void vserver_message_handler(int fd, uint64_t dat);
> +struct vhost_net_device_ops const *ops;
> +
> +static struct vhost_server *g_vhost_server;
> +
> +static const char *vhost_message_str[VHOST_USER_MAX] = {
> +	[VHOST_USER_NONE] = "VHOST_USER_NONE",
> +	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
> +	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
> +	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
> +	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
> +	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
> +	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
> +	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
> +	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
> +	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
> +	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
> +	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
> +	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
> +	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
> +	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR"
> +};
> +
> +/**
> + * Create a unix domain socket, bind to path and listen for connection.
> + * @return
> + *  socket fd or -1 on failure
> + */
> +static int
> +uds_socket(const char *path)
> +{
> +	struct sockaddr_un un;
> +	int sockfd;
> +	int ret;
> +
> +	if (path == NULL)
> +		return -1;
> +
> +	sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (sockfd < 0)
> +		return -1;
> +	RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
> +	ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
> +	if (ret == -1)
> +		goto err;
> +	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
> +
> +	ret = listen(sockfd, 1);
> +	if (ret == -1)
> +		goto err;
> +
> +	return sockfd;
> +
> +err:
> +	close(sockfd);
> +	return -1;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len  = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(sockfd, &msgh, 0);
> +	if (ret <= 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
> +		return ret;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "Truncted msg\n");
> +		return -1;
> +	}
> +
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
> +		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
> +	if (ret <= 0)
> +		return ret;
> +
> +	if (msg && msg->size) {
> +		if (msg->size > sizeof(msg->payload)) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"invalid msg size: %d\n", msg->size);
> +			return -1;
> +		}
> +		ret = read(sockfd, &msg->payload, msg->size);
> +		if (ret <= 0)
> +			return ret;
> +		if (ret != (int)msg->size) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"read control message failed\n");
> +			return -1;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +
> +	if (fds && fd_num > 0) {
> +		msgh.msg_control = control;
> +		msgh.msg_controllen = sizeof(control);
> +		cmsg = CMSG_FIRSTHDR(&msgh);
> +		cmsg->cmsg_len = CMSG_LEN(fdsize);
> +		cmsg->cmsg_level = SOL_SOCKET;
> +		cmsg->cmsg_type = SCM_RIGHTS;
> +		memcpy(CMSG_DATA(cmsg), fds, fdsize);
> +	} else {
> +		msgh.msg_control = NULL;
> +		msgh.msg_controllen = 0;
> +	}
> +
> +	do {
> +		ret = sendmsg(sockfd, &msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	if (!msg)
> +		return 0;
> +
> +	msg->flags &= ~VHOST_USER_VERSION_MASK;
> +	msg->flags |= VHOST_USER_VERSION;
> +	sg->flags |= VHOST_USER_REPLY_MASK;
> +
> +	ret = send_fd_message(sockfd, (char *)msg,
> +		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
> +
> +	return ret;
> +}
> +
> +/* call back when there is new virtio connection.  */
> +static void
> +vserver_new_vq_conn(int fd, uint64_t dat)
> +{
> +	struct vhost_server *vserver = (void *)(uintptr_t)dat;
> +	int conn_fd;
> +	uint32_t fh;
> +	struct vhost_device_ctx vdev_ctx = { 0 };
> +
> +	conn_fd = accept(fd, NULL, NULL);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"new virtio connection is %d\n", conn_fd);
> +	if (conn_fd < 0)
> +		return;
> +
> +	fh = ops->new_device(vdev_ctx);
> +	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
> +
> +	fdset_add(&vserver->fdset,
> +		conn_fd, vserver_message_handler, NULL, fh);
> +}
> +
> +/* callback when there is message on the connfd */
> +static void
> +vserver_message_handler(int connfd, uint64_t dat)
> +{
> +	struct vhost_device_ctx ctx;
> +	uint32_t fh = (uint32_t)dat;
> +	struct VhostUserMsg msg;
> +	uint64_t features;
> +	int ret;
> +
> +	ctx.fh = fh;
> +	ret = read_vhost_message(connfd, &msg);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read message failed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	} else if (ret == 0) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"vhost peer closed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	}
> +	if (msg.request > VHOST_USER_MAX) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read incorrect message\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +
> +		return;
> +	}
> +
> +	RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
> +		vhost_message_str[msg.request]);
> +	switch (msg.request) {
> +	case VHOST_USER_GET_FEATURES:
> +		ret = ops->get_features(ctx, &features);
> +		msg.payload.u64 = ret;
> +		msg.size = sizeof(msg.payload.u64);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +	case VHOST_USER_SET_FEATURES:
> +		ops->set_features(ctx, &features);
> +		break;
> +
> +	case VHOST_USER_SET_OWNER:
> +		ops->set_owner(ctx);
> +		break;
> +	case VHOST_USER_RESET_OWNER:
> +		ops->reset_owner(ctx);
> +		break;
> +
> +	case VHOST_USER_SET_MEM_TABLE:
> +		user_set_mem_table(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_LOG_BASE:
> +	case VHOST_USER_SET_LOG_FD:

should close fd for fd leak when receive VHOST_USER_SET_LOG_FD msg?

> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> +		break;
> +
> +	case VHOST_USER_SET_VRING_NUM:
> +		ops->set_vring_num(ctx, &msg.payload.state);
> +		break;
> +	case VHOST_USER_SET_VRING_ADDR:
> +		ops->set_vring_addr(ctx, &msg.payload.addr);
> +		break;
> +	case VHOST_USER_SET_VRING_BASE:
> +		ops->set_vring_base(ctx, &msg.payload.state);
> +		break;
> +
> +	case VHOST_USER_GET_VRING_BASE:
> +		ret = user_get_vring_base(ctx, &msg.payload.state);
> +		msg.size = sizeof(msg.payload.state);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_KICK:
> +		user_set_vring_kick(ctx, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_CALL:
> +		user_set_vring_call(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_ERR:

should close fd for fd leak?

> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> +		break;
> +
> +	default:
> +		break;
> +
> +	}
> +}
> +
> +
> +/**
> + * Creates and initialise the vhost server.
> + */
> +int
> +rte_vhost_driver_register(const char *path)
> +{
> +
> +	struct vhost_server *vserver;
> +
> +	if (g_vhost_server != NULL)
> +		return -1;
> +
> +	vserver = calloc(sizeof(struct vhost_server), 1);
> +	if (vserver == NULL)
> +		return -1;
> +
> +	fdset_init(&vserver->fdset);
> +
> +	unlink(path);
> +
> +	vserver->listenfd = uds_socket(path);
> +	if (vserver->listenfd < 0) {
> +		free(vserver);
> +		return -1;
> +	}
> +	vserver->path = path;
> +
> +	fdset_add(&vserver->fdset, vserver->listenfd,
> +		vserver_new_vq_conn, NULL,
> +		(uint64_t)(uintptr_t)vserver);
> +
> +	ops = get_virtio_net_callbacks();
> +
> +	g_vhost_server = vserver;
> +
> +	return 0;
> +}
> +
> +
> +int
> +rte_vhost_driver_session_start(void)
> +{
> +	fdset_event_dispatch(&g_vhost_server->fdset);
> +	return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> new file mode 100644
> index 0000000..c138844
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -0,0 +1,108 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VHOST_NET_USER_H
> +#define _VHOST_NET_USER_H
> +
> +#include <stdint.h>
> +#include <linux/vhost.h>
> +
> +#include "fd_man.h"
> +
> +struct vhost_server {
> +	const char *path; /**< The path the uds is bind to. */
> +	int listenfd;     /**< The listener sockfd. */
> +	struct fdset fdset; /**< The fd list this vhost server manages. */
> +};
> +
> +/* refer to hw/virtio/vhost-user.c */
> +
> +#define VHOST_MEMORY_MAX_NREGIONS    8
> +
> +typedef enum VhostUserRequest {
> +	VHOST_USER_NONE = 0,
> +	VHOST_USER_GET_FEATURES = 1,
> +	VHOST_USER_SET_FEATURES = 2,
> +	VHOST_USER_SET_OWNER = 3,
> +	VHOST_USER_RESET_OWNER = 4,
> +	VHOST_USER_SET_MEM_TABLE = 5,
> +	VHOST_USER_SET_LOG_BASE = 6,
> +	VHOST_USER_SET_LOG_FD = 7,
> +	VHOST_USER_SET_VRING_NUM = 8,
> +	VHOST_USER_SET_VRING_ADDR = 9,
> +	VHOST_USER_SET_VRING_BASE = 10,
> +	VHOST_USER_GET_VRING_BASE = 11,
> +	VHOST_USER_SET_VRING_KICK = 12,
> +	VHOST_USER_SET_VRING_CALL = 13,
> +	VHOST_USER_SET_VRING_ERR = 14,
> +	VHOST_USER_MAX
> +} VhostUserRequest;
> +
> +typedef struct VhostUserMemoryRegion {
> +	uint64_t guest_phys_addr;
> +	uint64_t memory_size;
> +	uint64_t userspace_addr;
> +	uint64_t mmap_offset;
> +} VhostUserMemoryRegion;
> +
> +typedef struct VhostUserMemory {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> +} VhostUserMemory;
> +
> +typedef struct VhostUserMsg {
> +	VhostUserRequest request;
> +
> +#define VHOST_USER_VERSION_MASK     (0x3)
> +#define VHOST_USER_REPLY_MASK       (0x1 << 2)
> +	uint32_t flags;
> +	uint32_t size; /* the following payload size */
> +	union {
> +#define VHOST_USER_VRING_IDX_MASK   (0xff)
> +#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
> +		uint64_t u64;
> +		struct vhost_vring_state state;
> +		struct vhost_vring_addr addr;
> +	VhostUserMemory memory;
> +	} payload;
> +	int fds[VHOST_MEMORY_MAX_NREGIONS];
> +} __attribute((packed)) VhostUserMsg;
> +
> +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION    (0x1)
> +
> +/*****************************************************************************/
> +#endif
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> new file mode 100644
> index 0000000..ad59fcc
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -0,0 +1,199 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +
> +#include <rte_log.h>
> +
> +#include "virtio-net.h"
> +#include "virtio-net-user.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +
> +int
> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	unsigned int idx;
> +	struct VhostUserMemory memory = pmsg->payload.memory;
> +	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> +	uint64_t mapped_address, base_address = 0;
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		if (memory.regions[idx].guest_phys_addr == 0)
> +			base_address = memory.regions[idx].userspace_addr;
> +	}
> +	if (base_address == 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"couldn't find the mem region whose GPA is 0.\n");
> +		return -1;
> +	}
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		regions[idx].guest_phys_address =
> +			memory.regions[idx].guest_phys_addr;
> +		regions[idx].guest_phys_address_end =
> +			memory.regions[idx].guest_phys_addr +
> +			memory.regions[idx].memory_size;
> +		regions[idx].memory_size = memory.regions[idx].memory_size;
> +		regions[idx].userspace_address =
> +			memory.regions[idx].userspace_addr;
> +
> +		/* This is ugly */
> +		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> +			regions[idx].memory_size +
> +				memory.regions[idx].mmap_offset,
> +			PROT_READ | PROT_WRITE, MAP_SHARED,
> +			pmsg->fds[idx],
> +			0);
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"mapped region %d to %p\n",
> +			idx, (void *)mapped_address);
> +
> +		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"mmap qemu guest failed.\n");
> +			return -1;
> +		}
> +
> +		mapped_address +=  memory.regions[idx].mmap_offset;
> +
> +		regions[idx].address_offset = mapped_address -
> +			regions[idx].guest_phys_address;
> +		LOG_DEBUG(VHOST_CONFIG,
> +			"REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
> +			idx,
> +			(void *)(uintptr_t)regions[idx].guest_phys_address,
> +			(void *)(uintptr_t)regions[idx].userspace_address,
> +			 regions[idx].memory_size);
> +	}
> +	ops->set_mem_table(ctx, regions, memory.nregions);
> +	return 0;
> +}
> +
> +
> +static int
> +virtio_is_ready(struct virtio_net *dev)
> +{
> +	struct vhost_virtqueue *rvq, *tvq;
> +
> +	/* mq support in future.*/
> +	rvq = dev->virtqueue[VIRTIO_RXQ];
> +	tvq = dev->virtqueue[VIRTIO_TXQ];
> +	if (rvq && tvq && rvq->desc && tvq->desc &&
> +		(rvq->kickfd != (eventfd_t)-1) &&
> +		(rvq->callfd != (eventfd_t)-1) &&
> +		(tvq->kickfd != (eventfd_t)-1) &&
> +		(tvq->callfd != (eventfd_t)-1)) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"virtio is now ready for processing.\n");
> +		return 1;
> +	}
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"virtio isn't ready for processing.\n");
> +	return 0;
> +}
> +
> +void
> +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring call idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_call(ctx, &file);
> +}
> +
> +
> +/*
> + *  In vhost-user, when we receive kick message, will test whether virtio
> + *  device is ready for packet processing.
> + */
> +void
> +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring kick idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_kick(ctx, &file);
> +
> +	if (virtio_is_ready(dev) &&
> +		!(dev->flags & VIRTIO_DEV_RUNNING))
> +			notify_ops->new_device(dev);
> +
> +}
> +
> +/*
> + * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
> + */
> +int
> +user_get_vring_base(struct vhost_device_ctx ctx,
> +	struct vhost_vring_state *state)
> +{
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	/* We have to stop the queue (virtio) if it is running. */
> +	if (dev->flags & VIRTIO_DEV_RUNNING)
> +		notify_ops->destroy_device(dev);
> +
> +	/* Here we are safe to get the last used index */
> +	ops->get_vring_base(ctx, state->index, state);
> +
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring base idx:%d file:%d\n", state->index, state->num);
> +	/*
> +	 * Based on current qemu vhost-user implementation, this message is
> +	 * sent and only sent in vhost_vring_stop.
> +	 * TODO: cleanup the vring, it isn't usable since here.
> +	 */
> +	if (dev->virtqueue[VIRTIO_RXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_RXQ]->callfd);
> +		dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
> +	}
> +	if (dev->virtqueue[VIRTIO_TXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_TXQ]->callfd);
> +		dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
> +	}
> +
> +	return 0;
> +
> +}
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> new file mode 100644
> index 0000000..0f6a75a
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -0,0 +1,48 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_USER_H
> +#define _VIRTIO_NET_USER_H
> +
> +#include "vhost-net.h"
> +#include "vhost-net-user.h"
> +
> +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
> +
> +#endif
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index f81e459..0b49f1b 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -46,6 +46,7 @@
>  #include <rte_virtio_net.h>
>  
>  #include "vhost-net.h"
> +#include "virtio-net.h"
>  
>  /*
>   * Device linked list structure for configuration.
> @@ -56,7 +57,7 @@ struct virtio_net_config_ll {
>  };
>  
>  /* device ops to add/remove device to/from data core. */
> -static struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net_device_ops const *notify_ops;
>  /* root address of the linked list of managed virtio devices */
>  static struct virtio_net_config_ll *ll_root;
>  
> @@ -83,8 +84,9 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
>  		if ((qemu_va >= region->userspace_address) &&
>  			(qemu_va <= region->userspace_address +
>  			region->memory_size)) {
> -			vhost_va = dev->mem->mapped_address + qemu_va -
> -					dev->mem->base_address;
> +			vhost_va = qemu_va + region->guest_phys_address +
> +				region->address_offset -
> +				region->userspace_address;
>  			break;
>  		}
>  	}
> @@ -114,7 +116,7 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
>   * Searches the configuration core linked list and
>   * retrieves the device if it exists.
>   */
> -static struct virtio_net *
> +struct virtio_net *
>  get_device(struct vhost_device_ctx ctx)
>  {
>  	struct virtio_net_config_ll *ll_dev;
> @@ -450,12 +452,6 @@ set_mem_table(struct vhost_device_ctx ctx,
>  	if (dev == NULL)
>  		return -1;
>  
> -	if (dev->mem) {
> -		munmap((void *)(uintptr_t)dev->mem->mapped_address,
> -			(size_t)dev->mem->mapped_size);
> -		free(dev->mem);
> -	}
> -
>  	/* Malloc the memory structure depending on the number of regions. */
>  	mem = calloc(1, sizeof(struct virtio_memory) +
>  		(sizeof(struct virtio_memory_regions) * nregions));
> diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_vhost/virtio-net.h
> new file mode 100644
> index 0000000..da4ade0
> --- /dev/null
> +++ b/lib/librte_vhost/virtio-net.h
> @@ -0,0 +1,43 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_H
> +#define _VIRTIO_NET_H
> +
> +#include "vhost-net.h"
> +#include "rte_virtio_net.h"
> +
> +struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net * get_device(struct vhost_device_ctx ctx);
> +
> +#endif
>
  
Linhaifeng Dec. 11, 2014, 6:04 a.m. UTC | #2
On 2014/12/11 5:37, Huawei Xie wrote:
> vhost-user support
> 
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  lib/librte_vhost/Makefile                     |   5 +-
>  lib/librte_vhost/vhost-net.h                  |   4 +
>  lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |   9 +
>  lib/librte_vhost/vhost_user/vhost-net-user.c  | 422 ++++++++++++++++++++++++++
>  lib/librte_vhost/vhost_user/vhost-net-user.h  | 108 +++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 199 ++++++++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.h |  48 +++
>  lib/librte_vhost/virtio-net.c                 |  16 +-
>  lib/librte_vhost/virtio-net.h                 |  43 +++
>  9 files changed, 842 insertions(+), 12 deletions(-)
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
>  create mode 100644 lib/librte_vhost/virtio-net.h
> 
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index e0d0ef6..b2f14a0 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -34,10 +34,11 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  # library name
>  LIB = librte_vhost.a
>  
> -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -I vhost_user -O3 -D_FILE_OFFSET_BITS=64 -lfuse
>  LDFLAGS += -lfuse
>  # all source are stored in SRCS-y
> -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c virtio-net.c vhost_rxtx.c
>  
>  # install includes
>  SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index f7e96fd..f9ec40b 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -41,8 +41,12 @@
>  
>  #include <rte_log.h>
>  
> +#include "rte_virtio_net.h"
> +
>  #define VHOST_MEMORY_MAX_NREGIONS 8
>  
> +extern struct vhost_net_device_ops const *ops;
> +
>  /* Macros for printing using RTE_LOG */
>  #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
>  #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
> diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> index edcbc10..8ac3360 100644
> --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> +++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> @@ -268,6 +268,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  	struct vhost_memory_region *mem_regions = (void *)(uintptr_t)
>  		((uint64_t)(uintptr_t)mem_regions_addr + size);
>  	uint64_t base_address = 0, mapped_address, mapped_size;
> +	struct virtio_dev *dev;
>  
>  	for (idx = 0; idx < nregions; idx++) {
>  		regions[idx].guest_phys_address =
> @@ -335,6 +336,14 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  			regions[idx].guest_phys_address;
>  	}
>  
> +	dev = get_device(ctx);
> +	if (dev && dev->mem && dev->mmaped_address) {
> +		munmap((void *)(uintptr_t)dev->mmaped_address,
> +			(size_t)dev->mmaped_size);
> +		free(dev->mem);
> +		dev->mem = NULL;
> +	}
> +
>  	ops->set_mem_table(ctx, &regions[0], valid_regions);
>  	return 0;
>  }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> new file mode 100644
> index 0000000..841d7e6
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -0,0 +1,422 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <limits.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +
> +#include <rte_log.h>
> +#include <rte_virtio_net.h>
> +
> +#include "fd_man.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +#include "virtio-net-user.h"
> +
> +static void vserver_new_vq_conn(int fd, uint64_t data);
> +static void vserver_message_handler(int fd, uint64_t dat);
> +struct vhost_net_device_ops const *ops;
> +
> +static struct vhost_server *g_vhost_server;

Only support one vhost-user port ?

> +
> +static const char *vhost_message_str[VHOST_USER_MAX] = {
> +	[VHOST_USER_NONE] = "VHOST_USER_NONE",
> +	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
> +	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
> +	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
> +	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
> +	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
> +	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
> +	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
> +	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
> +	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
> +	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
> +	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
> +	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
> +	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
> +	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR"
> +};
> +
> +/**
> + * Create a unix domain socket, bind to path and listen for connection.
> + * @return
> + *  socket fd or -1 on failure
> + */
> +static int
> +uds_socket(const char *path)
> +{
> +	struct sockaddr_un un;
> +	int sockfd;
> +	int ret;
> +
> +	if (path == NULL)
> +		return -1;
> +
> +	sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (sockfd < 0)
> +		return -1;
> +	RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
> +	ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
> +	if (ret == -1)
> +		goto err;
> +	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
> +
> +	ret = listen(sockfd, 1);
> +	if (ret == -1)
> +		goto err;
> +
> +	return sockfd;
> +
> +err:
> +	close(sockfd);
> +	return -1;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len  = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(sockfd, &msgh, 0);
> +	if (ret <= 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
> +		return ret;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "Truncted msg\n");
> +		return -1;
> +	}
> +
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
> +		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
> +	if (ret <= 0)
> +		return ret;
> +
> +	if (msg && msg->size) {
> +		if (msg->size > sizeof(msg->payload)) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"invalid msg size: %d\n", msg->size);
> +			return -1;
> +		}
> +		ret = read(sockfd, &msg->payload, msg->size);
> +		if (ret <= 0)
> +			return ret;
> +		if (ret != (int)msg->size) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"read control message failed\n");
> +			return -1;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +
> +	if (fds && fd_num > 0) {
> +		msgh.msg_control = control;
> +		msgh.msg_controllen = sizeof(control);
> +		cmsg = CMSG_FIRSTHDR(&msgh);
> +		cmsg->cmsg_len = CMSG_LEN(fdsize);
> +		cmsg->cmsg_level = SOL_SOCKET;
> +		cmsg->cmsg_type = SCM_RIGHTS;
> +		memcpy(CMSG_DATA(cmsg), fds, fdsize);
> +	} else {
> +		msgh.msg_control = NULL;
> +		msgh.msg_controllen = 0;
> +	}
> +
> +	do {
> +		ret = sendmsg(sockfd, &msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	if (!msg)
> +		return 0;
> +
> +	msg->flags &= ~VHOST_USER_VERSION_MASK;
> +	msg->flags |= VHOST_USER_VERSION;
> +	sg->flags |= VHOST_USER_REPLY_MASK;
> +
> +	ret = send_fd_message(sockfd, (char *)msg,
> +		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
> +
> +	return ret;
> +}
> +
> +/* call back when there is new virtio connection.  */
> +static void
> +vserver_new_vq_conn(int fd, uint64_t dat)

why not use vserver_new_vq_conn(int fd, void* dat) ?


> +{
> +	struct vhost_server *vserver = (void *)(uintptr_t)dat;
> +	int conn_fd;
> +	uint32_t fh;
> +	struct vhost_device_ctx vdev_ctx = { 0 };
> +
> +	conn_fd = accept(fd, NULL, NULL);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"new virtio connection is %d\n", conn_fd);
> +	if (conn_fd < 0)
> +		return;
> +
> +	fh = ops->new_device(vdev_ctx);
> +	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
> +
> +	fdset_add(&vserver->fdset,
> +		conn_fd, vserver_message_handler, NULL, fh);
> +}
> +
> +/* callback when there is message on the connfd */
> +static void
> +vserver_message_handler(int connfd, uint64_t dat)
> +{
> +	struct vhost_device_ctx ctx;
> +	uint32_t fh = (uint32_t)dat;
> +	struct VhostUserMsg msg;
> +	uint64_t features;
> +	int ret;
> +
> +	ctx.fh = fh;
> +	ret = read_vhost_message(connfd, &msg);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read message failed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	} else if (ret == 0) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"vhost peer closed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	}
> +	if (msg.request > VHOST_USER_MAX) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read incorrect message\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +
> +		return;
> +	}
> +
> +	RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
> +		vhost_message_str[msg.request]);
> +	switch (msg.request) {
> +	case VHOST_USER_GET_FEATURES:
> +		ret = ops->get_features(ctx, &features);
> +		msg.payload.u64 = ret;
> +		msg.size = sizeof(msg.payload.u64);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +	case VHOST_USER_SET_FEATURES:
> +		ops->set_features(ctx, &features);
> +		break;
> +
> +	case VHOST_USER_SET_OWNER:
> +		ops->set_owner(ctx);
> +		break;
> +	case VHOST_USER_RESET_OWNER:
> +		ops->reset_owner(ctx);
> +		break;
> +
> +	case VHOST_USER_SET_MEM_TABLE:
> +		user_set_mem_table(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_LOG_BASE:
> +	case VHOST_USER_SET_LOG_FD:
> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> +		break;
> +
> +	case VHOST_USER_SET_VRING_NUM:
> +		ops->set_vring_num(ctx, &msg.payload.state);
> +		break;
> +	case VHOST_USER_SET_VRING_ADDR:
> +		ops->set_vring_addr(ctx, &msg.payload.addr);
> +		break;
> +	case VHOST_USER_SET_VRING_BASE:
> +		ops->set_vring_base(ctx, &msg.payload.state);
> +		break;
> +
> +	case VHOST_USER_GET_VRING_BASE:
> +		ret = user_get_vring_base(ctx, &msg.payload.state);
> +		msg.size = sizeof(msg.payload.state);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_KICK:
> +		user_set_vring_kick(ctx, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_CALL:
> +		user_set_vring_call(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_ERR:
> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> +		break;
> +
> +	default:
> +		break;
> +
> +	}
> +}
> +
> +
> +/**
> + * Creates and initialise the vhost server.
> + */
> +int
> +rte_vhost_driver_register(const char *path)
> +{
> +
> +	struct vhost_server *vserver;
> +
> +	if (g_vhost_server != NULL)
> +		return -1;
> +
> +	vserver = calloc(sizeof(struct vhost_server), 1);
> +	if (vserver == NULL)
> +		return -1;
> +
> +	fdset_init(&vserver->fdset);
> +
> +	unlink(path);
> +
> +	vserver->listenfd = uds_socket(path);
> +	if (vserver->listenfd < 0) {
> +		free(vserver);
> +		return -1;
> +	}
> +	vserver->path = path;
> +
> +	fdset_add(&vserver->fdset, vserver->listenfd,
> +		vserver_new_vq_conn, NULL,
> +		(uint64_t)(uintptr_t)vserver);
> +
> +	ops = get_virtio_net_callbacks();
> +
> +	g_vhost_server = vserver;
> +
> +	return 0;
> +}
> +
> +
> +int
> +rte_vhost_driver_session_start(void)
> +{
> +	fdset_event_dispatch(&g_vhost_server->fdset);
> +	return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> new file mode 100644
> index 0000000..c138844
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -0,0 +1,108 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VHOST_NET_USER_H
> +#define _VHOST_NET_USER_H
> +
> +#include <stdint.h>
> +#include <linux/vhost.h>
> +
> +#include "fd_man.h"
> +
> +struct vhost_server {
> +	const char *path; /**< The path the uds is bind to. */
> +	int listenfd;     /**< The listener sockfd. */
> +	struct fdset fdset; /**< The fd list this vhost server manages. */
> +};
> +
> +/* refer to hw/virtio/vhost-user.c */
> +
> +#define VHOST_MEMORY_MAX_NREGIONS    8
> +
> +typedef enum VhostUserRequest {
> +	VHOST_USER_NONE = 0,
> +	VHOST_USER_GET_FEATURES = 1,
> +	VHOST_USER_SET_FEATURES = 2,
> +	VHOST_USER_SET_OWNER = 3,
> +	VHOST_USER_RESET_OWNER = 4,
> +	VHOST_USER_SET_MEM_TABLE = 5,
> +	VHOST_USER_SET_LOG_BASE = 6,
> +	VHOST_USER_SET_LOG_FD = 7,
> +	VHOST_USER_SET_VRING_NUM = 8,
> +	VHOST_USER_SET_VRING_ADDR = 9,
> +	VHOST_USER_SET_VRING_BASE = 10,
> +	VHOST_USER_GET_VRING_BASE = 11,
> +	VHOST_USER_SET_VRING_KICK = 12,
> +	VHOST_USER_SET_VRING_CALL = 13,
> +	VHOST_USER_SET_VRING_ERR = 14,
> +	VHOST_USER_MAX
> +} VhostUserRequest;
> +
> +typedef struct VhostUserMemoryRegion {
> +	uint64_t guest_phys_addr;
> +	uint64_t memory_size;
> +	uint64_t userspace_addr;
> +	uint64_t mmap_offset;
> +} VhostUserMemoryRegion;
> +
> +typedef struct VhostUserMemory {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> +} VhostUserMemory;
> +
> +typedef struct VhostUserMsg {
> +	VhostUserRequest request;
> +
> +#define VHOST_USER_VERSION_MASK     (0x3)
> +#define VHOST_USER_REPLY_MASK       (0x1 << 2)
> +	uint32_t flags;
> +	uint32_t size; /* the following payload size */
> +	union {
> +#define VHOST_USER_VRING_IDX_MASK   (0xff)
> +#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
> +		uint64_t u64;
> +		struct vhost_vring_state state;
> +		struct vhost_vring_addr addr;
> +	VhostUserMemory memory;
> +	} payload;
> +	int fds[VHOST_MEMORY_MAX_NREGIONS];
> +} __attribute((packed)) VhostUserMsg;
> +
> +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION    (0x1)
> +
> +/*****************************************************************************/
> +#endif
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> new file mode 100644
> index 0000000..ad59fcc
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -0,0 +1,199 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +
> +#include <rte_log.h>
> +
> +#include "virtio-net.h"
> +#include "virtio-net-user.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +
> +int
> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	unsigned int idx;
> +	struct VhostUserMemory memory = pmsg->payload.memory;
> +	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> +	uint64_t mapped_address, base_address = 0;
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		if (memory.regions[idx].guest_phys_addr == 0)
> +			base_address = memory.regions[idx].userspace_addr;
> +	}
> +	if (base_address == 0) {

when will base_address is 0? how to test to run this code?

> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"couldn't find the mem region whose GPA is 0.\n");
> +		return -1;
> +	}
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		regions[idx].guest_phys_address =
> +			memory.regions[idx].guest_phys_addr;
> +		regions[idx].guest_phys_address_end =
> +			memory.regions[idx].guest_phys_addr +
> +			memory.regions[idx].memory_size;
> +		regions[idx].memory_size = memory.regions[idx].memory_size;
> +		regions[idx].userspace_address =
> +			memory.regions[idx].userspace_addr;
> +
> +		/* This is ugly */
> +		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> +			regions[idx].memory_size +
> +				memory.regions[idx].mmap_offset,
> +			PROT_READ | PROT_WRITE, MAP_SHARED,
> +			pmsg->fds[idx],
> +			0);

Can you mmap the region if gpa is 0? When i run VM with two numa node (qemu will create two hugepage file) found that always failed to mmap with the region which gpa is 0.

BTW can we ensure the memory regions cover with all the memory of hugepage for VM?

> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"mapped region %d to %p\n",
> +			idx, (void *)mapped_address);
> +
> +		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"mmap qemu guest failed.\n");
> +			return -1;
> +		}
> +
> +		mapped_address +=  memory.regions[idx].mmap_offset;
> +
> +		regions[idx].address_offset = mapped_address -
> +			regions[idx].guest_phys_address;
> +		LOG_DEBUG(VHOST_CONFIG,
> +			"REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
> +			idx,
> +			(void *)(uintptr_t)regions[idx].guest_phys_address,
> +			(void *)(uintptr_t)regions[idx].userspace_address,
> +			 regions[idx].memory_size);
> +	}
> +	ops->set_mem_table(ctx, regions, memory.nregions);
> +	return 0;
> +}
> +
> +
> +static int
> +virtio_is_ready(struct virtio_net *dev)
> +{
> +	struct vhost_virtqueue *rvq, *tvq;
> +
> +	/* mq support in future.*/
> +	rvq = dev->virtqueue[VIRTIO_RXQ];
> +	tvq = dev->virtqueue[VIRTIO_TXQ];
> +	if (rvq && tvq && rvq->desc && tvq->desc &&
> +		(rvq->kickfd != (eventfd_t)-1) &&
> +		(rvq->callfd != (eventfd_t)-1) &&
> +		(tvq->kickfd != (eventfd_t)-1) &&
> +		(tvq->callfd != (eventfd_t)-1)) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"virtio is now ready for processing.\n");
> +		return 1;
> +	}
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"virtio isn't ready for processing.\n");
> +	return 0;
> +}
> +
> +void
> +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring call idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_call(ctx, &file);
> +}
> +
> +
> +/*
> + *  In vhost-user, when we receive kick message, will test whether virtio
> + *  device is ready for packet processing.
> + */
> +void
> +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring kick idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_kick(ctx, &file);
> +
> +	if (virtio_is_ready(dev) &&
> +		!(dev->flags & VIRTIO_DEV_RUNNING))
> +			notify_ops->new_device(dev);
> +
> +}
> +
> +/*
> + * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
> + */
> +int
> +user_get_vring_base(struct vhost_device_ctx ctx,
> +	struct vhost_vring_state *state)
> +{
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	/* We have to stop the queue (virtio) if it is running. */
> +	if (dev->flags & VIRTIO_DEV_RUNNING)
> +		notify_ops->destroy_device(dev);
> +
> +	/* Here we are safe to get the last used index */
> +	ops->get_vring_base(ctx, state->index, state);
> +
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring base idx:%d file:%d\n", state->index, state->num);
> +	/*
> +	 * Based on current qemu vhost-user implementation, this message is
> +	 * sent and only sent in vhost_vring_stop.
> +	 * TODO: cleanup the vring, it isn't usable since here.
> +	 */
> +	if (dev->virtqueue[VIRTIO_RXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_RXQ]->callfd);
> +		dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
> +	}
> +	if (dev->virtqueue[VIRTIO_TXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_TXQ]->callfd);
> +		dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
> +	}
> +
> +	return 0;
> +
> +}
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> new file mode 100644
> index 0000000..0f6a75a
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -0,0 +1,48 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_USER_H
> +#define _VIRTIO_NET_USER_H
> +
> +#include "vhost-net.h"
> +#include "vhost-net-user.h"
> +
> +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
> +
> +#endif
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index f81e459..0b49f1b 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -46,6 +46,7 @@
>  #include <rte_virtio_net.h>
>  
>  #include "vhost-net.h"
> +#include "virtio-net.h"
>  
>  /*
>   * Device linked list structure for configuration.
> @@ -56,7 +57,7 @@ struct virtio_net_config_ll {
>  };
>  
>  /* device ops to add/remove device to/from data core. */
> -static struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net_device_ops const *notify_ops;
>  /* root address of the linked list of managed virtio devices */
>  static struct virtio_net_config_ll *ll_root;
>  
> @@ -83,8 +84,9 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
>  		if ((qemu_va >= region->userspace_address) &&
>  			(qemu_va <= region->userspace_address +
>  			region->memory_size)) {
> -			vhost_va = dev->mem->mapped_address + qemu_va -
> -					dev->mem->base_address;
> +			vhost_va = qemu_va + region->guest_phys_address +
> +				region->address_offset -
> +				region->userspace_address;
>  			break;
>  		}
>  	}
> @@ -114,7 +116,7 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
>   * Searches the configuration core linked list and
>   * retrieves the device if it exists.
>   */
> -static struct virtio_net *
> +struct virtio_net *
>  get_device(struct vhost_device_ctx ctx)
>  {
>  	struct virtio_net_config_ll *ll_dev;
> @@ -450,12 +452,6 @@ set_mem_table(struct vhost_device_ctx ctx,
>  	if (dev == NULL)
>  		return -1;
>  
> -	if (dev->mem) {
> -		munmap((void *)(uintptr_t)dev->mem->mapped_address,
> -			(size_t)dev->mem->mapped_size);
> -		free(dev->mem);
> -	}
> -
>  	/* Malloc the memory structure depending on the number of regions. */
>  	mem = calloc(1, sizeof(struct virtio_memory) +
>  		(sizeof(struct virtio_memory_regions) * nregions));
> diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_vhost/virtio-net.h
> new file mode 100644
> index 0000000..da4ade0
> --- /dev/null
> +++ b/lib/librte_vhost/virtio-net.h
> @@ -0,0 +1,43 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_H
> +#define _VIRTIO_NET_H
> +
> +#include "vhost-net.h"
> +#include "rte_virtio_net.h"
> +
> +struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net * get_device(struct vhost_device_ctx ctx);
> +
> +#endif
>
  
Huawei Xie Dec. 11, 2014, 5:13 p.m. UTC | #3
> 
> Only support one vhost-user port ?

Do you mean vhost server by "port"?
If that is the case, yes, now only one vhost server is supported for multiple virtio devices.
As stated in the cover letter, we have requirement and plan for multiple server support,
though I am not sure if it is absolutely necessary.

> 
> Can you mmap the region if gpa is 0? When i run VM with two numa node (qemu
> will create two hugepage file) found that always failed to mmap with the region
> which gpa is 0.

Current implementation doesn't assume there is only one huge page file to back the guest memory.
It maps every region using the fd of that region. 
Could you please paste your guest VM command line here?

> 
> BTW can we ensure the memory regions cover with all the memory of hugepage
> for VM?

I think so, because virtio devices could use any normal guest memory, but we needn't ensure that.
We only need to map the region passed to us from qemu vhost, which should be enough to translate
the GPA in vring from virtio in guest, otherwise it is the bug of qemu vhost.
  
Huawei Xie Dec. 11, 2014, 8:16 p.m. UTC | #4
> -----Original Message-----
> From: Xie, Huawei
> Sent: Thursday, December 11, 2014 10:13 AM
> To: 'Linhaifeng'; dev@dpdk.org
> Cc: haifeng.lin@intel.com
> Subject: RE: [dpdk-dev] [PATCH RFC v2 08/12] lib/librte_vhost: vhost-user
> support
> 
> >
> > Only support one vhost-user port ?
> 
> Do you mean vhost server by "port"?
> If that is the case, yes, now only one vhost server is supported for multiple virtio
> devices.
> As stated in the cover letter, we have requirement and plan for multiple server
> support,
> though I am not sure if it is absolutely necessary.
> 
> >
> > Can you mmap the region if gpa is 0? When i run VM with two numa node
> (qemu
> > will create two hugepage file) found that always failed to mmap with the
> region
> > which gpa is 0.
> 
> Current implementation doesn't assume there is only one huge page file to back
> the guest memory.
> It maps every region using the fd of that region.
> Could you please paste your guest VM command line here?
> 
> >
> > BTW can we ensure the memory regions cover with all the memory of
> hugepage
> > for VM?
> 
> I think so, because virtio devices could use any normal guest memory, but we
> needn't ensure that.
> We only need to map the region passed to us from qemu vhost, which should be
> enough to translate
> the GPA in vring from virtio in guest, otherwise it is the bug of qemu vhost.

I see your post to qemu mailing list. Would you mind I paste here?
Qemu use two 1GB huge pages files to map the guest 2GB memory, and indeed we get "2GB" memory region.

The problem is the all the 2GB memory maps to the first huge page file node0.MvcPyi.
Seems a bug.


qemu command:
-m 2048 -smp 2,sockets=2,cores=1,threads=1
-object memory-backend-file,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu,share=on,size=1024M,id=ram-node0 -numa node,nodeid=0,cpus=0,memdev=ram-node0
-object memory-backend-file,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu,share=on,size=1024M,id=ram-node1 -numa node,nodeid=1,cpus=1,memdev=ram-node1


memory regions:
        gpa = 0xC0000
        size = 2146697216
        ua = 0x2aaaaacc0000
        offset = 786432

        gpa = 0x0
        size = 655360
        ua = 0x2aaaaac00000
        offset = 0

hugepage:
cat /proc/pidof qemu/maps
2aaaaac00000-2aaaeac00000 rw-s 00000000 00:18 10357788                   /dev/hugepages/libvirt/qemu/qemu_back_mem._objects_ram-node0.MvcPyi (deleted)
2aaaeac00000-2aab2ac00000 rw-s 00000000 00:18 10357789                   /dev/hugepages/libvirt/qemu/qemu_back_mem._objects_ram-node1.tjAVin (deleted)

The memory size of each region is not match to the size of each hugepage file,is this ok?How does vhost-user to mmap all the hugepage?
  
Linhaifeng Dec. 12, 2014, 2:25 a.m. UTC | #5
On 2014/12/12 1:13, Xie, Huawei wrote:
>>
>> Only support one vhost-user port ?
> 
> Do you mean vhost server by "port"?
> If that is the case, yes, now only one vhost server is supported for multiple virtio devices.
> As stated in the cover letter, we have requirement and plan for multiple server support,
> though I am not sure if it is absolutely necessary.
> 
I think if you also want to suport client mode (qemu as server) is necessary to implement multiple server.

>>
>> Can you mmap the region if gpa is 0? When i run VM with two numa node (qemu
>> will create two hugepage file) found that always failed to mmap with the region
>> which gpa is 0.
> 
> Current implementation doesn't assume there is only one huge page file to back the guest memory.
> It maps every region using the fd of that region. 
> Could you please paste your guest VM command line here?
> 
>>
>> BTW can we ensure the memory regions cover with all the memory of hugepage
>> for VM?
> 
> I think so, because virtio devices could use any normal guest memory, but we needn't ensure that.
> We only need to map the region passed to us from qemu vhost, which should be enough to translate
> the GPA in vring from virtio in guest, otherwise it is the bug of qemu vhost.
> 
>
  
Tetsuya Mukawa Dec. 16, 2014, 3:05 a.m. UTC | #6
(2014/12/11 6:37), Huawei Xie wrote:
> vhost-user support
>
>
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  lib/librte_vhost/Makefile                     |   5 +-
>  lib/librte_vhost/vhost-net.h                  |   4 +
>  lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |   9 +
>  lib/librte_vhost/vhost_user/vhost-net-user.c  | 422 ++++++++++++++++++++++++++
>  lib/librte_vhost/vhost_user/vhost-net-user.h  | 108 +++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 199 ++++++++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.h |  48 +++
>  lib/librte_vhost/virtio-net.c                 |  16 +-
>  lib/librte_vhost/virtio-net.h                 |  43 +++
>  9 files changed, 842 insertions(+), 12 deletions(-)
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
>  create mode 100644 lib/librte_vhost/virtio-net.h
>
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index e0d0ef6..b2f14a0 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -34,10 +34,11 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  # library name
>  LIB = librte_vhost.a
>  
> -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -I vhost_user -O3 -D_FILE_OFFSET_BITS=64 -lfuse
>  LDFLAGS += -lfuse
>  # all source are stored in SRCS-y
> -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c virtio-net.c vhost_rxtx.c
>  
>  # install includes
>  SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index f7e96fd..f9ec40b 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -41,8 +41,12 @@
>  
>  #include <rte_log.h>
>  
> +#include "rte_virtio_net.h"
> +
>  #define VHOST_MEMORY_MAX_NREGIONS 8
>  
> +extern struct vhost_net_device_ops const *ops;
> +
>  /* Macros for printing using RTE_LOG */
>  #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
>  #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
> diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> index edcbc10..8ac3360 100644
> --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> +++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> @@ -268,6 +268,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  	struct vhost_memory_region *mem_regions = (void *)(uintptr_t)
>  		((uint64_t)(uintptr_t)mem_regions_addr + size);
>  	uint64_t base_address = 0, mapped_address, mapped_size;
> +	struct virtio_dev *dev;
>  
>  	for (idx = 0; idx < nregions; idx++) {
>  		regions[idx].guest_phys_address =
> @@ -335,6 +336,14 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  			regions[idx].guest_phys_address;
>  	}
>  
> +	dev = get_device(ctx);
> +	if (dev && dev->mem && dev->mmaped_address) {
> +		munmap((void *)(uintptr_t)dev->mmaped_address,
> +			(size_t)dev->mmaped_size);
> +		free(dev->mem);
> +		dev->mem = NULL;
> +	}
> +
>  	ops->set_mem_table(ctx, &regions[0], valid_regions);
>  	return 0;
>  }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> new file mode 100644
> index 0000000..841d7e6
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -0,0 +1,422 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <limits.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +
> +#include <rte_log.h>
> +#include <rte_virtio_net.h>
> +
> +#include "fd_man.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +#include "virtio-net-user.h"
> +
> +static void vserver_new_vq_conn(int fd, uint64_t data);
> +static void vserver_message_handler(int fd, uint64_t dat);
> +struct vhost_net_device_ops const *ops;
> +
> +static struct vhost_server *g_vhost_server;
> +
> +static const char *vhost_message_str[VHOST_USER_MAX] = {
> +	[VHOST_USER_NONE] = "VHOST_USER_NONE",
> +	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
> +	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
> +	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
> +	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
> +	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
> +	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
> +	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
> +	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
> +	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
> +	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
> +	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
> +	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
> +	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
> +	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR"
> +};
> +
> +/**
> + * Create a unix domain socket, bind to path and listen for connection.
> + * @return
> + *  socket fd or -1 on failure
> + */
> +static int
> +uds_socket(const char *path)
> +{
> +	struct sockaddr_un un;
> +	int sockfd;
> +	int ret;
> +
> +	if (path == NULL)
> +		return -1;
> +
> +	sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (sockfd < 0)
> +		return -1;
> +	RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
> +	ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
> +	if (ret == -1)
> +		goto err;
> +	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
> +
> +	ret = listen(sockfd, 1);
> +	if (ret == -1)
> +		goto err;
> +
> +	return sockfd;
> +
> +err:
> +	close(sockfd);
> +	return -1;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len  = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(sockfd, &msgh, 0);
> +	if (ret <= 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
> +		return ret;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "Truncted msg\n");
> +		return -1;
> +	}
> +
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
> +		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
> +	if (ret <= 0)
> +		return ret;
> +
> +	if (msg && msg->size) {
> +		if (msg->size > sizeof(msg->payload)) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"invalid msg size: %d\n", msg->size);
> +			return -1;
> +		}
> +		ret = read(sockfd, &msg->payload, msg->size);
> +		if (ret <= 0)
> +			return ret;
> +		if (ret != (int)msg->size) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"read control message failed\n");
> +			return -1;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +
> +	if (fds && fd_num > 0) {
> +		msgh.msg_control = control;
> +		msgh.msg_controllen = sizeof(control);
> +		cmsg = CMSG_FIRSTHDR(&msgh);
> +		cmsg->cmsg_len = CMSG_LEN(fdsize);
> +		cmsg->cmsg_level = SOL_SOCKET;
> +		cmsg->cmsg_type = SCM_RIGHTS;
> +		memcpy(CMSG_DATA(cmsg), fds, fdsize);
> +	} else {
> +		msgh.msg_control = NULL;
> +		msgh.msg_controllen = 0;
> +	}
> +
> +	do {
> +		ret = sendmsg(sockfd, &msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	if (!msg)
> +		return 0;
> +
> +	msg->flags &= ~VHOST_USER_VERSION_MASK;
> +	msg->flags |= VHOST_USER_VERSION;
> +	sg->flags |= VHOST_USER_REPLY_MASK;
> +
> +	ret = send_fd_message(sockfd, (char *)msg,
> +		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
> +
> +	return ret;
> +}
> +
> +/* call back when there is new virtio connection.  */
> +static void
> +vserver_new_vq_conn(int fd, uint64_t dat)
> +{
> +	struct vhost_server *vserver = (void *)(uintptr_t)dat;
> +	int conn_fd;
> +	uint32_t fh;
> +	struct vhost_device_ctx vdev_ctx = { 0 };
> +
> +	conn_fd = accept(fd, NULL, NULL);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"new virtio connection is %d\n", conn_fd);
> +	if (conn_fd < 0)
> +		return;
> +
> +	fh = ops->new_device(vdev_ctx);
> +	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
> +
> +	fdset_add(&vserver->fdset,
> +		conn_fd, vserver_message_handler, NULL, fh);
> +}
> +
> +/* callback when there is message on the connfd */
> +static void
> +vserver_message_handler(int connfd, uint64_t dat)
> +{
> +	struct vhost_device_ctx ctx;
> +	uint32_t fh = (uint32_t)dat;
> +	struct VhostUserMsg msg;
> +	uint64_t features;
> +	int ret;
> +
> +	ctx.fh = fh;
> +	ret = read_vhost_message(connfd, &msg);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read message failed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	} else if (ret == 0) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"vhost peer closed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	}
> +	if (msg.request > VHOST_USER_MAX) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read incorrect message\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +
> +		return;
> +	}
> +
> +	RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
> +		vhost_message_str[msg.request]);
> +	switch (msg.request) {
> +	case VHOST_USER_GET_FEATURES:
> +		ret = ops->get_features(ctx, &features);
> +		msg.payload.u64 = ret;
> +		msg.size = sizeof(msg.payload.u64);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +	case VHOST_USER_SET_FEATURES:
> +		ops->set_features(ctx, &features);
> +		break;
> +
> +	case VHOST_USER_SET_OWNER:
> +		ops->set_owner(ctx);
> +		break;
> +	case VHOST_USER_RESET_OWNER:
> +		ops->reset_owner(ctx);
> +		break;
> +
> +	case VHOST_USER_SET_MEM_TABLE:
> +		user_set_mem_table(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_LOG_BASE:
> +	case VHOST_USER_SET_LOG_FD:
> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> +		break;
> +
> +	case VHOST_USER_SET_VRING_NUM:
> +		ops->set_vring_num(ctx, &msg.payload.state);
> +		break;
> +	case VHOST_USER_SET_VRING_ADDR:
> +		ops->set_vring_addr(ctx, &msg.payload.addr);
> +		break;
> +	case VHOST_USER_SET_VRING_BASE:
> +		ops->set_vring_base(ctx, &msg.payload.state);
> +		break;
> +
> +	case VHOST_USER_GET_VRING_BASE:
> +		ret = user_get_vring_base(ctx, &msg.payload.state);
> +		msg.size = sizeof(msg.payload.state);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_KICK:
> +		user_set_vring_kick(ctx, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_CALL:
> +		user_set_vring_call(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_ERR:
> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> +		break;
> +
> +	default:
> +		break;
> +
> +	}
> +}
> +
> +
> +/**
> + * Creates and initialise the vhost server.
> + */
> +int
> +rte_vhost_driver_register(const char *path)
> +{
> +
> +	struct vhost_server *vserver;
> +
> +	if (g_vhost_server != NULL)
> +		return -1;
> +
> +	vserver = calloc(sizeof(struct vhost_server), 1);
> +	if (vserver == NULL)
> +		return -1;
> +
> +	fdset_init(&vserver->fdset);
> +
> +	unlink(path);
> +
> +	vserver->listenfd = uds_socket(path);
> +	if (vserver->listenfd < 0) {
> +		free(vserver);
> +		return -1;
> +	}
> +	vserver->path = path;
> +
> +	fdset_add(&vserver->fdset, vserver->listenfd,
> +		vserver_new_vq_conn, NULL,
> +		(uint64_t)(uintptr_t)vserver);
> +
> +	ops = get_virtio_net_callbacks();
> +
> +	g_vhost_server = vserver;
> +
> +	return 0;
> +}
> +
> +
> +int
> +rte_vhost_driver_session_start(void)
> +{
> +	fdset_event_dispatch(&g_vhost_server->fdset);
> +	return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> new file mode 100644
> index 0000000..c138844
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -0,0 +1,108 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VHOST_NET_USER_H
> +#define _VHOST_NET_USER_H
> +
> +#include <stdint.h>
> +#include <linux/vhost.h>
> +
> +#include "fd_man.h"
> +
> +struct vhost_server {
> +	const char *path; /**< The path the uds is bind to. */
> +	int listenfd;     /**< The listener sockfd. */
> +	struct fdset fdset; /**< The fd list this vhost server manages. */
> +};
> +
> +/* refer to hw/virtio/vhost-user.c */
> +
> +#define VHOST_MEMORY_MAX_NREGIONS    8
> +
> +typedef enum VhostUserRequest {
> +	VHOST_USER_NONE = 0,
> +	VHOST_USER_GET_FEATURES = 1,
> +	VHOST_USER_SET_FEATURES = 2,
> +	VHOST_USER_SET_OWNER = 3,
> +	VHOST_USER_RESET_OWNER = 4,
> +	VHOST_USER_SET_MEM_TABLE = 5,
> +	VHOST_USER_SET_LOG_BASE = 6,
> +	VHOST_USER_SET_LOG_FD = 7,
> +	VHOST_USER_SET_VRING_NUM = 8,
> +	VHOST_USER_SET_VRING_ADDR = 9,
> +	VHOST_USER_SET_VRING_BASE = 10,
> +	VHOST_USER_GET_VRING_BASE = 11,
> +	VHOST_USER_SET_VRING_KICK = 12,
> +	VHOST_USER_SET_VRING_CALL = 13,
> +	VHOST_USER_SET_VRING_ERR = 14,
> +	VHOST_USER_MAX
> +} VhostUserRequest;
> +
> +typedef struct VhostUserMemoryRegion {
> +	uint64_t guest_phys_addr;
> +	uint64_t memory_size;
> +	uint64_t userspace_addr;
> +	uint64_t mmap_offset;
> +} VhostUserMemoryRegion;
> +
> +typedef struct VhostUserMemory {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> +} VhostUserMemory;
> +
> +typedef struct VhostUserMsg {
> +	VhostUserRequest request;
> +
> +#define VHOST_USER_VERSION_MASK     (0x3)
> +#define VHOST_USER_REPLY_MASK       (0x1 << 2)
> +	uint32_t flags;
> +	uint32_t size; /* the following payload size */
> +	union {
> +#define VHOST_USER_VRING_IDX_MASK   (0xff)
> +#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
> +		uint64_t u64;
> +		struct vhost_vring_state state;
> +		struct vhost_vring_addr addr;
> +	VhostUserMemory memory;
> +	} payload;
> +	int fds[VHOST_MEMORY_MAX_NREGIONS];
> +} __attribute((packed)) VhostUserMsg;
> +
> +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION    (0x1)
> +
> +/*****************************************************************************/
> +#endif
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> new file mode 100644
> index 0000000..ad59fcc
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -0,0 +1,199 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +
> +#include <rte_log.h>
> +
> +#include "virtio-net.h"
> +#include "virtio-net-user.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +
> +int
> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	unsigned int idx;
> +	struct VhostUserMemory memory = pmsg->payload.memory;
> +	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> +	uint64_t mapped_address, base_address = 0;
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		if (memory.regions[idx].guest_phys_addr == 0)
> +			base_address = memory.regions[idx].userspace_addr;
> +	}
> +	if (base_address == 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"couldn't find the mem region whose GPA is 0.\n");
> +		return -1;
> +	}
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		regions[idx].guest_phys_address =
> +			memory.regions[idx].guest_phys_addr;
> +		regions[idx].guest_phys_address_end =
> +			memory.regions[idx].guest_phys_addr +
> +			memory.regions[idx].memory_size;
> +		regions[idx].memory_size = memory.regions[idx].memory_size;
> +		regions[idx].userspace_address =
> +			memory.regions[idx].userspace_addr;
> +
> +		/* This is ugly */
> +		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> +			regions[idx].memory_size +
> +				memory.regions[idx].mmap_offset,
> +			PROT_READ | PROT_WRITE, MAP_SHARED,
> +			pmsg->fds[idx],
> +			0);
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"mapped region %d to %p\n",
> +			idx, (void *)mapped_address);
> +
> +		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"mmap qemu guest failed.\n");
> +			return -1;
> +		}
> +
> +		mapped_address +=  memory.regions[idx].mmap_offset;
> +
> +		regions[idx].address_offset = mapped_address -
> +			regions[idx].guest_phys_address;
> +		LOG_DEBUG(VHOST_CONFIG,
> +			"REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
> +			idx,
> +			(void *)(uintptr_t)regions[idx].guest_phys_address,
> +			(void *)(uintptr_t)regions[idx].userspace_address,
> +			 regions[idx].memory_size);
> +	}
> +	ops->set_mem_table(ctx, regions, memory.nregions);
> +	return 0;
> +}
> +
> +
> +static int
> +virtio_is_ready(struct virtio_net *dev)
> +{
> +	struct vhost_virtqueue *rvq, *tvq;
> +
> +	/* mq support in future.*/
> +	rvq = dev->virtqueue[VIRTIO_RXQ];
> +	tvq = dev->virtqueue[VIRTIO_TXQ];
> +	if (rvq && tvq && rvq->desc && tvq->desc &&
> +		(rvq->kickfd != (eventfd_t)-1) &&
> +		(rvq->callfd != (eventfd_t)-1) &&
> +		(tvq->kickfd != (eventfd_t)-1) &&
> +		(tvq->callfd != (eventfd_t)-1)) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"virtio is now ready for processing.\n");
> +		return 1;
> +	}
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"virtio isn't ready for processing.\n");
> +	return 0;
> +}
> +
> +void
> +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring call idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_call(ctx, &file);
> +}
> +
> +
> +/*
> + *  In vhost-user, when we receive kick message, will test whether virtio
> + *  device is ready for packet processing.
> + */
> +void
> +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring kick idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_kick(ctx, &file);
> +
> +	if (virtio_is_ready(dev) &&
> +		!(dev->flags & VIRTIO_DEV_RUNNING))
> +			notify_ops->new_device(dev);
> +
> +}
> +
> +/*
> + * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
> + */
> +int
> +user_get_vring_base(struct vhost_device_ctx ctx,
> +	struct vhost_vring_state *state)
> +{
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	/* We have to stop the queue (virtio) if it is running. */
> +	if (dev->flags & VIRTIO_DEV_RUNNING)
> +		notify_ops->destroy_device(dev);

I have an one concern about finalization of vrings.
Can vhost-backend stop accessing RX/TX to the vring before replying to
this message?

QEMU sends this message when virtio-net device is finalized by
virtio-net driver on the guest.
After finalization, memories used by the vring will be freed by
virtio-net driver, because these memories are allocated by virtio-net
driver.
Because of this, I guess vhost-backend must stop accessing to vring
before replying to this message.

I am not sure what is a good way to stop accessing.
One idea is adding a condition checking when rte_vhost_dequeue_burst()
and rte_vhost_enqueue_burst() is called.
Anyway we probably need to wait for stopping access before replying.

Thanks,
Tetsuya

> +
> +	/* Here we are safe to get the last used index */
> +	ops->get_vring_base(ctx, state->index, state);
> +
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring base idx:%d file:%d\n", state->index, state->num);
> +	/*
> +	 * Based on current qemu vhost-user implementation, this message is
> +	 * sent and only sent in vhost_vring_stop.
> +	 * TODO: cleanup the vring, it isn't usable since here.
> +	 */
> +	if (dev->virtqueue[VIRTIO_RXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_RXQ]->callfd);
> +		dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
> +	}
> +	if (dev->virtqueue[VIRTIO_TXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_TXQ]->callfd);
> +		dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
> +	}
> +
> +	return 0;
> +
> +}
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> new file mode 100644
> index 0000000..0f6a75a
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -0,0 +1,48 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_USER_H
> +#define _VIRTIO_NET_USER_H
> +
> +#include "vhost-net.h"
> +#include "vhost-net-user.h"
> +
> +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
> +
> +#endif
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index f81e459..0b49f1b 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -46,6 +46,7 @@
>  #include <rte_virtio_net.h>
>  
>  #include "vhost-net.h"
> +#include "virtio-net.h"
>  
>  /*
>   * Device linked list structure for configuration.
> @@ -56,7 +57,7 @@ struct virtio_net_config_ll {
>  };
>  
>  /* device ops to add/remove device to/from data core. */
> -static struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net_device_ops const *notify_ops;
>  /* root address of the linked list of managed virtio devices */
>  static struct virtio_net_config_ll *ll_root;
>  
> @@ -83,8 +84,9 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
>  		if ((qemu_va >= region->userspace_address) &&
>  			(qemu_va <= region->userspace_address +
>  			region->memory_size)) {
> -			vhost_va = dev->mem->mapped_address + qemu_va -
> -					dev->mem->base_address;
> +			vhost_va = qemu_va + region->guest_phys_address +
> +				region->address_offset -
> +				region->userspace_address;
>  			break;
>  		}
>  	}
> @@ -114,7 +116,7 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
>   * Searches the configuration core linked list and
>   * retrieves the device if it exists.
>   */
> -static struct virtio_net *
> +struct virtio_net *
>  get_device(struct vhost_device_ctx ctx)
>  {
>  	struct virtio_net_config_ll *ll_dev;
> @@ -450,12 +452,6 @@ set_mem_table(struct vhost_device_ctx ctx,
>  	if (dev == NULL)
>  		return -1;
>  
> -	if (dev->mem) {
> -		munmap((void *)(uintptr_t)dev->mem->mapped_address,
> -			(size_t)dev->mem->mapped_size);
> -		free(dev->mem);
> -	}
> -
>  	/* Malloc the memory structure depending on the number of regions. */
>  	mem = calloc(1, sizeof(struct virtio_memory) +
>  		(sizeof(struct virtio_memory_regions) * nregions));
> diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_vhost/virtio-net.h
> new file mode 100644
> index 0000000..da4ade0
> --- /dev/null
> +++ b/lib/librte_vhost/virtio-net.h
> @@ -0,0 +1,43 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_H
> +#define _VIRTIO_NET_H
> +
> +#include "vhost-net.h"
> +#include "rte_virtio_net.h"
> +
> +struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net * get_device(struct vhost_device_ctx ctx);
> +
> +#endif
  
Huawei Xie Dec. 17, 2014, 1:06 a.m. UTC | #7
> > +{
> > +	struct virtio_net *dev = get_device(ctx);
> > +
> > +	/* We have to stop the queue (virtio) if it is running. */
> > +	if (dev->flags & VIRTIO_DEV_RUNNING)
> > +		notify_ops->destroy_device(dev);
> 
> I have an one concern about finalization of vrings.
> Can vhost-backend stop accessing RX/TX to the vring before replying to
> this message?
> 
> QEMU sends this message when virtio-net device is finalized by
> virtio-net driver on the guest.
> After finalization, memories used by the vring will be freed by
> virtio-net driver, because these memories are allocated by virtio-net
> driver.
> Because of this, I guess vhost-backend must stop accessing to vring
> before replying to this message.
> 
> I am not sure what is a good way to stop accessing.
> One idea is adding a condition checking when rte_vhost_dequeue_burst()
> and rte_vhost_enqueue_burst() is called.
> Anyway we probably need to wait for stopping access before replying.
> 
> Thanks,
> Tetsuya
> 
I think we have discussed the similar question.
It is actually the same issue whether the virtio APP in guest is crashed, or is finalized.
The virtio APP will only write to the STATUS register without waiting/syncing to vhost backend.
After that, not only the guest memory pointed by vring entry but also the vring itself isn't usable any more.
The memory for vring or pointed by vring entry might be used by other APPs.
This will crash guest(rather than the vhost, do you agree?).

If you mean this issue, I think we have no solution but one walk around: keep the huge page files of crashed app, and 
bind virtio to igb_uio and then delete the huge page files.

In our implementation, when vhost sends the message,  we will call the destroy_device provided by the vSwitch to ask the
vSwitch to stop processing the vring, but this willn't solve the issue I mention above, because the virtio APP in guest will n't 
wait us.

Could you explain a bit more? Is it the same issue?


-huawei
  
Tetsuya Mukawa Dec. 17, 2014, 3:31 a.m. UTC | #8
(2014/12/17 10:06), Xie, Huawei wrote:
>>> +{
>>> +	struct virtio_net *dev = get_device(ctx);
>>> +
>>> +	/* We have to stop the queue (virtio) if it is running. */
>>> +	if (dev->flags & VIRTIO_DEV_RUNNING)
>>> +		notify_ops->destroy_device(dev);
>> I have an one concern about finalization of vrings.
>> Can vhost-backend stop accessing RX/TX to the vring before replying to
>> this message?
>>
>> QEMU sends this message when virtio-net device is finalized by
>> virtio-net driver on the guest.
>> After finalization, memories used by the vring will be freed by
>> virtio-net driver, because these memories are allocated by virtio-net
>> driver.
>> Because of this, I guess vhost-backend must stop accessing to vring
>> before replying to this message.
>>
>> I am not sure what is a good way to stop accessing.
>> One idea is adding a condition checking when rte_vhost_dequeue_burst()
>> and rte_vhost_enqueue_burst() is called.
>> Anyway we probably need to wait for stopping access before replying.
>>
>> Thanks,
>> Tetsuya
>>
> I think we have discussed the similar question.

Sorry, probably I might not be able to got your point correctly at the
former email.

> It is actually the same issue whether the virtio APP in guest is crashed, or is finalized.

I guess when the APP is finalized correctly, we can have a solution.
Could you please read comment I wrote later?

> The virtio APP will only write to the STATUS register without waiting/syncing to vhost backend.

Yes, virtio APP only write to the STATUS register. I agree with it.

When the register is written by guest, KVM will catch it, and the
context will be change to QEMU. And QEMU will works like below.
(Also while doing following steps, guest is waiting because the context
is in QEMU)

Could you please see below with latest QEMU code?
1. virtio_ioport_write() [hw/virtio/virtio-pci.c] <= virtio APP will
wait for replying of this function.
2. virtio_set_status() [hw/virtio/virtio.c]
3. virtio_net_set_status() [hw/net/virtio-net.c]
4. virtio_net_vhost_status() [hw/net/virtio-net.c]
5. vhost_net_stop() [hw/net/vhost_net.c]
6. vhost_net_stop_one() [hw/net/vhost_net.c]
7. vhost_dev_stop() [hw/virtio/vhost.c]
8. vhost_virtqueue_stop() [hw/virtio/vhost.c]
9. vhost_user_call() [virtio/vhost-user.c]
10. VHOST_USER_GET_VRING_BASE message is sent to backend. And waiting
for backend reply.

When the vhost-user backend receives GET_VRING_BASE, I guess the guest
APP is stopped.
Also QEMU will wait for vhost-user backend reply because GET_VRING_BASE
is synchronous message.
Because of above, I guess virtio APP can wait for vhost-backend
finalization.

> After that, not only the guest memory pointed by vring entry but also the vring itself isn't usable any more.
> The memory for vring or pointed by vring entry might be used by other APPs.

I agree.

> This will crash guest(rather than the vhost, do you agree?).

I guess we cannot assume how the freed memory is used.
In some cases, a new APP still works, but vhost backend can access
inconsistency vring structure.
In the case vhost backend could receive illegal packets.
For example, avail_idx value might be changed to be 0xFFFF by a new APP.
(I am not sure RX/TX functions can handle such a value correctly)

Anyway, my point is if we can finalize vhost backend correctly, we only
need to take care of crashing case.
(If so, it's very nice :))
So let's make sure whether virtio APP can wait for finalization, or not.
I am thinking how to do it now.


> If you mean this issue, I think we have no solution but one walk around: keep the huge page files of crashed app, and 
> bind virtio to igb_uio and then delete the huge page files.

Yes I agree.
If the virtio APP is crashed, this will be a solution.

Thanks,
Tetsuya

>
> In our implementation, when vhost sends the message,  we will call the destroy_device provided by the vSwitch to ask the
> vSwitch to stop processing the vring, but this willn't solve the issue I mention above, because the virtio APP in guest will n't 
> wait us.
>
> Could you explain a bit more? Is it the same issue?
>
>
> -huawei
>
>
>
  
Tetsuya Mukawa Dec. 24, 2014, 7:21 a.m. UTC | #9
(2014/12/11 6:37), Huawei Xie wrote:
> vhost-user support
>
>
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  lib/librte_vhost/Makefile                     |   5 +-
>  lib/librte_vhost/vhost-net.h                  |   4 +
>  lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |   9 +
>  lib/librte_vhost/vhost_user/vhost-net-user.c  | 422 ++++++++++++++++++++++++++
>  lib/librte_vhost/vhost_user/vhost-net-user.h  | 108 +++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 199 ++++++++++++
>  lib/librte_vhost/vhost_user/virtio-net-user.h |  48 +++
>  lib/librte_vhost/virtio-net.c                 |  16 +-
>  lib/librte_vhost/virtio-net.h                 |  43 +++
>  9 files changed, 842 insertions(+), 12 deletions(-)
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
>  create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
>  create mode 100644 lib/librte_vhost/virtio-net.h
>
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index e0d0ef6..b2f14a0 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -34,10 +34,11 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  # library name
>  LIB = librte_vhost.a
>  
> -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -I vhost_user -O3 -D_FILE_OFFSET_BITS=64 -lfuse
>  LDFLAGS += -lfuse
>  # all source are stored in SRCS-y
> -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c virtio-net.c vhost_rxtx.c
>  
>  # install includes
>  SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index f7e96fd..f9ec40b 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -41,8 +41,12 @@
>  
>  #include <rte_log.h>
>  
> +#include "rte_virtio_net.h"
> +
>  #define VHOST_MEMORY_MAX_NREGIONS 8
>  
> +extern struct vhost_net_device_ops const *ops;
> +
>  /* Macros for printing using RTE_LOG */
>  #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
>  #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
> diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> index edcbc10..8ac3360 100644
> --- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> +++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
> @@ -268,6 +268,7 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  	struct vhost_memory_region *mem_regions = (void *)(uintptr_t)
>  		((uint64_t)(uintptr_t)mem_regions_addr + size);
>  	uint64_t base_address = 0, mapped_address, mapped_size;
> +	struct virtio_dev *dev;
>  
>  	for (idx = 0; idx < nregions; idx++) {
>  		regions[idx].guest_phys_address =
> @@ -335,6 +336,14 @@ cuse_set_mem_table(struct vhost_device_ctx ctx,
>  			regions[idx].guest_phys_address;
>  	}
>  
> +	dev = get_device(ctx);
> +	if (dev && dev->mem && dev->mmaped_address) {
> +		munmap((void *)(uintptr_t)dev->mmaped_address,
> +			(size_t)dev->mmaped_size);
> +		free(dev->mem);
> +		dev->mem = NULL;
> +	}
> +
>  	ops->set_mem_table(ctx, &regions[0], valid_regions);
>  	return 0;
>  }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> new file mode 100644
> index 0000000..841d7e6
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -0,0 +1,422 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <limits.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +
> +#include <rte_log.h>
> +#include <rte_virtio_net.h>
> +
> +#include "fd_man.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +#include "virtio-net-user.h"
> +
> +static void vserver_new_vq_conn(int fd, uint64_t data);
> +static void vserver_message_handler(int fd, uint64_t dat);
> +struct vhost_net_device_ops const *ops;
> +
> +static struct vhost_server *g_vhost_server;
> +
> +static const char *vhost_message_str[VHOST_USER_MAX] = {
> +	[VHOST_USER_NONE] = "VHOST_USER_NONE",
> +	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
> +	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
> +	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
> +	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
> +	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
> +	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
> +	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
> +	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
> +	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
> +	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
> +	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
> +	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
> +	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
> +	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR"
> +};
> +
> +/**
> + * Create a unix domain socket, bind to path and listen for connection.
> + * @return
> + *  socket fd or -1 on failure
> + */
> +static int
> +uds_socket(const char *path)
> +{
> +	struct sockaddr_un un;
> +	int sockfd;
> +	int ret;
> +
> +	if (path == NULL)
> +		return -1;
> +
> +	sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (sockfd < 0)
> +		return -1;
> +	RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
> +
> +	memset(&un, 0, sizeof(un));
> +	un.sun_family = AF_UNIX;
> +	snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
> +	ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
> +	if (ret == -1)
> +		goto err;
> +	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
> +
> +	ret = listen(sockfd, 1);
> +	if (ret == -1)
> +		goto err;
> +
> +	return sockfd;
> +
> +err:
> +	close(sockfd);
> +	return -1;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len  = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +	msgh.msg_control = control;
> +	msgh.msg_controllen = sizeof(control);
> +
> +	ret = recvmsg(sockfd, &msgh, 0);
> +	if (ret <= 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
> +		return ret;
> +	}
> +
> +	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "Truncted msg\n");
> +		return -1;
> +	}
> +
> +	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> +		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> +		if ((cmsg->cmsg_level == SOL_SOCKET) &&
> +			(cmsg->cmsg_type == SCM_RIGHTS)) {
> +			memcpy(fds, CMSG_DATA(cmsg), fdsize);
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
> +		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
> +	if (ret <= 0)
> +		return ret;
> +
> +	if (msg && msg->size) {
> +		if (msg->size > sizeof(msg->payload)) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"invalid msg size: %d\n", msg->size);
> +			return -1;
> +		}
> +		ret = read(sockfd, &msg->payload, msg->size);
> +		if (ret <= 0)
> +			return ret;
> +		if (ret != (int)msg->size) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"read control message failed\n");
> +			return -1;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +
> +	struct iovec iov;
> +	struct msghdr msgh = { 0 };
> +	size_t fdsize = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fdsize)];
> +	struct cmsghdr *cmsg;
> +	int ret;
> +
> +	iov.iov_base = buf;
> +	iov.iov_len = buflen;
> +
> +	msgh.msg_iov = &iov;
> +	msgh.msg_iovlen = 1;
> +
> +	if (fds && fd_num > 0) {
> +		msgh.msg_control = control;
> +		msgh.msg_controllen = sizeof(control);
> +		cmsg = CMSG_FIRSTHDR(&msgh);
> +		cmsg->cmsg_len = CMSG_LEN(fdsize);
> +		cmsg->cmsg_level = SOL_SOCKET;
> +		cmsg->cmsg_type = SCM_RIGHTS;
> +		memcpy(CMSG_DATA(cmsg), fds, fdsize);
> +	} else {
> +		msgh.msg_control = NULL;
> +		msgh.msg_controllen = 0;
> +	}
> +
> +	do {
> +		ret = sendmsg(sockfd, &msgh, 0);
> +	} while (ret < 0 && errno == EINTR);
> +
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +send_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> +	int ret;
> +
> +	if (!msg)
> +		return 0;
> +
> +	msg->flags &= ~VHOST_USER_VERSION_MASK;
> +	msg->flags |= VHOST_USER_VERSION;
> +	sg->flags |= VHOST_USER_REPLY_MASK;
> +
> +	ret = send_fd_message(sockfd, (char *)msg,
> +		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
> +
> +	return ret;
> +}
> +
> +/* call back when there is new virtio connection.  */
> +static void
> +vserver_new_vq_conn(int fd, uint64_t dat)
> +{
> +	struct vhost_server *vserver = (void *)(uintptr_t)dat;
> +	int conn_fd;
> +	uint32_t fh;
> +	struct vhost_device_ctx vdev_ctx = { 0 };
> +
> +	conn_fd = accept(fd, NULL, NULL);
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"new virtio connection is %d\n", conn_fd);
> +	if (conn_fd < 0)
> +		return;
> +
> +	fh = ops->new_device(vdev_ctx);
> +	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
> +
> +	fdset_add(&vserver->fdset,
> +		conn_fd, vserver_message_handler, NULL, fh);
> +}
> +
> +/* callback when there is message on the connfd */
> +static void
> +vserver_message_handler(int connfd, uint64_t dat)
> +{
> +	struct vhost_device_ctx ctx;
> +	uint32_t fh = (uint32_t)dat;
> +	struct VhostUserMsg msg;
> +	uint64_t features;
> +	int ret;
> +
> +	ctx.fh = fh;
> +	ret = read_vhost_message(connfd, &msg);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read message failed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	} else if (ret == 0) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"vhost peer closed\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +		ops->destroy_device(ctx);
> +
> +		return;
> +	}
> +	if (msg.request > VHOST_USER_MAX) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"vhost read incorrect message\n");
> +
> +		/*TODO: cleanup */
> +		close(connfd);
> +		fdset_del(&g_vhost_server->fdset, connfd);
> +
> +		return;
> +	}
> +
> +	RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
> +		vhost_message_str[msg.request]);
> +	switch (msg.request) {
> +	case VHOST_USER_GET_FEATURES:
> +		ret = ops->get_features(ctx, &features);
> +		msg.payload.u64 = ret;

I guess 'feature' should be filled as payload like below.
msg.payload.u64 = features;

Thanks,
Tetsuya

> +		msg.size = sizeof(msg.payload.u64);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +	case VHOST_USER_SET_FEATURES:
> +		ops->set_features(ctx, &features);
> +		break;
> +
> +	case VHOST_USER_SET_OWNER:
> +		ops->set_owner(ctx);
> +		break;
> +	case VHOST_USER_RESET_OWNER:
> +		ops->reset_owner(ctx);
> +		break;
> +
> +	case VHOST_USER_SET_MEM_TABLE:
> +		user_set_mem_table(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_LOG_BASE:
> +	case VHOST_USER_SET_LOG_FD:
> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> +		break;
> +
> +	case VHOST_USER_SET_VRING_NUM:
> +		ops->set_vring_num(ctx, &msg.payload.state);
> +		break;
> +	case VHOST_USER_SET_VRING_ADDR:
> +		ops->set_vring_addr(ctx, &msg.payload.addr);
> +		break;
> +	case VHOST_USER_SET_VRING_BASE:
> +		ops->set_vring_base(ctx, &msg.payload.state);
> +		break;
> +
> +	case VHOST_USER_GET_VRING_BASE:
> +		ret = user_get_vring_base(ctx, &msg.payload.state);
> +		msg.size = sizeof(msg.payload.state);
> +		send_vhost_message(connfd, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_KICK:
> +		user_set_vring_kick(ctx, &msg);
> +		break;
> +	case VHOST_USER_SET_VRING_CALL:
> +		user_set_vring_call(ctx, &msg);
> +		break;
> +
> +	case VHOST_USER_SET_VRING_ERR:
> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> +		break;
> +
> +	default:
> +		break;
> +
> +	}
> +}
> +
> +
> +/**
> + * Creates and initialise the vhost server.
> + */
> +int
> +rte_vhost_driver_register(const char *path)
> +{
> +
> +	struct vhost_server *vserver;
> +
> +	if (g_vhost_server != NULL)
> +		return -1;
> +
> +	vserver = calloc(sizeof(struct vhost_server), 1);
> +	if (vserver == NULL)
> +		return -1;
> +
> +	fdset_init(&vserver->fdset);
> +
> +	unlink(path);
> +
> +	vserver->listenfd = uds_socket(path);
> +	if (vserver->listenfd < 0) {
> +		free(vserver);
> +		return -1;
> +	}
> +	vserver->path = path;
> +
> +	fdset_add(&vserver->fdset, vserver->listenfd,
> +		vserver_new_vq_conn, NULL,
> +		(uint64_t)(uintptr_t)vserver);
> +
> +	ops = get_virtio_net_callbacks();
> +
> +	g_vhost_server = vserver;
> +
> +	return 0;
> +}
> +
> +
> +int
> +rte_vhost_driver_session_start(void)
> +{
> +	fdset_event_dispatch(&g_vhost_server->fdset);
> +	return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> new file mode 100644
> index 0000000..c138844
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -0,0 +1,108 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VHOST_NET_USER_H
> +#define _VHOST_NET_USER_H
> +
> +#include <stdint.h>
> +#include <linux/vhost.h>
> +
> +#include "fd_man.h"
> +
> +struct vhost_server {
> +	const char *path; /**< The path the uds is bind to. */
> +	int listenfd;     /**< The listener sockfd. */
> +	struct fdset fdset; /**< The fd list this vhost server manages. */
> +};
> +
> +/* refer to hw/virtio/vhost-user.c */
> +
> +#define VHOST_MEMORY_MAX_NREGIONS    8
> +
> +typedef enum VhostUserRequest {
> +	VHOST_USER_NONE = 0,
> +	VHOST_USER_GET_FEATURES = 1,
> +	VHOST_USER_SET_FEATURES = 2,
> +	VHOST_USER_SET_OWNER = 3,
> +	VHOST_USER_RESET_OWNER = 4,
> +	VHOST_USER_SET_MEM_TABLE = 5,
> +	VHOST_USER_SET_LOG_BASE = 6,
> +	VHOST_USER_SET_LOG_FD = 7,
> +	VHOST_USER_SET_VRING_NUM = 8,
> +	VHOST_USER_SET_VRING_ADDR = 9,
> +	VHOST_USER_SET_VRING_BASE = 10,
> +	VHOST_USER_GET_VRING_BASE = 11,
> +	VHOST_USER_SET_VRING_KICK = 12,
> +	VHOST_USER_SET_VRING_CALL = 13,
> +	VHOST_USER_SET_VRING_ERR = 14,
> +	VHOST_USER_MAX
> +} VhostUserRequest;
> +
> +typedef struct VhostUserMemoryRegion {
> +	uint64_t guest_phys_addr;
> +	uint64_t memory_size;
> +	uint64_t userspace_addr;
> +	uint64_t mmap_offset;
> +} VhostUserMemoryRegion;
> +
> +typedef struct VhostUserMemory {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> +} VhostUserMemory;
> +
> +typedef struct VhostUserMsg {
> +	VhostUserRequest request;
> +
> +#define VHOST_USER_VERSION_MASK     (0x3)
> +#define VHOST_USER_REPLY_MASK       (0x1 << 2)
> +	uint32_t flags;
> +	uint32_t size; /* the following payload size */
> +	union {
> +#define VHOST_USER_VRING_IDX_MASK   (0xff)
> +#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
> +		uint64_t u64;
> +		struct vhost_vring_state state;
> +		struct vhost_vring_addr addr;
> +	VhostUserMemory memory;
> +	} payload;
> +	int fds[VHOST_MEMORY_MAX_NREGIONS];
> +} __attribute((packed)) VhostUserMsg;
> +
> +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION    (0x1)
> +
> +/*****************************************************************************/
> +#endif
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> new file mode 100644
> index 0000000..ad59fcc
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -0,0 +1,199 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +
> +#include <rte_log.h>
> +
> +#include "virtio-net.h"
> +#include "virtio-net-user.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +
> +int
> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	unsigned int idx;
> +	struct VhostUserMemory memory = pmsg->payload.memory;
> +	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> +	uint64_t mapped_address, base_address = 0;
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		if (memory.regions[idx].guest_phys_addr == 0)
> +			base_address = memory.regions[idx].userspace_addr;
> +	}
> +	if (base_address == 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"couldn't find the mem region whose GPA is 0.\n");
> +		return -1;
> +	}
> +
> +	for (idx = 0; idx < memory.nregions; idx++) {
> +		regions[idx].guest_phys_address =
> +			memory.regions[idx].guest_phys_addr;
> +		regions[idx].guest_phys_address_end =
> +			memory.regions[idx].guest_phys_addr +
> +			memory.regions[idx].memory_size;
> +		regions[idx].memory_size = memory.regions[idx].memory_size;
> +		regions[idx].userspace_address =
> +			memory.regions[idx].userspace_addr;
> +
> +		/* This is ugly */
> +		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> +			regions[idx].memory_size +
> +				memory.regions[idx].mmap_offset,
> +			PROT_READ | PROT_WRITE, MAP_SHARED,
> +			pmsg->fds[idx],
> +			0);
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"mapped region %d to %p\n",
> +			idx, (void *)mapped_address);
> +
> +		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
> +			RTE_LOG(ERR, VHOST_CONFIG,
> +				"mmap qemu guest failed.\n");
> +			return -1;
> +		}
> +
> +		mapped_address +=  memory.regions[idx].mmap_offset;
> +
> +		regions[idx].address_offset = mapped_address -
> +			regions[idx].guest_phys_address;
> +		LOG_DEBUG(VHOST_CONFIG,
> +			"REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
> +			idx,
> +			(void *)(uintptr_t)regions[idx].guest_phys_address,
> +			(void *)(uintptr_t)regions[idx].userspace_address,
> +			 regions[idx].memory_size);
> +	}
> +	ops->set_mem_table(ctx, regions, memory.nregions);
> +	return 0;
> +}
> +
> +
> +static int
> +virtio_is_ready(struct virtio_net *dev)
> +{
> +	struct vhost_virtqueue *rvq, *tvq;
> +
> +	/* mq support in future.*/
> +	rvq = dev->virtqueue[VIRTIO_RXQ];
> +	tvq = dev->virtqueue[VIRTIO_TXQ];
> +	if (rvq && tvq && rvq->desc && tvq->desc &&
> +		(rvq->kickfd != (eventfd_t)-1) &&
> +		(rvq->callfd != (eventfd_t)-1) &&
> +		(tvq->kickfd != (eventfd_t)-1) &&
> +		(tvq->callfd != (eventfd_t)-1)) {
> +		RTE_LOG(INFO, VHOST_CONFIG,
> +			"virtio is now ready for processing.\n");
> +		return 1;
> +	}
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"virtio isn't ready for processing.\n");
> +	return 0;
> +}
> +
> +void
> +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring call idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_call(ctx, &file);
> +}
> +
> +
> +/*
> + *  In vhost-user, when we receive kick message, will test whether virtio
> + *  device is ready for packet processing.
> + */
> +void
> +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> +	struct vhost_vring_file file;
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> +	file.fd = pmsg->fds[0];
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring kick idx:%d file:%d\n", file.index, file.fd);
> +	ops->set_vring_kick(ctx, &file);
> +
> +	if (virtio_is_ready(dev) &&
> +		!(dev->flags & VIRTIO_DEV_RUNNING))
> +			notify_ops->new_device(dev);
> +
> +}
> +
> +/*
> + * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
> + */
> +int
> +user_get_vring_base(struct vhost_device_ctx ctx,
> +	struct vhost_vring_state *state)
> +{
> +	struct virtio_net *dev = get_device(ctx);
> +
> +	/* We have to stop the queue (virtio) if it is running. */
> +	if (dev->flags & VIRTIO_DEV_RUNNING)
> +		notify_ops->destroy_device(dev);
> +
> +	/* Here we are safe to get the last used index */
> +	ops->get_vring_base(ctx, state->index, state);
> +
> +	RTE_LOG(INFO, VHOST_CONFIG,
> +		"vring base idx:%d file:%d\n", state->index, state->num);
> +	/*
> +	 * Based on current qemu vhost-user implementation, this message is
> +	 * sent and only sent in vhost_vring_stop.
> +	 * TODO: cleanup the vring, it isn't usable since here.
> +	 */
> +	if (dev->virtqueue[VIRTIO_RXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_RXQ]->callfd);
> +		dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
> +	}
> +	if (dev->virtqueue[VIRTIO_TXQ]->callfd) {
> +		close(dev->virtqueue[VIRTIO_TXQ]->callfd);
> +		dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
> +	}
> +
> +	return 0;
> +
> +}
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> new file mode 100644
> index 0000000..0f6a75a
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -0,0 +1,48 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_USER_H
> +#define _VIRTIO_NET_USER_H
> +
> +#include "vhost-net.h"
> +#include "vhost-net-user.h"
> +
> +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
> +
> +#endif
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index f81e459..0b49f1b 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -46,6 +46,7 @@
>  #include <rte_virtio_net.h>
>  
>  #include "vhost-net.h"
> +#include "virtio-net.h"
>  
>  /*
>   * Device linked list structure for configuration.
> @@ -56,7 +57,7 @@ struct virtio_net_config_ll {
>  };
>  
>  /* device ops to add/remove device to/from data core. */
> -static struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net_device_ops const *notify_ops;
>  /* root address of the linked list of managed virtio devices */
>  static struct virtio_net_config_ll *ll_root;
>  
> @@ -83,8 +84,9 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
>  		if ((qemu_va >= region->userspace_address) &&
>  			(qemu_va <= region->userspace_address +
>  			region->memory_size)) {
> -			vhost_va = dev->mem->mapped_address + qemu_va -
> -					dev->mem->base_address;
> +			vhost_va = qemu_va + region->guest_phys_address +
> +				region->address_offset -
> +				region->userspace_address;
>  			break;
>  		}
>  	}
> @@ -114,7 +116,7 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
>   * Searches the configuration core linked list and
>   * retrieves the device if it exists.
>   */
> -static struct virtio_net *
> +struct virtio_net *
>  get_device(struct vhost_device_ctx ctx)
>  {
>  	struct virtio_net_config_ll *ll_dev;
> @@ -450,12 +452,6 @@ set_mem_table(struct vhost_device_ctx ctx,
>  	if (dev == NULL)
>  		return -1;
>  
> -	if (dev->mem) {
> -		munmap((void *)(uintptr_t)dev->mem->mapped_address,
> -			(size_t)dev->mem->mapped_size);
> -		free(dev->mem);
> -	}
> -
>  	/* Malloc the memory structure depending on the number of regions. */
>  	mem = calloc(1, sizeof(struct virtio_memory) +
>  		(sizeof(struct virtio_memory_regions) * nregions));
> diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_vhost/virtio-net.h
> new file mode 100644
> index 0000000..da4ade0
> --- /dev/null
> +++ b/lib/librte_vhost/virtio-net.h
> @@ -0,0 +1,43 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_H
> +#define _VIRTIO_NET_H
> +
> +#include "vhost-net.h"
> +#include "rte_virtio_net.h"
> +
> +struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net * get_device(struct vhost_device_ctx ctx);
> +
> +#endif
  
Huawei Xie Jan. 4, 2015, 9:53 a.m. UTC | #10
> -----Original Message-----
> From: Tetsuya Mukawa [mailto:mukawa@igel.co.jp]
> Sent: Wednesday, December 24, 2014 12:21 AM
> To: Xie, Huawei; dev@dpdk.org
> Cc: haifeng.lin@intel.com
> Subject: Re: [PATCH RFC v2 08/12] lib/librte_vhost: vhost-user support
> 
> (2014/12/11 6:37), Huawei Xie wrote:
> > vhost-user support
> >
> >
> 
> I guess 'feature' should be filled as payload like below.
> msg.payload.u64 = features;
>

Applied. Thanks.
 
> Thanks,
> Tetsuya
>
  
Huawei Xie Jan. 5, 2015, 10:21 a.m. UTC | #11
> -----Original Message-----
> From: Linhaifeng [mailto:haifeng.lin@huawei.com]
> Sent: Wednesday, December 10, 2014 10:36 PM
> To: Xie, Huawei; dev@dpdk.org
> Cc: haifeng.lin@intel.com
> Subject: Re: [dpdk-dev] [PATCH RFC v2 08/12] lib/librte_vhost: vhost-user
> support
> 
> 
> 
> On 2014/12/11 5:37, Huawei Xie wrote:
> > vhost-user support
> >
> >
> 
> should close fd for fd leak when receive VHOST_USER_SET_LOG_FD msg?
> 
> > +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> > +	case VHOST_USER_SET_VRING_ERR:
> 
> should close fd for fd leak?
> 
Thanks for careful review.
> > +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> > +		break;
  
Huawei Xie Jan. 23, 2015, 3:36 a.m. UTC | #12
> -----Original Message-----
> From: Linhaifeng [mailto:haifeng.lin@huawei.com]
> Sent: Thursday, December 11, 2014 2:04 PM
> To: Xie, Huawei; dev@dpdk.org
> Cc: haifeng.lin@intel.com
> Subject: Re: [dpdk-dev] [PATCH RFC v2 08/12] lib/librte_vhost: vhost-user
> support
> 
> 
> 
> On 2014/12/11 5:37, Huawei Xie wrote:
> > vhost-user support
> >
> >
> > Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> > ---
> >  lib/librte_vhost/Makefile                     |   5 +-
> >  lib/librte_vhost/vhost-net.h                  |   4 +
> >  lib/librte_vhost/vhost_cuse/virtio-net-cdev.c |   9 +
> >  lib/librte_vhost/vhost_user/vhost-net-user.c  | 422
> ++++++++++++++++++++++++++
> > +
> > +static struct vhost_server *g_vhost_server;
> 
> Only support one vhost-user port ?

The newer patch supports multiple vhost-user port.
Will send next week.
> 
> > +
> > +static const char *vhost_message_str[VHOST_USER_MAX] = {

> 
> why not use vserver_new_vq_conn(int fd, void* dat) ?
> 
>
Since currently we only pass memory address and 32bit number, will change to void *.
 > > +{
> > +	struct vhost_server *vserver = (void *)(uintptr_t)dat;
> regions[VHOST_MEMORY_MAX_NREGIONS];
> > +	uint64_t mapped_address, base_address = 0;
> > +
> > +	for (idx = 0; idx < memory.nregions; idx++) {
> > +		if (memory.regions[idx].guest_phys_addr == 0)
> > +			base_address = memory.regions[idx].userspace_addr;
> > +	}
> > +	if (base_address == 0) {
> 
> when will base_address is 0? how to test to run this code?

base_address with 0 is the requirement for vhost-cuse implementation.
Normally it is the region [0, 0xA0000).
In this version, we would keep this and not fix this. In future, will remove this.

> 
> > +		RTE_LOG(ERR, VHOST_CONFIG,
> > +			"couldn't find the mem region whose GPA is 0.\n");
> > +		return -1;
> > +	}
> > +
> > +	for (idx = 0; idx < memory.nregions; idx++) {
> > +		regions[idx].guest_phys_address =
> > +			memory.regions[idx].guest_phys_addr;
> > +		regions[idx].guest_phys_address_end =
> > +			memory.regions[idx].guest_phys_addr +
> > +			memory.regions[idx].memory_size;
> > +		regions[idx].memory_size = memory.regions[idx].memory_size;
> > +		regions[idx].userspace_address =
> > +			memory.regions[idx].userspace_addr;
> > +
> > +		/* This is ugly */
> > +		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> > +			regions[idx].memory_size +
> > +				memory.regions[idx].mmap_offset,
> > +			PROT_READ | PROT_WRITE, MAP_SHARED,
> > +			pmsg->fds[idx],
> > +			0);
> 
> Can you mmap the region if gpa is 0? When i run VM with two numa node (qemu
> will create two hugepage file) found that always failed to mmap with the region
> which gpa is 0.
> 
> BTW can we ensure the memory regions cover with all the memory of hugepage
> for VM?
> 
We had discussed this. Seemed a qemu bug.
Do you have update on this?

> > +		RTE_LOG(INFO, VHOST_CONFIG,
> > +			"mapped region %d to %p\n",
> > +struct virtio_net_device_ops const *notify_ops;
> > +struct virtio_net * get_device(struct vhost_device_ctx ctx);
> > +
> > +#endif
> >
> 
> --
> Regards,
> Haifeng

Thanks Haifeng for your comments.
Next time, could you help cut the code not commented?
They are too long. It is really hard for me to find your comment.
  
Huawei Xie Jan. 23, 2015, 3:40 a.m. UTC | #13
> -----Original Message-----
> From: Linhaifeng [mailto:haifeng.lin@huawei.com]
> Sent: Thursday, December 11, 2014 1:36 PM
> To: Xie, Huawei; dev@dpdk.org
> Cc: haifeng.lin@intel.com
> Subject: Re: [dpdk-dev] [PATCH RFC v2 08/12] lib/librte_vhost: vhost-user
> support
> 
> 
> 
> On 2014/12/11 5:37, Huawei Xie wrote:
> > vhost-user support
> >
> >
> > Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> > ---
> >  lib/librte_vhost/Makefile                     |   5 +-
> > +	case VHOST_USER_SET_LOG_FD:
> 
> should close fd for fd leak when receive VHOST_USER_SET_LOG_FD msg?
Ok, would at least close the fd though we don't support.
Do you know how to test this?
> 
> > +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> > +
> > +	case VHOST_USER_SET_VRING_ERR:
> 
> should close fd for fd leak?
Would do.
> 
> > +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> > +		break;
> > +#endif
> >
> 
> --
> Regards,
> Haifeng
  
Linhaifeng Jan. 23, 2015, 3:53 a.m. UTC | #14
On 2015/1/23 11:40, Xie, Huawei wrote:
> 
> 
>> -----Original Message-----
>> From: Linhaifeng [mailto:haifeng.lin@huawei.com]
>> Sent: Thursday, December 11, 2014 1:36 PM
>> To: Xie, Huawei; dev@dpdk.org
>> Cc: haifeng.lin@intel.com
>> Subject: Re: [dpdk-dev] [PATCH RFC v2 08/12] lib/librte_vhost: vhost-user
>> support
>>
>>
>>
>> On 2014/12/11 5:37, Huawei Xie wrote:
>>> vhost-user support
>>>
>>>
>>> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
>>> ---
>>>  lib/librte_vhost/Makefile                     |   5 +-
>>> +	case VHOST_USER_SET_LOG_FD:
>>
>> should close fd for fd leak when receive VHOST_USER_SET_LOG_FD msg?
> Ok, would at least close the fd though we don't support.
> Do you know how to test this?

reboot VM many times then watch the number of fd with command 'ls /proc/pidof vhost/fd |wc -l'

>>
>>> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
>>> +
>>> +	case VHOST_USER_SET_VRING_ERR:
>>
>> should close fd for fd leak?
> Would do.
>>
>>> +		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
>>> +		break;
>>> +#endif
>>>
>>
>> --
>> Regards,
>> Haifeng
> 
> 
>
  
Linhaifeng Jan. 23, 2015, 8:36 a.m. UTC | #15
>>
>> Can you mmap the region if gpa is 0? When i run VM with two numa node (qemu
>> will create two hugepage file) found that always failed to mmap with the region
>> which gpa is 0.
>>
>> BTW can we ensure the memory regions cover with all the memory of hugepage
>> for VM?
>>
> We had discussed this. Seemed a qemu bug.
> Do you have update on this?
> 

Yes,because qemu not send all hugepage file fd to vhost-user.I have send the patch bug no body reply.


> 
> Thanks Haifeng for your comments.
> Next time, could you help cut the code not commented?
> They are too long. It is really hard for me to find your comment.
> 
> 
> .
> 

OK:)
  

Patch

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index e0d0ef6..b2f14a0 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -34,10 +34,11 @@  include $(RTE_SDK)/mk/rte.vars.mk
 # library name
 LIB = librte_vhost.a
 
-CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -I vhost_user -O3 -D_FILE_OFFSET_BITS=64 -lfuse
 LDFLAGS += -lfuse
 # all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
+#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c virtio-net.c vhost_rxtx.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c virtio-net.c vhost_rxtx.c
 
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index f7e96fd..f9ec40b 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -41,8 +41,12 @@ 
 
 #include <rte_log.h>
 
+#include "rte_virtio_net.h"
+
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
+extern struct vhost_net_device_ops const *ops;
+
 /* Macros for printing using RTE_LOG */
 #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
 #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
diff --git a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
index edcbc10..8ac3360 100644
--- a/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
+++ b/lib/librte_vhost/vhost_cuse/virtio-net-cdev.c
@@ -268,6 +268,7 @@  cuse_set_mem_table(struct vhost_device_ctx ctx,
 	struct vhost_memory_region *mem_regions = (void *)(uintptr_t)
 		((uint64_t)(uintptr_t)mem_regions_addr + size);
 	uint64_t base_address = 0, mapped_address, mapped_size;
+	struct virtio_dev *dev;
 
 	for (idx = 0; idx < nregions; idx++) {
 		regions[idx].guest_phys_address =
@@ -335,6 +336,14 @@  cuse_set_mem_table(struct vhost_device_ctx ctx,
 			regions[idx].guest_phys_address;
 	}
 
+	dev = get_device(ctx);
+	if (dev && dev->mem && dev->mmaped_address) {
+		munmap((void *)(uintptr_t)dev->mmaped_address,
+			(size_t)dev->mmaped_size);
+		free(dev->mem);
+		dev->mem = NULL;
+	}
+
 	ops->set_mem_table(ctx, &regions[0], valid_regions);
 	return 0;
 }
diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
new file mode 100644
index 0000000..841d7e6
--- /dev/null
+++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
@@ -0,0 +1,422 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <errno.h>
+
+#include <rte_log.h>
+#include <rte_virtio_net.h>
+
+#include "fd_man.h"
+#include "vhost-net-user.h"
+#include "vhost-net.h"
+#include "virtio-net-user.h"
+
+static void vserver_new_vq_conn(int fd, uint64_t data);
+static void vserver_message_handler(int fd, uint64_t dat);
+struct vhost_net_device_ops const *ops;
+
+static struct vhost_server *g_vhost_server;
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+	[VHOST_USER_NONE] = "VHOST_USER_NONE",
+	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR"
+};
+
+/**
+ * Create a unix domain socket, bind to path and listen for connection.
+ * @return
+ *  socket fd or -1 on failure
+ */
+static int
+uds_socket(const char *path)
+{
+	struct sockaddr_un un;
+	int sockfd;
+	int ret;
+
+	if (path == NULL)
+		return -1;
+
+	sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (sockfd < 0)
+		return -1;
+	RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
+	ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
+	if (ret == -1)
+		goto err;
+	RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+	ret = listen(sockfd, 1);
+	if (ret == -1)
+		goto err;
+
+	return sockfd;
+
+err:
+	close(sockfd);
+	return -1;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+	struct iovec iov;
+	struct msghdr msgh = { 0 };
+	size_t fdsize = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	int ret;
+
+	iov.iov_base = buf;
+	iov.iov_len  = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	ret = recvmsg(sockfd, &msgh, 0);
+	if (ret <= 0) {
+		RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+		return ret;
+	}
+
+	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+		RTE_LOG(ERR, VHOST_CONFIG, "Truncted msg\n");
+		return -1;
+	}
+
+	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+		if ((cmsg->cmsg_level == SOL_SOCKET) &&
+			(cmsg->cmsg_type == SCM_RIGHTS)) {
+			memcpy(fds, CMSG_DATA(cmsg), fdsize);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+	int ret;
+
+	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+		msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+	if (ret <= 0)
+		return ret;
+
+	if (msg && msg->size) {
+		if (msg->size > sizeof(msg->payload)) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"invalid msg size: %d\n", msg->size);
+			return -1;
+		}
+		ret = read(sockfd, &msg->payload, msg->size);
+		if (ret <= 0)
+			return ret;
+		if (ret != (int)msg->size) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"read control message failed\n");
+			return -1;
+		}
+	}
+
+	return ret;
+}
+
+static int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+	struct iovec iov;
+	struct msghdr msgh = { 0 };
+	size_t fdsize = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fdsize)];
+	struct cmsghdr *cmsg;
+	int ret;
+
+	iov.iov_base = buf;
+	iov.iov_len = buflen;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+
+	if (fds && fd_num > 0) {
+		msgh.msg_control = control;
+		msgh.msg_controllen = sizeof(control);
+		cmsg = CMSG_FIRSTHDR(&msgh);
+		cmsg->cmsg_len = CMSG_LEN(fdsize);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		memcpy(CMSG_DATA(cmsg), fds, fdsize);
+	} else {
+		msgh.msg_control = NULL;
+		msgh.msg_controllen = 0;
+	}
+
+	do {
+		ret = sendmsg(sockfd, &msgh, 0);
+	} while (ret < 0 && errno == EINTR);
+
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+	int ret;
+
+	if (!msg)
+		return 0;
+
+	msg->flags &= ~VHOST_USER_VERSION_MASK;
+	msg->flags |= VHOST_USER_VERSION;
+	sg->flags |= VHOST_USER_REPLY_MASK;
+
+	ret = send_fd_message(sockfd, (char *)msg,
+		VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+	return ret;
+}
+
+/* call back when there is new virtio connection.  */
+static void
+vserver_new_vq_conn(int fd, uint64_t dat)
+{
+	struct vhost_server *vserver = (void *)(uintptr_t)dat;
+	int conn_fd;
+	uint32_t fh;
+	struct vhost_device_ctx vdev_ctx = { 0 };
+
+	conn_fd = accept(fd, NULL, NULL);
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"new virtio connection is %d\n", conn_fd);
+	if (conn_fd < 0)
+		return;
+
+	fh = ops->new_device(vdev_ctx);
+	RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
+
+	fdset_add(&vserver->fdset,
+		conn_fd, vserver_message_handler, NULL, fh);
+}
+
+/* callback when there is message on the connfd */
+static void
+vserver_message_handler(int connfd, uint64_t dat)
+{
+	struct vhost_device_ctx ctx;
+	uint32_t fh = (uint32_t)dat;
+	struct VhostUserMsg msg;
+	uint64_t features;
+	int ret;
+
+	ctx.fh = fh;
+	ret = read_vhost_message(connfd, &msg);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"vhost read message failed\n");
+
+		/*TODO: cleanup */
+		close(connfd);
+		fdset_del(&g_vhost_server->fdset, connfd);
+		ops->destroy_device(ctx);
+
+		return;
+	} else if (ret == 0) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"vhost peer closed\n");
+
+		/*TODO: cleanup */
+		close(connfd);
+		fdset_del(&g_vhost_server->fdset, connfd);
+		ops->destroy_device(ctx);
+
+		return;
+	}
+	if (msg.request > VHOST_USER_MAX) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"vhost read incorrect message\n");
+
+		/*TODO: cleanup */
+		close(connfd);
+		fdset_del(&g_vhost_server->fdset, connfd);
+
+		return;
+	}
+
+	RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+		vhost_message_str[msg.request]);
+	switch (msg.request) {
+	case VHOST_USER_GET_FEATURES:
+		ret = ops->get_features(ctx, &features);
+		msg.payload.u64 = ret;
+		msg.size = sizeof(msg.payload.u64);
+		send_vhost_message(connfd, &msg);
+		break;
+	case VHOST_USER_SET_FEATURES:
+		ops->set_features(ctx, &features);
+		break;
+
+	case VHOST_USER_SET_OWNER:
+		ops->set_owner(ctx);
+		break;
+	case VHOST_USER_RESET_OWNER:
+		ops->reset_owner(ctx);
+		break;
+
+	case VHOST_USER_SET_MEM_TABLE:
+		user_set_mem_table(ctx, &msg);
+		break;
+
+	case VHOST_USER_SET_LOG_BASE:
+	case VHOST_USER_SET_LOG_FD:
+		RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+		break;
+
+	case VHOST_USER_SET_VRING_NUM:
+		ops->set_vring_num(ctx, &msg.payload.state);
+		break;
+	case VHOST_USER_SET_VRING_ADDR:
+		ops->set_vring_addr(ctx, &msg.payload.addr);
+		break;
+	case VHOST_USER_SET_VRING_BASE:
+		ops->set_vring_base(ctx, &msg.payload.state);
+		break;
+
+	case VHOST_USER_GET_VRING_BASE:
+		ret = user_get_vring_base(ctx, &msg.payload.state);
+		msg.size = sizeof(msg.payload.state);
+		send_vhost_message(connfd, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_KICK:
+		user_set_vring_kick(ctx, &msg);
+		break;
+	case VHOST_USER_SET_VRING_CALL:
+		user_set_vring_call(ctx, &msg);
+		break;
+
+	case VHOST_USER_SET_VRING_ERR:
+		RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+		break;
+
+	default:
+		break;
+
+	}
+}
+
+
+/**
+ * Creates and initialise the vhost server.
+ */
+int
+rte_vhost_driver_register(const char *path)
+{
+
+	struct vhost_server *vserver;
+
+	if (g_vhost_server != NULL)
+		return -1;
+
+	vserver = calloc(sizeof(struct vhost_server), 1);
+	if (vserver == NULL)
+		return -1;
+
+	fdset_init(&vserver->fdset);
+
+	unlink(path);
+
+	vserver->listenfd = uds_socket(path);
+	if (vserver->listenfd < 0) {
+		free(vserver);
+		return -1;
+	}
+	vserver->path = path;
+
+	fdset_add(&vserver->fdset, vserver->listenfd,
+		vserver_new_vq_conn, NULL,
+		(uint64_t)(uintptr_t)vserver);
+
+	ops = get_virtio_net_callbacks();
+
+	g_vhost_server = vserver;
+
+	return 0;
+}
+
+
+int
+rte_vhost_driver_session_start(void)
+{
+	fdset_event_dispatch(&g_vhost_server->fdset);
+	return 0;
+}
+
diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
new file mode 100644
index 0000000..c138844
--- /dev/null
+++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
@@ -0,0 +1,108 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "fd_man.h"
+
+struct vhost_server {
+	const char *path; /**< The path the uds is bind to. */
+	int listenfd;     /**< The listener sockfd. */
+	struct fdset fdset; /**< The fd list this vhost server manages. */
+};
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS    8
+
+typedef enum VhostUserRequest {
+	VHOST_USER_NONE = 0,
+	VHOST_USER_GET_FEATURES = 1,
+	VHOST_USER_SET_FEATURES = 2,
+	VHOST_USER_SET_OWNER = 3,
+	VHOST_USER_RESET_OWNER = 4,
+	VHOST_USER_SET_MEM_TABLE = 5,
+	VHOST_USER_SET_LOG_BASE = 6,
+	VHOST_USER_SET_LOG_FD = 7,
+	VHOST_USER_SET_VRING_NUM = 8,
+	VHOST_USER_SET_VRING_ADDR = 9,
+	VHOST_USER_SET_VRING_BASE = 10,
+	VHOST_USER_GET_VRING_BASE = 11,
+	VHOST_USER_SET_VRING_KICK = 12,
+	VHOST_USER_SET_VRING_CALL = 13,
+	VHOST_USER_SET_VRING_ERR = 14,
+	VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+	uint64_t guest_phys_addr;
+	uint64_t memory_size;
+	uint64_t userspace_addr;
+	uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+	uint32_t nregions;
+	uint32_t padding;
+	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserMsg {
+	VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK     (0x3)
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+	uint32_t flags;
+	uint32_t size; /* the following payload size */
+	union {
+#define VHOST_USER_VRING_IDX_MASK   (0xff)
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+		uint64_t u64;
+		struct vhost_vring_state state;
+		struct vhost_vring_addr addr;
+	VhostUserMemory memory;
+	} payload;
+	int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    (0x1)
+
+/*****************************************************************************/
+#endif
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
new file mode 100644
index 0000000..ad59fcc
--- /dev/null
+++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
@@ -0,0 +1,199 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include <rte_log.h>
+
+#include "virtio-net.h"
+#include "virtio-net-user.h"
+#include "vhost-net-user.h"
+#include "vhost-net.h"
+
+int
+user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
+{
+	unsigned int idx;
+	struct VhostUserMemory memory = pmsg->payload.memory;
+	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
+	uint64_t mapped_address, base_address = 0;
+
+	for (idx = 0; idx < memory.nregions; idx++) {
+		if (memory.regions[idx].guest_phys_addr == 0)
+			base_address = memory.regions[idx].userspace_addr;
+	}
+	if (base_address == 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"couldn't find the mem region whose GPA is 0.\n");
+		return -1;
+	}
+
+	for (idx = 0; idx < memory.nregions; idx++) {
+		regions[idx].guest_phys_address =
+			memory.regions[idx].guest_phys_addr;
+		regions[idx].guest_phys_address_end =
+			memory.regions[idx].guest_phys_addr +
+			memory.regions[idx].memory_size;
+		regions[idx].memory_size = memory.regions[idx].memory_size;
+		regions[idx].userspace_address =
+			memory.regions[idx].userspace_addr;
+
+		/* This is ugly */
+		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
+			regions[idx].memory_size +
+				memory.regions[idx].mmap_offset,
+			PROT_READ | PROT_WRITE, MAP_SHARED,
+			pmsg->fds[idx],
+			0);
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"mapped region %d to %p\n",
+			idx, (void *)mapped_address);
+
+		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"mmap qemu guest failed.\n");
+			return -1;
+		}
+
+		mapped_address +=  memory.regions[idx].mmap_offset;
+
+		regions[idx].address_offset = mapped_address -
+			regions[idx].guest_phys_address;
+		LOG_DEBUG(VHOST_CONFIG,
+			"REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
+			idx,
+			(void *)(uintptr_t)regions[idx].guest_phys_address,
+			(void *)(uintptr_t)regions[idx].userspace_address,
+			 regions[idx].memory_size);
+	}
+	ops->set_mem_table(ctx, regions, memory.nregions);
+	return 0;
+}
+
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+	struct vhost_virtqueue *rvq, *tvq;
+
+	/* mq support in future.*/
+	rvq = dev->virtqueue[VIRTIO_RXQ];
+	tvq = dev->virtqueue[VIRTIO_TXQ];
+	if (rvq && tvq && rvq->desc && tvq->desc &&
+		(rvq->kickfd != (eventfd_t)-1) &&
+		(rvq->callfd != (eventfd_t)-1) &&
+		(tvq->kickfd != (eventfd_t)-1) &&
+		(tvq->callfd != (eventfd_t)-1)) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"virtio is now ready for processing.\n");
+		return 1;
+	}
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"virtio isn't ready for processing.\n");
+	return 0;
+}
+
+void
+user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
+{
+	struct vhost_vring_file file;
+
+	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	file.fd = pmsg->fds[0];
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring call idx:%d file:%d\n", file.index, file.fd);
+	ops->set_vring_call(ctx, &file);
+}
+
+
+/*
+ *  In vhost-user, when we receive kick message, will test whether virtio
+ *  device is ready for packet processing.
+ */
+void
+user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
+{
+	struct vhost_vring_file file;
+	struct virtio_net *dev = get_device(ctx);
+
+	file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+	file.fd = pmsg->fds[0];
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring kick idx:%d file:%d\n", file.index, file.fd);
+	ops->set_vring_kick(ctx, &file);
+
+	if (virtio_is_ready(dev) &&
+		!(dev->flags & VIRTIO_DEV_RUNNING))
+			notify_ops->new_device(dev);
+
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+int
+user_get_vring_base(struct vhost_device_ctx ctx,
+	struct vhost_vring_state *state)
+{
+	struct virtio_net *dev = get_device(ctx);
+
+	/* We have to stop the queue (virtio) if it is running. */
+	if (dev->flags & VIRTIO_DEV_RUNNING)
+		notify_ops->destroy_device(dev);
+
+	/* Here we are safe to get the last used index */
+	ops->get_vring_base(ctx, state->index, state);
+
+	RTE_LOG(INFO, VHOST_CONFIG,
+		"vring base idx:%d file:%d\n", state->index, state->num);
+	/*
+	 * Based on current qemu vhost-user implementation, this message is
+	 * sent and only sent in vhost_vring_stop.
+	 * TODO: cleanup the vring, it isn't usable since here.
+	 */
+	if (dev->virtqueue[VIRTIO_RXQ]->callfd) {
+		close(dev->virtqueue[VIRTIO_RXQ]->callfd);
+		dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
+	}
+	if (dev->virtqueue[VIRTIO_TXQ]->callfd) {
+		close(dev->virtqueue[VIRTIO_TXQ]->callfd);
+		dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
+	}
+
+	return 0;
+
+}
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
new file mode 100644
index 0000000..0f6a75a
--- /dev/null
+++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
@@ -0,0 +1,48 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_NET_USER_H
+#define _VIRTIO_NET_USER_H
+
+#include "vhost-net.h"
+#include "vhost-net-user.h"
+
+int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
+
+void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
+
+void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
+
+int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
+
+#endif
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
index f81e459..0b49f1b 100644
--- a/lib/librte_vhost/virtio-net.c
+++ b/lib/librte_vhost/virtio-net.c
@@ -46,6 +46,7 @@ 
 #include <rte_virtio_net.h>
 
 #include "vhost-net.h"
+#include "virtio-net.h"
 
 /*
  * Device linked list structure for configuration.
@@ -56,7 +57,7 @@  struct virtio_net_config_ll {
 };
 
 /* device ops to add/remove device to/from data core. */
-static struct virtio_net_device_ops const *notify_ops;
+struct virtio_net_device_ops const *notify_ops;
 /* root address of the linked list of managed virtio devices */
 static struct virtio_net_config_ll *ll_root;
 
@@ -83,8 +84,9 @@  qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
 		if ((qemu_va >= region->userspace_address) &&
 			(qemu_va <= region->userspace_address +
 			region->memory_size)) {
-			vhost_va = dev->mem->mapped_address + qemu_va -
-					dev->mem->base_address;
+			vhost_va = qemu_va + region->guest_phys_address +
+				region->address_offset -
+				region->userspace_address;
 			break;
 		}
 	}
@@ -114,7 +116,7 @@  get_config_ll_entry(struct vhost_device_ctx ctx)
  * Searches the configuration core linked list and
  * retrieves the device if it exists.
  */
-static struct virtio_net *
+struct virtio_net *
 get_device(struct vhost_device_ctx ctx)
 {
 	struct virtio_net_config_ll *ll_dev;
@@ -450,12 +452,6 @@  set_mem_table(struct vhost_device_ctx ctx,
 	if (dev == NULL)
 		return -1;
 
-	if (dev->mem) {
-		munmap((void *)(uintptr_t)dev->mem->mapped_address,
-			(size_t)dev->mem->mapped_size);
-		free(dev->mem);
-	}
-
 	/* Malloc the memory structure depending on the number of regions. */
 	mem = calloc(1, sizeof(struct virtio_memory) +
 		(sizeof(struct virtio_memory_regions) * nregions));
diff --git a/lib/librte_vhost/virtio-net.h b/lib/librte_vhost/virtio-net.h
new file mode 100644
index 0000000..da4ade0
--- /dev/null
+++ b/lib/librte_vhost/virtio-net.h
@@ -0,0 +1,43 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_NET_H
+#define _VIRTIO_NET_H
+
+#include "vhost-net.h"
+#include "rte_virtio_net.h"
+
+struct virtio_net_device_ops const *notify_ops;
+struct virtio_net * get_device(struct vhost_device_ctx ctx);
+
+#endif