[dpdk-dev,3/4] vhost: log vring changes

Message ID 1449027793-30975-4-git-send-email-yuanhan.liu@linux.intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Yuanhan Liu Dec. 2, 2015, 3:43 a.m. UTC
  Invoking vhost_log_write() to mark corresponding page as dirty while
updating used vring.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 74 +++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 24 deletions(-)
  

Comments

Victor Kaplansky Dec. 2, 2015, 2:07 p.m. UTC | #1
On Wed, Dec 02, 2015 at 11:43:12AM +0800, Yuanhan Liu wrote:
> Invoking vhost_log_write() to mark corresponding page as dirty while
> updating used vring.

Looks good, thanks!

I didn't find where you log the dirty pages in result of data
written to the buffers pointed by the descriptors in RX vring.
AFAIU, the buffers of RX queue reside in guest's memory and have
to be marked as dirty if they are written. What do you say?

-- Victor

> 
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
>  lib/librte_vhost/vhost_rxtx.c | 74 +++++++++++++++++++++++++++++--------------
>  1 file changed, 50 insertions(+), 24 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 9322ce6..d4805d8 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -129,6 +129,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  		uint32_t offset = 0, vb_offset = 0;
>  		uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
>  		uint8_t hdr = 0, uncompleted_pkt = 0;
> +		uint16_t idx;
>  
>  		/* Get descriptor from available ring */
>  		desc = &vq->desc[head[packet_success]];
> @@ -200,16 +201,22 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  		}
>  
>  		/* Update used ring with desc information */
> -		vq->used->ring[res_cur_idx & (vq->size - 1)].id =
> -							head[packet_success];
> +		idx = res_cur_idx & (vq->size - 1);
> +		vq->used->ring[idx].id = head[packet_success];
>  
>  		/* Drop the packet if it is uncompleted */
>  		if (unlikely(uncompleted_pkt == 1))
> -			vq->used->ring[res_cur_idx & (vq->size - 1)].len =
> -							vq->vhost_hlen;
> +			vq->used->ring[idx].len = vq->vhost_hlen;
>  		else
> -			vq->used->ring[res_cur_idx & (vq->size - 1)].len =
> -							pkt_len + vq->vhost_hlen;
> +			vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
> +
> +		/*
> +		 * to defer the update to when updating used->idx,
> +		 * and batch them?
> +		 */
> +		vhost_log_write(dev, vq,
> +			offsetof(struct vring_used, ring[idx]),
> +			sizeof(vq->used->ring[idx]));
>  
>  		res_cur_idx++;
>  		packet_success++;
> @@ -236,6 +243,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  
>  	*(volatile uint16_t *)&vq->used->idx += count;
>  	vq->last_used_idx = res_end_idx;
> +	vhost_log_write(dev, vq,
> +		offsetof(struct vring_used, idx),
> +		sizeof(vq->used->idx));
>  
>  	/* flush used->idx update before we read avail->flags. */
>  	rte_mb();
> @@ -265,6 +275,7 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
>  	uint32_t seg_avail;
>  	uint32_t vb_avail;
>  	uint32_t cpy_len, entry_len;
> +	uint16_t idx;
>  
>  	if (pkt == NULL)
>  		return 0;
> @@ -302,16 +313,18 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
>  	entry_len = vq->vhost_hlen;
>  
>  	if (vb_avail == 0) {
> -		uint32_t desc_idx =
> -			vq->buf_vec[vec_idx].desc_idx;
> +		uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
> +
> +		if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
> +			idx = cur_idx & (vq->size - 1);
>  
> -		if ((vq->desc[desc_idx].flags
> -			& VRING_DESC_F_NEXT) == 0) {
>  			/* Update used ring with desc information */
> -			vq->used->ring[cur_idx & (vq->size - 1)].id
> -				= vq->buf_vec[vec_idx].desc_idx;
> -			vq->used->ring[cur_idx & (vq->size - 1)].len
> -				= entry_len;
> +			vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
> +			vq->used->ring[idx].len = entry_len;
> +
> +			vhost_log_write(dev, vq,
> +					offsetof(struct vring_used, ring[idx]),
> +					sizeof(vq->used->ring[idx]));
>  
>  			entry_len = 0;
>  			cur_idx++;
> @@ -354,10 +367,13 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
>  			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
>  				VRING_DESC_F_NEXT) == 0) {
>  				/* Update used ring with desc information */
> -				vq->used->ring[cur_idx & (vq->size - 1)].id
> +				idx = cur_idx & (vq->size - 1);
> +				vq->used->ring[idx].id
>  					= vq->buf_vec[vec_idx].desc_idx;
> -				vq->used->ring[cur_idx & (vq->size - 1)].len
> -					= entry_len;
> +				vq->used->ring[idx].len = entry_len;
> +				vhost_log_write(dev, vq,
> +					offsetof(struct vring_used, ring[idx]),
> +					sizeof(vq->used->ring[idx]));
>  				entry_len = 0;
>  				cur_idx++;
>  				entry_success++;
> @@ -390,16 +406,18 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
>  
>  					if ((vq->desc[desc_idx].flags &
>  						VRING_DESC_F_NEXT) == 0) {
> -						uint16_t wrapped_idx =
> -							cur_idx & (vq->size - 1);
> +						idx = cur_idx & (vq->size - 1);
>  						/*
>  						 * Update used ring with the
>  						 * descriptor information
>  						 */
> -						vq->used->ring[wrapped_idx].id
> +						vq->used->ring[idx].id
>  							= desc_idx;
> -						vq->used->ring[wrapped_idx].len
> +						vq->used->ring[idx].len
>  							= entry_len;
> +						vhost_log_write(dev, vq,
> +							offsetof(struct vring_used, ring[idx]),
> +							sizeof(vq->used->ring[idx]));
>  						entry_success++;
>  						entry_len = 0;
>  						cur_idx++;
> @@ -422,10 +440,13 @@ copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
>  				 * This whole packet completes.
>  				 */
>  				/* Update used ring with desc information */
> -				vq->used->ring[cur_idx & (vq->size - 1)].id
> +				idx = cur_idx & (vq->size - 1);
> +				vq->used->ring[idx].id
>  					= vq->buf_vec[vec_idx].desc_idx;
> -				vq->used->ring[cur_idx & (vq->size - 1)].len
> -					= entry_len;
> +				vq->used->ring[idx].len = entry_len;
> +				vhost_log_write(dev, vq,
> +					offsetof(struct vring_used, ring[idx]),
> +					sizeof(vq->used->ring[idx]));
>  				entry_success++;
>  				break;
>  			}
> @@ -658,6 +679,9 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  		/* Update used index buffer information. */
>  		vq->used->ring[used_idx].id = head[entry_success];
>  		vq->used->ring[used_idx].len = 0;
> +		vhost_log_write(dev, vq,
> +				offsetof(struct vring_used, ring[used_idx]),
> +				sizeof(vq->used->ring[used_idx]));
>  
>  		/* Allocate an mbuf and populate the structure. */
>  		m = rte_pktmbuf_alloc(mbuf_pool);
> @@ -778,6 +802,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>  
>  	rte_compiler_barrier();
>  	vq->used->idx += entry_success;
> +	vhost_log_write(dev, vq, offsetof(struct vring_used, idx),
> +			sizeof(vq->used->idx));
>  	/* Kick guest if required. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -- 
> 1.9.0
  
Yuanhan Liu Dec. 2, 2015, 2:38 p.m. UTC | #2
On Wed, Dec 02, 2015 at 04:07:02PM +0200, Victor Kaplansky wrote:
> On Wed, Dec 02, 2015 at 11:43:12AM +0800, Yuanhan Liu wrote:
> > Invoking vhost_log_write() to mark corresponding page as dirty while
> > updating used vring.
> 
> Looks good, thanks!
> 
> I didn't find where you log the dirty pages in result of data
> written to the buffers pointed by the descriptors in RX vring.
> AFAIU, the buffers of RX queue reside in guest's memory and have
> to be marked as dirty if they are written. What do you say?

Yeah, we should. I got a question then: why log_guest_addr is set
to the physical address of used vring in guest? I mean, apparently,
we need log more changes other than used vring only.

	--yliu
  
Victor Kaplansky Dec. 2, 2015, 3:58 p.m. UTC | #3
On Wed, Dec 02, 2015 at 10:38:02PM +0800, Yuanhan Liu wrote:
> On Wed, Dec 02, 2015 at 04:07:02PM +0200, Victor Kaplansky wrote:
> > On Wed, Dec 02, 2015 at 11:43:12AM +0800, Yuanhan Liu wrote:
> > > Invoking vhost_log_write() to mark corresponding page as dirty while
> > > updating used vring.
> > 
> > Looks good, thanks!
> > 
> > I didn't find where you log the dirty pages in result of data
> > written to the buffers pointed by the descriptors in RX vring.
> > AFAIU, the buffers of RX queue reside in guest's memory and have
> > to be marked as dirty if they are written. What do you say?
> 
> Yeah, we should. I got a question then: why log_guest_addr is set
> to the physical address of used vring in guest? I mean, apparently,
> we need log more changes other than used vring only.

The physical address of used vring sent to the back-end, since
otherwise back-end has to perform virtual to physical
translation, and we want to avoid this. The dirty buffers has to
be marked as well, but their guest's physical address is known
directly from the descriptors.

> 
> 	--yliu
  
Michael S. Tsirkin Dec. 2, 2015, 4:26 p.m. UTC | #4
On Wed, Dec 02, 2015 at 05:58:24PM +0200, Victor Kaplansky wrote:
> On Wed, Dec 02, 2015 at 10:38:02PM +0800, Yuanhan Liu wrote:
> > On Wed, Dec 02, 2015 at 04:07:02PM +0200, Victor Kaplansky wrote:
> > > On Wed, Dec 02, 2015 at 11:43:12AM +0800, Yuanhan Liu wrote:
> > > > Invoking vhost_log_write() to mark corresponding page as dirty while
> > > > updating used vring.
> > > 
> > > Looks good, thanks!
> > > 
> > > I didn't find where you log the dirty pages in result of data
> > > written to the buffers pointed by the descriptors in RX vring.
> > > AFAIU, the buffers of RX queue reside in guest's memory and have
> > > to be marked as dirty if they are written. What do you say?
> > 
> > Yeah, we should. I got a question then: why log_guest_addr is set
> > to the physical address of used vring in guest? I mean, apparently,
> > we need log more changes other than used vring only.
> 
> The physical address of used vring sent to the back-end, since
> otherwise back-end has to perform virtual to physical
> translation, and we want to avoid this. The dirty buffers has to
> be marked as well, but their guest's physical address is known
> directly from the descriptors.

Yes, people wanted to be able to do multiple physical
addresses to one virtual so you do not want to translate
virt to phys.

> > 
> > 	--yliu
  
Yuanhan Liu Dec. 3, 2015, 2:31 a.m. UTC | #5
On Wed, Dec 02, 2015 at 06:26:37PM +0200, Michael S. Tsirkin wrote:
> On Wed, Dec 02, 2015 at 05:58:24PM +0200, Victor Kaplansky wrote:
> > On Wed, Dec 02, 2015 at 10:38:02PM +0800, Yuanhan Liu wrote:
> > > On Wed, Dec 02, 2015 at 04:07:02PM +0200, Victor Kaplansky wrote:
> > > > On Wed, Dec 02, 2015 at 11:43:12AM +0800, Yuanhan Liu wrote:
> > > > > Invoking vhost_log_write() to mark corresponding page as dirty while
> > > > > updating used vring.
> > > > 
> > > > Looks good, thanks!
> > > > 
> > > > I didn't find where you log the dirty pages in result of data
> > > > written to the buffers pointed by the descriptors in RX vring.
> > > > AFAIU, the buffers of RX queue reside in guest's memory and have
> > > > to be marked as dirty if they are written. What do you say?
> > > 
> > > Yeah, we should. I got a question then: why log_guest_addr is set
> > > to the physical address of used vring in guest? I mean, apparently,
> > > we need log more changes other than used vring only.
> > 
> > The physical address of used vring sent to the back-end, since
> > otherwise back-end has to perform virtual to physical
> > translation, and we want to avoid this. The dirty buffers has to
> > be marked as well, but their guest's physical address is known
> > directly from the descriptors.
> 
> Yes, people wanted to be able to do multiple physical
> addresses to one virtual so you do not want to translate
> virt to phys.

Thank you both, it's clear to me then.

	--yliu
  
Huawei Xie Dec. 9, 2015, 2:45 a.m. UTC | #6
On 12/2/2015 10:09 PM, Victor Kaplansky wrote:
> On Wed, Dec 02, 2015 at 11:43:12AM +0800, Yuanhan Liu wrote:
>> Invoking vhost_log_write() to mark corresponding page as dirty while
>> updating used vring.
> Looks good, thanks!
>
> I didn't find where you log the dirty pages in result of data
> written to the buffers pointed by the descriptors in RX vring.
> AFAIU, the buffers of RX queue reside in guest's memory and have
> to be marked as dirty if they are written. What do you say?
Yes, that is actually the majority of the work.
Besides, in the first version, we could temporarily ignore zero copy,
which is much more complicated, as we have no idea when the page has
been accessed.
-- Huawei
>
> -- Victor
>
>
  

Patch

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index 9322ce6..d4805d8 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -129,6 +129,7 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 		uint32_t offset = 0, vb_offset = 0;
 		uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
 		uint8_t hdr = 0, uncompleted_pkt = 0;
+		uint16_t idx;
 
 		/* Get descriptor from available ring */
 		desc = &vq->desc[head[packet_success]];
@@ -200,16 +201,22 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 		}
 
 		/* Update used ring with desc information */
-		vq->used->ring[res_cur_idx & (vq->size - 1)].id =
-							head[packet_success];
+		idx = res_cur_idx & (vq->size - 1);
+		vq->used->ring[idx].id = head[packet_success];
 
 		/* Drop the packet if it is uncompleted */
 		if (unlikely(uncompleted_pkt == 1))
-			vq->used->ring[res_cur_idx & (vq->size - 1)].len =
-							vq->vhost_hlen;
+			vq->used->ring[idx].len = vq->vhost_hlen;
 		else
-			vq->used->ring[res_cur_idx & (vq->size - 1)].len =
-							pkt_len + vq->vhost_hlen;
+			vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
+
+		/*
+		 * to defer the update to when updating used->idx,
+		 * and batch them?
+		 */
+		vhost_log_write(dev, vq,
+			offsetof(struct vring_used, ring[idx]),
+			sizeof(vq->used->ring[idx]));
 
 		res_cur_idx++;
 		packet_success++;
@@ -236,6 +243,9 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 
 	*(volatile uint16_t *)&vq->used->idx += count;
 	vq->last_used_idx = res_end_idx;
+	vhost_log_write(dev, vq,
+		offsetof(struct vring_used, idx),
+		sizeof(vq->used->idx));
 
 	/* flush used->idx update before we read avail->flags. */
 	rte_mb();
@@ -265,6 +275,7 @@  copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
 	uint32_t seg_avail;
 	uint32_t vb_avail;
 	uint32_t cpy_len, entry_len;
+	uint16_t idx;
 
 	if (pkt == NULL)
 		return 0;
@@ -302,16 +313,18 @@  copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
 	entry_len = vq->vhost_hlen;
 
 	if (vb_avail == 0) {
-		uint32_t desc_idx =
-			vq->buf_vec[vec_idx].desc_idx;
+		uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
+
+		if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
+			idx = cur_idx & (vq->size - 1);
 
-		if ((vq->desc[desc_idx].flags
-			& VRING_DESC_F_NEXT) == 0) {
 			/* Update used ring with desc information */
-			vq->used->ring[cur_idx & (vq->size - 1)].id
-				= vq->buf_vec[vec_idx].desc_idx;
-			vq->used->ring[cur_idx & (vq->size - 1)].len
-				= entry_len;
+			vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
+			vq->used->ring[idx].len = entry_len;
+
+			vhost_log_write(dev, vq,
+					offsetof(struct vring_used, ring[idx]),
+					sizeof(vq->used->ring[idx]));
 
 			entry_len = 0;
 			cur_idx++;
@@ -354,10 +367,13 @@  copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
 			if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
 				VRING_DESC_F_NEXT) == 0) {
 				/* Update used ring with desc information */
-				vq->used->ring[cur_idx & (vq->size - 1)].id
+				idx = cur_idx & (vq->size - 1);
+				vq->used->ring[idx].id
 					= vq->buf_vec[vec_idx].desc_idx;
-				vq->used->ring[cur_idx & (vq->size - 1)].len
-					= entry_len;
+				vq->used->ring[idx].len = entry_len;
+				vhost_log_write(dev, vq,
+					offsetof(struct vring_used, ring[idx]),
+					sizeof(vq->used->ring[idx]));
 				entry_len = 0;
 				cur_idx++;
 				entry_success++;
@@ -390,16 +406,18 @@  copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
 
 					if ((vq->desc[desc_idx].flags &
 						VRING_DESC_F_NEXT) == 0) {
-						uint16_t wrapped_idx =
-							cur_idx & (vq->size - 1);
+						idx = cur_idx & (vq->size - 1);
 						/*
 						 * Update used ring with the
 						 * descriptor information
 						 */
-						vq->used->ring[wrapped_idx].id
+						vq->used->ring[idx].id
 							= desc_idx;
-						vq->used->ring[wrapped_idx].len
+						vq->used->ring[idx].len
 							= entry_len;
+						vhost_log_write(dev, vq,
+							offsetof(struct vring_used, ring[idx]),
+							sizeof(vq->used->ring[idx]));
 						entry_success++;
 						entry_len = 0;
 						cur_idx++;
@@ -422,10 +440,13 @@  copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
 				 * This whole packet completes.
 				 */
 				/* Update used ring with desc information */
-				vq->used->ring[cur_idx & (vq->size - 1)].id
+				idx = cur_idx & (vq->size - 1);
+				vq->used->ring[idx].id
 					= vq->buf_vec[vec_idx].desc_idx;
-				vq->used->ring[cur_idx & (vq->size - 1)].len
-					= entry_len;
+				vq->used->ring[idx].len = entry_len;
+				vhost_log_write(dev, vq,
+					offsetof(struct vring_used, ring[idx]),
+					sizeof(vq->used->ring[idx]));
 				entry_success++;
 				break;
 			}
@@ -658,6 +679,9 @@  rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
 		/* Update used index buffer information. */
 		vq->used->ring[used_idx].id = head[entry_success];
 		vq->used->ring[used_idx].len = 0;
+		vhost_log_write(dev, vq,
+				offsetof(struct vring_used, ring[used_idx]),
+				sizeof(vq->used->ring[used_idx]));
 
 		/* Allocate an mbuf and populate the structure. */
 		m = rte_pktmbuf_alloc(mbuf_pool);
@@ -778,6 +802,8 @@  rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
 
 	rte_compiler_barrier();
 	vq->used->idx += entry_success;
+	vhost_log_write(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
 	/* Kick guest if required. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
 		eventfd_write(vq->callfd, (eventfd_t)1);