[dpdk-dev,v2,6/7] virtio: simple tx routine

Message ID 1445149744-3217-7-git-send-email-huawei.xie@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Huawei Xie Oct. 18, 2015, 6:29 a.m. UTC
  bulk free of mbufs when clean used ring.
shift operation of idx could be further saved if vq_free_cnt means
free slots rather than free descriptors.

Signed-off-by: Huawei Xie <huawei.xie@intel.com>
---
 drivers/net/virtio/virtio_ethdev.h      |  3 ++
 drivers/net/virtio/virtio_rxtx_simple.c | 95 +++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
  

Comments

Stephen Hemminger Oct. 19, 2015, 4:16 a.m. UTC | #1
On Sun, 18 Oct 2015 14:29:03 +0800
Huawei Xie <huawei.xie@intel.com> wrote:

> bulk free of mbufs when clean used ring.
> shift operation of idx could be further saved if vq_free_cnt means
> free slots rather than free descriptors.
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>

Did you measure this. I finished my transmit optimizations and gets 25% performance improvement
without any of these restrictions.
  
Stephen Hemminger Oct. 19, 2015, 4:18 a.m. UTC | #2
On Sun, 18 Oct 2015 14:29:03 +0800
Huawei Xie <huawei.xie@intel.com> wrote:

> +
> +	for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
> +		m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
> +		if (likely(m->pool == free[0]->pool))
> +			free[nb_free++] = m;
> +		else {
> +			rte_mempool_put_bulk(free[0]->pool, (void **)free,
> +				nb_free);
> +			free[0] = m;
> +			nb_free = 1;
> +		}
> +	}

This assumes all transmits are from the same pool, which is not necessarily true.
  
Stephen Hemminger Oct. 19, 2015, 4:19 a.m. UTC | #3
+static inline void __attribute__((always_inline))
+virtio_xmit_cleanup(struct virtqueue *vq)
+{

Please don't use always inline, frustrating the compiler isn't going
to help.

+	uint16_t i, desc_idx;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
+		((vq->vq_nentries >> 1) - 1));
+	free[0] = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+	nb_free = 1;
+
+	for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+		m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+		if (likely(m->pool == free[0]->pool))
+			free[nb_free++] = m;
+		else {
+			rte_mempool_put_bulk(free[0]->pool, (void **)free,
+				nb_free);
+			free[0] = m;
+			nb_free = 1;
+		}
+	}
+
+	rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
+	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
+
+	return;
+}

Don't add return; at end of void functions. It only clutters
things for no reason.
  
Huawei Xie Oct. 19, 2015, 5:12 a.m. UTC | #4
On 10/19/2015 12:19 PM, Stephen Hemminger wrote:
> +static inline void __attribute__((always_inline))
> +virtio_xmit_cleanup(struct virtqueue *vq)
> +{
>
> Please don't use always inline, frustrating the compiler isn't going
> to help.
always inline is scattered elsewhere in the dpdk code.
What is the negative effect? Should we remove all of them?
> +	uint16_t i, desc_idx;
> +	int nb_free = 0;
> +	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
> +
> +	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
> +		((vq->vq_nentries >> 1) - 1));
> +	free[0] = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
> +	nb_free = 1;
> +
> +	for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
> +		m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
> +		if (likely(m->pool == free[0]->pool))
> +			free[nb_free++] = m;
> +		else {
> +			rte_mempool_put_bulk(free[0]->pool, (void **)free,
> +				nb_free);
> +			free[0] = m;
> +			nb_free = 1;
> +		}
> +	}
> +
> +	rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
> +	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
> +	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
> +
> +	return;
> +}
>
> Don't add return; at end of void functions. It only clutters
> things for no reason.
Agree.
>
  
Huawei Xie Oct. 19, 2015, 5:15 a.m. UTC | #5
On 10/19/2015 12:17 PM, Stephen Hemminger wrote:
> On Sun, 18 Oct 2015 14:29:03 +0800
> Huawei Xie <huawei.xie@intel.com> wrote:
>
>> +
>> +	for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
>> +		m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
>> +		if (likely(m->pool == free[0]->pool))
>> +			free[nb_free++] = m;
>> +		else {
>> +			rte_mempool_put_bulk(free[0]->pool, (void **)free,
>> +				nb_free);
>> +			free[0] = m;
>> +			nb_free = 1;
>> +		}
>> +	}
> This assumes all transmits are from the same pool, which is not necessarily true.

Don't get you. It accumulates all the mbufs from the same pool, free all
of them until it meets one from a different pool.

	if (likely(m->pool == free[0]->pool))


>
  
Huawei Xie Oct. 19, 2015, 5:22 a.m. UTC | #6
On 10/19/2015 12:17 PM, Stephen Hemminger wrote:
> On Sun, 18 Oct 2015 14:29:03 +0800
> Huawei Xie <huawei.xie@intel.com> wrote:
>
>> bulk free of mbufs when clean used ring.
>> shift operation of idx could be further saved if vq_free_cnt means
>> free slots rather than free descriptors.
>>
>> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> Did you measure this. I finished my transmit optimizations and gets 25% performance improvement
> without any of these restrictions.
Which restriction do you mean?  For the ring layout optimization, this
is the core idea. For the single segment mbuf, this is what all other
PMDs assume for fastest rx/tx path.
With all vhost and virtio optimizations, it could achieve approximately
3~4 times performance improvement(depending on the workload).
Do you mean the indirect feature support or additional optimization not
submitted? Would review your patch this week.
>
  

Patch

diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
index d7797ab..ae2d47d 100644
--- a/drivers/net/virtio/virtio_ethdev.h
+++ b/drivers/net/virtio/virtio_ethdev.h
@@ -111,6 +111,9 @@  uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts);
 
+uint16_t virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts);
+
 /*
  * The VIRTIO_NET_F_GUEST_TSO[46] features permit the host to send us
  * frames larger than 1514 bytes. We do not yet support software LRO
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index ef17562..3339a24 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -288,6 +288,101 @@  virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return nb_pkts_received;
 }
 
+#define VIRTIO_TX_FREE_THRESH 32
+#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
+#define VIRTIO_TX_FREE_NR 32
+/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
+static inline void __attribute__((always_inline))
+virtio_xmit_cleanup(struct virtqueue *vq)
+{
+	uint16_t i, desc_idx;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
+		((vq->vq_nentries >> 1) - 1));
+	free[0] = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+	nb_free = 1;
+
+	for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+		m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+		if (likely(m->pool == free[0]->pool))
+			free[nb_free++] = m;
+		else {
+			rte_mempool_put_bulk(free[0]->pool, (void **)free,
+				nb_free);
+			free[0] = m;
+			nb_free = 1;
+		}
+	}
+
+	rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
+	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
+
+	return;
+}
+
+uint16_t
+virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtqueue *txvq = tx_queue;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_desc *start_dp;
+	uint16_t nb_tail, nb_commit;
+	int i;
+	uint16_t desc_idx_max = (txvq->vq_nentries >> 1) - 1;
+
+	nb_used = VIRTQUEUE_NUSED(txvq);
+	rte_compiler_barrier();
+
+	nb_commit = nb_pkts = RTE_MIN((txvq->vq_free_cnt >> 1), nb_pkts);
+	desc_idx = (uint16_t) (txvq->vq_avail_idx & desc_idx_max);
+	start_dp = txvq->vq_ring.desc;
+	nb_tail = (uint16_t) (desc_idx_max + 1 - desc_idx);
+
+	if (nb_used >= VIRTIO_TX_FREE_THRESH)
+		virtio_xmit_cleanup(tx_queue);
+
+	if (nb_commit >= nb_tail) {
+		for (i = 0; i < nb_tail; i++)
+			txvq->vq_descx[desc_idx + i].cookie = tx_pkts[i];
+		for (i = 0; i < nb_tail; i++) {
+			start_dp[desc_idx].addr =
+				RTE_MBUF_DATA_DMA_ADDR(*tx_pkts);
+			start_dp[desc_idx].len = (*tx_pkts)->pkt_len;
+			tx_pkts++;
+			desc_idx++;
+		}
+		nb_commit -= nb_tail;
+		desc_idx = 0;
+	}
+	for (i = 0; i < nb_commit; i++)
+		txvq->vq_descx[desc_idx + i].cookie = tx_pkts[i];
+	for (i = 0; i < nb_commit; i++) {
+		start_dp[desc_idx].addr = RTE_MBUF_DATA_DMA_ADDR(*tx_pkts);
+		start_dp[desc_idx].len = (*tx_pkts)->pkt_len;
+		tx_pkts++;
+		desc_idx++;
+	}
+
+	rte_compiler_barrier();
+
+	txvq->vq_free_cnt -= (uint16_t)(nb_pkts << 1);
+	txvq->vq_avail_idx += nb_pkts;
+	txvq->vq_ring.avail->idx = txvq->vq_avail_idx;
+	txvq->packets += nb_pkts;
+
+	if (likely(nb_pkts)) {
+		if (unlikely(virtqueue_kick_prepare(txvq)))
+			virtqueue_notify(txvq);
+	}
+
+	return nb_pkts;
+}
+
 int __attribute__((cold))
 virtio_rxq_vec_setup(struct virtqueue *rxq)
 {