[v1,2/4] net/virtio: add vectorized packed ring Rx NEON path

Message ID 20201117100635.27690-3-joyce.kong@arm.com (mailing list archive)
State Accepted, archived
Delegated to: Maxime Coquelin
Headers
Series Vectorize packed ring RX/TX path with NEON |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Joyce Kong Nov. 17, 2020, 10:06 a.m. UTC
  Optimize packed ring Rx batch path with NEON instructions.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
 drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
  

Comments

Maxime Coquelin Jan. 5, 2021, 2:16 p.m. UTC | #1
On 11/17/20 11:06 AM, Joyce Kong wrote:
> Optimize packed ring Rx batch path with NEON instructions.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
>  2 files changed, 165 insertions(+)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
> 
> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
> index b0b1d63ec..8f5198ad7 100644
> --- a/drivers/net/virtio/virtio_rxtx_packed.h
> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
> @@ -19,9 +19,16 @@
>  #include "virtqueue.h"
>  
>  #define BYTE_SIZE 8
> +
> +#ifdef CC_AVX512_SUPPORT
>  /* flag bits offset in packed ring desc higher 64bits */
>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> +#elif defined(RTE_ARCH_ARM)
> +/* flag bits offset in packed ring desc from ID */
> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
> +#endif
>  
>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
>  	FLAGS_BITS_OFFSET)
> @@ -44,8 +51,16 @@
>  /* net hdr short size mask */
>  #define NET_HDR_MASK 0x3F
>  
> +#ifdef RTE_ARCH_ARM
> +/* The cache line size on different Arm platforms are different, so
> + * put a four batch size here to match with the minimum cache line
> + * size and accommodate NEON register size.
> + */
> +#define PACKED_BATCH_SIZE 4
> +#else
>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>  	sizeof(struct vring_packed_desc))
> +#endif
>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>  
>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
> new file mode 100644
> index 000000000..fb1e49909
> --- /dev/null
> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
> @@ -0,0 +1,150 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Arm Corporation
> + */
> +
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <rte_net.h>
> +#include <rte_vect.h>
> +
> +#include "virtio_ethdev.h"
> +#include "virtio_pci.h"
> +#include "virtio_rxtx_packed.h"
> +#include "virtqueue.h"
> +
> +static inline uint16_t
> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
> +				   struct rte_mbuf **rx_pkts)
> +{
> +	struct virtqueue *vq = rxvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint16_t head_size = hw->vtnet_hdr_size;
> +	uint16_t id = vq->vq_used_cons_idx;
> +	struct vring_packed_desc *p_desc;
> +	uint16_t i;
> +
> +	if (id & PACKED_BATCH_MASK)
> +		return -1;
> +
> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
> +		return -1;

This function returns an unsigned short, I think you should return 0
here since it failed to dequeue packets.

> +	/* Map packed descriptor to mbuf fields. */
> +	uint8x16_t shuf_msk1 = {
> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
> +		0, 1,			/* octet 1~0, 16 bits data_len */
> +		0xFF, 0xFF,		/* vlan tci set as unknown */
> +		0xFF, 0xFF, 0xFF, 0xFF
> +	};
> +
> +	uint8x16_t shuf_msk2 = {
> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
> +		8, 9,			/* octet 9~8, 16 bits data_len */
> +		0xFF, 0xFF,		/* vlan tci set as unknown */
> +		0xFF, 0xFF, 0xFF, 0xFF
> +	};
> +
> +	/* Subtract the header length. */
> +	uint16x8_t len_adjust = {
> +		0, 0,		/* ignore pkt_type field */
> +		head_size,	/* sub head_size on pkt_len */
> +		0,		/* ignore high 16 bits of pkt_len */
> +		head_size,	/* sub head_size on data_len */
> +		0, 0, 0		/* ignore non-length fields */
> +	};
> +
> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
> +
> +	p_desc = &vq->vq_packed.ring.desc[id];
> +	/* Load high 64 bits of packed descriptor 0,1. */
> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
> +	/* Load high 64 bits of packed descriptor 2,3. */
> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
> +
> +	/* Only care avail/used bits. */
> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
> +	/* Extract high 32 bits of packed descriptor (id, flags). */
> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
> +				vreinterpretq_u32_u64(desc[1]));
> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
> +
> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
> +	if (vq->vq_packed.used_wrap_counter)
> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
> +
> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
> +
> +	/* Check all descs are used. */
> +	if (desc_stats)
> +		return -1;

Same here. You should return 0 here as the queue is full.

> +
> +	/* Load 2 mbuf pointers per time. */
> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
> +
> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
> +
> +	/**
> +	 *  Update data length and packet length for descriptor.
> +	 *  structure of pkt_mb:
> +	 *  --------------------------------------------------------------------
> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
> +	 *  --------------------------------------------------------------------
> +	 */
> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
> +
> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
> +
> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
> +
> +	if (hw->has_rx_offload) {
> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			char *addr = (char *)rx_pkts[i]->buf_addr +
> +				RTE_PKTMBUF_HEADROOM - head_size;
> +			virtio_vec_rx_offload(rx_pkts[i],
> +					(struct virtio_net_hdr *)addr);
> +		}
> +	}
> +
> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
> +			rx_pkts[3]->pkt_len);
> +
> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
> +
> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> +		vq->vq_packed.used_wrap_counter ^= 1;
> +	}
> +
> +	return 0;
> +}
>
  
Maxime Coquelin Jan. 5, 2021, 2:27 p.m. UTC | #2
On 1/5/21 3:16 PM, Maxime Coquelin wrote:
> 
> 
> On 11/17/20 11:06 AM, Joyce Kong wrote:
>> Optimize packed ring Rx batch path with NEON instructions.
>>
>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
>>  2 files changed, 165 insertions(+)
>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
>>
>> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
>> index b0b1d63ec..8f5198ad7 100644
>> --- a/drivers/net/virtio/virtio_rxtx_packed.h
>> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
>> @@ -19,9 +19,16 @@
>>  #include "virtqueue.h"
>>  
>>  #define BYTE_SIZE 8
>> +
>> +#ifdef CC_AVX512_SUPPORT
>>  /* flag bits offset in packed ring desc higher 64bits */
>>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>> +#elif defined(RTE_ARCH_ARM)
>> +/* flag bits offset in packed ring desc from ID */
>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
>> +#endif
>>  
>>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
>>  	FLAGS_BITS_OFFSET)
>> @@ -44,8 +51,16 @@
>>  /* net hdr short size mask */
>>  #define NET_HDR_MASK 0x3F
>>  
>> +#ifdef RTE_ARCH_ARM
>> +/* The cache line size on different Arm platforms are different, so
>> + * put a four batch size here to match with the minimum cache line
>> + * size and accommodate NEON register size.
>> + */
>> +#define PACKED_BATCH_SIZE 4
>> +#else
>>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>  	sizeof(struct vring_packed_desc))
>> +#endif
>>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>  
>>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>> new file mode 100644
>> index 000000000..fb1e49909
>> --- /dev/null
>> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>> @@ -0,0 +1,150 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2020 Arm Corporation
>> + */
>> +
>> +#include <stdlib.h>
>> +#include <stdint.h>
>> +#include <stdio.h>
>> +#include <string.h>
>> +#include <errno.h>
>> +
>> +#include <rte_net.h>
>> +#include <rte_vect.h>
>> +
>> +#include "virtio_ethdev.h"
>> +#include "virtio_pci.h"
>> +#include "virtio_rxtx_packed.h"
>> +#include "virtqueue.h"
>> +
>> +static inline uint16_t
>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>> +				   struct rte_mbuf **rx_pkts)
>> +{
>> +	struct virtqueue *vq = rxvq->vq;
>> +	struct virtio_hw *hw = vq->hw;
>> +	uint16_t head_size = hw->vtnet_hdr_size;
>> +	uint16_t id = vq->vq_used_cons_idx;
>> +	struct vring_packed_desc *p_desc;
>> +	uint16_t i;
>> +
>> +	if (id & PACKED_BATCH_MASK)
>> +		return -1;
>> +
>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>> +		return -1;
> 
> This function returns an unsigned short, I think you should return 0
> here since it failed to dequeue packets.
> 
>> +	/* Map packed descriptor to mbuf fields. */
>> +	uint8x16_t shuf_msk1 = {
>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>> +		0, 1,			/* octet 1~0, 16 bits data_len */
>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>> +		0xFF, 0xFF, 0xFF, 0xFF
>> +	};
>> +
>> +	uint8x16_t shuf_msk2 = {
>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>> +		8, 9,			/* octet 9~8, 16 bits data_len */
>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>> +		0xFF, 0xFF, 0xFF, 0xFF
>> +	};
>> +
>> +	/* Subtract the header length. */
>> +	uint16x8_t len_adjust = {
>> +		0, 0,		/* ignore pkt_type field */
>> +		head_size,	/* sub head_size on pkt_len */
>> +		0,		/* ignore high 16 bits of pkt_len */
>> +		head_size,	/* sub head_size on data_len */
>> +		0, 0, 0		/* ignore non-length fields */
>> +	};
>> +
>> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
>> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
>> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
>> +
>> +	p_desc = &vq->vq_packed.ring.desc[id];
>> +	/* Load high 64 bits of packed descriptor 0,1. */
>> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
>> +	/* Load high 64 bits of packed descriptor 2,3. */
>> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
>> +
>> +	/* Only care avail/used bits. */
>> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
>> +	/* Extract high 32 bits of packed descriptor (id, flags). */
>> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
>> +				vreinterpretq_u32_u64(desc[1]));
>> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
>> +
>> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
>> +	if (vq->vq_packed.used_wrap_counter)
>> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
>> +
>> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
>> +
>> +	/* Check all descs are used. */
>> +	if (desc_stats)
>> +		return -1;
> 
> Same here. You should return 0 here as the queue is full.

Just looked again at the code and at AVX implementation.
It should not return 0 here, but any positive value.

Maybe the cleanest way would change the function prototype to int.
0: success
-1: failure

>> +
>> +	/* Load 2 mbuf pointers per time. */
>> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
>> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
>> +
>> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
>> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
>> +
>> +	/**
>> +	 *  Update data length and packet length for descriptor.
>> +	 *  structure of pkt_mb:
>> +	 *  --------------------------------------------------------------------
>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>> +	 *  --------------------------------------------------------------------
>> +	 */
>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
>> +
>> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
>> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
>> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
>> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
>> +
>> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
>> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
>> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
>> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
>> +
>> +	if (hw->has_rx_offload) {
>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>> +				RTE_PKTMBUF_HEADROOM - head_size;
>> +			virtio_vec_rx_offload(rx_pkts[i],
>> +					(struct virtio_net_hdr *)addr);
>> +		}
>> +	}
>> +
>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>> +			rx_pkts[3]->pkt_len);
>> +
>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>> +
>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>> +		vq->vq_packed.used_wrap_counter ^= 1;
>> +	}
>> +
>> +	return 0;
>> +}
>>
>
  
Maxime Coquelin Jan. 7, 2021, 10:39 a.m. UTC | #3
On 1/5/21 3:27 PM, Maxime Coquelin wrote:
> 
> 
> On 1/5/21 3:16 PM, Maxime Coquelin wrote:
>>
>>
>> On 11/17/20 11:06 AM, Joyce Kong wrote:
>>> Optimize packed ring Rx batch path with NEON instructions.
>>>
>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>>> ---
>>>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>>>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
>>>  2 files changed, 165 insertions(+)
>>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>
>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
>>> index b0b1d63ec..8f5198ad7 100644
>>> --- a/drivers/net/virtio/virtio_rxtx_packed.h
>>> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
>>> @@ -19,9 +19,16 @@
>>>  #include "virtqueue.h"
>>>  
>>>  #define BYTE_SIZE 8
>>> +
>>> +#ifdef CC_AVX512_SUPPORT
>>>  /* flag bits offset in packed ring desc higher 64bits */
>>>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>>> +#elif defined(RTE_ARCH_ARM)
>>> +/* flag bits offset in packed ring desc from ID */
>>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
>>> +#endif
>>>  
>>>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
>>>  	FLAGS_BITS_OFFSET)
>>> @@ -44,8 +51,16 @@
>>>  /* net hdr short size mask */
>>>  #define NET_HDR_MASK 0x3F
>>>  
>>> +#ifdef RTE_ARCH_ARM
>>> +/* The cache line size on different Arm platforms are different, so
>>> + * put a four batch size here to match with the minimum cache line
>>> + * size and accommodate NEON register size.
>>> + */
>>> +#define PACKED_BATCH_SIZE 4
>>> +#else
>>>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>>  	sizeof(struct vring_packed_desc))
>>> +#endif
>>>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>>  
>>>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>> new file mode 100644
>>> index 000000000..fb1e49909
>>> --- /dev/null
>>> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>> @@ -0,0 +1,150 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(c) 2020 Arm Corporation
>>> + */
>>> +
>>> +#include <stdlib.h>
>>> +#include <stdint.h>
>>> +#include <stdio.h>
>>> +#include <string.h>
>>> +#include <errno.h>
>>> +
>>> +#include <rte_net.h>
>>> +#include <rte_vect.h>
>>> +
>>> +#include "virtio_ethdev.h"
>>> +#include "virtio_pci.h"
>>> +#include "virtio_rxtx_packed.h"
>>> +#include "virtqueue.h"
>>> +
>>> +static inline uint16_t
>>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>>> +				   struct rte_mbuf **rx_pkts)
>>> +{
>>> +	struct virtqueue *vq = rxvq->vq;
>>> +	struct virtio_hw *hw = vq->hw;
>>> +	uint16_t head_size = hw->vtnet_hdr_size;
>>> +	uint16_t id = vq->vq_used_cons_idx;
>>> +	struct vring_packed_desc *p_desc;
>>> +	uint16_t i;
>>> +
>>> +	if (id & PACKED_BATCH_MASK)
>>> +		return -1;
>>> +
>>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>>> +		return -1;
>>
>> This function returns an unsigned short, I think you should return 0
>> here since it failed to dequeue packets.
>>
>>> +	/* Map packed descriptor to mbuf fields. */
>>> +	uint8x16_t shuf_msk1 = {
>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>>> +		0, 1,			/* octet 1~0, 16 bits data_len */
>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>> +	};
>>> +
>>> +	uint8x16_t shuf_msk2 = {
>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>>> +		8, 9,			/* octet 9~8, 16 bits data_len */
>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>> +	};
>>> +
>>> +	/* Subtract the header length. */
>>> +	uint16x8_t len_adjust = {
>>> +		0, 0,		/* ignore pkt_type field */
>>> +		head_size,	/* sub head_size on pkt_len */
>>> +		0,		/* ignore high 16 bits of pkt_len */
>>> +		head_size,	/* sub head_size on data_len */
>>> +		0, 0, 0		/* ignore non-length fields */
>>> +	};
>>> +
>>> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
>>> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
>>> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
>>> +
>>> +	p_desc = &vq->vq_packed.ring.desc[id];
>>> +	/* Load high 64 bits of packed descriptor 0,1. */
>>> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
>>> +	/* Load high 64 bits of packed descriptor 2,3. */
>>> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
>>> +
>>> +	/* Only care avail/used bits. */
>>> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
>>> +	/* Extract high 32 bits of packed descriptor (id, flags). */
>>> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
>>> +				vreinterpretq_u32_u64(desc[1]));
>>> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
>>> +
>>> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
>>> +	if (vq->vq_packed.used_wrap_counter)
>>> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
>>> +
>>> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
>>> +
>>> +	/* Check all descs are used. */
>>> +	if (desc_stats)
>>> +		return -1;
>>
>> Same here. You should return 0 here as the queue is full.
> 
> Just looked again at the code and at AVX implementation.
> It should not return 0 here, but any positive value.
> 
> Maybe the cleanest way would change the function prototype to int.
> 0: success
> -1: failure


Joyce, are you fine if I do the cange while applying?
I have a big series that will conflicts with your patch set, so I'd like
to have yours merged ASAP so I can start the rebase.

Thanks,
Maxime

>>> +
>>> +	/* Load 2 mbuf pointers per time. */
>>> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
>>> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
>>> +
>>> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
>>> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
>>> +
>>> +	/**
>>> +	 *  Update data length and packet length for descriptor.
>>> +	 *  structure of pkt_mb:
>>> +	 *  --------------------------------------------------------------------
>>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>>> +	 *  --------------------------------------------------------------------
>>> +	 */
>>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>>> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
>>> +
>>> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
>>> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
>>> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
>>> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
>>> +
>>> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
>>> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
>>> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
>>> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
>>> +
>>> +	if (hw->has_rx_offload) {
>>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>>> +				RTE_PKTMBUF_HEADROOM - head_size;
>>> +			virtio_vec_rx_offload(rx_pkts[i],
>>> +					(struct virtio_net_hdr *)addr);
>>> +		}
>>> +	}
>>> +
>>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>>> +			rx_pkts[3]->pkt_len);
>>> +
>>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>>> +
>>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>>> +		vq->vq_packed.used_wrap_counter ^= 1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>>
>>
  
Joyce Kong Jan. 8, 2021, 7:29 a.m. UTC | #4
>On 1/5/21 3:27 PM, Maxime Coquelin wrote:
>>
>>
>> On 1/5/21 3:16 PM, Maxime Coquelin wrote:
>>>
>>>
>>> On 11/17/20 11:06 AM, Joyce Kong wrote:
>>>> Optimize packed ring Rx batch path with NEON instructions.
>>>>
>>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>>>> ---
>>>>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>>>>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150
>>>> +++++++++++++++++++
>>>>  2 files changed, 165 insertions(+)
>>>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>>
>>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h
>>>> b/drivers/net/virtio/virtio_rxtx_packed.h
>>>> index b0b1d63ec..8f5198ad7 100644
>>>> --- a/drivers/net/virtio/virtio_rxtx_packed.h
>>>> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
>>>> @@ -19,9 +19,16 @@
>>>>  #include "virtqueue.h"
>>>>
>>>>  #define BYTE_SIZE 8
>>>> +
>>>> +#ifdef CC_AVX512_SUPPORT
>>>>  /* flag bits offset in packed ring desc higher 64bits */  #define
>>>> FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>>>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>>>> +#elif defined(RTE_ARCH_ARM)
>>>> +/* flag bits offset in packed ring desc from ID */ #define
>>>> +FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>>> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE) #endif
>>>>
>>>>  #define PACKED_FLAGS_MASK ((0ULL |
>VRING_PACKED_DESC_F_AVAIL_USED) << \
>>>>  	FLAGS_BITS_OFFSET)
>>>> @@ -44,8 +51,16 @@
>>>>  /* net hdr short size mask */
>>>>  #define NET_HDR_MASK 0x3F
>>>>
>>>> +#ifdef RTE_ARCH_ARM
>>>> +/* The cache line size on different Arm platforms are different, so
>>>> + * put a four batch size here to match with the minimum cache line
>>>> + * size and accommodate NEON register size.
>>>> + */
>>>> +#define PACKED_BATCH_SIZE 4
>>>> +#else
>>>>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>>>  	sizeof(struct vring_packed_desc))
>>>> +#endif
>>>>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>>>
>>>>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
>>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>> b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>> new file mode 100644
>>>> index 000000000..fb1e49909
>>>> --- /dev/null
>>>> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>> @@ -0,0 +1,150 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2020 Arm Corporation  */
>>>> +
>>>> +#include <stdlib.h>
>>>> +#include <stdint.h>
>>>> +#include <stdio.h>
>>>> +#include <string.h>
>>>> +#include <errno.h>
>>>> +
>>>> +#include <rte_net.h>
>>>> +#include <rte_vect.h>
>>>> +
>>>> +#include "virtio_ethdev.h"
>>>> +#include "virtio_pci.h"
>>>> +#include "virtio_rxtx_packed.h"
>>>> +#include "virtqueue.h"
>>>> +
>>>> +static inline uint16_t
>>>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>>>> +				   struct rte_mbuf **rx_pkts)
>>>> +{
>>>> +	struct virtqueue *vq = rxvq->vq;
>>>> +	struct virtio_hw *hw = vq->hw;
>>>> +	uint16_t head_size = hw->vtnet_hdr_size;
>>>> +	uint16_t id = vq->vq_used_cons_idx;
>>>> +	struct vring_packed_desc *p_desc;
>>>> +	uint16_t i;
>>>> +
>>>> +	if (id & PACKED_BATCH_MASK)
>>>> +		return -1;
>>>> +
>>>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>>>> +		return -1;
>>>
>>> This function returns an unsigned short, I think you should return 0
>>> here since it failed to dequeue packets.
>>>
>>>> +	/* Map packed descriptor to mbuf fields. */
>>>> +	uint8x16_t shuf_msk1 = {
>>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>>> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
>>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out
>*/
>>>> +		0, 1,			/* octet 1~0, 16 bits data_len */
>>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>>> +	};
>>>> +
>>>> +	uint8x16_t shuf_msk2 = {
>>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>>> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
>>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out
>*/
>>>> +		8, 9,			/* octet 9~8, 16 bits data_len */
>>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>>> +	};
>>>> +
>>>> +	/* Subtract the header length. */
>>>> +	uint16x8_t len_adjust = {
>>>> +		0, 0,		/* ignore pkt_type field */
>>>> +		head_size,	/* sub head_size on pkt_len */
>>>> +		0,		/* ignore high 16 bits of pkt_len */
>>>> +		head_size,	/* sub head_size on data_len */
>>>> +		0, 0, 0		/* ignore non-length fields */
>>>> +	};
>>>> +
>>>> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
>>>> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
>>>> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
>>>> +
>>>> +	p_desc = &vq->vq_packed.ring.desc[id];
>>>> +	/* Load high 64 bits of packed descriptor 0,1. */
>>>> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
>>>> +	/* Load high 64 bits of packed descriptor 2,3. */
>>>> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
>>>> +
>>>> +	/* Only care avail/used bits. */
>>>> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
>>>> +	/* Extract high 32 bits of packed descriptor (id, flags). */
>>>> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
>>>> +				vreinterpretq_u32_u64(desc[1]));
>>>> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
>>>> +
>>>> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
>>>> +	if (vq->vq_packed.used_wrap_counter)
>>>> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
>>>> +
>>>> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag,
>>>> +v_used_flag));
>>>> +
>>>> +	/* Check all descs are used. */
>>>> +	if (desc_stats)
>>>> +		return -1;
>>>
>>> Same here. You should return 0 here as the queue is full.
>>
>> Just looked again at the code and at AVX implementation.
>> It should not return 0 here, but any positive value.
>>
>> Maybe the cleanest way would change the function prototype to int.
>> 0: success
>> -1: failure
>
>
>Joyce, are you fine if I do the cange while applying?
>I have a big series that will conflicts with your patch set, so I'd like to have
>yours merged ASAP so I can start the rebase.
>
>Thanks,
>Maxime
>

Maxime, It's ok if you would do the change while applying.

Thanks,
Joyce
 
>>>> +
>>>> +	/* Load 2 mbuf pointers per time. */
>>>> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
>>>> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
>>>> +
>>>> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
>>>> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
>>>> +
>>>> +	/**
>>>> +	 *  Update data length and packet length for descriptor.
>>>> +	 *  structure of pkt_mb:
>>>> +	 *  --------------------------------------------------------------------
>>>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>>>> +	 *  --------------------------------------------------------------------
>>>> +	 */
>>>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>>>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>>>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>>>> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
>>>> +
>>>> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
>>>> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
>>>> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
>>>> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
>>>> +
>>>> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
>>>> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
>>>> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
>>>> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
>>>> +
>>>> +	if (hw->has_rx_offload) {
>>>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>>>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>>>> +				RTE_PKTMBUF_HEADROOM - head_size;
>>>> +			virtio_vec_rx_offload(rx_pkts[i],
>>>> +					(struct virtio_net_hdr *)addr);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>>>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>>>> +			rx_pkts[3]->pkt_len);
>>>> +
>>>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>>>> +
>>>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>>>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>>>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>>>> +		vq->vq_packed.used_wrap_counter ^= 1;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>>
>>>
  
Ferruh Yigit Jan. 8, 2021, 5:02 p.m. UTC | #5
On 11/17/2020 10:06 AM, Joyce Kong wrote:
> +	/**
> +	 *  Update data length and packet length for descriptor.
> +	 *  structure of pkt_mb:
> +	 *  --------------------------------------------------------------------
> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
> +	 *  --------------------------------------------------------------------
> +	 */
> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'

s\'\;

I will fix in next-net but my concern is why this has been not caught by any of 
our automated builds?

In patchwork only test report seems from the 'checkpatch':
https://patches.dpdk.org/patch/84260/
  
Honnappa Nagarahalli Jan. 8, 2021, 10:26 p.m. UTC | #6
<snip>

> 
> On 11/17/2020 10:06 AM, Joyce Kong wrote:
> > +	/**
> > +	 *  Update data length and packet length for descriptor.
> > +	 *  structure of pkt_mb:
> > +	 *  --------------------------------------------------------------------
> > +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits
> vlan_tci|
> > +	 *  --------------------------------------------------------------------
> > +	 */
> > +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
> > +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
> > +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
> > +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
> 
> s\'\;
> 
> I will fix in next-net but my concern is why this has been not caught by any of
> our automated builds?
> 
> In patchwork only test report seems from the 'checkpatch':
> https://patches.dpdk.org/patch/84260/
 
Looking at [1], Travis CI has not run and the UNH CI did not have Arm builds enabled at the time this patch was submitted.

[1] https://patches.dpdk.org/patch/84262/
  
Maxime Coquelin Jan. 11, 2021, 10:45 a.m. UTC | #7
On 1/8/21 6:02 PM, Ferruh Yigit wrote:
> On 11/17/2020 10:06 AM, Joyce Kong wrote:
>> +    /**
>> +     *  Update data length and packet length for descriptor.
>> +     *  structure of pkt_mb:
>> +     * 
>> --------------------------------------------------------------------
>> +     *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits
>> vlan_tci|
>> +     * 
>> --------------------------------------------------------------------
>> +     */
>> +    pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +            vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> +    pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +            vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> +    pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +            vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
> 
> s\'\;
> 
> I will fix in next-net but my concern is why this has been not caught by
> any of our automated builds?
> 
> In patchwork only test report seems from the 'checkpatch':
> https://patches.dpdk.org/patch/84260/
> 

Thanks Ferruh for spotting and fixing it.

I think the CI was broken at the time it was submitted, it would be
great to have a way to manually trigger the CI again!

That plus me changing laptop recently and not having a full multi-arch
build system up and running again made this build issue pass through...

Regards,
Maxime
  
Aaron Conole Jan. 11, 2021, 1:04 p.m. UTC | #8
Ferruh Yigit <ferruh.yigit@intel.com> writes:

> On 11/17/2020 10:06 AM, Joyce Kong wrote:
>> +	/**
>> +	 *  Update data length and packet length for descriptor.
>> +	 *  structure of pkt_mb:
>> +	 *  --------------------------------------------------------------------
>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>> +	 *  --------------------------------------------------------------------
>> +	 */
>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>
> s\'\;
>
> I will fix in next-net but my concern is why this has been not caught
> by any of our automated builds?

That series was flagged for error with Travis:

http://mails.dpdk.org/archives/test-report/2020-November/167602.html

Unfortunately, the build seems to have been purged (since it's from
November).  But Travis did flag the build as failing.  With github
actions we hope to pull the full logs into the email.

> In patchwork only test report seems from the 'checkpatch':
> https://patches.dpdk.org/patch/84260/

At least the 0-day robot does not submit each patch for separate build.
We did that at first, and the robot's queue reached a week of backlog
because the build takes a while.  Especially when we get 20+ patch
series followed by v2-v4 fixing build errors or compile errors.
  
Aaron Conole Jan. 11, 2021, 1:05 p.m. UTC | #9
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> writes:

> <snip>
>
>> 
>> On 11/17/2020 10:06 AM, Joyce Kong wrote:
>> > +	/**
>> > +	 *  Update data length and packet length for descriptor.
>> > +	 *  structure of pkt_mb:
>> > +	 *  --------------------------------------------------------------------
>> > +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits
>> vlan_tci|
>> > +	 *  --------------------------------------------------------------------
>> > +	 */
>> > +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> > +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> > +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> > +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>> 
>> s\'\;
>> 
>> I will fix in next-net but my concern is why this has been not caught by any of
>> our automated builds?
>> 
>> In patchwork only test report seems from the 'checkpatch':
>> https://patches.dpdk.org/patch/84260/
>  
> Looking at [1], Travis CI has not run and the UNH CI did not have Arm
> builds enabled at the time this patch was submitted.

Seem my other email.  Travis CI doesn't run 'patch-at-a-time' for
execution time reasons.

> [1] https://patches.dpdk.org/patch/84262/
  

Patch

diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
index b0b1d63ec..8f5198ad7 100644
--- a/drivers/net/virtio/virtio_rxtx_packed.h
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -19,9 +19,16 @@ 
 #include "virtqueue.h"
 
 #define BYTE_SIZE 8
+
+#ifdef CC_AVX512_SUPPORT
 /* flag bits offset in packed ring desc higher 64bits */
 #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
 	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+#elif defined(RTE_ARCH_ARM)
+/* flag bits offset in packed ring desc from ID */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
+#endif
 
 #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
 	FLAGS_BITS_OFFSET)
@@ -44,8 +51,16 @@ 
 /* net hdr short size mask */
 #define NET_HDR_MASK 0x3F
 
+#ifdef RTE_ARCH_ARM
+/* The cache line size on different Arm platforms are different, so
+ * put a four batch size here to match with the minimum cache line
+ * size and accommodate NEON register size.
+ */
+#define PACKED_BATCH_SIZE 4
+#else
 #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
 	sizeof(struct vring_packed_desc))
+#endif
 #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
 
 #ifdef VIRTIO_GCC_UNROLL_PRAGMA
diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
new file mode 100644
index 000000000..fb1e49909
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
@@ -0,0 +1,150 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Arm Corporation
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+#include <rte_vect.h>
+
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+static inline uint16_t
+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
+				   struct rte_mbuf **rx_pkts)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t head_size = hw->vtnet_hdr_size;
+	uint16_t id = vq->vq_used_cons_idx;
+	struct vring_packed_desc *p_desc;
+	uint16_t i;
+
+	if (id & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* Map packed descriptor to mbuf fields. */
+	uint8x16_t shuf_msk1 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
+		0, 1,			/* octet 1~0, low 16 bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
+		0, 1,			/* octet 1~0, 16 bits data_len */
+		0xFF, 0xFF,		/* vlan tci set as unknown */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	uint8x16_t shuf_msk2 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
+		8, 9,			/* octet 9~8, low 16 bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
+		8, 9,			/* octet 9~8, 16 bits data_len */
+		0xFF, 0xFF,		/* vlan tci set as unknown */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* Subtract the header length. */
+	uint16x8_t len_adjust = {
+		0, 0,		/* ignore pkt_type field */
+		head_size,	/* sub head_size on pkt_len */
+		0,		/* ignore high 16 bits of pkt_len */
+		head_size,	/* sub head_size on data_len */
+		0, 0, 0		/* ignore non-length fields */
+	};
+
+	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
+	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
+	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
+
+	p_desc = &vq->vq_packed.ring.desc[id];
+	/* Load high 64 bits of packed descriptor 0,1. */
+	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
+	/* Load high 64 bits of packed descriptor 2,3. */
+	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
+
+	/* Only care avail/used bits. */
+	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
+	/* Extract high 32 bits of packed descriptor (id, flags). */
+	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
+				vreinterpretq_u32_u64(desc[1]));
+	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
+
+	uint32x4_t v_used_flag = vdupq_n_u32(0);
+	if (vq->vq_packed.used_wrap_counter)
+		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
+
+	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
+
+	/* Check all descs are used. */
+	if (desc_stats)
+		return -1;
+
+	/* Load 2 mbuf pointers per time. */
+	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
+	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
+
+	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
+	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
+
+	/**
+	 *  Update data length and packet length for descriptor.
+	 *  structure of pkt_mb:
+	 *  --------------------------------------------------------------------
+	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
+	 *  --------------------------------------------------------------------
+	 */
+	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
+	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+
+	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+
+	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
+	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
+	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
+	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
+
+	if (hw->has_rx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			char *addr = (char *)rx_pkts[i]->buf_addr +
+				RTE_PKTMBUF_HEADROOM - head_size;
+			virtio_vec_rx_offload(rx_pkts[i],
+					(struct virtio_net_hdr *)addr);
+		}
+	}
+
+	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
+			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
+			rx_pkts[3]->pkt_len);
+
+	vq->vq_free_cnt += PACKED_BATCH_SIZE;
+
+	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}