From patchwork Tue Nov 17 10:06:33 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Joyce Kong X-Patchwork-Id: 84260 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id B1166A04DB; Tue, 17 Nov 2020 11:07:51 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 722A9C8E6; Tue, 17 Nov 2020 11:07:22 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by dpdk.org (Postfix) with ESMTP id 9F9CBC8E6 for ; Tue, 17 Nov 2020 11:07:20 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 29B371474; Tue, 17 Nov 2020 02:07:19 -0800 (PST) Received: from net-arm-thunderx2-03.shanghai.arm.com (net-arm-thunderx2-03.shanghai.arm.com [10.169.208.206]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id A61103F718; Tue, 17 Nov 2020 02:07:16 -0800 (PST) From: Joyce Kong To: maxime.coquelin@redhat.com, chenbo.xia@intel.com, jerinj@marvell.com, ruifeng.wang@arm.com, honnappa.nagarahalli@arm.com Cc: dev@dpdk.org, nd@arm.com Date: Tue, 17 Nov 2020 18:06:33 +0800 Message-Id: <20201117100635.27690-3-joyce.kong@arm.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201117100635.27690-1-joyce.kong@arm.com> References: <20200911120906.45995-1-joyce.kong@arm.com> <20201117100635.27690-1-joyce.kong@arm.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Optimize packed ring Rx batch path with NEON instructions. Signed-off-by: Joyce Kong Reviewed-by: Ruifeng Wang --- drivers/net/virtio/virtio_rxtx_packed.h | 15 ++ drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h index b0b1d63ec..8f5198ad7 100644 --- a/drivers/net/virtio/virtio_rxtx_packed.h +++ b/drivers/net/virtio/virtio_rxtx_packed.h @@ -19,9 +19,16 @@ #include "virtqueue.h" #define BYTE_SIZE 8 + +#ifdef CC_AVX512_SUPPORT /* flag bits offset in packed ring desc higher 64bits */ #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) +#elif defined(RTE_ARCH_ARM) +/* flag bits offset in packed ring desc from ID */ +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ + offsetof(struct vring_packed_desc, id)) * BYTE_SIZE) +#endif #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \ FLAGS_BITS_OFFSET) @@ -44,8 +51,16 @@ /* net hdr short size mask */ #define NET_HDR_MASK 0x3F +#ifdef RTE_ARCH_ARM +/* The cache line size on different Arm platforms are different, so + * put a four batch size here to match with the minimum cache line + * size and accommodate NEON register size. + */ +#define PACKED_BATCH_SIZE 4 +#else #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) +#endif #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) #ifdef VIRTIO_GCC_UNROLL_PRAGMA diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h new file mode 100644 index 000000000..fb1e49909 --- /dev/null +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Arm Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtio_rxtx_packed.h" +#include "virtqueue.h" + +static inline uint16_t +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq, + struct rte_mbuf **rx_pkts) +{ + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t head_size = hw->vtnet_hdr_size; + uint16_t id = vq->vq_used_cons_idx; + struct vring_packed_desc *p_desc; + uint16_t i; + + if (id & PACKED_BATCH_MASK) + return -1; + + if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries)) + return -1; + + /* Map packed descriptor to mbuf fields. */ + uint8x16_t shuf_msk1 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */ + 0, 1, /* octet 1~0, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */ + 0, 1, /* octet 1~0, 16 bits data_len */ + 0xFF, 0xFF, /* vlan tci set as unknown */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + uint8x16_t shuf_msk2 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */ + 8, 9, /* octet 9~8, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */ + 8, 9, /* octet 9~8, 16 bits data_len */ + 0xFF, 0xFF, /* vlan tci set as unknown */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + /* Subtract the header length. */ + uint16x8_t len_adjust = { + 0, 0, /* ignore pkt_type field */ + head_size, /* sub head_size on pkt_len */ + 0, /* ignore high 16 bits of pkt_len */ + head_size, /* sub head_size on data_len */ + 0, 0, 0 /* ignore non-length fields */ + }; + + uint64x2_t desc[PACKED_BATCH_SIZE / 2]; + uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2]; + uint64x2_t pkt_mb[PACKED_BATCH_SIZE]; + + p_desc = &vq->vq_packed.ring.desc[id]; + /* Load high 64 bits of packed descriptor 0,1. */ + desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1]; + /* Load high 64 bits of packed descriptor 2,3. */ + desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1]; + + /* Only care avail/used bits. */ + uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK); + /* Extract high 32 bits of packed descriptor (id, flags). */ + uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]), + vreinterpretq_u32_u64(desc[1])); + uint32x4_t v_flag = vandq_u32(v_desc, v_mask); + + uint32x4_t v_used_flag = vdupq_n_u32(0); + if (vq->vq_packed.used_wrap_counter) + v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK); + + poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag)); + + /* Check all descs are used. */ + if (desc_stats) + return -1; + + /* Load 2 mbuf pointers per time. */ + mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]); + vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]); + + mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]); + vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]); + + /** + * Update data length and packet length for descriptor. + * structure of pkt_mb: + * -------------------------------------------------------------------- + * |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci| + * -------------------------------------------------------------------- + */ + pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[0]), shuf_msk1)); + pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[0]), shuf_msk2)); + pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[1]), shuf_msk1))' + pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[1]), shuf_msk2)); + + pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[0]), len_adjust)); + pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[1]), len_adjust)); + pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[2]), len_adjust)); + pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[3]), len_adjust)); + + vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]); + vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]); + vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]); + vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]); + + if (hw->has_rx_offload) { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + char *addr = (char *)rx_pkts[i]->buf_addr + + RTE_PKTMBUF_HEADROOM - head_size; + virtio_vec_rx_offload(rx_pkts[i], + (struct virtio_net_hdr *)addr); + } + } + + virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len, + rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len, + rx_pkts[3]->pkt_len); + + vq->vq_free_cnt += PACKED_BATCH_SIZE; + + vq->vq_used_cons_idx += PACKED_BATCH_SIZE; + if (vq->vq_used_cons_idx >= vq->vq_nentries) { + vq->vq_used_cons_idx -= vq->vq_nentries; + vq->vq_packed.used_wrap_counter ^= 1; + } + + return 0; +}