@@ -44,6 +44,133 @@
#pragma GCC diagnostic ignored "-Wcast-qual"
#endif
+/* Handling the offload flags (olflags) field takes computation
+ * time when receiving packets. Therefore we provide a flag to disable
+ * the processing of the olflags field when they are not needed. This
+ * gives improved performance, at the cost of losing the offload info
+ * in the received packet
+ */
+#ifdef RTE_LIBRTE_FM10K_RX_OLFLAGS_ENABLE
+
+/* Vlan present flag shift */
+#define VP_SHIFT (2)
+/* L3 type shift */
+#define L3TYPE_SHIFT (4)
+/* L4 type shift */
+#define L4TYPE_SHIFT (7)
+
+static inline void
+fm10k_desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
+{
+ __m128i ptype0, ptype1, vtag0, vtag1;
+ union {
+ uint16_t e[4];
+ uint64_t dword;
+ } vol;
+
+ const __m128i pkttype_msk = _mm_set_epi16(
+ 0x0000, 0x0000, 0x0000, 0x0000,
+ PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
+ PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);
+
+ /* mask everything except rss type */
+ const __m128i rsstype_msk = _mm_set_epi16(
+ 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x000F, 0x000F, 0x000F, 0x000F);
+
+ /* map rss type to rss hash flag */
+ const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
+ 0, 0, 0, PKT_RX_RSS_HASH,
+ PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
+ PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);
+
+ ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
+ ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
+ vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
+ vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);
+
+ ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
+ ptype0 = _mm_and_si128(ptype0, rsstype_msk);
+ ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);
+
+ vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
+ vtag1 = _mm_srli_epi16(vtag1, VP_SHIFT);
+ vtag1 = _mm_and_si128(vtag1, pkttype_msk);
+
+ vtag1 = _mm_or_si128(ptype0, vtag1);
+ vol.dword = _mm_cvtsi128_si64(vtag1);
+
+ rx_pkts[0]->ol_flags = vol.e[0];
+ rx_pkts[1]->ol_flags = vol.e[1];
+ rx_pkts[2]->ol_flags = vol.e[2];
+ rx_pkts[3]->ol_flags = vol.e[3];
+}
+
+static inline void
+fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
+{
+ __m128i l3l4type0, l3l4type1, l3type, l4type;
+ union {
+ uint16_t e[4];
+ uint64_t dword;
+ } vol;
+
+ /* L3 pkt type mask Bit4 to Bit6 */
+ const __m128i l3type_msk = _mm_set_epi16(
+ 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0070, 0x0070, 0x0070, 0x0070);
+
+ /* L4 pkt type mask Bit7 to Bit9 */
+ const __m128i l4type_msk = _mm_set_epi16(
+ 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0380, 0x0380, 0x0380, 0x0380);
+
+ /* convert RRC l3 type to mbuf format */
+ const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT,
+ RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT,
+ RTE_PTYPE_L3_IPV4, 0);
+
+ /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits
+ * to fill into8 bits length.
+ */
+ const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
+ RTE_PTYPE_TUNNEL_GENEVE >> 8,
+ RTE_PTYPE_TUNNEL_NVGRE >> 8,
+ RTE_PTYPE_TUNNEL_VXLAN >> 8,
+ RTE_PTYPE_TUNNEL_GRE >> 8,
+ RTE_PTYPE_L4_UDP >> 8,
+ RTE_PTYPE_L4_TCP >> 8,
+ 0);
+
+ l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]);
+ l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]);
+ l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1);
+
+ l3type = _mm_and_si128(l3l4type0, l3type_msk);
+ l4type = _mm_and_si128(l3l4type0, l4type_msk);
+
+ l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT);
+ l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT);
+
+ l3type = _mm_shuffle_epi8(l3type_flags, l3type);
+ /* l4type_flags shift-left for 8 bits, need shift-right back */
+ l4type = _mm_shuffle_epi8(l4type_flags, l4type);
+
+ l4type = _mm_slli_epi16(l4type, 8);
+ l3l4type0 = _mm_or_si128(l3type, l4type);
+ vol.dword = _mm_cvtsi128_si64(l3l4type0);
+
+ rx_pkts[0]->packet_type = vol.e[0];
+ rx_pkts[1]->packet_type = vol.e[1];
+ rx_pkts[2]->packet_type = vol.e[2];
+ rx_pkts[3]->packet_type = vol.e[3];
+}
+#else
+#define fm10k_desc_to_olflags_v(desc, rx_pkts) do {} while (0)
+#define fm10k_desc_to_pktype_v(desc, rx_pkts) do {} while (0)
+#endif
+
int __attribute__((cold))
fm10k_rxq_vec_setup(struct fm10k_rx_queue *rxq)
{