From patchwork Tue Jan 20 18:41:02 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Ananyev, Konstantin" X-Patchwork-Id: 2412 Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [IPv6:::1]) by dpdk.org (Postfix) with ESMTP id A54035AC3; Tue, 20 Jan 2015 19:41:54 +0100 (CET) Received: from mga09.intel.com (mga09.intel.com [134.134.136.24]) by dpdk.org (Postfix) with ESMTP id E122C5AD8 for ; Tue, 20 Jan 2015 19:41:43 +0100 (CET) Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by orsmga102.jf.intel.com with ESMTP; 20 Jan 2015 10:38:25 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.09,435,1418112000"; d="scan'208";a="664782666" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by fmsmga002.fm.intel.com with ESMTP; 20 Jan 2015 10:41:21 -0800 Received: from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com [10.237.217.46]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id t0KIfLNZ029511; Tue, 20 Jan 2015 18:41:21 GMT Received: from sivswdev02.ir.intel.com (localhost [127.0.0.1]) by sivswdev02.ir.intel.com with ESMTP id t0KIfLQr018877; Tue, 20 Jan 2015 18:41:21 GMT Received: (from kananye1@localhost) by sivswdev02.ir.intel.com with id t0KIfLRk018873; Tue, 20 Jan 2015 18:41:21 GMT From: Konstantin Ananyev To: dev@dpdk.org Date: Tue, 20 Jan 2015 18:41:02 +0000 Message-Id: <1421779267-18492-14-git-send-email-konstantin.ananyev@intel.com> X-Mailer: git-send-email 1.7.4.1 In-Reply-To: <1421779267-18492-1-git-send-email-konstantin.ananyev@intel.com> References: <1421779267-18492-1-git-send-email-konstantin.ananyev@intel.com> Subject: [dpdk-dev] [PATCH v3 13/18] librte_acl: Remove search_sse_2 and relatives. X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Previous improvements made scalar method the fastest one for tiny bunch of packets (< 4). That allows us to remove specific vector code-path for small number of packets (search_sse_2) and always use scalar method for such cases. Signed-off-by: Konstantin Ananyev --- lib/librte_acl/acl_run_avx2.c | 2 +- lib/librte_acl/acl_run_sse.c | 3 +- lib/librte_acl/acl_run_sse.h | 110 ------------------------------------------ 3 files changed, 3 insertions(+), 112 deletions(-) diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c index 0a42f72..79ebbd6 100644 --- a/lib/librte_acl/acl_run_avx2.c +++ b/lib/librte_acl/acl_run_avx2.c @@ -49,6 +49,6 @@ rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data, else if (num >= MAX_SEARCHES_SSE4) return search_sse_4(ctx, data, results, num, categories); else - return search_sse_2(ctx, data, results, num, + return rte_acl_classify_scalar(ctx, data, results, num, categories); } diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c index 77b32b3..a5a7d36 100644 --- a/lib/librte_acl/acl_run_sse.c +++ b/lib/librte_acl/acl_run_sse.c @@ -42,5 +42,6 @@ rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, else if (num >= MAX_SEARCHES_SSE4) return search_sse_4(ctx, data, results, num, categories); else - return search_sse_2(ctx, data, results, num, categories); + return rte_acl_classify_scalar(ctx, data, results, num, + categories); } diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h index e33e16b..1b7870e 100644 --- a/lib/librte_acl/acl_run_sse.h +++ b/lib/librte_acl/acl_run_sse.h @@ -45,10 +45,6 @@ static const rte_xmm_t xmm_shuffle_input = { .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}, }; -static const rte_xmm_t xmm_shuffle_input64 = { - .u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080}, -}; - static const rte_xmm_t xmm_ones_16 = { .u16 = {1, 1, 1, 1, 1, 1, 1, 1}, }; @@ -62,15 +58,6 @@ static const rte_xmm_t xmm_match_mask = { }, }; -static const rte_xmm_t xmm_match_mask64 = { - .u32 = { - RTE_ACL_NODE_MATCH, - 0, - RTE_ACL_NODE_MATCH, - 0, - }, -}; - static const rte_xmm_t xmm_index_mask = { .u32 = { RTE_ACL_NODE_INDEX, @@ -80,16 +67,6 @@ static const rte_xmm_t xmm_index_mask = { }, }; -static const rte_xmm_t xmm_index_mask64 = { - .u32 = { - RTE_ACL_NODE_INDEX, - RTE_ACL_NODE_INDEX, - 0, - 0, - }, -}; - - /* * Resolve priority for multiple results (sse version). * This consists comparing the priority of the current traversal with the @@ -161,22 +138,6 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx, } /* - * Check for a match in 2 transitions (contained in SSE register) - */ -static inline __attribute__((always_inline)) void -acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, - struct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask) -{ - xmm_t temp; - - temp = MM_AND(match_mask, *indices); - while (!MM_TESTZ(temp, temp)) { - acl_process_matches(indices, slot, ctx, parms, flows); - temp = MM_AND(match_mask, *indices); - } -} - -/* * Check for any match in 4 transitions (contained in 2 SSE registers) */ static inline __attribute__((always_inline)) void @@ -460,74 +421,3 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, return 0; } - -static inline __attribute__((always_inline)) xmm_t -transition2(xmm_t next_input, const uint64_t *trans, xmm_t *indices1) -{ - uint64_t t; - xmm_t addr, indices2; - - indices2 = _mm_setzero_si128(); - - addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x, - xmm_ones_16.x, *indices1, indices2); - - /* Gather 64 bit transitions and pack 2 per register. */ - - t = trans[MM_CVT32(addr)]; - - /* get slot 1 */ - addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1); - *indices1 = MM_SET64(trans[MM_CVT32(addr)], t); - - return MM_SRL32(next_input, CHAR_BIT); -} - -/* - * Execute trie traversal with 2 traversals in parallel. - */ -static inline int -search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data, - uint32_t *results, uint32_t total_packets, uint32_t categories) -{ - int n; - struct acl_flow_data flows; - uint64_t index_array[MAX_SEARCHES_SSE2]; - struct completion cmplt[MAX_SEARCHES_SSE2]; - struct parms parms[MAX_SEARCHES_SSE2]; - xmm_t input, indices; - - acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, - total_packets, categories, ctx->trans_table); - - for (n = 0; n < MAX_SEARCHES_SSE2; n++) { - cmplt[n].count = 0; - index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); - } - - indices = MM_LOADU((xmm_t *) &index_array[0]); - - /* Check for any matches. */ - acl_match_check_x2(0, ctx, parms, &flows, &indices, - xmm_match_mask64.x); - - while (flows.started > 0) { - - /* Gather 4 bytes of input data for each stream. */ - input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); - input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1); - - /* Process the 4 bytes of input on each stream. */ - - input = transition2(input, flows.trans, &indices); - input = transition2(input, flows.trans, &indices); - input = transition2(input, flows.trans, &indices); - input = transition2(input, flows.trans, &indices); - - /* Check for any matches. */ - acl_match_check_x2(0, ctx, parms, &flows, &indices, - xmm_match_mask64.x); - } - - return 0; -}