@@ -169,6 +169,7 @@ struct rte_acl_ctx {
int32_t socket_id;
/** Socket ID to allocate memory from. */
enum rte_acl_classify_alg alg;
+ uint32_t first_load_sz;
void *rules;
uint32_t max_rules;
uint32_t rule_sz;
@@ -1581,6 +1581,37 @@ acl_check_bld_param(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg)
return 0;
}
+/*
+ * With current ACL implementation first field in the rule definition
+ * has always to be one byte long. Though for optimising *classify*
+ * implementation it might be useful to be able to use 4B reads
+ * (as we do for rest of the fields).
+ * This function checks input config to determine is it safe to do 4B
+ * loads for first ACL field. For that we need to make sure that
+ * first field in our rule definition doesn't have the biggest offset,
+ * i.e. we still do have other fields located after the first one.
+ * Contrary if first field has the largest offset, then it means
+ * first field can occupy the very last byte in the input data buffer,
+ * and we have to do single byte load for it.
+ */
+static uint32_t
+get_first_load_size(const struct rte_acl_config *cfg)
+{
+ uint32_t i, max_ofs, ofs;
+
+ ofs = 0;
+ max_ofs = 0;
+
+ for (i = 0; i != cfg->num_fields; i++) {
+ if (cfg->defs[i].field_index == 0)
+ ofs = cfg->defs[i].offset;
+ else if (max_ofs < cfg->defs[i].offset)
+ max_ofs = cfg->defs[i].offset;
+ }
+
+ return (ofs < max_ofs) ? sizeof(uint32_t) : sizeof(uint8_t);
+}
+
int
rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg)
{
@@ -1618,6 +1649,9 @@ rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg)
/* set data indexes. */
acl_set_data_indexes(ctx);
+ /* determine can we always do 4B load */
+ ctx->first_load_sz = get_first_load_size(cfg);
+
/* copy in build config. */
ctx->config = *cfg;
}
@@ -16,6 +16,7 @@ struct acl_flow_avx512 {
uint32_t num_packets; /* number of packets processed */
uint32_t total_packets; /* max number of packets to process */
uint32_t root_index; /* current root index */
+ uint32_t first_load_sz; /* first load size for new packet */
const uint64_t *trans; /* transition table */
const uint32_t *data_index; /* input data indexes */
const uint8_t **idata; /* input data */
@@ -29,6 +30,7 @@ acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx,
{
flow->num_packets = 0;
flow->total_packets = total_packets;
+ flow->first_load_sz = ctx->first_load_sz;
flow->root_index = ctx->trie[trie].root_index;
flow->trans = ctx->trans_table;
flow->data_index = ctx->trie[trie].data_index;
@@ -155,6 +157,11 @@ resolve_mcgt8_avx512x1(uint32_t result[],
}
}
+/*
+ * unfortunately current AVX512 ISA doesn't provide ability for
+ * gather load on a byte quantity. So we have to mimic it in SW,
+ * by doing 8x1B scalar loads.
+ */
static inline ymm_t
_m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
{
@@ -413,7 +413,7 @@ match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
if (n[0] != 0) {
inp[0] = get_next_bytes_avx512x16(flow, &pdata[0],
- rm[0], &di[0], sizeof(uint8_t));
+ rm[0], &di[0], flow->first_load_sz);
first_trans16(flow, inp[0], rm[0], &tr_lo[0],
&tr_hi[0]);
rm[0] = _mm512_test_epi32_mask(tr_lo[0],
@@ -422,7 +422,7 @@ match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
if (n[1] != 0) {
inp[1] = get_next_bytes_avx512x16(flow, &pdata[2],
- rm[1], &di[1], sizeof(uint8_t));
+ rm[1], &di[1], flow->first_load_sz);
first_trans16(flow, inp[1], rm[1], &tr_lo[1],
&tr_hi[1]);
rm[1] = _mm512_test_epi32_mask(tr_lo[1],
@@ -447,9 +447,9 @@ search_trie_avx512x16x2(struct acl_flow_avx512 *flow)
start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);
in[0] = get_next_bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0],
- sizeof(uint8_t));
+ flow->first_load_sz);
in[1] = get_next_bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1],
- sizeof(uint8_t));
+ flow->first_load_sz);
first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);
first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);
@@ -325,7 +325,7 @@ match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
if (n[0] != 0) {
inp[0] = get_next_bytes_avx512x8(flow, pdata[0], rm[0],
- &di[0], sizeof(uint8_t));
+ &di[0], flow->first_load_sz);
first_trans8(flow, inp[0], rm[0], &tr_lo[0], &tr_hi[0]);
rm[0] = _mm256_test_epi32_mask(tr_lo[0],
@@ -334,7 +334,7 @@ match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
if (n[1] != 0) {
inp[1] = get_next_bytes_avx512x8(flow, pdata[1], rm[1],
- &di[1], sizeof(uint8_t));
+ &di[1], flow->first_load_sz);
first_trans8(flow, inp[1], rm[1], &tr_lo[1], &tr_hi[1]);
rm[1] = _mm256_test_epi32_mask(tr_lo[1],
@@ -360,9 +360,9 @@ search_trie_avx512x8x2(struct acl_flow_avx512 *flow)
start_flow8(flow, CHAR_BIT, UINT8_MAX, &pdata[1], &idx[1], &di[1]);
inp[0] = get_next_bytes_avx512x8(flow, pdata[0], UINT8_MAX, &di[0],
- sizeof(uint8_t));
+ flow->first_load_sz);
inp[1] = get_next_bytes_avx512x8(flow, pdata[1], UINT8_MAX, &di[1],
- sizeof(uint8_t));
+ flow->first_load_sz);
first_trans8(flow, inp[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]);
first_trans8(flow, inp[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]);
@@ -486,6 +486,7 @@ rte_acl_dump(const struct rte_acl_ctx *ctx)
printf("acl context <%s>@%p\n", ctx->name, ctx);
printf(" socket_id=%"PRId32"\n", ctx->socket_id);
printf(" alg=%"PRId32"\n", ctx->alg);
+ printf(" first_load_sz=%"PRIu32"\n", ctx->first_load_sz);
printf(" max_rules=%"PRIu32"\n", ctx->max_rules);
printf(" rule_size=%"PRIu32"\n", ctx->rule_sz);
printf(" num_rules=%"PRIu32"\n", ctx->num_rules);