From patchwork Fri Oct 23 19:59:06 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Stephen Hemminger X-Patchwork-Id: 7981 Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [IPv6:::1]) by dpdk.org (Postfix) with ESMTP id 3815C5A1F; Fri, 23 Oct 2015 21:59:01 +0200 (CEST) Received: from mail-pa0-f50.google.com (mail-pa0-f50.google.com [209.85.220.50]) by dpdk.org (Postfix) with ESMTP id D3E29592B for ; Fri, 23 Oct 2015 21:58:59 +0200 (CEST) Received: by pacfv9 with SMTP id fv9so132230619pac.3 for ; Fri, 23 Oct 2015 12:58:59 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:date:from:to:cc:subject:message-id:in-reply-to :references:mime-version:content-type:content-transfer-encoding; bh=E17LhrnyW24m2xyJ0iCL6sG7FiXk+vJ2JkWJHHbn8Rs=; b=EeefJ6/GytlZnGcR+pSNwZ40H+7su0G/bDvBmPntfCxC7SMWgunUYC5WjdXe7zIE6E 5ZFGw0sYIc2URK8GgsbTUOEKBCydI2GO80oB5QyaQS1WuoexhF4f6Cw/6V7M6W926FZG 6IrKfn+0qtf3oj02LDBjnltq+t6th58dhGlrA0B1LBSmX38Rzh8DxPWnD4YJP1KfbzNA +UVBjdIyIg9GHNEt8lq2aQff9k9I+oJ83xCRktZUTAusEs6JbWT1+HnEJPOto45/iVy2 0wnaGc9jLTA1TbYh1qfFD4qCSm8yxadkM7t0xYwZNNG48+DKUca2ga99tjeAmcKPcdSL Djqg== X-Gm-Message-State: ALoCoQn6w2YoGf3OwtvPX318jK1AJvDDfm3YUtCNt9vQLM7zJU1MQh2xh3yy3DMGVtSPHSaYuW+X X-Received: by 10.68.65.37 with SMTP id u5mr25741794pbs.76.1445630338943; Fri, 23 Oct 2015 12:58:58 -0700 (PDT) Received: from xeon-e3 (static-50-53-82-155.bvtn.or.frontiernet.net. [50.53.82.155]) by smtp.gmail.com with ESMTPSA id xm4sm20469280pab.27.2015.10.23.12.58.58 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Fri, 23 Oct 2015 12:58:58 -0700 (PDT) Date: Fri, 23 Oct 2015 12:59:06 -0700 From: Stephen Hemminger To: Matthew Hall Message-ID: <20151023125906.36fd3856@xeon-e3> In-Reply-To: <20151023183811.GA11859@mhcomputing.net> References: <1445608311-8092-1-git-send-email-michalx.k.jastrzebski@intel.com> <20151023162033.GA10036@mhcomputing.net> <20151023093305.2e971298@xeon-e3> <20151023183811.GA11859@mhcomputing.net> MIME-Version: 1.0 Cc: dev@dpdk.org Subject: Re: [dpdk-dev] [PATCH v1 0/3] lpm: increase number of next hops for lpm (ipv4) X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From 9efec4571eec4db455a29773b95cf9264c046a03 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 23 Oct 2015 12:55:05 -0700 Subject: [PATCH] lpm: brocade extensions This is a brute-force merge of the Brocade extension to LPM to current DPDK source tree. No API/ABI compatibility is expected. 1. Allow arbitrary number of rules 2. Get rid of N^2 search for rule add/delete 3. Add route scope 4. Extend nexthop to 16 bits 5. Extend to allow for more info on delete, (callback and nexthop) 6. Dynamically grow /8 table (requires RCU) 7. Support full /0 and /32 rules --- lib/librte_lpm/rte_lpm.c | 814 ++++++++++++++++++++++++++--------------------- lib/librte_lpm/rte_lpm.h | 381 +++++++--------------- 2 files changed, 567 insertions(+), 628 deletions(-) diff --git a/lib/librte_lpm/rte_lpm.c b/lib/librte_lpm/rte_lpm.c index 163ba3c..ef1f0bf 100644 --- a/lib/librte_lpm/rte_lpm.c +++ b/lib/librte_lpm/rte_lpm.c @@ -2,6 +2,7 @@ * BSD LICENSE * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2015 Brocade Communications Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,13 +39,15 @@ #include #include #include +#include #include #include #include -#include /* for definition of RTE_CACHE_LINE_SIZE */ +#include /* for definition of RTE_CACHE_LINE_SIZE */ #include #include +#include #include #include #include @@ -52,9 +55,25 @@ #include #include #include +#include #include "rte_lpm.h" +#include + +/** Auto-growth of tbl8 */ +#define RTE_LPM_TBL8_INIT_GROUPS 256 /* power of 2 */ +#define RTE_LPM_TBL8_INIT_ENTRIES (RTE_LPM_TBL8_INIT_GROUPS * \ + RTE_LPM_TBL8_GROUP_NUM_ENTRIES) +/** Rule structure. */ +struct rte_lpm_rule { + uint32_t ip; /**< Rule IP address. */ + uint16_t next_hop; /**< Rule next hop. */ + uint8_t scope; /**< Rule scope */ + uint8_t reserved; + RB_ENTRY(rte_lpm_rule) link; +}; + TAILQ_HEAD(rte_lpm_list, rte_tailq_entry); static struct rte_tailq_elem rte_lpm_tailq = { @@ -71,31 +90,55 @@ enum valid_flag { /* Macro to enable/disable run-time checks. */ #if defined(RTE_LIBRTE_LPM_DEBUG) -#include -#define VERIFY_DEPTH(depth) do { \ - if ((depth == 0) || (depth > RTE_LPM_MAX_DEPTH)) \ +#define VERIFY_DEPTH(depth) do { \ + if (depth > RTE_LPM_MAX_DEPTH) \ rte_panic("LPM: Invalid depth (%u) at line %d", \ - (unsigned)(depth), __LINE__); \ + (unsigned)(depth), __LINE__); \ } while (0) #else #define VERIFY_DEPTH(depth) #endif +/* Comparison function for red-black tree nodes. + "If the first argument is smaller than the second, the function + returns a value smaller than zero. If they are equal, the function + returns zero. Otherwise, it should return a value greater than zero." +*/ +static inline int rules_cmp(const struct rte_lpm_rule *r1, + const struct rte_lpm_rule *r2) +{ + if (r1->ip < r2->ip) + return -1; + else if (r1->ip > r2->ip) + return 1; + else + return r1->scope - r2->scope; +} + +/* Satisfy old style attribute in tree.h header */ +#ifndef __unused +#define __unused __attribute__ ((unused)) +#endif + +/* Generate internal functions and make them static. */ +RB_GENERATE_STATIC(rte_lpm_rules_tree, rte_lpm_rule, link, rules_cmp) + /* * Converts a given depth value to its corresponding mask value. * * depth (IN) : range = 1 - 32 - * mask (OUT) : 32bit mask + * mask (OUT) : 32bit mask */ static uint32_t __attribute__((pure)) depth_to_mask(uint8_t depth) { VERIFY_DEPTH(depth); - /* To calculate a mask start with a 1 on the left hand side and right - * shift while populating the left hand side with 1's - */ - return (int)0x80000000 >> (depth - 1); + /* per C std. shift of 32 bits is undefined */ + if (depth == 0) + return 0; + + return ~0u << (32 - depth); } /* @@ -113,7 +156,7 @@ depth_to_range(uint8_t depth) return 1 << (MAX_DEPTH_TBL24 - depth); /* Else if depth is greater than 24 */ - return (1 << (RTE_LPM_MAX_DEPTH - depth)); + return 1 << (32 - depth); } /* @@ -148,31 +191,28 @@ rte_lpm_find_existing(const char *name) * Allocates memory for LPM object */ struct rte_lpm * -rte_lpm_create(const char *name, int socket_id, int max_rules, - __rte_unused int flags) +rte_lpm_create(const char *name, int socket_id) { char mem_name[RTE_LPM_NAMESIZE]; struct rte_lpm *lpm = NULL; struct rte_tailq_entry *te; - uint32_t mem_size; + unsigned int depth; struct rte_lpm_list *lpm_list; + /* check that we have an initialized tail queue */ lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list); - RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl24_entry) != 2); - RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl8_entry) != 2); + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl24_entry) != 4); + RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl8_entry) != 4); /* Check user arguments. */ - if ((name == NULL) || (socket_id < -1) || (max_rules == 0)){ + if ((name == NULL) || (socket_id < -1)) { rte_errno = EINVAL; return NULL; } snprintf(mem_name, sizeof(mem_name), "LPM_%s", name); - /* Determine the amount of memory to allocate. */ - mem_size = sizeof(*lpm) + (sizeof(lpm->rules_tbl[0]) * max_rules); - rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK); /* guarantee there's no existing */ @@ -192,17 +232,33 @@ rte_lpm_create(const char *name, int socket_id, int max_rules, } /* Allocate memory to store the LPM data structures. */ - lpm = (struct rte_lpm *)rte_zmalloc_socket(mem_name, mem_size, - RTE_CACHE_LINE_SIZE, socket_id); + lpm = rte_zmalloc_socket(mem_name, sizeof(*lpm), RTE_CACHE_LINE_SIZE, + socket_id); if (lpm == NULL) { RTE_LOG(ERR, LPM, "LPM memory allocation failed\n"); - rte_free(te); goto exit; } /* Save user arguments. */ - lpm->max_rules = max_rules; snprintf(lpm->name, sizeof(lpm->name), "%s", name); + lpm->socket_id = socket_id; + + /* Vyatta change to use red-black tree */ + for (depth = 0; depth < RTE_LPM_MAX_DEPTH; ++depth) + RB_INIT(&lpm->rules[depth]); + + /* Vyatta change to dynamically grow tbl8 */ + lpm->tbl8_num_groups = RTE_LPM_TBL8_INIT_GROUPS; + lpm->tbl8_rover = RTE_LPM_TBL8_INIT_GROUPS - 1; + lpm->tbl8 = rte_calloc_socket(NULL, RTE_LPM_TBL8_INIT_ENTRIES, + sizeof(struct rte_lpm_tbl8_entry), + RTE_CACHE_LINE_SIZE, socket_id); + if (lpm->tbl8 == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 group allocation failed\n"); + rte_free(lpm); + lpm = NULL; + goto exit; + } te->data = (void *) lpm; @@ -245,248 +301,237 @@ rte_lpm_free(struct rte_lpm *lpm) rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK); + rte_free(lpm->tbl8); rte_free(lpm); rte_free(te); } + /* - * Adds a rule to the rule table. - * - * NOTE: The rule table is split into 32 groups. Each group contains rules that - * apply to a specific prefix depth (i.e. group 1 contains rules that apply to - * prefixes with a depth of 1 etc.). In the following code (depth - 1) is used - * to refer to depth 1 because even though the depth range is 1 - 32, depths - * are stored in the rule table from 0 - 31. - * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + * Finds a rule in rule table. */ -static inline int32_t -rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, - uint8_t next_hop) +static struct rte_lpm_rule * +rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, uint8_t scope) { - uint32_t rule_gindex, rule_index, last_rule; - int i; - - VERIFY_DEPTH(depth); - - /* Scan through rule group to see if rule already exists. */ - if (lpm->rule_info[depth - 1].used_rules > 0) { - - /* rule_gindex stands for rule group index. */ - rule_gindex = lpm->rule_info[depth - 1].first_rule; - /* Initialise rule_index to point to start of rule group. */ - rule_index = rule_gindex; - /* Last rule = Last used rule in this rule group. */ - last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; - - for (; rule_index < last_rule; rule_index++) { + struct rte_lpm_rules_tree *head = &lpm->rules[depth]; + struct rte_lpm_rule k = { + .ip = ip_masked, + .scope = scope, + }; - /* If rule already exists update its next_hop and return. */ - if (lpm->rules_tbl[rule_index].ip == ip_masked) { - lpm->rules_tbl[rule_index].next_hop = next_hop; - - return rule_index; - } - } - - if (rule_index == lpm->max_rules) - return -ENOSPC; - } else { - /* Calculate the position in which the rule will be stored. */ - rule_index = 0; + return RB_FIND(rte_lpm_rules_tree, head, &k); +} - for (i = depth - 1; i > 0; i--) { - if (lpm->rule_info[i - 1].used_rules > 0) { - rule_index = lpm->rule_info[i - 1].first_rule + lpm->rule_info[i - 1].used_rules; - break; - } - } - if (rule_index == lpm->max_rules) - return -ENOSPC; +/* Finds rule in table in scope order */ +static struct rte_lpm_rule * +rule_find_any(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +{ + struct rte_lpm_rule *r; + int scope; - lpm->rule_info[depth - 1].first_rule = rule_index; + for (scope = 255; scope >= 0; --scope) { + r = rule_find(lpm, ip_masked, depth, scope); + if (r) + return r; } - /* Make room for the new rule in the array. */ - for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) { - if (lpm->rule_info[i - 1].first_rule + lpm->rule_info[i - 1].used_rules == lpm->max_rules) - return -ENOSPC; + return NULL; +} - if (lpm->rule_info[i - 1].used_rules > 0) { - lpm->rules_tbl[lpm->rule_info[i - 1].first_rule + lpm->rule_info[i - 1].used_rules] - = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule]; - lpm->rule_info[i - 1].first_rule++; - } - } +/* + * Adds a rule to the rule table. + * + * NOTE: The rule table is split into 32 groups. Each group contains rules that + * apply to a specific prefix depth (i.e. group 1 contains rules that apply to + * prefixes with a depth of 1 etc.). + * NOTE: Valid range for depth parameter is 0 .. 32 inclusive. + */ +static struct rte_lpm_rule * +rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + uint16_t next_hop, uint8_t scope) +{ + struct rte_lpm_rules_tree *head = &lpm->rules[depth]; + struct rte_lpm_rule *r, *old; - /* Add the new rule. */ - lpm->rules_tbl[rule_index].ip = ip_masked; - lpm->rules_tbl[rule_index].next_hop = next_hop; + /* + * NB: uses regular malloc to avoid chewing up precious + * memory pool space for rules. + */ + r = malloc(sizeof(*r)); + if (!r) + return NULL; - /* Increment the used rules counter for this rule group. */ - lpm->rule_info[depth - 1].used_rules++; + r->ip = ip_masked; + r->next_hop = next_hop; + r->scope = scope; - return rule_index; + old = RB_INSERT(rte_lpm_rules_tree, head, r); + if (!old) + return r; + + /* collision with existing rule */ + free(r); + return old; } /* * Delete a rule from the rule table. * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. */ -static inline void -rule_delete(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth) +static void +rule_delete(struct rte_lpm *lpm, struct rte_lpm_rule *r, uint8_t depth) { - int i; + struct rte_lpm_rules_tree *head = &lpm->rules[depth]; - VERIFY_DEPTH(depth); - - lpm->rules_tbl[rule_index] = lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule - + lpm->rule_info[depth - 1].used_rules - 1]; + RB_REMOVE(rte_lpm_rules_tree, head, r); - for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) { - if (lpm->rule_info[i].used_rules > 0) { - lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] = - lpm->rules_tbl[lpm->rule_info[i].first_rule + lpm->rule_info[i].used_rules - 1]; - lpm->rule_info[i].first_rule--; - } - } - - lpm->rule_info[depth - 1].used_rules--; + free(r); } /* - * Finds a rule in rule table. - * NOTE: Valid range for depth parameter is 1 .. 32 inclusive. + * Dynamically increase size of tbl8 */ -static inline int32_t -rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth) +static int +tbl8_grow(struct rte_lpm *lpm) { - uint32_t rule_gindex, last_rule, rule_index; - - VERIFY_DEPTH(depth); + size_t old_size, new_size; + struct rte_lpm_tbl8_entry *new_tbl8; + + /* This should not happen, + * worst case is each /24 can point to one tbl8 */ + if (lpm->tbl8_num_groups >= RTE_LPM_TBL24_NUM_ENTRIES) + rte_panic("LPM: tbl8 grow already at %u", + lpm->tbl8_num_groups); + + old_size = lpm->tbl8_num_groups; + new_size = old_size << 1; + new_tbl8 = rte_calloc_socket(NULL, + new_size * RTE_LPM_TBL8_GROUP_NUM_ENTRIES, + sizeof(struct rte_lpm_tbl8_entry), + RTE_CACHE_LINE_SIZE, + lpm->socket_id); + if (new_tbl8 == NULL) { + RTE_LOG(ERR, LPM, "LPM tbl8 group expand allocation failed\n"); + return -ENOMEM; + } - rule_gindex = lpm->rule_info[depth - 1].first_rule; - last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules; + memcpy(new_tbl8, lpm->tbl8, + old_size * RTE_LPM_TBL8_GROUP_NUM_ENTRIES + * sizeof(struct rte_lpm_tbl8_entry)); - /* Scan used rules at given depth to find rule. */ - for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) { - /* If rule is found return the rule index. */ - if (lpm->rules_tbl[rule_index].ip == ip_masked) - return rule_index; - } + /* swap in new table */ + defer_rcu(rte_free, lpm->tbl8); + rcu_assign_pointer(lpm->tbl8, new_tbl8); + lpm->tbl8_num_groups = new_size; - /* If rule is not found return -EINVAL. */ - return -EINVAL; + return 0; } /* * Find, clean and allocate a tbl8. */ -static inline int32_t -tbl8_alloc(struct rte_lpm_tbl8_entry *tbl8) +static int32_t +tbl8_alloc(struct rte_lpm *lpm) { uint32_t tbl8_gindex; /* tbl8 group index. */ struct rte_lpm_tbl8_entry *tbl8_entry; /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */ - for (tbl8_gindex = 0; tbl8_gindex < RTE_LPM_TBL8_NUM_GROUPS; - tbl8_gindex++) { - tbl8_entry = &tbl8[tbl8_gindex * - RTE_LPM_TBL8_GROUP_NUM_ENTRIES]; + for (tbl8_gindex = (lpm->tbl8_rover + 1) & (lpm->tbl8_num_groups - 1); + tbl8_gindex != lpm->tbl8_rover; + tbl8_gindex = (tbl8_gindex + 1) & (lpm->tbl8_num_groups - 1)) { + tbl8_entry = lpm->tbl8 + + tbl8_gindex * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + /* If a free tbl8 group is found clean it and set as VALID. */ - if (!tbl8_entry->valid_group) { - memset(&tbl8_entry[0], 0, - RTE_LPM_TBL8_GROUP_NUM_ENTRIES * - sizeof(tbl8_entry[0])); + if (likely(!tbl8_entry->valid_group)) + goto found; + } - tbl8_entry->valid_group = VALID; + /* Out of space expand */ + tbl8_gindex = lpm->tbl8_num_groups; + if (tbl8_grow(lpm) < 0) + return -ENOSPC; - /* Return group index for allocated tbl8 group. */ - return tbl8_gindex; - } - } + tbl8_entry = lpm->tbl8 + + tbl8_gindex * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + found: + memset(tbl8_entry, 0, + RTE_LPM_TBL8_GROUP_NUM_ENTRIES * sizeof(tbl8_entry[0])); + + tbl8_entry->valid_group = VALID; - /* If there are no tbl8 groups free then return error. */ - return -ENOSPC; + /* Remember last slot to start looking there */ + lpm->tbl8_rover = tbl8_gindex; + + /* Return group index for allocated tbl8 group. */ + return tbl8_gindex; } static inline void -tbl8_free(struct rte_lpm_tbl8_entry *tbl8, uint32_t tbl8_group_start) +tbl8_free(struct rte_lpm *lpm, uint32_t tbl8_group_start) { /* Set tbl8 group invalid*/ - tbl8[tbl8_group_start].valid_group = INVALID; + lpm->tbl8[tbl8_group_start].valid_group = INVALID; } -static inline int32_t +static void add_depth_small(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, - uint8_t next_hop) + uint16_t next_hop) { uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j; + struct rte_lpm_tbl24_entry new_tbl24_entry = { + .valid = VALID, + .ext_entry = 0, + .depth = depth, + { .next_hop = next_hop, } + }; + struct rte_lpm_tbl8_entry new_tbl8_entry = { + .valid_group = VALID, + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + }; + + /* Force compiler to initialize before assignment */ + rte_barrier(); /* Calculate the index into Table24. */ tbl24_index = ip >> 8; tbl24_range = depth_to_range(depth); - for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { /* * For invalid OR valid and non-extended tbl 24 entries set * entry. */ - if (!lpm->tbl24[i].valid || (lpm->tbl24[i].ext_entry == 0 && - lpm->tbl24[i].depth <= depth)) { - - struct rte_lpm_tbl24_entry new_tbl24_entry = { - { .next_hop = next_hop, }, - .valid = VALID, - .ext_entry = 0, - .depth = depth, - }; - - /* Setting tbl24 entry in one go to avoid race - * conditions - */ - lpm->tbl24[i] = new_tbl24_entry; - + if (!lpm->tbl24[i].valid || lpm->tbl24[i].ext_entry == 0) { + if (!lpm->tbl24[i].valid || + lpm->tbl24[i].depth <= depth) + lpm->tbl24[i] = new_tbl24_entry; continue; } - if (lpm->tbl24[i].ext_entry == 1) { - /* If tbl24 entry is valid and extended calculate the - * index into tbl8. - */ - tbl8_index = lpm->tbl24[i].tbl8_gindex * - RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl8_group_end = tbl8_index + - RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - - for (j = tbl8_index; j < tbl8_group_end; j++) { - if (!lpm->tbl8[j].valid || - lpm->tbl8[j].depth <= depth) { - struct rte_lpm_tbl8_entry - new_tbl8_entry = { - .valid = VALID, - .valid_group = VALID, - .depth = depth, - .next_hop = next_hop, - }; - - /* - * Setting tbl8 entry in one go to avoid - * race conditions - */ - lpm->tbl8[j] = new_tbl8_entry; - - continue; - } + /* If tbl24 entry is valid and extended calculate the index + * into tbl8. */ + tbl8_index = lpm->tbl24[i].tbl8_gindex + * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + tbl8_group_end = tbl8_index + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + for (j = tbl8_index; j < tbl8_group_end; j++) { + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) { + /* + * Setting tbl8 entry in one go to avoid race + * conditions + */ + lpm->tbl8[j] = new_tbl8_entry; } } } - - return 0; } -static inline int32_t +static int32_t add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, - uint8_t next_hop) + uint16_t next_hop) { uint32_t tbl24_index; int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index, @@ -497,12 +542,11 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, if (!lpm->tbl24[tbl24_index].valid) { /* Search for a free tbl8 group. */ - tbl8_group_index = tbl8_alloc(lpm->tbl8); + tbl8_group_index = tbl8_alloc(lpm); - /* Check tbl8 allocation was successful. */ - if (tbl8_group_index < 0) { + /* Check tbl8 allocation was unsuccessful. */ + if (tbl8_group_index < 0) return tbl8_group_index; - } /* Find index into tbl8 and range. */ tbl8_index = (tbl8_group_index * @@ -510,35 +554,38 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, (ip_masked & 0xFF); /* Set tbl8 entry. */ - for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { - lpm->tbl8[i].depth = depth; - lpm->tbl8[i].next_hop = next_hop; - lpm->tbl8[i].valid = VALID; - } + struct rte_lpm_tbl8_entry new_tbl8_entry = { + .valid_group = VALID, + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + }; + + for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) + lpm->tbl8[i] = new_tbl8_entry; /* * Update tbl24 entry to point to new tbl8 entry. Note: The * ext_flag and tbl8_index need to be updated simultaneously, * so assign whole structure in one go */ - struct rte_lpm_tbl24_entry new_tbl24_entry = { - { .tbl8_gindex = (uint8_t)tbl8_group_index, }, .valid = VALID, .ext_entry = 1, .depth = 0, + { .tbl8_gindex = tbl8_group_index, } }; + rte_barrier(); lpm->tbl24[tbl24_index] = new_tbl24_entry; - - }/* If valid entry but not extended calculate the index into Table8. */ + } + /* If valid entry but not extended calculate the index into Table8. */ else if (lpm->tbl24[tbl24_index].ext_entry == 0) { /* Search for free tbl8 group. */ - tbl8_group_index = tbl8_alloc(lpm->tbl8); + tbl8_group_index = tbl8_alloc(lpm); - if (tbl8_group_index < 0) { + if (tbl8_group_index < 0) return tbl8_group_index; - } tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; @@ -546,69 +593,68 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, RTE_LPM_TBL8_GROUP_NUM_ENTRIES; /* Populate new tbl8 with tbl24 value. */ - for (i = tbl8_group_start; i < tbl8_group_end; i++) { - lpm->tbl8[i].valid = VALID; - lpm->tbl8[i].depth = lpm->tbl24[tbl24_index].depth; - lpm->tbl8[i].next_hop = - lpm->tbl24[tbl24_index].next_hop; - } + struct rte_lpm_tbl8_entry new_tbl8_entry = { + .valid_group = VALID, + .valid = VALID, + .depth = lpm->tbl24[tbl24_index].depth, + .next_hop = lpm->tbl24[tbl24_index].next_hop, + }; + + for (i = tbl8_group_start; i < tbl8_group_end; i++) + lpm->tbl8[i] = new_tbl8_entry; tbl8_index = tbl8_group_start + (ip_masked & 0xFF); - /* Insert new rule into the tbl8 entry. */ - for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) { - if (!lpm->tbl8[i].valid || - lpm->tbl8[i].depth <= depth) { - lpm->tbl8[i].valid = VALID; - lpm->tbl8[i].depth = depth; - lpm->tbl8[i].next_hop = next_hop; - - continue; - } - } + /* Insert new specific rule into the tbl8 entry. */ + new_tbl8_entry.depth = depth; + new_tbl8_entry.next_hop = next_hop; + for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) + lpm->tbl8[i] = new_tbl8_entry; /* * Update tbl24 entry to point to new tbl8 entry. Note: The * ext_flag and tbl8_index need to be updated simultaneously, * so assign whole structure in one go. */ - struct rte_lpm_tbl24_entry new_tbl24_entry = { - { .tbl8_gindex = (uint8_t)tbl8_group_index, }, .valid = VALID, .ext_entry = 1, .depth = 0, + { .tbl8_gindex = tbl8_group_index, } }; + /* + * Ensure compiler isn't doing something completely odd + * like updating tbl24 before tbl8. + */ + rte_barrier(); lpm->tbl24[tbl24_index] = new_tbl24_entry; - } - else { /* - * If it is valid, extended entry calculate the index into tbl8. - */ + } else { + /* + * If it is valid, extended entry calculate the index into tbl8. + */ + struct rte_lpm_tbl8_entry new_tbl8_entry = { + .valid_group = VALID, + .valid = VALID, + .depth = depth, + .next_hop = next_hop, + }; + rte_barrier(); + tbl8_group_index = lpm->tbl24[tbl24_index].tbl8_gindex; tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; tbl8_index = tbl8_group_start + (ip_masked & 0xFF); for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { - if (!lpm->tbl8[i].valid || - lpm->tbl8[i].depth <= depth) { - struct rte_lpm_tbl8_entry new_tbl8_entry = { - .valid = VALID, - .depth = depth, - .next_hop = next_hop, - .valid_group = lpm->tbl8[i].valid_group, - }; - + lpm->tbl8[i].depth <= depth) { /* * Setting tbl8 entry in one go to avoid race * condition */ lpm->tbl8[i] = new_tbl8_entry; - - continue; } } } @@ -621,38 +667,32 @@ add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, */ int rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, - uint8_t next_hop) + uint16_t next_hop, uint8_t scope) { - int32_t rule_index, status = 0; - uint32_t ip_masked; + struct rte_lpm_rule *rule; + uint32_t ip_masked = (ip & depth_to_mask(depth)); /* Check user arguments. */ - if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) + if ((lpm == NULL) || (depth >= RTE_LPM_MAX_DEPTH)) return -EINVAL; - ip_masked = ip & depth_to_mask(depth); - /* Add the rule to the rule table. */ - rule_index = rule_add(lpm, ip_masked, depth, next_hop); + rule = rule_add(lpm, ip_masked, depth, next_hop, scope); /* If the is no space available for new rule return error. */ - if (rule_index < 0) { - return rule_index; - } - - if (depth <= MAX_DEPTH_TBL24) { - status = add_depth_small(lpm, ip_masked, depth, next_hop); - } - else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */ - status = add_depth_big(lpm, ip_masked, depth, next_hop); + if (rule == NULL) + return -ENOSPC; + if (depth <= MAX_DEPTH_TBL24) + add_depth_small(lpm, ip_masked, depth, next_hop); + else { /* * If add fails due to exhaustion of tbl8 extensions delete * rule that was added to rule table. */ + int status = add_depth_big(lpm, ip_masked, depth, next_hop); if (status < 0) { - rule_delete(lpm, rule_index, depth); - + rule_delete(lpm, rule, depth); return status; } } @@ -665,10 +705,10 @@ rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, */ int rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, -uint8_t *next_hop) + uint16_t *next_hop, uint8_t scope) { uint32_t ip_masked; - int32_t rule_index; + struct rte_lpm_rule *rule; /* Check user arguments. */ if ((lpm == NULL) || @@ -678,10 +718,10 @@ uint8_t *next_hop) /* Look for the rule using rule_find. */ ip_masked = ip & depth_to_mask(depth); - rule_index = rule_find(lpm, ip_masked, depth); + rule = rule_find(lpm, ip_masked, depth, scope); - if (rule_index >= 0) { - *next_hop = lpm->rules_tbl[rule_index].next_hop; + if (rule != NULL) { + *next_hop = rule->next_hop; return 1; } @@ -689,30 +729,29 @@ uint8_t *next_hop) return 0; } -static inline int32_t -find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint8_t *sub_rule_depth) +static struct rte_lpm_rule * +find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint8_t *sub_rule_depth) { - int32_t rule_index; + struct rte_lpm_rule *rule; uint32_t ip_masked; - uint8_t prev_depth; + int prev_depth; - for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) { + for (prev_depth = depth - 1; prev_depth >= 0; prev_depth--) { ip_masked = ip & depth_to_mask(prev_depth); - - rule_index = rule_find(lpm, ip_masked, prev_depth); - - if (rule_index >= 0) { + rule = rule_find_any(lpm, ip_masked, prev_depth); + if (rule) { *sub_rule_depth = prev_depth; - return rule_index; + return rule; } } - return -1; + return NULL; } -static inline int32_t -delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, - uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +static void +delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + struct rte_lpm_rule *sub_rule, uint8_t new_depth) { uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j; @@ -720,28 +759,22 @@ delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, tbl24_range = depth_to_range(depth); tbl24_index = (ip_masked >> 8); - /* - * Firstly check the sub_rule_index. A -1 indicates no replacement rule - * and a positive number indicates a sub_rule_index. - */ - if (sub_rule_index < 0) { + /* Firstly check the sub_rule. */ + if (sub_rule == NULL) { /* * If no replacement rule exists then invalidate entries * associated with this rule. */ for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { - - if (lpm->tbl24[i].ext_entry == 0 && - lpm->tbl24[i].depth <= depth ) { - lpm->tbl24[i].valid = INVALID; - } - else { + if (lpm->tbl24[i].ext_entry == 0) { + if (lpm->tbl24[i].depth <= depth) + lpm->tbl24[i].valid = INVALID; + } else { /* * If TBL24 entry is extended, then there has * to be a rule with depth >= 25 in the * associated TBL8 group. */ - tbl8_group_index = lpm->tbl24[i].tbl8_gindex; tbl8_index = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; @@ -749,60 +782,54 @@ delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, for (j = tbl8_index; j < (tbl8_index + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { - if (lpm->tbl8[j].depth <= depth) + if (lpm->tbl8[j].valid && + lpm->tbl8[j].depth <= depth) lpm->tbl8[j].valid = INVALID; } } } - } - else { + } else { /* * If a replacement rule exists then modify entries * associated with this rule. */ - struct rte_lpm_tbl24_entry new_tbl24_entry = { - {.next_hop = lpm->rules_tbl[sub_rule_index].next_hop,}, .valid = VALID, .ext_entry = 0, - .depth = sub_rule_depth, + .depth = new_depth, + { .next_hop = sub_rule->next_hop, } }; struct rte_lpm_tbl8_entry new_tbl8_entry = { + .valid_group = VALID, .valid = VALID, - .depth = sub_rule_depth, - .next_hop = lpm->rules_tbl - [sub_rule_index].next_hop, + .depth = new_depth, + .next_hop = sub_rule->next_hop, }; for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) { - - if (lpm->tbl24[i].ext_entry == 0 && - lpm->tbl24[i].depth <= depth ) { - lpm->tbl24[i] = new_tbl24_entry; - } - else { + if (lpm->tbl24[i].ext_entry == 0) { + if (lpm->tbl24[i].depth <= depth) + lpm->tbl24[i] = new_tbl24_entry; + } else { /* * If TBL24 entry is extended, then there has * to be a rule with depth >= 25 in the * associated TBL8 group. */ - tbl8_group_index = lpm->tbl24[i].tbl8_gindex; tbl8_index = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; for (j = tbl8_index; j < (tbl8_index + RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) { - - if (lpm->tbl8[j].depth <= depth) + if (!lpm->tbl8[j].valid || + lpm->tbl8[j].depth <= depth) lpm->tbl8[j] = new_tbl8_entry; } } } } - - return 0; } /* @@ -813,8 +840,9 @@ delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked, * Return of value > -1 means tbl8 is in use but has all the same values and * thus can be recycled */ -static inline int32_t -tbl8_recycle_check(struct rte_lpm_tbl8_entry *tbl8, uint32_t tbl8_group_start) +static int32_t +tbl8_recycle_check(const struct rte_lpm_tbl8_entry *tbl8, + uint32_t tbl8_group_start) { uint32_t tbl8_group_end, i; tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES; @@ -855,13 +883,14 @@ tbl8_recycle_check(struct rte_lpm_tbl8_entry *tbl8, uint32_t tbl8_group_start) if (tbl8[i].valid) return -EEXIST; } + /* If no valid entries are found then return -EINVAL. */ return -EINVAL; } -static inline int32_t -delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, - uint8_t depth, int32_t sub_rule_index, uint8_t sub_rule_depth) +static void +delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth, + struct rte_lpm_rule *sub_rule, uint8_t new_depth) { uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index, tbl8_range, i; @@ -879,23 +908,22 @@ delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, tbl8_index = tbl8_group_start + (ip_masked & 0xFF); tbl8_range = depth_to_range(depth); - if (sub_rule_index < 0) { + if (sub_rule == NULL) { /* * Loop through the range of entries on tbl8 for which the * rule_to_delete must be removed or modified. */ for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { - if (lpm->tbl8[i].depth <= depth) + if (lpm->tbl8[i].valid && lpm->tbl8[i].depth <= depth) lpm->tbl8[i].valid = INVALID; } - } - else { + } else { /* Set new tbl8 entry. */ struct rte_lpm_tbl8_entry new_tbl8_entry = { + .valid_group = VALID, .valid = VALID, - .depth = sub_rule_depth, - .valid_group = lpm->tbl8[tbl8_group_start].valid_group, - .next_hop = lpm->rules_tbl[sub_rule_index].next_hop, + .depth = new_depth, + .next_hop = sub_rule->next_hop, }; /* @@ -903,7 +931,7 @@ delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, * rule_to_delete must be modified. */ for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) { - if (lpm->tbl8[i].depth <= depth) + if (!lpm->tbl8[i].valid || lpm->tbl8[i].depth <= depth) lpm->tbl8[i] = new_tbl8_entry; } } @@ -915,100 +943,158 @@ delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, */ tbl8_recycle_index = tbl8_recycle_check(lpm->tbl8, tbl8_group_start); - - if (tbl8_recycle_index == -EINVAL){ + if (tbl8_recycle_index == -EINVAL) { /* Set tbl24 before freeing tbl8 to avoid race condition. */ lpm->tbl24[tbl24_index].valid = 0; - tbl8_free(lpm->tbl8, tbl8_group_start); - } - else if (tbl8_recycle_index > -1) { + rte_barrier(); + tbl8_free(lpm, tbl8_group_start); + } else if (tbl8_recycle_index > -1) { /* Update tbl24 entry. */ struct rte_lpm_tbl24_entry new_tbl24_entry = { - { .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, }, .valid = VALID, .ext_entry = 0, .depth = lpm->tbl8[tbl8_recycle_index].depth, + { .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop, } }; /* Set tbl24 before freeing tbl8 to avoid race condition. */ lpm->tbl24[tbl24_index] = new_tbl24_entry; - tbl8_free(lpm->tbl8, tbl8_group_start); + rte_barrier(); + tbl8_free(lpm, tbl8_group_start); } +} - return 0; +/* + * Find rule to replace the just deleted. If there is no rule to + * replace the rule_to_delete we return NULL and invalidate the table + * entries associated with this rule. + */ +static void rule_replace(struct rte_lpm *lpm, uint32_t ip, uint8_t depth) +{ + uint32_t ip_masked; + struct rte_lpm_rule *sub_rule; + uint8_t sub_depth = 0; + + ip_masked = ip & depth_to_mask(depth); + sub_rule = find_previous_rule(lpm, ip, depth, &sub_depth); + + /* + * If the input depth value is less than 25 use function + * delete_depth_small otherwise use delete_depth_big. + */ + if (depth <= MAX_DEPTH_TBL24) + delete_depth_small(lpm, ip_masked, depth, sub_rule, sub_depth); + else + delete_depth_big(lpm, ip_masked, depth, sub_rule, sub_depth); } /* * Deletes a rule */ int -rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth) +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint16_t *next_hop, uint8_t scope) { - int32_t rule_to_delete_index, sub_rule_index; + struct rte_lpm_rule *rule; uint32_t ip_masked; - uint8_t sub_rule_depth; + /* * Check input arguments. Note: IP must be a positive integer of 32 * bits in length therefore it need not be checked. */ - if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) { + if ((lpm == NULL) || (depth >= RTE_LPM_MAX_DEPTH)) return -EINVAL; - } ip_masked = ip & depth_to_mask(depth); /* - * Find the index of the input rule, that needs to be deleted, in the + * Find the input rule, that needs to be deleted, in the * rule table. */ - rule_to_delete_index = rule_find(lpm, ip_masked, depth); + rule = rule_find(lpm, ip_masked, depth, scope); /* * Check if rule_to_delete_index was found. If no rule was found the - * function rule_find returns -EINVAL. + * function rule_find returns -E_RTE_NO_TAILQ. */ - if (rule_to_delete_index < 0) + if (rule == NULL) return -EINVAL; - /* Delete the rule from the rule table. */ - rule_delete(lpm, rule_to_delete_index, depth); - /* - * Find rule to replace the rule_to_delete. If there is no rule to - * replace the rule_to_delete we return -1 and invalidate the table - * entries associated with this rule. + * Return next hop so caller can avoid lookup. */ - sub_rule_depth = 0; - sub_rule_index = find_previous_rule(lpm, ip, depth, &sub_rule_depth); + if (next_hop) + *next_hop = rule->next_hop; - /* - * If the input depth value is less than 25 use function - * delete_depth_small otherwise use delete_depth_big. - */ - if (depth <= MAX_DEPTH_TBL24) { - return delete_depth_small(lpm, ip_masked, depth, - sub_rule_index, sub_rule_depth); - } - else { /* If depth > MAX_DEPTH_TBL24 */ - return delete_depth_big(lpm, ip_masked, depth, sub_rule_index, sub_rule_depth); - } + /* Delete the rule from the rule table. */ + rule_delete(lpm, rule, depth); + + /* Replace with next level up rule */ + rule_replace(lpm, ip, depth); + + return 0; } /* * Delete all rules from the LPM table. */ void -rte_lpm_delete_all(struct rte_lpm *lpm) +rte_lpm_delete_all(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *arg) { - /* Zero rule information. */ - memset(lpm->rule_info, 0, sizeof(lpm->rule_info)); + uint8_t depth; /* Zero tbl24. */ memset(lpm->tbl24, 0, sizeof(lpm->tbl24)); /* Zero tbl8. */ - memset(lpm->tbl8, 0, sizeof(lpm->tbl8)); + memset(lpm->tbl8, 0, + lpm->tbl8_num_groups * RTE_LPM_TBL8_GROUP_NUM_ENTRIES + * sizeof(struct rte_lpm_tbl8_entry)); + lpm->tbl8_rover = lpm->tbl8_num_groups - 1; /* Delete all rules form the rules table. */ - memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules); + for (depth = 0; depth < RTE_LPM_MAX_DEPTH; ++depth) { + struct rte_lpm_rules_tree *head = &lpm->rules[depth]; + struct rte_lpm_rule *r, *n; + + RB_FOREACH_SAFE(r, rte_lpm_rules_tree, head, n) { + if (func) + func(lpm, r->ip, depth, r->scope, + r->next_hop, arg); + rule_delete(lpm, r, depth); + } + } +} + +/* + * Iterate over LPM rules + */ +void +rte_lpm_walk(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *arg) +{ + uint8_t depth; + + for (depth = 0; depth < RTE_LPM_MAX_DEPTH; depth++) { + struct rte_lpm_rules_tree *head = &lpm->rules[depth]; + struct rte_lpm_rule *r, *n; + + RB_FOREACH_SAFE(r, rte_lpm_rules_tree, head, n) { + func(lpm, r->ip, depth, r->scope, r->next_hop, arg); + } + } +} + +/* Count usage of tbl8 */ +unsigned +rte_lpm_tbl8_count(const struct rte_lpm *lpm) +{ + unsigned i, count = 0; + + for (i = 0; i < lpm->tbl8_num_groups; i++) { + const struct rte_lpm_tbl8_entry *tbl8_entry + = lpm->tbl8 + i * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + if (tbl8_entry->valid_group) + ++count; + } + return count; } diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index c299ce2..a39e3b5 100644 --- a/lib/librte_lpm/rte_lpm.h +++ b/lib/librte_lpm/rte_lpm.h @@ -2,6 +2,7 @@ * BSD LICENSE * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2015 Brocade Communications Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,11 +44,9 @@ #include #include #include +#include #include -#include #include -#include -#include #ifdef __cplusplus extern "C" { @@ -55,130 +54,89 @@ extern "C" { /** Max number of characters in LPM name. */ #define RTE_LPM_NAMESIZE 32 + + /** Maximum depth value possible for IPv4 LPM. */ +#define RTE_LPM_MAX_DEPTH 33 + +/** Total number of tbl24 entries. */ +#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) -/** Maximum depth value possible for IPv4 LPM. */ -#define RTE_LPM_MAX_DEPTH 32 +/** Number of entries in a tbl8 group. */ +#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 -/** @internal Total number of tbl24 entries. */ -#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24) - -/** @internal Number of entries in a tbl8 group. */ -#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256 - -/** @internal Total number of tbl8 groups in the tbl8. */ -#define RTE_LPM_TBL8_NUM_GROUPS 256 - -/** @internal Total number of tbl8 entries. */ -#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \ - RTE_LPM_TBL8_GROUP_NUM_ENTRIES) - -/** @internal Macro to enable/disable run-time checks. */ -#if defined(RTE_LIBRTE_LPM_DEBUG) -#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \ - if (cond) return (retval); \ -} while (0) -#else -#define RTE_LPM_RETURN_IF_TRUE(cond, retval) -#endif - -/** @internal bitmask with valid and ext_entry/valid_group fields set */ -#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x0300 - -/** Bitmask used to indicate successful lookup */ -#define RTE_LPM_LOOKUP_SUCCESS 0x0100 - -#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN -/** @internal Tbl24 entry structure. */ +/** Tbl24 entry structure. */ struct rte_lpm_tbl24_entry { + /* Using single uint8_t to store 3 values. */ + uint8_t valid :1; /**< Validation flag. */ + uint8_t ext_entry :1; /**< external entry? */ + uint8_t depth; /**< Rule depth. */ /* Stores Next hop or group index (i.e. gindex)into tbl8. */ union { - uint8_t next_hop; - uint8_t tbl8_gindex; + uint16_t next_hop; + uint16_t tbl8_gindex; }; - /* Using single uint8_t to store 3 values. */ - uint8_t valid :1; /**< Validation flag. */ - uint8_t ext_entry :1; /**< External entry. */ - uint8_t depth :6; /**< Rule depth. */ }; -/** @internal Tbl8 entry structure. */ +/** Tbl8 entry structure. */ struct rte_lpm_tbl8_entry { - uint8_t next_hop; /**< next hop. */ - /* Using single uint8_t to store 3 values. */ + uint16_t next_hop; /**< next hop. */ + uint8_t depth; /**< Rule depth. */ uint8_t valid :1; /**< Validation flag. */ uint8_t valid_group :1; /**< Group validation flag. */ - uint8_t depth :6; /**< Rule depth. */ -}; -#else -struct rte_lpm_tbl24_entry { - uint8_t depth :6; - uint8_t ext_entry :1; - uint8_t valid :1; - union { - uint8_t tbl8_gindex; - uint8_t next_hop; - }; -}; - -struct rte_lpm_tbl8_entry { - uint8_t depth :6; - uint8_t valid_group :1; - uint8_t valid :1; - uint8_t next_hop; -}; -#endif - -/** @internal Rule structure. */ -struct rte_lpm_rule { - uint32_t ip; /**< Rule IP address. */ - uint8_t next_hop; /**< Rule next hop. */ -}; - -/** @internal Contains metadata about the rules table. */ -struct rte_lpm_rule_info { - uint32_t used_rules; /**< Used rules so far. */ - uint32_t first_rule; /**< Indexes the first rule of a given depth. */ }; /** @internal LPM structure. */ struct rte_lpm { + TAILQ_ENTRY(rte_lpm) next; /**< Next in list. */ + /* LPM metadata. */ - char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ - uint32_t max_rules; /**< Max. balanced rules per lpm. */ - struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */ + char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */ + + /**< LPM rules. */ + int socket_id; /**< socket to allocate rules on */ + RB_HEAD(rte_lpm_rules_tree, rte_lpm_rule) rules[RTE_LPM_MAX_DEPTH]; /* LPM Tables. */ - struct rte_lpm_tbl24_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] \ + uint32_t tbl8_num_groups; /* Number of slots */ + uint32_t tbl8_rover; /* Next slot to check */ + struct rte_lpm_tbl8_entry *tbl8; /* Actual table */ + + struct rte_lpm_tbl24_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES] __rte_cache_aligned; /**< LPM tbl24 table. */ - struct rte_lpm_tbl8_entry tbl8[RTE_LPM_TBL8_NUM_ENTRIES] \ - __rte_cache_aligned; /**< LPM tbl8 table. */ - struct rte_lpm_rule rules_tbl[0] \ - __rte_cache_aligned; /**< LPM rules. */ }; /** + * Compiler memory barrier. + * + * Protects against compiler optimization of ordered operations. + */ +#ifdef __GNUC__ +#define rte_barrier() asm volatile("": : :"memory") +#else +/* Intel compiler has intrinsic for this. */ +#define rte_barrier() __memory_barrier() +#endif + +/** * Create an LPM object. * * @param name * LPM object name * @param socket_id * NUMA socket ID for LPM table memory allocation - * @param max_rules - * Maximum number of LPM rules that can be added - * @param flags - * This parameter is currently unused * @return * Handle to LPM object on success, NULL otherwise with rte_errno set * to an appropriate values. Possible rte_errno values include: * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure * - E_RTE_SECONDARY - function was called from a secondary process instance + * - E_RTE_NO_TAILQ - no tailq list could be got for the lpm object list * - EINVAL - invalid parameter passed to function * - ENOSPC - the maximum number of memzones has already been allocated * - EEXIST - a memzone with the same name already exists * - ENOMEM - no appropriate memory area found in which to create memzone */ struct rte_lpm * -rte_lpm_create(const char *name, int socket_id, int max_rules, int flags); +rte_lpm_create(const char *name, int socket_id); /** * Find an existing LPM object and return a pointer to it. @@ -215,11 +173,14 @@ rte_lpm_free(struct rte_lpm *lpm); * Depth of the rule to be added to the LPM table * @param next_hop * Next hop of the rule to be added to the LPM table + * @param scope + * Priority scope of this route rule * @return * 0 on success, negative value otherwise */ int -rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint8_t next_hop); +rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint16_t next_hop, uint8_t scope); /** * Check if a rule is present in the LPM table, @@ -231,6 +192,8 @@ rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint8_t next_hop); * IP of the rule to be searched * @param depth * Depth of the rule to searched + * @param scope + * Priority scope of the rule * @param next_hop * Next hop of the rule (valid only if it is found) * @return @@ -238,7 +201,7 @@ rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint8_t next_hop); */ int rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, -uint8_t *next_hop); + uint16_t *next_hop, uint8_t scope); /** * Delete a rule from the LPM table. @@ -249,20 +212,30 @@ uint8_t *next_hop); * IP of the rule to be deleted from the LPM table * @param depth * Depth of the rule to be deleted from the LPM table + * @param scope + * Priority scope of this route rule * @return * 0 on success, negative value otherwise */ int -rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth); +rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, + uint16_t *next_hop, uint8_t scope); + +/** iterator function for LPM rule */ +typedef void (*rte_lpm_walk_func_t)(struct rte_lpm *lpm, + uint32_t ip, uint8_t depth, uint8_t scope, + uint16_t next_hop, void *arg); /** * Delete all rules from the LPM table. * * @param lpm * LPM object handle + * @param func + * Optional callback for each entry */ void -rte_lpm_delete_all(struct rte_lpm *lpm); +rte_lpm_delete_all(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *arg); /** * Lookup an IP into the LPM table. @@ -277,200 +250,80 @@ rte_lpm_delete_all(struct rte_lpm *lpm); * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit */ static inline int -rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint8_t *next_hop) +rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint16_t *next_hop) { - unsigned tbl24_index = (ip >> 8); - uint16_t tbl_entry; + struct rte_lpm_tbl24_entry tbl24; + struct rte_lpm_tbl8_entry tbl8; - /* DEBUG: Check user input arguments. */ - RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (next_hop == NULL)), -EINVAL); + /* Copy tbl24 entry (to avoid conconcurrency issues) */ + tbl24 = lpm->tbl24[ip >> 8]; + rte_barrier(); - /* Copy tbl24 entry */ - tbl_entry = *(const uint16_t *)&lpm->tbl24[tbl24_index]; + /* + * Use the tbl24_index to access the required tbl24 entry then check if + * the tbl24 entry is INVALID, if so return -ENOENT. + */ + if (unlikely(!tbl24.valid)) + return -ENOENT; /* Lookup miss. */ - /* Copy tbl8 entry (only if needed) */ - if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + /* + * If tbl24 entry is valid check if it is NOT extended (i.e. it does + * not use a tbl8 extension) if so return the next hop. + */ + if (tbl24.ext_entry == 0) { + *next_hop = tbl24.next_hop; + return 0; /* Lookup hit. */ + } - unsigned tbl8_index = (uint8_t)ip + - ((uint8_t)tbl_entry * RTE_LPM_TBL8_GROUP_NUM_ENTRIES); + /* + * If tbl24 entry is valid and extended calculate the index into the + * tbl8 entry. + */ + tbl8 = lpm->tbl8[tbl24.tbl8_gindex * RTE_LPM_TBL8_GROUP_NUM_ENTRIES + + (ip & 0xFF)]; + rte_barrier(); - tbl_entry = *(const uint16_t *)&lpm->tbl8[tbl8_index]; - } + /* Check if the tbl8 entry is invalid and if so return -ENOENT. */ + if (unlikely(!tbl8.valid)) + return -ENOENT; /* Lookup miss. */ - *next_hop = (uint8_t)tbl_entry; - return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT; + /* If the tbl8 entry is valid return return the next_hop. */ + *next_hop = tbl8.next_hop; + return 0; /* Lookup hit. */ } /** - * Lookup multiple IP addresses in an LPM table. This may be implemented as a - * macro, so the address of the function should not be used. + * Iterate over all rules in the LPM table. * * @param lpm * LPM object handle - * @param ips - * Array of IPs to be looked up in the LPM table - * @param next_hops - * Next hop of the most specific rule found for IP (valid on lookup hit only). - * This is an array of two byte values. The most significant byte in each - * value says whether the lookup was successful (bitmask - * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the - * actual next hop. - * @param n - * Number of elements in ips (and next_hops) array to lookup. This should be a - * compile time constant, and divisible by 8 for best performance. - * @return - * -EINVAL for incorrect arguments, otherwise 0 + * @param func + * Callback to display + * @param arg + * Argument passed to iterator */ -#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \ - rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n) - -static inline int -rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips, - uint16_t * next_hops, const unsigned n) -{ - unsigned i; - unsigned tbl24_indexes[n]; - - /* DEBUG: Check user input arguments. */ - RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (ips == NULL) || - (next_hops == NULL)), -EINVAL); - - for (i = 0; i < n; i++) { - tbl24_indexes[i] = ips[i] >> 8; - } - - for (i = 0; i < n; i++) { - /* Simply copy tbl24 entry to output */ - next_hops[i] = *(const uint16_t *)&lpm->tbl24[tbl24_indexes[i]]; - - /* Overwrite output with tbl8 entry if needed */ - if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - - unsigned tbl8_index = (uint8_t)ips[i] + - ((uint8_t)next_hops[i] * - RTE_LPM_TBL8_GROUP_NUM_ENTRIES); - - next_hops[i] = *(const uint16_t *)&lpm->tbl8[tbl8_index]; - } - } - return 0; -} +void +rte_lpm_walk(struct rte_lpm *lpm, rte_lpm_walk_func_t func, void *arg); -/* Mask four results. */ -#define RTE_LPM_MASKX4_RES UINT64_C(0x00ff00ff00ff00ff) +/** + * Return the number of entries in the Tbl8 array + * + * @param lpm + * LPM object handle + */ +unsigned +rte_lpm_tbl8_count(const struct rte_lpm *lpm); /** - * Lookup four IP addresses in an LPM table. + * Return the number of free entries in the Tbl8 array * * @param lpm * LPM object handle - * @param ip - * Four IPs to be looked up in the LPM table - * @param hop - * Next hop of the most specific rule found for IP (valid on lookup hit only). - * This is an 4 elements array of two byte values. - * If the lookup was succesfull for the given IP, then least significant byte - * of the corresponding element is the actual next hop and the most - * significant byte is zero. - * If the lookup for the given IP failed, then corresponding element would - * contain default value, see description of then next parameter. - * @param defv - * Default value to populate into corresponding element of hop[] array, - * if lookup would fail. */ -static inline void -rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4], - uint16_t defv) +static inline unsigned +rte_lpm_tbl8_free_count(const struct rte_lpm *lpm) { - __m128i i24; - rte_xmm_t i8; - uint16_t tbl[4]; - uint64_t idx, pt; - - const __m128i mask8 = - _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX); - - /* - * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries - * as one 64-bit value (0x0300030003000300). - */ - const uint64_t mask_xv = - ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | - (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 | - (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 | - (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48); - - /* - * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries - * as one 64-bit value (0x0100010001000100). - */ - const uint64_t mask_v = - ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | - (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 | - (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 | - (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48); - - /* get 4 indexes for tbl24[]. */ - i24 = _mm_srli_epi32(ip, CHAR_BIT); - - /* extract values from tbl24[] */ - idx = _mm_cvtsi128_si64(i24); - i24 = _mm_srli_si128(i24, sizeof(uint64_t)); - - tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx]; - tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32]; - - idx = _mm_cvtsi128_si64(i24); - - tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx]; - tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32]; - - /* get 4 indexes for tbl8[]. */ - i8.x = _mm_and_si128(ip, mask8); - - pt = (uint64_t)tbl[0] | - (uint64_t)tbl[1] << 16 | - (uint64_t)tbl[2] << 32 | - (uint64_t)tbl[3] << 48; - - /* search successfully finished for all 4 IP addresses. */ - if (likely((pt & mask_xv) == mask_v)) { - uintptr_t ph = (uintptr_t)hop; - *(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES; - return; - } - - if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[0] = i8.u32[0] + - (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]]; - } - if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[1] = i8.u32[1] + - (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]]; - } - if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[2] = i8.u32[2] + - (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]]; - } - if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == - RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { - i8.u32[3] = i8.u32[3] + - (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; - tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]]; - } - - hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv; - hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv; - hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv; - hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv; + return lpm->tbl8_num_groups - rte_lpm_tbl8_count(lpm); } #ifdef __cplusplus