diff mbox series

[v2,5/7] hash: add extendable bucket feature

Message ID	1537550255-252066-6-git-send-email-yipeng1.wang@intel.com (mailing list archive)
State	Superseded, archived
Delegated to:	Thomas Monjalon
Headers	From: Yipeng Wang <yipeng1.wang@intel.com> To: bruce.richardson@intel.com Cc: dev@dpdk.org, yipeng1.wang@intel.com, michel@digirati.com.br, honnappa.nagarahalli@arm.com Date: Fri, 21 Sep 2018 10:17:33 -0700 Message-Id: <1537550255-252066-6-git-send-email-yipeng1.wang@intel.com> In-Reply-To: <1537550255-252066-1-git-send-email-yipeng1.wang@intel.com> References: <1536253745-133104-1-git-send-email-yipeng1.wang@intel.com> <1537550255-252066-1-git-send-email-yipeng1.wang@intel.com> Subject: [dpdk-dev] [PATCH v2 5/7] hash: add extendable bucket feature Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	hash: add extendable bucket and partial key hashing \| [v2,0/7] hash: add extendable bucket and partial key hashing [v2,1/7] test/hash: fix bucket size in hash perf test [v2,2/7] test/hash: more accurate hash perf test output [v2,3/7] test/hash: fix rw test with non-consecutive cores [v2,4/7] hash: fix unnecessary code [v2,5/7] hash: add extendable bucket feature [v2,6/7] test/hash: implement extendable bucket hash test [v2,7/7] hash: use partial-key hashing

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK
ci/Intel-compilation	success	Compilation OK

Commit Message

Wang, Yipeng1 Sept. 21, 2018, 5:17 p.m. UTC

  In use cases that hash table capacity needs to be guaranteed,
the extendable bucket feature can be used to contain extra
keys in linked lists when conflict happens. This is similar
concept to the extendable bucket hash table in packet
framework.

This commit adds the extendable bucket feature. User can turn
it on or off through the extra flag field during table
creation time.

Extendable bucket table composes of buckets that can be
linked list to current main table. When extendable bucket
is enabled, the table utilization can always acheive 100%.
Although keys ending up in the ext buckets may have longer
look up time, they should be rare due to the cuckoo
algorithm.

Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
---
 lib/librte_hash/rte_cuckoo_hash.c | 326 +++++++++++++++++++++++++++++++++-----
 lib/librte_hash/rte_cuckoo_hash.h |   5 +
 lib/librte_hash/rte_hash.h        |   3 +
 3 files changed, 292 insertions(+), 42 deletions(-)

Comments

Honnappa Nagarahalli Sept. 27, 2018, 4:23 a.m. UTC | #1

> -----Original Message-----
> From: Yipeng Wang <yipeng1.wang@intel.com>
> Sent: Friday, September 21, 2018 12:18 PM
> To: bruce.richardson@intel.com
> Cc: dev@dpdk.org; yipeng1.wang@intel.com; michel@digirati.com.br;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Subject: [PATCH v2 5/7] hash: add extendable bucket feature
>
> In use cases that hash table capacity needs to be guaranteed, the extendable
> bucket feature can be used to contain extra keys in linked lists when conflict
> happens. This is similar concept to the extendable bucket hash table in packet
> framework.
>
> This commit adds the extendable bucket feature. User can turn it on or off
> through the extra flag field during table creation time.
>
> Extendable bucket table composes of buckets that can be linked list to current
> main table. When extendable bucket is enabled, the table utilization can
> always acheive 100%.
IMO, referring to this as 'table utilization' indicates an efficiency about memory utilization. Please consider changing this to indicate that all of the configured number of entries will be accommodated?

> Although keys ending up in the ext buckets may have longer look up time, they
> should be rare due to the cuckoo algorithm.
>
> Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> ---
>  lib/librte_hash/rte_cuckoo_hash.c | 326
> +++++++++++++++++++++++++++++++++-----
>  lib/librte_hash/rte_cuckoo_hash.h |   5 +
>  lib/librte_hash/rte_hash.h        |   3 +
>  3 files changed, 292 insertions(+), 42 deletions(-)
>
> diff --git a/lib/librte_hash/rte_cuckoo_hash.c
> b/lib/librte_hash/rte_cuckoo_hash.c
> index f7b86c8..616900b 100644
> --- a/lib/librte_hash/rte_cuckoo_hash.c
> +++ b/lib/librte_hash/rte_cuckoo_hash.c
> @@ -31,6 +31,10 @@
>  #include "rte_hash.h"
>  #include "rte_cuckoo_hash.h"
>
> +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)
> \
> +for (CURRENT_BKT = START_BUCKET;                                      \
> +CURRENT_BKT != NULL;                                          \
> +CURRENT_BKT = CURRENT_BKT->next)
>
>  TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
>
> @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)
>  return h;
>  }
>
> +static inline struct rte_hash_bucket *
> +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) {
> +while (lst_bkt->next != NULL)
> +lst_bkt = lst_bkt->next;
> +return lst_bkt;
> +}
> +
>  void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)  {
>  h->cmp_jump_table_idx = KEY_CUSTOM;
> @@ -85,13 +97,17 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  struct rte_tailq_entry *te = NULL;
>  struct rte_hash_list *hash_list;
>  struct rte_ring *r = NULL;
> +struct rte_ring *r_ext = NULL;
>  char hash_name[RTE_HASH_NAMESIZE];
>  void *k = NULL;
>  void *buckets = NULL;
> +void *buckets_ext = NULL;
>  char ring_name[RTE_RING_NAMESIZE];
> +char ext_ring_name[RTE_RING_NAMESIZE];
>  unsigned num_key_slots;
>  unsigned i;
>  unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
> +unsigned int ext_table_support = 0;
>  unsigned int readwrite_concur_support = 0;
>
>  rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
> @@ -124,6 +140,9 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  multi_writer_support = 1;
>  }
>
> +if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
> +ext_table_support = 1;
> +
>  /* Store all keys and leave the first entry as a dummy entry for
> lookup_bulk */
>  if (multi_writer_support)
>  /*
> @@ -145,6 +164,24 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  goto err;
>  }
>
> +const uint32_t num_buckets = rte_align32pow2(params->entries) /
> +RTE_HASH_BUCKET_ENTRIES;
> +
> +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> +params-
> >name);
Can be inside the if statement below.

> +/* Create ring for extendable buckets. */
> +if (ext_table_support) {
> +r_ext = rte_ring_create(ext_ring_name,
> +rte_align32pow2(num_buckets + 1),
> +params->socket_id, 0);
> +
> +if (r_ext == NULL) {
> +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> "
> +"failed\n");
> +goto err;
> +}
> +}
> +
>  snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
>
>  rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
> @@ -177,18 +214,34 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  goto err_unlock;
>  }
>
> -const uint32_t num_buckets = rte_align32pow2(params->entries)
> -/ RTE_HASH_BUCKET_ENTRIES;
> -
>  buckets = rte_zmalloc_socket(NULL,
>  num_buckets * sizeof(struct rte_hash_bucket),
>  RTE_CACHE_LINE_SIZE, params->socket_id);
>
>  if (buckets == NULL) {
> -RTE_LOG(ERR, HASH, "memory allocation failed\n");
> +RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
>  goto err_unlock;
>  }
>
> +/* Allocate same number of extendable buckets */
IMO, we are allocating too much memory to support this feature. Especially, when we claim that keys ending up in the extendable table is a rare occurrence. By doubling the memory we are effectively saying that the main table might have 50% utilization. It will also significantly increase the cycles required to iterate the complete hash table (in rte_hash_iterate API) even when we expect that the extendable table contains very few entries.

I am wondering if we can provide options to control the amount of extra memory that gets allocated and make the memory allocation dynamic (or on demand basis). I think this also goes well with the general direction DPDK is taking - allocate resources as needed rather than allocating all the resources during initialization.

> +if (ext_table_support) {
> +buckets_ext = rte_zmalloc_socket(NULL,
> +num_buckets * sizeof(struct rte_hash_bucket),
> +RTE_CACHE_LINE_SIZE, params->socket_id);
> +if (buckets_ext == NULL) {
> +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> "
> +"failed\n");
> +goto err_unlock;
> +}
> +/* Populate ext bkt ring. We reserve 0 similar to the
> + * key-data slot, just in case in future we want to
> + * use bucket index for the linked list and 0 means NULL
> + * for next bucket
> + */
> +for (i = 1; i <= num_buckets; i++)
Since, the bucket index 0 is reserved, should be 'i < num_buckets'

> +rte_ring_sp_enqueue(r_ext, (void *)((uintptr_t) i));
> +}
> +
>  const uint32_t key_entry_size = sizeof(struct rte_hash_key) + params-
> >key_len;
>  const uint64_t key_tbl_size = (uint64_t) key_entry_size *
> num_key_slots;
>
> @@ -262,6 +315,8 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  h->num_buckets = num_buckets;
>  h->bucket_bitmask = h->num_buckets - 1;
>  h->buckets = buckets;
> +h->buckets_ext = buckets_ext;
> +h->free_ext_bkts = r_ext;
>  h->hash_func = (params->hash_func == NULL) ?
>  default_hash_func : params->hash_func;
>  h->key_store = k;
> @@ -269,6 +324,7 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  h->hw_trans_mem_support = hw_trans_mem_support;
>  h->multi_writer_support = multi_writer_support;
>  h->readwrite_concur_support = readwrite_concur_support;
> +h->ext_table_support = ext_table_support;
>
>  #if defined(RTE_ARCH_X86)
>  if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
> @@ -304,9 +360,11 @@ rte_hash_create(const struct rte_hash_parameters
> *params)
>  rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
>  err:
>  rte_ring_free(r);
> +rte_ring_free(r_ext);
>  rte_free(te);
>  rte_free(h);
>  rte_free(buckets);
> +rte_free(buckets_ext);
>  rte_free(k);
>  return NULL;
>  }
> @@ -344,6 +402,7 @@ rte_hash_free(struct rte_hash *h)
>  rte_free(h->readwrite_lock);
>  }
>  rte_ring_free(h->free_slots);
> +rte_ring_free(h->free_ext_bkts);
>  rte_free(h->key_store);
>  rte_free(h->buckets);
Add rte_free(h->buckets_ext);

>  rte_free(h);
> @@ -403,7 +462,6 @@ __hash_rw_writer_lock(const struct rte_hash *h)
>  rte_rwlock_write_lock(h->readwrite_lock);
>  }
>
> -
>  static inline void
>  __hash_rw_reader_lock(const struct rte_hash *h)  { @@ -448,6 +506,14 @@
> rte_hash_reset(struct rte_hash *h)
>  while (rte_ring_dequeue(h->free_slots, &ptr) == 0)
>  rte_pause();
>
> +/* clear free extendable bucket ring and memory */
> +if (h->ext_table_support) {
> +memset(h->buckets_ext, 0, h->num_buckets *
> +sizeof(struct
> rte_hash_bucket));
> +while (rte_ring_dequeue(h->free_ext_bkts, &ptr) == 0)
> +rte_pause();
> +}
> +
>  /* Repopulate the free slots ring. Entry zero is reserved for key misses
> */
>  if (h->multi_writer_support)
>  tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) * @@ -
> 458,6 +524,12 @@ rte_hash_reset(struct rte_hash *h)
>  for (i = 1; i < tot_ring_cnt + 1; i++)
>  rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i));
>
> +/* Repopulate the free ext bkt ring. */
> +if (h->ext_table_support)
> +for (i = 1; i < h->num_buckets + 1; i++)
Index 0 is reserved as per the comments. Condition should be 'i < h->num_buckets'.

> +rte_ring_sp_enqueue(h->free_ext_bkts,
> +(void *)((uintptr_t) i));
> +
>  if (h->multi_writer_support) {
>  /* Reset local caches per lcore */
>  for (i = 0; i < RTE_MAX_LCORE; i++)
> @@ -524,24 +596,27 @@ rte_hash_cuckoo_insert_mw(const struct rte_hash
> *h,
>  int32_t *ret_val)
>  {
>  unsigned int i;
> -struct rte_hash_bucket *cur_bkt = prim_bkt;
> +struct rte_hash_bucket *cur_bkt;
>  int32_t ret;
>
>  __hash_rw_writer_lock(h);
>  /* Check if key was inserted after last check but before this
>   * protected region in case of inserting duplicated keys.
>   */
> -ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash);
> +ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash);
>  if (ret != -1) {
>  __hash_rw_writer_unlock(h);
>  *ret_val = ret;
>  return 1;
>  }
> -ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig);
> -if (ret != -1) {
> -__hash_rw_writer_unlock(h);
> -*ret_val = ret;
> -return 1;
> +
> +FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
> +ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
> +if (ret != -1) {
> +__hash_rw_writer_unlock(h);
> +*ret_val = ret;
> +return 1;
> +}
>  }
>
>  /* Insert new entry if there is room in the primary @@ -580,7 +655,7
> @@ rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
>  int32_t *ret_val)
>  {
>  uint32_t prev_alt_bkt_idx;
> -struct rte_hash_bucket *cur_bkt = bkt;
> +struct rte_hash_bucket *cur_bkt;
>  struct queue_node *prev_node, *curr_node = leaf;
>  struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt;
>  uint32_t prev_slot, curr_slot = leaf_slot; @@ -597,18 +672,20 @@
> rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
>  /* Check if key was inserted after last check but before this
>   * protected region.
>   */
> -ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash);
> +ret = search_and_update(h, data, key, bkt, sig, alt_hash);
>  if (ret != -1) {
>  __hash_rw_writer_unlock(h);
>  *ret_val = ret;
>  return 1;
>  }
>
> -ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig);
> -if (ret != -1) {
> -__hash_rw_writer_unlock(h);
> -*ret_val = ret;
> -return 1;
> +FOR_EACH_BUCKET(cur_bkt, alt_bkt) {
> +ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
> +if (ret != -1) {
> +__hash_rw_writer_unlock(h);
> +*ret_val = ret;
> +return 1;
> +}
>  }
>
>  while (likely(curr_node->prev != NULL)) { @@ -711,15 +788,18 @@
> __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,  {
>  hash_sig_t alt_hash;
>  uint32_t prim_bucket_idx, sec_bucket_idx;
> -struct rte_hash_bucket *prim_bkt, *sec_bkt;
> +struct rte_hash_bucket *prim_bkt, *sec_bkt, *cur_bkt;
>  struct rte_hash_key *new_k, *keys = h->key_store;
>  void *slot_id = NULL;
> -uint32_t new_idx;
> +void *ext_bkt_id = NULL;
> +uint32_t new_idx, bkt_id;
>  int ret;
>  unsigned n_slots;
>  unsigned lcore_id;
> +unsigned int i;
>  struct lcore_cache *cached_free_slots = NULL;
>  int32_t ret_val;
> +struct rte_hash_bucket *last;
>
>  prim_bucket_idx = sig & h->bucket_bitmask;
>  prim_bkt = &h->buckets[prim_bucket_idx]; @@ -739,10 +819,12 @@
> __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
>  }
>
>  /* Check if key is already inserted in secondary location */
> -ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig);
> -if (ret != -1) {
> -__hash_rw_writer_unlock(h);
> -return ret;
> +FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
> +ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
> +if (ret != -1) {
> +__hash_rw_writer_unlock(h);
> +return ret;
> +}
>  }
>  __hash_rw_writer_unlock(h);
>
> @@ -808,10 +890,71 @@ __rte_hash_add_key_with_hash(const struct
> rte_hash *h, const void *key,
>  else if (ret == 1) {
>  enqueue_slot_back(h, cached_free_slots, slot_id);
>  return ret_val;
> -} else {
> +}
> +
> +/* if ext table not enabled, we failed the insertion */
> +if (!h->ext_table_support) {
>  enqueue_slot_back(h, cached_free_slots, slot_id);
>  return ret;
>  }
> +
> +/* Now we need to go through the extendable bucket. Protection is
> needed
> + * to protect all extendable bucket processes.
> + */
> +__hash_rw_writer_lock(h);
> +/* We check for duplicates again since could be inserted before the
> lock */
> +ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash);
> +if (ret != -1) {
> +enqueue_slot_back(h, cached_free_slots, slot_id);
> +goto failure;
> +}
> +
> +FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
> +ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
> +if (ret != -1) {
> +enqueue_slot_back(h, cached_free_slots, slot_id);
> +goto failure;
> +}
> +}
> +
> +/* Search extendable buckets to find an empty entry to insert. */
> +struct rte_hash_bucket *next_bkt = sec_bkt->next;
> +FOR_EACH_BUCKET(cur_bkt, next_bkt) {
> +for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +/* Check if slot is available */
> +if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) {
> +cur_bkt->sig_current[i] = alt_hash;
> +cur_bkt->sig_alt[i] = sig;
> +cur_bkt->key_idx[i] = new_idx;
> +__hash_rw_writer_unlock(h);
> +return new_idx - 1;
> +}
> +}
> +}
> +
> +/* Failed to get an empty entry from extendable buckets. Link a new
> + * extendable bucket. We first get a free bucket from ring.
> + */
> +if (rte_ring_sc_dequeue(h->free_ext_bkts, &ext_bkt_id) != 0) {
> +ret = -ENOSPC;
> +goto failure;
> +}
> +
> +bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
If index 0 is reserved, -1 is not required.

> +/* Use the first location of the new bucket */
> +(h->buckets_ext[bkt_id]).sig_current[0] = alt_hash;
> +(h->buckets_ext[bkt_id]).sig_alt[0] = sig;
> +(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
> +/* Link the new bucket to sec bucket linked list */
> +last = rte_hash_get_last_bkt(sec_bkt);
> +last->next = &h->buckets_ext[bkt_id];
> +__hash_rw_writer_unlock(h);
> +return new_idx - 1;
> +
> +failure:
> +__hash_rw_writer_unlock(h);
> +return ret;
> +
>  }
>
>  int32_t
> @@ -890,7 +1033,7 @@ __rte_hash_lookup_with_hash(const struct rte_hash
> *h, const void *key,  {
>  uint32_t bucket_idx;
>  hash_sig_t alt_hash;
> -struct rte_hash_bucket *bkt;
> +struct rte_hash_bucket *bkt, *cur_bkt;
>  int ret;
>
>  bucket_idx = sig & h->bucket_bitmask;
> @@ -910,10 +1053,12 @@ __rte_hash_lookup_with_hash(const struct
> rte_hash *h, const void *key,
>  bkt = &h->buckets[bucket_idx];
>
>  /* Check if key is in secondary location */
> -ret = search_one_bucket(h, key, alt_hash, data, bkt);
> -if (ret != -1) {
> -__hash_rw_reader_unlock(h);
> -return ret;
> +FOR_EACH_BUCKET(cur_bkt, bkt) {
> +ret = search_one_bucket(h, key, alt_hash, data, cur_bkt);
> +if (ret != -1) {
> +__hash_rw_reader_unlock(h);
> +return ret;
> +}
>  }
>  __hash_rw_reader_unlock(h);
>  return -ENOENT;
> @@ -1015,15 +1160,17 @@ __rte_hash_del_key_with_hash(const struct
> rte_hash *h, const void *key,  {
>  uint32_t bucket_idx;
>  hash_sig_t alt_hash;
> -struct rte_hash_bucket *bkt;
> -int32_t ret;
> +struct rte_hash_bucket *prim_bkt, *sec_bkt;
> +struct rte_hash_bucket *cur_bkt, *prev_bkt, *next_bkt;
> +int32_t ret, i;
> +struct rte_hash_bucket *tobe_removed_bkt = NULL;
>
>  bucket_idx = sig & h->bucket_bitmask;
> -bkt = &h->buckets[bucket_idx];
> +prim_bkt = &h->buckets[bucket_idx];
>
>  __hash_rw_writer_lock(h);
>  /* look for key in primary bucket */
> -ret = search_and_remove(h, key, bkt, sig);
> +ret = search_and_remove(h, key, prim_bkt, sig);
>  if (ret != -1) {
>  __hash_rw_writer_unlock(h);
>  return ret;
> @@ -1032,17 +1179,51 @@ __rte_hash_del_key_with_hash(const struct
> rte_hash *h, const void *key,
>  /* Calculate secondary hash */
>  alt_hash = rte_hash_secondary_hash(sig);
>  bucket_idx = alt_hash & h->bucket_bitmask;
> -bkt = &h->buckets[bucket_idx];
> +sec_bkt = &h->buckets[bucket_idx];
>
>  /* look for key in secondary bucket */
> -ret = search_and_remove(h, key, bkt, alt_hash);
> +ret = search_and_remove(h, key, sec_bkt, alt_hash);
>  if (ret != -1) {
>  __hash_rw_writer_unlock(h);
>  return ret;
>  }
>
> +/* Not in main table, we need to search ext buckets */
> +if (h->ext_table_support) {
> +next_bkt = sec_bkt->next;
> +FOR_EACH_BUCKET(cur_bkt, next_bkt) {
> +ret = search_and_remove(h, key, cur_bkt, alt_hash);
> +if (ret != -1)
> +goto return_bkt;
> +}
> +}
> +
>  __hash_rw_writer_unlock(h);
>  return -ENOENT;
> +
> +/* Search extendable buckets to see if any empty bucket need to be
> +recycled */
> +return_bkt:
> +for (cur_bkt = sec_bkt->next, prev_bkt = sec_bkt; cur_bkt != NULL;
> +prev_bkt = cur_bkt, cur_bkt = cur_bkt->next) {
> +for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +if (cur_bkt->key_idx[i] != EMPTY_SLOT)
> +break;
> +}
> +if (i == RTE_HASH_BUCKET_ENTRIES) {
> +prev_bkt->next = cur_bkt->next;
> +cur_bkt->next = NULL;
> +tobe_removed_bkt = cur_bkt;
> +break;
> +}
> +}
> +
> +__hash_rw_writer_unlock(h);
> +
> +if (tobe_removed_bkt) {
> +uint32_t index = tobe_removed_bkt - h->buckets_ext + 1;
No need to increase the index by 1 if entry 0 is reserved.

> +rte_ring_mp_enqueue(h->free_ext_bkts, (void
> *)(uintptr_t)index);
> +}
> +return ret;
>  }
>
>  int32_t
> @@ -1143,12 +1324,14 @@ __rte_hash_lookup_bulk(const struct rte_hash
> *h, const void **keys,  {
>  uint64_t hits = 0;
>  int32_t i;
> +int32_t ret;
>  uint32_t prim_hash[RTE_HASH_LOOKUP_BULK_MAX];
>  uint32_t sec_hash[RTE_HASH_LOOKUP_BULK_MAX];
>  const struct rte_hash_bucket
> *primary_bkt[RTE_HASH_LOOKUP_BULK_MAX];
>  const struct rte_hash_bucket
> *secondary_bkt[RTE_HASH_LOOKUP_BULK_MAX];
>  uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
>  uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
> +struct rte_hash_bucket *cur_bkt, *next_bkt;
>
>  /* Prefetch first keys */
>  for (i = 0; i < PREFETCH_OFFSET && i < num_keys; i++) @@ -1266,6
> +1449,34 @@ __rte_hash_lookup_bulk(const struct rte_hash *h, const void
> **keys,
>  continue;
>  }
>
> +/* all found, do not need to go through ext bkt */
> +if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
> +if (hit_mask != NULL)
> +*hit_mask = hits;
> +__hash_rw_reader_unlock(h);
> +return;
> +}
> +
> +/* need to check ext buckets for match */
> +for (i = 0; i < num_keys; i++) {
> +if ((hits & (1ULL << i)) != 0)
> +continue;
> +next_bkt = secondary_bkt[i]->next;
> +FOR_EACH_BUCKET(cur_bkt, next_bkt) {
> +if (data != NULL)
> +ret = search_one_bucket(h, keys[i],
> +sec_hash[i], &data[i],
> cur_bkt);
> +else
> +ret = search_one_bucket(h, keys[i],
> +sec_hash[i], NULL, cur_bkt);
> +if (ret != -1) {
> +positions[i] = ret;
> +hits |= 1ULL << i;
> +break;
> +}
> +}
> +}
> +
>  __hash_rw_reader_unlock(h);
>
>  if (hit_mask != NULL)
> @@ -1308,10 +1519,13 @@ rte_hash_iterate(const struct rte_hash *h, const
> void **key, void **data, uint32
>
>  RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL);
>
> -const uint32_t total_entries = h->num_buckets *
> RTE_HASH_BUCKET_ENTRIES;
> +const uint32_t total_entries_main = h->num_buckets *
> +
> RTE_HASH_BUCKET_ENTRIES;
> +const uint32_t total_entries = total_entries_main << 1;
> +
>  /* Out of bounds */
Minor: update the comment to reflect the new code.

> -if (*next >= total_entries)
> -return -ENOENT;
> +if (*next >= total_entries_main)
> +goto extend_table;
>
>  /* Calculate bucket and index of current iterator */
>  bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES; @@ -1321,8
> +1535,8 @@ rte_hash_iterate(const struct rte_hash *h, const void **key, void
> **data, uint32
>  while (h->buckets[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
>  (*next)++;
>  /* End of table */
> -if (*next == total_entries)
> -return -ENOENT;
> +if (*next == total_entries_main)
> +goto extend_table;
>  bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES;
>  idx = *next % RTE_HASH_BUCKET_ENTRIES;
>  }
> @@ -1341,4 +1555,32 @@ rte_hash_iterate(const struct rte_hash *h, const
> void **key, void **data, uint32
>  (*next)++;
>
>  return position - 1;
> +
> +extend_table:
> +/* Out of bounds */
> +if (*next >= total_entries || !h->ext_table_support)
> +return -ENOENT;
> +
> +bucket_idx = (*next - total_entries_main) /
> RTE_HASH_BUCKET_ENTRIES;
> +idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
> +
> +while (h->buckets_ext[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
> +(*next)++;
> +if (*next == total_entries)
> +return -ENOENT;
> +bucket_idx = (*next - total_entries_main) /
> +RTE_HASH_BUCKET_ENTRIES;
> +idx = (*next - total_entries_main) %
> RTE_HASH_BUCKET_ENTRIES;
> +}
> +/* Get position of entry in key table */
> +position = h->buckets_ext[bucket_idx].key_idx[idx];
There is a possibility that 'position' is not the same value read in the while loop. It presents a problem if 'position' becomes EMPTY_SLOT. 'position' should be read as part of the while loop. Since it is 32b value, it should be atomic on most platforms. This issue applies to existing code as well.

__hash_rw_reader_lock(h) required
> +next_key = (struct rte_hash_key *) ((char *)h->key_store +
> +position * h->key_entry_size);
> +/* Return key and data */
> +*key = next_key->key;
> +*data = next_key->pdata;
> +
__hash_rw_reader_unlock(h) required

> +/* Increment iterator */
> +(*next)++;
> +return position - 1;
>  }
> diff --git a/lib/librte_hash/rte_cuckoo_hash.h
> b/lib/librte_hash/rte_cuckoo_hash.h
> index fc0e5c2..e601520 100644
> --- a/lib/librte_hash/rte_cuckoo_hash.h
> +++ b/lib/librte_hash/rte_cuckoo_hash.h
> @@ -142,6 +142,8 @@ struct rte_hash_bucket {
>  hash_sig_t sig_alt[RTE_HASH_BUCKET_ENTRIES];
>
>  uint8_t flag[RTE_HASH_BUCKET_ENTRIES];
> +
> +void *next;
>  } __rte_cache_aligned;
>
>  /** A hash table structure. */
> @@ -166,6 +168,7 @@ struct rte_hash {
>  /**< If multi-writer support is enabled. */
>  uint8_t readwrite_concur_support;
>  /**< If read-write concurrency support is enabled */
> +uint8_t ext_table_support;     /**< Enable extendable bucket table */
>  rte_hash_function hash_func;    /**< Function used to calculate hash.
> */
>  uint32_t hash_func_init_val;    /**< Init value used by hash_func. */
>  rte_hash_cmp_eq_t rte_hash_custom_cmp_eq; @@ -184,6 +187,8
> @@ struct rte_hash {
>   * to the key table.
>   */
>  rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */
> +struct rte_hash_bucket *buckets_ext; /**< Extra buckets array */
> +struct rte_ring *free_ext_bkts; /**< Ring of indexes of free buckets
> +*/
>  } __rte_cache_aligned;
>
>  struct queue_node {
> diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h index
> 9e7d931..11d8e28 100644
> --- a/lib/librte_hash/rte_hash.h
> +++ b/lib/librte_hash/rte_hash.h
> @@ -37,6 +37,9 @@ extern "C" {
>  /** Flag to support reader writer concurrency */  #define
> RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04
>
> +/** Flag to indicate the extendabe bucket table feature should be used
> +*/ #define RTE_HASH_EXTRA_FLAGS_EXT_TABLE 0x08
> +
>  /** Signature of key that is stored internally. */  typedef uint32_t hash_sig_t;
>
> --
> 2.7.4

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

Bruce Richardson Sept. 27, 2018, 11:15 a.m. UTC | #2

On Thu, Sep 27, 2018 at 04:23:48AM +0000, Honnappa Nagarahalli wrote:
> 
> 
> > -----Original Message-----
> > From: Yipeng Wang <yipeng1.wang@intel.com>
> > Sent: Friday, September 21, 2018 12:18 PM
> > To: bruce.richardson@intel.com
> > Cc: dev@dpdk.org; yipeng1.wang@intel.com; michel@digirati.com.br;
> > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > Subject: [PATCH v2 5/7] hash: add extendable bucket feature
> >
> > In use cases that hash table capacity needs to be guaranteed, the extendable
> > bucket feature can be used to contain extra keys in linked lists when conflict
> > happens. This is similar concept to the extendable bucket hash table in packet
> > framework.
> >
> > This commit adds the extendable bucket feature. User can turn it on or off
> > through the extra flag field during table creation time.
> >
> > Extendable bucket table composes of buckets that can be linked list to current
> > main table. When extendable bucket is enabled, the table utilization can
> > always acheive 100%.
> IMO, referring to this as 'table utilization' indicates an efficiency about memory utilization. Please consider changing this to indicate that all of the configured number of entries will be accommodated?
> 
> > Although keys ending up in the ext buckets may have longer look up time, they
> > should be rare due to the cuckoo algorithm.
> >
> > Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> > ---
> >  lib/librte_hash/rte_cuckoo_hash.c | 326
> > +++++++++++++++++++++++++++++++++-----
> >  lib/librte_hash/rte_cuckoo_hash.h |   5 +
> >  lib/librte_hash/rte_hash.h        |   3 +
> >  3 files changed, 292 insertions(+), 42 deletions(-)
> >
> > diff --git a/lib/librte_hash/rte_cuckoo_hash.c
> > b/lib/librte_hash/rte_cuckoo_hash.c
> > index f7b86c8..616900b 100644
> > --- a/lib/librte_hash/rte_cuckoo_hash.c
> > +++ b/lib/librte_hash/rte_cuckoo_hash.c
> > @@ -31,6 +31,10 @@
> >  #include "rte_hash.h"
> >  #include "rte_cuckoo_hash.h"
> >
> > +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)
> > \
> > +for (CURRENT_BKT = START_BUCKET;                                      \
> > +CURRENT_BKT != NULL;                                          \
> > +CURRENT_BKT = CURRENT_BKT->next)
> >
> >  TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
> >
> > @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)
> >  return h;
> >  }
> >
> > +static inline struct rte_hash_bucket *
> > +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) {
> > +while (lst_bkt->next != NULL)
> > +lst_bkt = lst_bkt->next;
> > +return lst_bkt;
> > +}
> > +
> >  void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)  {
> >  h->cmp_jump_table_idx = KEY_CUSTOM;
> > @@ -85,13 +97,17 @@ rte_hash_create(const struct rte_hash_parameters
> > *params)
> >  struct rte_tailq_entry *te = NULL;
> >  struct rte_hash_list *hash_list;
> >  struct rte_ring *r = NULL;
> > +struct rte_ring *r_ext = NULL;
> >  char hash_name[RTE_HASH_NAMESIZE];
> >  void *k = NULL;
> >  void *buckets = NULL;
> > +void *buckets_ext = NULL;
> >  char ring_name[RTE_RING_NAMESIZE];
> > +char ext_ring_name[RTE_RING_NAMESIZE];
> >  unsigned num_key_slots;
> >  unsigned i;
> >  unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
> > +unsigned int ext_table_support = 0;
> >  unsigned int readwrite_concur_support = 0;
> >
> >  rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
> > @@ -124,6 +140,9 @@ rte_hash_create(const struct rte_hash_parameters
> > *params)
> >  multi_writer_support = 1;
> >  }
> >
> > +if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
> > +ext_table_support = 1;
> > +
> >  /* Store all keys and leave the first entry as a dummy entry for
> > lookup_bulk */
> >  if (multi_writer_support)
> >  /*
> > @@ -145,6 +164,24 @@ rte_hash_create(const struct rte_hash_parameters
> > *params)
> >  goto err;
> >  }
> >
> > +const uint32_t num_buckets = rte_align32pow2(params->entries) /
> > +RTE_HASH_BUCKET_ENTRIES;
> > +
> > +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> > +params-
> > >name);
> Can be inside the if statement below.
> 
> > +/* Create ring for extendable buckets. */
> > +if (ext_table_support) {
> > +r_ext = rte_ring_create(ext_ring_name,
> > +rte_align32pow2(num_buckets + 1),
> > +params->socket_id, 0);
> > +
> > +if (r_ext == NULL) {
> > +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> > "
> > +"failed\n");
> > +goto err;
> > +}
> > +}
> > +
> >  snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
> >
> >  rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
> > @@ -177,18 +214,34 @@ rte_hash_create(const struct rte_hash_parameters
> > *params)
> >  goto err_unlock;
> >  }
> >
> > -const uint32_t num_buckets = rte_align32pow2(params->entries)
> > -/ RTE_HASH_BUCKET_ENTRIES;
> > -
> >  buckets = rte_zmalloc_socket(NULL,
> >  num_buckets * sizeof(struct rte_hash_bucket),
> >  RTE_CACHE_LINE_SIZE, params->socket_id);
> >
> >  if (buckets == NULL) {
> > -RTE_LOG(ERR, HASH, "memory allocation failed\n");
> > +RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
> >  goto err_unlock;
> >  }
> >
> > +/* Allocate same number of extendable buckets */
> IMO, we are allocating too much memory to support this feature. Especially, when we claim that keys ending up in the extendable table is a rare occurrence. By doubling the memory we are effectively saying that the main table might have 50% utilization. It will also significantly increase the cycles required to iterate the complete hash table (in rte_hash_iterate API) even when we expect that the extendable table contains very few entries.
> 
> I am wondering if we can provide options to control the amount of extra memory that gets allocated and make the memory allocation dynamic (or on demand basis). I think this also goes well with the general direction DPDK is taking - allocate resources as needed rather than allocating all the resources during initialization.
> 

Given that adding new entries should not normally be a fast-path function,
how about allowing memory allocation in add itself. Why not initialize with
a fairly small number of extra bucket entries, and then each time they are
all used, double the number of entries. That will give efficient resource
scaling, I think.

/Bruce

Ananyev, Konstantin Sept. 27, 2018, 11:27 a.m. UTC | #3

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Thursday, September 27, 2018 12:15 PM
> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Cc: Wang, Yipeng1 <yipeng1.wang@intel.com>; dev@dpdk.org; michel@digirati.com.br
> Subject: Re: [dpdk-dev] [PATCH v2 5/7] hash: add extendable bucket feature
> 
> On Thu, Sep 27, 2018 at 04:23:48AM +0000, Honnappa Nagarahalli wrote:
> >
> >
> > > -----Original Message-----
> > > From: Yipeng Wang <yipeng1.wang@intel.com>
> > > Sent: Friday, September 21, 2018 12:18 PM
> > > To: bruce.richardson@intel.com
> > > Cc: dev@dpdk.org; yipeng1.wang@intel.com; michel@digirati.com.br;
> > > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > Subject: [PATCH v2 5/7] hash: add extendable bucket feature
> > >
> > > In use cases that hash table capacity needs to be guaranteed, the extendable
> > > bucket feature can be used to contain extra keys in linked lists when conflict
> > > happens. This is similar concept to the extendable bucket hash table in packet
> > > framework.
> > >
> > > This commit adds the extendable bucket feature. User can turn it on or off
> > > through the extra flag field during table creation time.
> > >
> > > Extendable bucket table composes of buckets that can be linked list to current
> > > main table. When extendable bucket is enabled, the table utilization can
> > > always acheive 100%.
> > IMO, referring to this as 'table utilization' indicates an efficiency about memory utilization. Please consider changing this to indicate that
> all of the configured number of entries will be accommodated?
> >
> > > Although keys ending up in the ext buckets may have longer look up time, they
> > > should be rare due to the cuckoo algorithm.
> > >
> > > Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> > > ---
> > >  lib/librte_hash/rte_cuckoo_hash.c | 326
> > > +++++++++++++++++++++++++++++++++-----
> > >  lib/librte_hash/rte_cuckoo_hash.h |   5 +
> > >  lib/librte_hash/rte_hash.h        |   3 +
> > >  3 files changed, 292 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/lib/librte_hash/rte_cuckoo_hash.c
> > > b/lib/librte_hash/rte_cuckoo_hash.c
> > > index f7b86c8..616900b 100644
> > > --- a/lib/librte_hash/rte_cuckoo_hash.c
> > > +++ b/lib/librte_hash/rte_cuckoo_hash.c
> > > @@ -31,6 +31,10 @@
> > >  #include "rte_hash.h"
> > >  #include "rte_cuckoo_hash.h"
> > >
> > > +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)
> > > \
> > > +for (CURRENT_BKT = START_BUCKET;                                      \
> > > +CURRENT_BKT != NULL;                                          \
> > > +CURRENT_BKT = CURRENT_BKT->next)
> > >
> > >  TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
> > >
> > > @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)
> > >  return h;
> > >  }
> > >
> > > +static inline struct rte_hash_bucket *
> > > +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) {
> > > +while (lst_bkt->next != NULL)
> > > +lst_bkt = lst_bkt->next;
> > > +return lst_bkt;
> > > +}
> > > +
> > >  void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)  {
> > >  h->cmp_jump_table_idx = KEY_CUSTOM;
> > > @@ -85,13 +97,17 @@ rte_hash_create(const struct rte_hash_parameters
> > > *params)
> > >  struct rte_tailq_entry *te = NULL;
> > >  struct rte_hash_list *hash_list;
> > >  struct rte_ring *r = NULL;
> > > +struct rte_ring *r_ext = NULL;
> > >  char hash_name[RTE_HASH_NAMESIZE];
> > >  void *k = NULL;
> > >  void *buckets = NULL;
> > > +void *buckets_ext = NULL;
> > >  char ring_name[RTE_RING_NAMESIZE];
> > > +char ext_ring_name[RTE_RING_NAMESIZE];
> > >  unsigned num_key_slots;
> > >  unsigned i;
> > >  unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
> > > +unsigned int ext_table_support = 0;
> > >  unsigned int readwrite_concur_support = 0;
> > >
> > >  rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
> > > @@ -124,6 +140,9 @@ rte_hash_create(const struct rte_hash_parameters
> > > *params)
> > >  multi_writer_support = 1;
> > >  }
> > >
> > > +if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
> > > +ext_table_support = 1;
> > > +
> > >  /* Store all keys and leave the first entry as a dummy entry for
> > > lookup_bulk */
> > >  if (multi_writer_support)
> > >  /*
> > > @@ -145,6 +164,24 @@ rte_hash_create(const struct rte_hash_parameters
> > > *params)
> > >  goto err;
> > >  }
> > >
> > > +const uint32_t num_buckets = rte_align32pow2(params->entries) /
> > > +RTE_HASH_BUCKET_ENTRIES;
> > > +
> > > +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> > > +params-
> > > >name);
> > Can be inside the if statement below.
> >
> > > +/* Create ring for extendable buckets. */
> > > +if (ext_table_support) {
> > > +r_ext = rte_ring_create(ext_ring_name,
> > > +rte_align32pow2(num_buckets + 1),
> > > +params->socket_id, 0);
> > > +
> > > +if (r_ext == NULL) {
> > > +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> > > "
> > > +"failed\n");
> > > +goto err;
> > > +}
> > > +}
> > > +
> > >  snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
> > >
> > >  rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
> > > @@ -177,18 +214,34 @@ rte_hash_create(const struct rte_hash_parameters
> > > *params)
> > >  goto err_unlock;
> > >  }
> > >
> > > -const uint32_t num_buckets = rte_align32pow2(params->entries)
> > > -/ RTE_HASH_BUCKET_ENTRIES;
> > > -
> > >  buckets = rte_zmalloc_socket(NULL,
> > >  num_buckets * sizeof(struct rte_hash_bucket),
> > >  RTE_CACHE_LINE_SIZE, params->socket_id);
> > >
> > >  if (buckets == NULL) {
> > > -RTE_LOG(ERR, HASH, "memory allocation failed\n");
> > > +RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
> > >  goto err_unlock;
> > >  }
> > >
> > > +/* Allocate same number of extendable buckets */
> > IMO, we are allocating too much memory to support this feature. Especially, when we claim that keys ending up in the extendable table is
> a rare occurrence. By doubling the memory we are effectively saying that the main table might have 50% utilization. It will also significantly
> increase the cycles required to iterate the complete hash table (in rte_hash_iterate API) even when we expect that the extendable table
> contains very few entries.
> >
> > I am wondering if we can provide options to control the amount of extra memory that gets allocated and make the memory allocation
> dynamic (or on demand basis). I think this also goes well with the general direction DPDK is taking - allocate resources as needed rather than
> allocating all the resources during initialization.
> >
> 
> Given that adding new entries should not normally be a fast-path function,

Umm, I don't think I agree with that.
There are plenty of cases where add/delete speed is important.
Konstantin

> how about allowing memory allocation in add itself. Why not initialize with
> a fairly small number of extra bucket entries, and then each time they are
> all used, double the number of entries. That will give efficient resource
> scaling, I think.
> 
> /Bruce

Bruce Richardson Sept. 27, 2018, 12:27 p.m. UTC | #4

On Thu, Sep 27, 2018 at 12:27:21PM +0100, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Thursday, September 27, 2018 12:15 PM
> > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > Cc: Wang, Yipeng1 <yipeng1.wang@intel.com>; dev@dpdk.org; michel@digirati.com.br
> > Subject: Re: [dpdk-dev] [PATCH v2 5/7] hash: add extendable bucket feature
> > 
> > On Thu, Sep 27, 2018 at 04:23:48AM +0000, Honnappa Nagarahalli wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Yipeng Wang <yipeng1.wang@intel.com>
> > > > Sent: Friday, September 21, 2018 12:18 PM
> > > > To: bruce.richardson@intel.com
> > > > Cc: dev@dpdk.org; yipeng1.wang@intel.com; michel@digirati.com.br;
> > > > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > > Subject: [PATCH v2 5/7] hash: add extendable bucket feature
> > > >
> > > > In use cases that hash table capacity needs to be guaranteed, the extendable
> > > > bucket feature can be used to contain extra keys in linked lists when conflict
> > > > happens. This is similar concept to the extendable bucket hash table in packet
> > > > framework.
> > > >
> > > > This commit adds the extendable bucket feature. User can turn it on or off
> > > > through the extra flag field during table creation time.
> > > >
> > > > Extendable bucket table composes of buckets that can be linked list to current
> > > > main table. When extendable bucket is enabled, the table utilization can
> > > > always acheive 100%.
> > > IMO, referring to this as 'table utilization' indicates an efficiency about memory utilization. Please consider changing this to indicate that
> > all of the configured number of entries will be accommodated?
> > >
> > > > Although keys ending up in the ext buckets may have longer look up time, they
> > > > should be rare due to the cuckoo algorithm.
> > > >
> > > > Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> > > > ---
> > > >  lib/librte_hash/rte_cuckoo_hash.c | 326
> > > > +++++++++++++++++++++++++++++++++-----
> > > >  lib/librte_hash/rte_cuckoo_hash.h |   5 +
> > > >  lib/librte_hash/rte_hash.h        |   3 +
> > > >  3 files changed, 292 insertions(+), 42 deletions(-)
> > > >
> > > > diff --git a/lib/librte_hash/rte_cuckoo_hash.c
> > > > b/lib/librte_hash/rte_cuckoo_hash.c
> > > > index f7b86c8..616900b 100644
> > > > --- a/lib/librte_hash/rte_cuckoo_hash.c
> > > > +++ b/lib/librte_hash/rte_cuckoo_hash.c
> > > > @@ -31,6 +31,10 @@
> > > >  #include "rte_hash.h"
> > > >  #include "rte_cuckoo_hash.h"
> > > >
> > > > +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)
> > > > \
> > > > +for (CURRENT_BKT = START_BUCKET;                                      \
> > > > +CURRENT_BKT != NULL;                                          \
> > > > +CURRENT_BKT = CURRENT_BKT->next)
> > > >
> > > >  TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
> > > >
> > > > @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)
> > > >  return h;
> > > >  }
> > > >
> > > > +static inline struct rte_hash_bucket *
> > > > +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) {
> > > > +while (lst_bkt->next != NULL)
> > > > +lst_bkt = lst_bkt->next;
> > > > +return lst_bkt;
> > > > +}
> > > > +
> > > >  void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)  {
> > > >  h->cmp_jump_table_idx = KEY_CUSTOM;
> > > > @@ -85,13 +97,17 @@ rte_hash_create(const struct rte_hash_parameters
> > > > *params)
> > > >  struct rte_tailq_entry *te = NULL;
> > > >  struct rte_hash_list *hash_list;
> > > >  struct rte_ring *r = NULL;
> > > > +struct rte_ring *r_ext = NULL;
> > > >  char hash_name[RTE_HASH_NAMESIZE];
> > > >  void *k = NULL;
> > > >  void *buckets = NULL;
> > > > +void *buckets_ext = NULL;
> > > >  char ring_name[RTE_RING_NAMESIZE];
> > > > +char ext_ring_name[RTE_RING_NAMESIZE];
> > > >  unsigned num_key_slots;
> > > >  unsigned i;
> > > >  unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
> > > > +unsigned int ext_table_support = 0;
> > > >  unsigned int readwrite_concur_support = 0;
> > > >
> > > >  rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
> > > > @@ -124,6 +140,9 @@ rte_hash_create(const struct rte_hash_parameters
> > > > *params)
> > > >  multi_writer_support = 1;
> > > >  }
> > > >
> > > > +if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
> > > > +ext_table_support = 1;
> > > > +
> > > >  /* Store all keys and leave the first entry as a dummy entry for
> > > > lookup_bulk */
> > > >  if (multi_writer_support)
> > > >  /*
> > > > @@ -145,6 +164,24 @@ rte_hash_create(const struct rte_hash_parameters
> > > > *params)
> > > >  goto err;
> > > >  }
> > > >
> > > > +const uint32_t num_buckets = rte_align32pow2(params->entries) /
> > > > +RTE_HASH_BUCKET_ENTRIES;
> > > > +
> > > > +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> > > > +params-
> > > > >name);
> > > Can be inside the if statement below.
> > >
> > > > +/* Create ring for extendable buckets. */
> > > > +if (ext_table_support) {
> > > > +r_ext = rte_ring_create(ext_ring_name,
> > > > +rte_align32pow2(num_buckets + 1),
> > > > +params->socket_id, 0);
> > > > +
> > > > +if (r_ext == NULL) {
> > > > +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> > > > "
> > > > +"failed\n");
> > > > +goto err;
> > > > +}
> > > > +}
> > > > +
> > > >  snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
> > > >
> > > >  rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
> > > > @@ -177,18 +214,34 @@ rte_hash_create(const struct rte_hash_parameters
> > > > *params)
> > > >  goto err_unlock;
> > > >  }
> > > >
> > > > -const uint32_t num_buckets = rte_align32pow2(params->entries)
> > > > -/ RTE_HASH_BUCKET_ENTRIES;
> > > > -
> > > >  buckets = rte_zmalloc_socket(NULL,
> > > >  num_buckets * sizeof(struct rte_hash_bucket),
> > > >  RTE_CACHE_LINE_SIZE, params->socket_id);
> > > >
> > > >  if (buckets == NULL) {
> > > > -RTE_LOG(ERR, HASH, "memory allocation failed\n");
> > > > +RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
> > > >  goto err_unlock;
> > > >  }
> > > >
> > > > +/* Allocate same number of extendable buckets */
> > > IMO, we are allocating too much memory to support this feature. Especially, when we claim that keys ending up in the extendable table is
> > a rare occurrence. By doubling the memory we are effectively saying that the main table might have 50% utilization. It will also significantly
> > increase the cycles required to iterate the complete hash table (in rte_hash_iterate API) even when we expect that the extendable table
> > contains very few entries.
> > >
> > > I am wondering if we can provide options to control the amount of extra memory that gets allocated and make the memory allocation
> > dynamic (or on demand basis). I think this also goes well with the general direction DPDK is taking - allocate resources as needed rather than
> > allocating all the resources during initialization.
> > >
> > 
> > Given that adding new entries should not normally be a fast-path function,
> 
> Umm, I don't think I agree with that.
> There are plenty of cases where add/delete speed is important.
> Konstantin

True, I suppose.
Perhaps then the best approach is to give a couple of options to the
developer. Allow specifying an initial amount of memory, and then an option
to allow the memory to grow or not. If the cost of memory allocation is a
problem for a specific app, then they can provide a large default and not
allow growing, while other apps, for whom speed is not that critical, can
provide a small default and allow growing.

Does that seem reasonable?

/Bruce

Ananyev, Konstantin Sept. 27, 2018, 12:33 p.m. UTC | #5

> -----Original Message-----
> From: Richardson, Bruce
> Sent: Thursday, September 27, 2018 1:28 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Wang, Yipeng1 <yipeng1.wang@intel.com>; dev@dpdk.org;
> michel@digirati.com.br
> Subject: Re: [dpdk-dev] [PATCH v2 5/7] hash: add extendable bucket feature
> 
> On Thu, Sep 27, 2018 at 12:27:21PM +0100, Ananyev, Konstantin wrote:
> >
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > > Sent: Thursday, September 27, 2018 12:15 PM
> > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > Cc: Wang, Yipeng1 <yipeng1.wang@intel.com>; dev@dpdk.org; michel@digirati.com.br
> > > Subject: Re: [dpdk-dev] [PATCH v2 5/7] hash: add extendable bucket feature
> > >
> > > On Thu, Sep 27, 2018 at 04:23:48AM +0000, Honnappa Nagarahalli wrote:
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Yipeng Wang <yipeng1.wang@intel.com>
> > > > > Sent: Friday, September 21, 2018 12:18 PM
> > > > > To: bruce.richardson@intel.com
> > > > > Cc: dev@dpdk.org; yipeng1.wang@intel.com; michel@digirati.com.br;
> > > > > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > > > Subject: [PATCH v2 5/7] hash: add extendable bucket feature
> > > > >
> > > > > In use cases that hash table capacity needs to be guaranteed, the extendable
> > > > > bucket feature can be used to contain extra keys in linked lists when conflict
> > > > > happens. This is similar concept to the extendable bucket hash table in packet
> > > > > framework.
> > > > >
> > > > > This commit adds the extendable bucket feature. User can turn it on or off
> > > > > through the extra flag field during table creation time.
> > > > >
> > > > > Extendable bucket table composes of buckets that can be linked list to current
> > > > > main table. When extendable bucket is enabled, the table utilization can
> > > > > always acheive 100%.
> > > > IMO, referring to this as 'table utilization' indicates an efficiency about memory utilization. Please consider changing this to indicate
> that
> > > all of the configured number of entries will be accommodated?
> > > >
> > > > > Although keys ending up in the ext buckets may have longer look up time, they
> > > > > should be rare due to the cuckoo algorithm.
> > > > >
> > > > > Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> > > > > ---
> > > > >  lib/librte_hash/rte_cuckoo_hash.c | 326
> > > > > +++++++++++++++++++++++++++++++++-----
> > > > >  lib/librte_hash/rte_cuckoo_hash.h |   5 +
> > > > >  lib/librte_hash/rte_hash.h        |   3 +
> > > > >  3 files changed, 292 insertions(+), 42 deletions(-)
> > > > >
> > > > > diff --git a/lib/librte_hash/rte_cuckoo_hash.c
> > > > > b/lib/librte_hash/rte_cuckoo_hash.c
> > > > > index f7b86c8..616900b 100644
> > > > > --- a/lib/librte_hash/rte_cuckoo_hash.c
> > > > > +++ b/lib/librte_hash/rte_cuckoo_hash.c
> > > > > @@ -31,6 +31,10 @@
> > > > >  #include "rte_hash.h"
> > > > >  #include "rte_cuckoo_hash.h"
> > > > >
> > > > > +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)
> > > > > \
> > > > > +for (CURRENT_BKT = START_BUCKET;                                      \
> > > > > +CURRENT_BKT != NULL;                                          \
> > > > > +CURRENT_BKT = CURRENT_BKT->next)
> > > > >
> > > > >  TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
> > > > >
> > > > > @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)
> > > > >  return h;
> > > > >  }
> > > > >
> > > > > +static inline struct rte_hash_bucket *
> > > > > +rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt) {
> > > > > +while (lst_bkt->next != NULL)
> > > > > +lst_bkt = lst_bkt->next;
> > > > > +return lst_bkt;
> > > > > +}
> > > > > +
> > > > >  void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)  {
> > > > >  h->cmp_jump_table_idx = KEY_CUSTOM;
> > > > > @@ -85,13 +97,17 @@ rte_hash_create(const struct rte_hash_parameters
> > > > > *params)
> > > > >  struct rte_tailq_entry *te = NULL;
> > > > >  struct rte_hash_list *hash_list;
> > > > >  struct rte_ring *r = NULL;
> > > > > +struct rte_ring *r_ext = NULL;
> > > > >  char hash_name[RTE_HASH_NAMESIZE];
> > > > >  void *k = NULL;
> > > > >  void *buckets = NULL;
> > > > > +void *buckets_ext = NULL;
> > > > >  char ring_name[RTE_RING_NAMESIZE];
> > > > > +char ext_ring_name[RTE_RING_NAMESIZE];
> > > > >  unsigned num_key_slots;
> > > > >  unsigned i;
> > > > >  unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
> > > > > +unsigned int ext_table_support = 0;
> > > > >  unsigned int readwrite_concur_support = 0;
> > > > >
> > > > >  rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
> > > > > @@ -124,6 +140,9 @@ rte_hash_create(const struct rte_hash_parameters
> > > > > *params)
> > > > >  multi_writer_support = 1;
> > > > >  }
> > > > >
> > > > > +if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
> > > > > +ext_table_support = 1;
> > > > > +
> > > > >  /* Store all keys and leave the first entry as a dummy entry for
> > > > > lookup_bulk */
> > > > >  if (multi_writer_support)
> > > > >  /*
> > > > > @@ -145,6 +164,24 @@ rte_hash_create(const struct rte_hash_parameters
> > > > > *params)
> > > > >  goto err;
> > > > >  }
> > > > >
> > > > > +const uint32_t num_buckets = rte_align32pow2(params->entries) /
> > > > > +RTE_HASH_BUCKET_ENTRIES;
> > > > > +
> > > > > +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> > > > > +params-
> > > > > >name);
> > > > Can be inside the if statement below.
> > > >
> > > > > +/* Create ring for extendable buckets. */
> > > > > +if (ext_table_support) {
> > > > > +r_ext = rte_ring_create(ext_ring_name,
> > > > > +rte_align32pow2(num_buckets + 1),
> > > > > +params->socket_id, 0);
> > > > > +
> > > > > +if (r_ext == NULL) {
> > > > > +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> > > > > "
> > > > > +"failed\n");
> > > > > +goto err;
> > > > > +}
> > > > > +}
> > > > > +
> > > > >  snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
> > > > >
> > > > >  rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
> > > > > @@ -177,18 +214,34 @@ rte_hash_create(const struct rte_hash_parameters
> > > > > *params)
> > > > >  goto err_unlock;
> > > > >  }
> > > > >
> > > > > -const uint32_t num_buckets = rte_align32pow2(params->entries)
> > > > > -/ RTE_HASH_BUCKET_ENTRIES;
> > > > > -
> > > > >  buckets = rte_zmalloc_socket(NULL,
> > > > >  num_buckets * sizeof(struct rte_hash_bucket),
> > > > >  RTE_CACHE_LINE_SIZE, params->socket_id);
> > > > >
> > > > >  if (buckets == NULL) {
> > > > > -RTE_LOG(ERR, HASH, "memory allocation failed\n");
> > > > > +RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
> > > > >  goto err_unlock;
> > > > >  }
> > > > >
> > > > > +/* Allocate same number of extendable buckets */
> > > > IMO, we are allocating too much memory to support this feature. Especially, when we claim that keys ending up in the extendable
> table is
> > > a rare occurrence. By doubling the memory we are effectively saying that the main table might have 50% utilization. It will also
> significantly
> > > increase the cycles required to iterate the complete hash table (in rte_hash_iterate API) even when we expect that the extendable table
> > > contains very few entries.
> > > >
> > > > I am wondering if we can provide options to control the amount of extra memory that gets allocated and make the memory allocation
> > > dynamic (or on demand basis). I think this also goes well with the general direction DPDK is taking - allocate resources as needed rather
> than
> > > allocating all the resources during initialization.
> > > >
> > >
> > > Given that adding new entries should not normally be a fast-path function,
> >
> > Umm, I don't think I agree with that.
> > There are plenty of cases where add/delete speed is important.
> > Konstantin
> 
> True, I suppose.
> Perhaps then the best approach is to give a couple of options to the
> developer. Allow specifying an initial amount of memory, and then an option
> to allow the memory to grow or not. If the cost of memory allocation is a
> problem for a specific app, then they can provide a large default and not
> allow growing, while other apps, for whom speed is not that critical, can
> provide a small default and allow growing.
> 
> Does that seem reasonable?

Yes, I think it does.
Konstantin

Honnappa Nagarahalli Sept. 27, 2018, 7:21 p.m. UTC | #6

> > >
> > > Signed-off-by: Yipeng Wang <yipeng1.wang@intel.com>
> > > ---
> > >  lib/librte_hash/rte_cuckoo_hash.c | 326
> > > +++++++++++++++++++++++++++++++++-----
> > >  lib/librte_hash/rte_cuckoo_hash.h |   5 +
> > >  lib/librte_hash/rte_hash.h        |   3 +
> > >  3 files changed, 292 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/lib/librte_hash/rte_cuckoo_hash.c
> > > b/lib/librte_hash/rte_cuckoo_hash.c
> > > index f7b86c8..616900b 100644
> > > --- a/lib/librte_hash/rte_cuckoo_hash.c
> > > +++ b/lib/librte_hash/rte_cuckoo_hash.c
> > > @@ -31,6 +31,10 @@
> > >  #include "rte_hash.h"
> > >  #include "rte_cuckoo_hash.h"
> > >
> > > +#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)
> > > \
> > > +for (CURRENT_BKT = START_BUCKET;                                      \
> > > +CURRENT_BKT != NULL;                                          \
> > > +CURRENT_BKT = CURRENT_BKT->next)
> > >
> > >  TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
> > >
> > > @@ -63,6 +67,14 @@ rte_hash_find_existing(const char *name)  return
> > > h;  }
> > >
> > > +static inline struct rte_hash_bucket * rte_hash_get_last_bkt(struct
> > > +rte_hash_bucket *lst_bkt) { while (lst_bkt->next != NULL) lst_bkt =
> > > +lst_bkt->next; return lst_bkt; }
> > > +
> > >  void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t
> > > func)  {  h->cmp_jump_table_idx = KEY_CUSTOM; @@ -85,13 +97,17 @@
> > > rte_hash_create(const struct rte_hash_parameters
> > > *params)
> > >  struct rte_tailq_entry *te = NULL;
> > >  struct rte_hash_list *hash_list;
> > >  struct rte_ring *r = NULL;
> > > +struct rte_ring *r_ext = NULL;
> > >  char hash_name[RTE_HASH_NAMESIZE];
> > >  void *k = NULL;
> > >  void *buckets = NULL;
> > > +void *buckets_ext = NULL;
> > >  char ring_name[RTE_RING_NAMESIZE];
> > > +char ext_ring_name[RTE_RING_NAMESIZE];
> > >  unsigned num_key_slots;
> > >  unsigned i;
> > >  unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
> > > +unsigned int ext_table_support = 0;
> > >  unsigned int readwrite_concur_support = 0;
> > >
> > >  rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
> > > @@ -124,6 +140,9 @@ rte_hash_create(const struct
> rte_hash_parameters
> > > *params)
> > >  multi_writer_support = 1;
> > >  }
> > >
> > > +if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
> > > +ext_table_support = 1;
> > > +
> > >  /* Store all keys and leave the first entry as a dummy entry for
> > > lookup_bulk */  if (multi_writer_support)
> > >  /*
> > > @@ -145,6 +164,24 @@ rte_hash_create(const struct
> > > rte_hash_parameters
> > > *params)
> > >  goto err;
> > >  }
> > >
> > > +const uint32_t num_buckets = rte_align32pow2(params->entries) /
> > > +RTE_HASH_BUCKET_ENTRIES;
> > > +
> > > +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> > > +params-
> > > >name);
> > Can be inside the if statement below.
> >
> > > +/* Create ring for extendable buckets. */ if (ext_table_support) {
> > > +r_ext = rte_ring_create(ext_ring_name, rte_align32pow2(num_buckets
> > > ++ 1),
> > > +params->socket_id, 0);
> > > +
> > > +if (r_ext == NULL) {
> > > +RTE_LOG(ERR, HASH, "ext buckets memory allocation
> > > "
> > > +"failed\n");
> > > +goto err;
> > > +}
> > > +}
> > > +
> > >  snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
> > >
> > >  rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
> > > @@ -177,18 +214,34 @@ rte_hash_create(const struct
> > > rte_hash_parameters
> > > *params)
> > >  goto err_unlock;
> > >  }
> > >
> > > -const uint32_t num_buckets = rte_align32pow2(params->entries) -/
> > > RTE_HASH_BUCKET_ENTRIES;
> > > -
> > >  buckets = rte_zmalloc_socket(NULL,
> > >  num_buckets * sizeof(struct rte_hash_bucket),  RTE_CACHE_LINE_SIZE,
> > > params->socket_id);
> > >
> > >  if (buckets == NULL) {
> > > -RTE_LOG(ERR, HASH, "memory allocation failed\n");
> > > +RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
> > >  goto err_unlock;
> > >  }
> > >
> > > +/* Allocate same number of extendable buckets */
> > IMO, we are allocating too much memory to support this feature. Especially,
> when we claim that keys ending up in the extendable table is a rare
> occurrence. By doubling the memory we are effectively saying that the main
> table might have 50% utilization. It will also significantly increase the cycles
> required to iterate the complete hash table (in rte_hash_iterate API) even
> when we expect that the extendable table contains very few entries.
> >
> > I am wondering if we can provide options to control the amount of extra
> memory that gets allocated and make the memory allocation dynamic (or on
> demand basis). I think this also goes well with the general direction DPDK is
> taking - allocate resources as needed rather than allocating all the resources
> during initialization.
> >
>
> Given that adding new entries should not normally be a fast-path function,
> how about allowing memory allocation in add itself. Why not initialize with a
> fairly small number of extra bucket entries, and then each time they are all
> used, double the number of entries. That will give efficient resource scaling, I
> think.
>
+1
'small number of extra bucket entries' == 5% of total capacity requested (assuming cuckoo hash will provide 95% efficiency)

> /Bruce
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

Wang, Yipeng1 Sept. 28, 2018, 5:35 p.m. UTC | #7

> -----Original Message-----
> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Thursday, September 27, 2018 12:22 PM
> To: Richardson, Bruce <bruce.richardson@intel.com>
> Cc: Wang, Yipeng1 <yipeng1.wang@intel.com>; dev@dpdk.org;
> michel@digirati.com.br
> Subject: RE: [PATCH v2 5/7] hash: add extendable bucket feature
> 
> > > > +/* Allocate same number of extendable buckets */
> > > IMO, we are allocating too much memory to support this feature.
> > > Especially,
> > when we claim that keys ending up in the extendable table is a rare
> > occurrence. By doubling the memory we are effectively saying that the
> > main table might have 50% utilization. It will also significantly
> > increase the cycles required to iterate the complete hash table (in
> > rte_hash_iterate API) even when we expect that the extendable table
> contains very few entries.
> > >
> > > I am wondering if we can provide options to control the amount of
> > > extra
> > memory that gets allocated and make the memory allocation dynamic (or
> > on demand basis). I think this also goes well with the general
> > direction DPDK is taking - allocate resources as needed rather than
> > allocating all the resources during initialization.
> > >
> >
> > Given that adding new entries should not normally be a fast-path
> > function, how about allowing memory allocation in add itself. Why not
> > initialize with a fairly small number of extra bucket entries, and
> > then each time they are all used, double the number of entries. That
> > will give efficient resource scaling, I think.
> >
> +1
> 'small number of extra bucket entries' == 5% of total capacity requested
> (assuming cuckoo hash will provide 95% efficiency)
> 
> > /Bruce
 [Wang, Yipeng] 
Thanks for the comments.
We allocate same as table size for extendable buckets at creation
because the purpose is to provide capacity guarantee even for the worst scenario
(all keys collide in same buckets).
Applications (e.g. Telco workloads) that require 100% capacity guarantee will be sure
that insertion always succeeds below the specified table size.
With any dynamic memory allocation or less buckets, this guarantee is
broken (even if it is very rare). The dynamic memory allocation could fail. 

Users who do not need such behavior can disable this feature.
Given that the cuckoo algorithm already ensures
very high utilization, they usually do not need the extendable buckets.

Wang, Yipeng1 Sept. 29, 2018, 1:10 a.m. UTC | #8

>-----Original Message-----
>From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
>>
>> Extendable bucket table composes of buckets that can be linked list to current
>> main table. When extendable bucket is enabled, the table utilization can
>> always acheive 100%.
>IMO, referring to this as 'table utilization' indicates an efficiency about memory utilization. Please consider changing this to indicate
>that all of the configured number of entries will be accommodated?
>
[Wang, Yipeng] Improved in V4, please check! Thanks!

>> +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
>> +params-
>> >name);
>Can be inside the if statement below.
[Wang, Yipeng] Done in V3, Thanks!
>
>> +/* Populate ext bkt ring. We reserve 0 similar to the
>> + * key-data slot, just in case in future we want to
>> + * use bucket index for the linked list and 0 means NULL
>> + * for next bucket
>> + */
>> +for (i = 1; i <= num_buckets; i++)
>Since, the bucket index 0 is reserved, should be 'i < num_buckets'
[Wang, Yipeng]  So the bucket array is from 0 to num_buckets - 1, and the index
Is from 1 to num_buckets. So I guess reserving 0 means reserving the index 0 but
not reduce the usable bucket count.
So I guess we still need to enqueue index of 1 to num_buckets into the free
Bucket ring for use?
>
>>  rte_free(h->key_store);
>>  rte_free(h->buckets);
>Add rte_free(h->buckets_ext);
[Wang, Yipeng] Done in V3, thanks!
>
>> +for (i = 1; i < h->num_buckets + 1; i++)
>Index 0 is reserved as per the comments. Condition should be 'i < h->num_buckets'.
[Wang, Yipeng] Similar to previous one, I guess we still need the number of num_buckets index
To be inserted in the ring.
>
>> +bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
>If index 0 is reserved, -1 is not required.
>
[Wang, Yipeng] Similar to previous one, bkt_id is the subscript to the array so it ranges from 0 to num_bucket - 1.
While the index is ranged from 1 to num_bucket. So I guess every time we got a bucket index we need to -1
To get the bucket array subscript. 
>> +if (tobe_removed_bkt) {
>> +uint32_t index = tobe_removed_bkt - h->buckets_ext + 1;
>No need to increase the index by 1 if entry 0 is reserved.
>
[Wang, Yipeng] Similar to previous one.
>> @@ -1308,10 +1519,13 @@ rte_hash_iterate(const struct rte_hash *h, const
>> void **key, void **data, uint32
>>
>>  RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL);
>>
>> -const uint32_t total_entries = h->num_buckets *
>> RTE_HASH_BUCKET_ENTRIES;
>> +const uint32_t total_entries_main = h->num_buckets *
>> +
>> RTE_HASH_BUCKET_ENTRIES;
>> +const uint32_t total_entries = total_entries_main << 1;
>> +
>>  /* Out of bounds */
>Minor: update the comment to reflect the new code.
[Wang, Yipeng] Done in V4, thanks!
>
>> @@ -1341,4 +1555,32 @@ rte_hash_iterate(const struct rte_hash *h, const
>> void **key, void **data, uint32
>>  (*next)++;
>>
>>  return position - 1;
>> +
>> +extend_table:
>> +/* Out of bounds */
>> +if (*next >= total_entries || !h->ext_table_support)
>> +return -ENOENT;
>> +
>> +bucket_idx = (*next - total_entries_main) /
>> RTE_HASH_BUCKET_ENTRIES;
>> +idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
>> +
>> +while (h->buckets_ext[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
>> +(*next)++;
>> +if (*next == total_entries)
>> +return -ENOENT;
>> +bucket_idx = (*next - total_entries_main) /
>> +RTE_HASH_BUCKET_ENTRIES;
>> +idx = (*next - total_entries_main) %
>> RTE_HASH_BUCKET_ENTRIES;
>> +}
>> +/* Get position of entry in key table */
>> +position = h->buckets_ext[bucket_idx].key_idx[idx];
>There is a possibility that 'position' is not the same value read in the while loop. It presents a problem if 'position' becomes
>EMPTY_SLOT. 'position' should be read as part of the while loop. Since it is 32b value, it should be atomic on most platforms. This issue
>applies to existing code as well.
>
[Wang, Yipeng] I agree. I add a new bug fix commit to fix this in V4. Basically I just extend the current critical region to cover
The while loop. Please check if that works. Thanks.

>__hash_rw_reader_lock(h) required
>> +next_key = (struct rte_hash_key *) ((char *)h->key_store +
>> +position * h->key_entry_size);
>> +/* Return key and data */
>> +*key = next_key->key;
>> +*data = next_key->pdata;
>> +
>__hash_rw_reader_unlock(h) required
[Wang, Yipeng] Agree, done in V4.  Thanks!
>

Honnappa Nagarahalli Sept. 29, 2018, 9:09 p.m. UTC | #9

> >
> > > > > +/* Allocate same number of extendable buckets */
> > > > IMO, we are allocating too much memory to support this feature.
> > > > Especially,
> > > when we claim that keys ending up in the extendable table is a rare
> > > occurrence. By doubling the memory we are effectively saying that
> > > the main table might have 50% utilization. It will also
> > > significantly increase the cycles required to iterate the complete
> > > hash table (in rte_hash_iterate API) even when we expect that the
> > > extendable table
> > contains very few entries.
> > > >
> > > > I am wondering if we can provide options to control the amount of
> > > > extra
> > > memory that gets allocated and make the memory allocation dynamic
> > > (or on demand basis). I think this also goes well with the general
> > > direction DPDK is taking - allocate resources as needed rather than
> > > allocating all the resources during initialization.
> > > >
> > >
> > > Given that adding new entries should not normally be a fast-path
> > > function, how about allowing memory allocation in add itself. Why
> > > not initialize with a fairly small number of extra bucket entries,
> > > and then each time they are all used, double the number of entries.
> > > That will give efficient resource scaling, I think.
> > >
> > +1
> > 'small number of extra bucket entries' == 5% of total capacity
> > requested (assuming cuckoo hash will provide 95% efficiency)
> >
> > > /Bruce
>  [Wang, Yipeng]
> Thanks for the comments.
> We allocate same as table size for extendable buckets at creation because the
> purpose is to provide capacity guarantee even for the worst scenario (all keys
> collide in same buckets).
> Applications (e.g. Telco workloads) that require 100% capacity guarantee will
> be sure that insertion always succeeds below the specified table size.
> With any dynamic memory allocation or less buckets, this guarantee is broken
> (even if it is very rare). The dynamic memory allocation could fail.
> 
> Users who do not need such behavior can disable this feature.
> Given that the cuckoo algorithm already ensures very high utilization, they
> usually do not need the extendable buckets.
Adding the dynamic memory allocation will make the code complicated. It is also possible that, keeping this feature disabled, one can create the table with more than required number of entries. I suggest we document the reason for doubling the memory. If someone sees a concrete requirement for dynamic allocation in the future, the code can be changed.

Honnappa Nagarahalli Oct. 1, 2018, 8:56 p.m. UTC | #10

> >-----Original Message-----
> >From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> >>
> >> Extendable bucket table composes of buckets that can be linked list
> >> to current main table. When extendable bucket is enabled, the table
> >> utilization can always acheive 100%.
> >IMO, referring to this as 'table utilization' indicates an efficiency
> >about memory utilization. Please consider changing this to indicate that all of
> the configured number of entries will be accommodated?
> >
> [Wang, Yipeng] Improved in V4, please check! Thanks!
> 
> >> +snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
> >> +params-
> >> >name);
> >Can be inside the if statement below.
> [Wang, Yipeng] Done in V3, Thanks!
> >
> >> +/* Populate ext bkt ring. We reserve 0 similar to the
> >> + * key-data slot, just in case in future we want to
> >> + * use bucket index for the linked list and 0 means NULL
> >> + * for next bucket
> >> + */
> >> +for (i = 1; i <= num_buckets; i++)
> >Since, the bucket index 0 is reserved, should be 'i < num_buckets'
> [Wang, Yipeng]  So the bucket array is from 0 to num_buckets - 1, and the
> index Is from 1 to num_buckets. So I guess reserving 0 means reserving the
> index 0 but not reduce the usable bucket count.
> So I guess we still need to enqueue index of 1 to num_buckets into the free
> Bucket ring for use?
I understand it now. I mis-read the 'similar to the key-data slot' comment. I see that the changes are correct.
Minor comment, I am not particular: I think it makes sense to change it to the same logic followed for key-data slot. i.e. allocate an extra bucket.

> >
> >>  rte_free(h->key_store);
> >>  rte_free(h->buckets);
> >Add rte_free(h->buckets_ext);
> [Wang, Yipeng] Done in V3, thanks!
> >
> >> +for (i = 1; i < h->num_buckets + 1; i++)
Minor comment:
If we are not changing the logic, I suggest we change the for loop as follows (like it is done earlier)
for (i = 1; i <= h->num_buckets; i++)


> >Index 0 is reserved as per the comments. Condition should be 'i < h-
> >num_buckets'.
> [Wang, Yipeng] Similar to previous one, I guess we still need the number of
> num_buckets index To be inserted in the ring.
> >
> >> +bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
> >If index 0 is reserved, -1 is not required.
> >
> [Wang, Yipeng] Similar to previous one, bkt_id is the subscript to the array so
> it ranges from 0 to num_bucket - 1.
> While the index is ranged from 1 to num_bucket. So I guess every time we got
> a bucket index we need to -1 To get the bucket array subscript.
> >> +if (tobe_removed_bkt) {
> >> +uint32_t index = tobe_removed_bkt - h->buckets_ext + 1;
> >No need to increase the index by 1 if entry 0 is reserved.
> >
> [Wang, Yipeng] Similar to previous one.
> >> @@ -1308,10 +1519,13 @@ rte_hash_iterate(const struct rte_hash *h,
> >> const void **key, void **data, uint32
> >>
> >>  RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL);
> >>
> >> -const uint32_t total_entries = h->num_buckets *
> >> RTE_HASH_BUCKET_ENTRIES;
> >> +const uint32_t total_entries_main = h->num_buckets *
> >> +
> >> RTE_HASH_BUCKET_ENTRIES;
> >> +const uint32_t total_entries = total_entries_main << 1;
> >> +
> >>  /* Out of bounds */
> >Minor: update the comment to reflect the new code.
> [Wang, Yipeng] Done in V4, thanks!
> >
> >> @@ -1341,4 +1555,32 @@ rte_hash_iterate(const struct rte_hash *h,
> >> const void **key, void **data, uint32  (*next)++;
> >>
> >>  return position - 1;
> >> +
> >> +extend_table:
> >> +/* Out of bounds */
> >> +if (*next >= total_entries || !h->ext_table_support) return -ENOENT;
> >> +
> >> +bucket_idx = (*next - total_entries_main) /
> >> RTE_HASH_BUCKET_ENTRIES;
> >> +idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
> >> +
> >> +while (h->buckets_ext[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
> >> +(*next)++; if (*next == total_entries) return -ENOENT; bucket_idx =
> >> +(*next - total_entries_main) / RTE_HASH_BUCKET_ENTRIES; idx = (*next
> >> +- total_entries_main) %
> >> RTE_HASH_BUCKET_ENTRIES;
> >> +}
> >> +/* Get position of entry in key table */ position =
> >> +h->buckets_ext[bucket_idx].key_idx[idx];
> >There is a possibility that 'position' is not the same value read in
> >the while loop. It presents a problem if 'position' becomes EMPTY_SLOT.
> >'position' should be read as part of the while loop. Since it is 32b value, it
> should be atomic on most platforms. This issue applies to existing code as
> well.
> >
> [Wang, Yipeng] I agree. I add a new bug fix commit to fix this in V4. Basically I
> just extend the current critical region to cover The while loop. Please check if
> that works. Thanks.
> 
> >__hash_rw_reader_lock(h) required
> >> +next_key = (struct rte_hash_key *) ((char *)h->key_store + position
> >> +* h->key_entry_size);
> >> +/* Return key and data */
> >> +*key = next_key->key;
> >> +*data = next_key->pdata;
> >> +
> >__hash_rw_reader_unlock(h) required
> [Wang, Yipeng] Agree, done in V4.  Thanks!
> >

Wang, Yipeng1 Oct. 2, 2018, 1:56 a.m. UTC | #11

>-----Original Message-----
>From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
>> >> +for (i = 1; i <= num_buckets; i++)
>> >Since, the bucket index 0 is reserved, should be 'i < num_buckets'
>> [Wang, Yipeng]  So the bucket array is from 0 to num_buckets - 1, and the
>> index Is from 1 to num_buckets. So I guess reserving 0 means reserving the
>> index 0 but not reduce the usable bucket count.
>> So I guess we still need to enqueue index of 1 to num_buckets into the free
>> Bucket ring for use?
>I understand it now. I mis-read the 'similar to the key-data slot' comment. I see that the changes are correct.
>Minor comment, I am not particular: I think it makes sense to change it to the same logic followed for key-data slot. i.e. allocate an
>extra bucket.
[Wang, Yipeng] hmm, I think key-data slot doing similar thing that allocates same number of slots.
As I re-read the code, maybe the current code allocate one more than needed?
>
>> >
>> >>  rte_free(h->key_store);
>> >>  rte_free(h->buckets);
>> >Add rte_free(h->buckets_ext);
>> [Wang, Yipeng] Done in V3, thanks!
>> >
>> >> +for (i = 1; i < h->num_buckets + 1; i++)
>Minor comment:
>If we are not changing the logic, I suggest we change the for loop as follows (like it is done earlier)
>for (i = 1; i <= h->num_buckets; i++)
>
[Wang, Yipeng] Thanks, I did in V5.

diff mbox series

Patch

diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c
index f7b86c8..616900b 100644
--- a/lib/librte_hash/rte_cuckoo_hash.c
+++ b/lib/librte_hash/rte_cuckoo_hash.c
@@ -31,6 +31,10 @@ 
 #include "rte_hash.h"
 #include "rte_cuckoo_hash.h"
 
+#define FOR_EACH_BUCKET(CURRENT_BKT, START_BUCKET)                            \
+	for (CURRENT_BKT = START_BUCKET;                                      \
+		CURRENT_BKT != NULL;                                          \
+		CURRENT_BKT = CURRENT_BKT->next)
 
 TAILQ_HEAD(rte_hash_list, rte_tailq_entry);
 
@@ -63,6 +67,14 @@  rte_hash_find_existing(const char *name)
 	return h;
 }
 
+static inline struct rte_hash_bucket *
+rte_hash_get_last_bkt(struct rte_hash_bucket *lst_bkt)
+{
+	while (lst_bkt->next != NULL)
+		lst_bkt = lst_bkt->next;
+	return lst_bkt;
+}
+
 void rte_hash_set_cmp_func(struct rte_hash *h, rte_hash_cmp_eq_t func)
 {
 	h->cmp_jump_table_idx = KEY_CUSTOM;
@@ -85,13 +97,17 @@  rte_hash_create(const struct rte_hash_parameters *params)
 	struct rte_tailq_entry *te = NULL;
 	struct rte_hash_list *hash_list;
 	struct rte_ring *r = NULL;
+	struct rte_ring *r_ext = NULL;
 	char hash_name[RTE_HASH_NAMESIZE];
 	void *k = NULL;
 	void *buckets = NULL;
+	void *buckets_ext = NULL;
 	char ring_name[RTE_RING_NAMESIZE];
+	char ext_ring_name[RTE_RING_NAMESIZE];
 	unsigned num_key_slots;
 	unsigned i;
 	unsigned int hw_trans_mem_support = 0, multi_writer_support = 0;
+	unsigned int ext_table_support = 0;
 	unsigned int readwrite_concur_support = 0;
 
 	rte_hash_function default_hash_func = (rte_hash_function)rte_jhash;
@@ -124,6 +140,9 @@  rte_hash_create(const struct rte_hash_parameters *params)
 		multi_writer_support = 1;
 	}
 
+	if (params->extra_flag & RTE_HASH_EXTRA_FLAGS_EXT_TABLE)
+		ext_table_support = 1;
+
 	/* Store all keys and leave the first entry as a dummy entry for lookup_bulk */
 	if (multi_writer_support)
 		/*
@@ -145,6 +164,24 @@  rte_hash_create(const struct rte_hash_parameters *params)
 		goto err;
 	}
 
+	const uint32_t num_buckets = rte_align32pow2(params->entries) /
+						RTE_HASH_BUCKET_ENTRIES;
+
+	snprintf(ext_ring_name, sizeof(ext_ring_name), "HT_EXT_%s",
+								params->name);
+	/* Create ring for extendable buckets. */
+	if (ext_table_support) {
+		r_ext = rte_ring_create(ext_ring_name,
+				rte_align32pow2(num_buckets + 1),
+				params->socket_id, 0);
+
+		if (r_ext == NULL) {
+			RTE_LOG(ERR, HASH, "ext buckets memory allocation "
+								"failed\n");
+			goto err;
+		}
+	}
+
 	snprintf(hash_name, sizeof(hash_name), "HT_%s", params->name);
 
 	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
@@ -177,18 +214,34 @@  rte_hash_create(const struct rte_hash_parameters *params)
 		goto err_unlock;
 	}
 
-	const uint32_t num_buckets = rte_align32pow2(params->entries)
-					/ RTE_HASH_BUCKET_ENTRIES;
-
 	buckets = rte_zmalloc_socket(NULL,
 				num_buckets * sizeof(struct rte_hash_bucket),
 				RTE_CACHE_LINE_SIZE, params->socket_id);
 
 	if (buckets == NULL) {
-		RTE_LOG(ERR, HASH, "memory allocation failed\n");
+		RTE_LOG(ERR, HASH, "buckets memory allocation failed\n");
 		goto err_unlock;
 	}
 
+	/* Allocate same number of extendable buckets */
+	if (ext_table_support) {
+		buckets_ext = rte_zmalloc_socket(NULL,
+				num_buckets * sizeof(struct rte_hash_bucket),
+				RTE_CACHE_LINE_SIZE, params->socket_id);
+		if (buckets_ext == NULL) {
+			RTE_LOG(ERR, HASH, "ext buckets memory allocation "
+							"failed\n");
+			goto err_unlock;
+		}
+		/* Populate ext bkt ring. We reserve 0 similar to the
+		 * key-data slot, just in case in future we want to
+		 * use bucket index for the linked list and 0 means NULL
+		 * for next bucket
+		 */
+		for (i = 1; i <= num_buckets; i++)
+			rte_ring_sp_enqueue(r_ext, (void *)((uintptr_t) i));
+	}
+
 	const uint32_t key_entry_size = sizeof(struct rte_hash_key) + params->key_len;
 	const uint64_t key_tbl_size = (uint64_t) key_entry_size * num_key_slots;
 
@@ -262,6 +315,8 @@  rte_hash_create(const struct rte_hash_parameters *params)
 	h->num_buckets = num_buckets;
 	h->bucket_bitmask = h->num_buckets - 1;
 	h->buckets = buckets;
+	h->buckets_ext = buckets_ext;
+	h->free_ext_bkts = r_ext;
 	h->hash_func = (params->hash_func == NULL) ?
 		default_hash_func : params->hash_func;
 	h->key_store = k;
@@ -269,6 +324,7 @@  rte_hash_create(const struct rte_hash_parameters *params)
 	h->hw_trans_mem_support = hw_trans_mem_support;
 	h->multi_writer_support = multi_writer_support;
 	h->readwrite_concur_support = readwrite_concur_support;
+	h->ext_table_support = ext_table_support;
 
 #if defined(RTE_ARCH_X86)
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
@@ -304,9 +360,11 @@  rte_hash_create(const struct rte_hash_parameters *params)
 	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
 err:
 	rte_ring_free(r);
+	rte_ring_free(r_ext);
 	rte_free(te);
 	rte_free(h);
 	rte_free(buckets);
+	rte_free(buckets_ext);
 	rte_free(k);
 	return NULL;
 }
@@ -344,6 +402,7 @@  rte_hash_free(struct rte_hash *h)
 		rte_free(h->readwrite_lock);
 	}
 	rte_ring_free(h->free_slots);
+	rte_ring_free(h->free_ext_bkts);
 	rte_free(h->key_store);
 	rte_free(h->buckets);
 	rte_free(h);
@@ -403,7 +462,6 @@  __hash_rw_writer_lock(const struct rte_hash *h)
 		rte_rwlock_write_lock(h->readwrite_lock);
 }
 
-
 static inline void
 __hash_rw_reader_lock(const struct rte_hash *h)
 {
@@ -448,6 +506,14 @@  rte_hash_reset(struct rte_hash *h)
 	while (rte_ring_dequeue(h->free_slots, &ptr) == 0)
 		rte_pause();
 
+	/* clear free extendable bucket ring and memory */
+	if (h->ext_table_support) {
+		memset(h->buckets_ext, 0, h->num_buckets *
+						sizeof(struct rte_hash_bucket));
+		while (rte_ring_dequeue(h->free_ext_bkts, &ptr) == 0)
+			rte_pause();
+	}
+
 	/* Repopulate the free slots ring. Entry zero is reserved for key misses */
 	if (h->multi_writer_support)
 		tot_ring_cnt = h->entries + (RTE_MAX_LCORE - 1) *
@@ -458,6 +524,12 @@  rte_hash_reset(struct rte_hash *h)
 	for (i = 1; i < tot_ring_cnt + 1; i++)
 		rte_ring_sp_enqueue(h->free_slots, (void *)((uintptr_t) i));
 
+	/* Repopulate the free ext bkt ring. */
+	if (h->ext_table_support)
+		for (i = 1; i < h->num_buckets + 1; i++)
+			rte_ring_sp_enqueue(h->free_ext_bkts,
+						(void *)((uintptr_t) i));
+
 	if (h->multi_writer_support) {
 		/* Reset local caches per lcore */
 		for (i = 0; i < RTE_MAX_LCORE; i++)
@@ -524,24 +596,27 @@  rte_hash_cuckoo_insert_mw(const struct rte_hash *h,
 		int32_t *ret_val)
 {
 	unsigned int i;
-	struct rte_hash_bucket *cur_bkt = prim_bkt;
+	struct rte_hash_bucket *cur_bkt;
 	int32_t ret;
 
 	__hash_rw_writer_lock(h);
 	/* Check if key was inserted after last check but before this
 	 * protected region in case of inserting duplicated keys.
 	 */
-	ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash);
+	ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash);
 	if (ret != -1) {
 		__hash_rw_writer_unlock(h);
 		*ret_val = ret;
 		return 1;
 	}
-	ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig);
-	if (ret != -1) {
-		__hash_rw_writer_unlock(h);
-		*ret_val = ret;
-		return 1;
+
+	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+		ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
+		if (ret != -1) {
+			__hash_rw_writer_unlock(h);
+			*ret_val = ret;
+			return 1;
+		}
 	}
 
 	/* Insert new entry if there is room in the primary
@@ -580,7 +655,7 @@  rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
 			int32_t *ret_val)
 {
 	uint32_t prev_alt_bkt_idx;
-	struct rte_hash_bucket *cur_bkt = bkt;
+	struct rte_hash_bucket *cur_bkt;
 	struct queue_node *prev_node, *curr_node = leaf;
 	struct rte_hash_bucket *prev_bkt, *curr_bkt = leaf->bkt;
 	uint32_t prev_slot, curr_slot = leaf_slot;
@@ -597,18 +672,20 @@  rte_hash_cuckoo_move_insert_mw(const struct rte_hash *h,
 	/* Check if key was inserted after last check but before this
 	 * protected region.
 	 */
-	ret = search_and_update(h, data, key, cur_bkt, sig, alt_hash);
+	ret = search_and_update(h, data, key, bkt, sig, alt_hash);
 	if (ret != -1) {
 		__hash_rw_writer_unlock(h);
 		*ret_val = ret;
 		return 1;
 	}
 
-	ret = search_and_update(h, data, key, alt_bkt, alt_hash, sig);
-	if (ret != -1) {
-		__hash_rw_writer_unlock(h);
-		*ret_val = ret;
-		return 1;
+	FOR_EACH_BUCKET(cur_bkt, alt_bkt) {
+		ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
+		if (ret != -1) {
+			__hash_rw_writer_unlock(h);
+			*ret_val = ret;
+			return 1;
+		}
 	}
 
 	while (likely(curr_node->prev != NULL)) {
@@ -711,15 +788,18 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
 {
 	hash_sig_t alt_hash;
 	uint32_t prim_bucket_idx, sec_bucket_idx;
-	struct rte_hash_bucket *prim_bkt, *sec_bkt;
+	struct rte_hash_bucket *prim_bkt, *sec_bkt, *cur_bkt;
 	struct rte_hash_key *new_k, *keys = h->key_store;
 	void *slot_id = NULL;
-	uint32_t new_idx;
+	void *ext_bkt_id = NULL;
+	uint32_t new_idx, bkt_id;
 	int ret;
 	unsigned n_slots;
 	unsigned lcore_id;
+	unsigned int i;
 	struct lcore_cache *cached_free_slots = NULL;
 	int32_t ret_val;
+	struct rte_hash_bucket *last;
 
 	prim_bucket_idx = sig & h->bucket_bitmask;
 	prim_bkt = &h->buckets[prim_bucket_idx];
@@ -739,10 +819,12 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
 	}
 
 	/* Check if key is already inserted in secondary location */
-	ret = search_and_update(h, data, key, sec_bkt, alt_hash, sig);
-	if (ret != -1) {
-		__hash_rw_writer_unlock(h);
-		return ret;
+	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+		ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
+		if (ret != -1) {
+			__hash_rw_writer_unlock(h);
+			return ret;
+		}
 	}
 	__hash_rw_writer_unlock(h);
 
@@ -808,10 +890,71 @@  __rte_hash_add_key_with_hash(const struct rte_hash *h, const void *key,
 	else if (ret == 1) {
 		enqueue_slot_back(h, cached_free_slots, slot_id);
 		return ret_val;
-	} else {
+	}
+
+	/* if ext table not enabled, we failed the insertion */
+	if (!h->ext_table_support) {
 		enqueue_slot_back(h, cached_free_slots, slot_id);
 		return ret;
 	}
+
+	/* Now we need to go through the extendable bucket. Protection is needed
+	 * to protect all extendable bucket processes.
+	 */
+	__hash_rw_writer_lock(h);
+	/* We check for duplicates again since could be inserted before the lock */
+	ret = search_and_update(h, data, key, prim_bkt, sig, alt_hash);
+	if (ret != -1) {
+		enqueue_slot_back(h, cached_free_slots, slot_id);
+		goto failure;
+	}
+
+	FOR_EACH_BUCKET(cur_bkt, sec_bkt) {
+		ret = search_and_update(h, data, key, cur_bkt, alt_hash, sig);
+		if (ret != -1) {
+			enqueue_slot_back(h, cached_free_slots, slot_id);
+			goto failure;
+		}
+	}
+
+	/* Search extendable buckets to find an empty entry to insert. */
+	struct rte_hash_bucket *next_bkt = sec_bkt->next;
+	FOR_EACH_BUCKET(cur_bkt, next_bkt) {
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			/* Check if slot is available */
+			if (likely(cur_bkt->key_idx[i] == EMPTY_SLOT)) {
+				cur_bkt->sig_current[i] = alt_hash;
+				cur_bkt->sig_alt[i] = sig;
+				cur_bkt->key_idx[i] = new_idx;
+				__hash_rw_writer_unlock(h);
+				return new_idx - 1;
+			}
+		}
+	}
+
+	/* Failed to get an empty entry from extendable buckets. Link a new
+	 * extendable bucket. We first get a free bucket from ring.
+	 */
+	if (rte_ring_sc_dequeue(h->free_ext_bkts, &ext_bkt_id) != 0) {
+		ret = -ENOSPC;
+		goto failure;
+	}
+
+	bkt_id = (uint32_t)((uintptr_t)ext_bkt_id) - 1;
+	/* Use the first location of the new bucket */
+	(h->buckets_ext[bkt_id]).sig_current[0] = alt_hash;
+	(h->buckets_ext[bkt_id]).sig_alt[0] = sig;
+	(h->buckets_ext[bkt_id]).key_idx[0] = new_idx;
+	/* Link the new bucket to sec bucket linked list */
+	last = rte_hash_get_last_bkt(sec_bkt);
+	last->next = &h->buckets_ext[bkt_id];
+	__hash_rw_writer_unlock(h);
+	return new_idx - 1;
+
+failure:
+	__hash_rw_writer_unlock(h);
+	return ret;
+
 }
 
 int32_t
@@ -890,7 +1033,7 @@  __rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key,
 {
 	uint32_t bucket_idx;
 	hash_sig_t alt_hash;
-	struct rte_hash_bucket *bkt;
+	struct rte_hash_bucket *bkt, *cur_bkt;
 	int ret;
 
 	bucket_idx = sig & h->bucket_bitmask;
@@ -910,10 +1053,12 @@  __rte_hash_lookup_with_hash(const struct rte_hash *h, const void *key,
 	bkt = &h->buckets[bucket_idx];
 
 	/* Check if key is in secondary location */
-	ret = search_one_bucket(h, key, alt_hash, data, bkt);
-	if (ret != -1) {
-		__hash_rw_reader_unlock(h);
-		return ret;
+	FOR_EACH_BUCKET(cur_bkt, bkt) {
+		ret = search_one_bucket(h, key, alt_hash, data, cur_bkt);
+		if (ret != -1) {
+			__hash_rw_reader_unlock(h);
+			return ret;
+		}
 	}
 	__hash_rw_reader_unlock(h);
 	return -ENOENT;
@@ -1015,15 +1160,17 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
 {
 	uint32_t bucket_idx;
 	hash_sig_t alt_hash;
-	struct rte_hash_bucket *bkt;
-	int32_t ret;
+	struct rte_hash_bucket *prim_bkt, *sec_bkt;
+	struct rte_hash_bucket *cur_bkt, *prev_bkt, *next_bkt;
+	int32_t ret, i;
+	struct rte_hash_bucket *tobe_removed_bkt = NULL;
 
 	bucket_idx = sig & h->bucket_bitmask;
-	bkt = &h->buckets[bucket_idx];
+	prim_bkt = &h->buckets[bucket_idx];
 
 	__hash_rw_writer_lock(h);
 	/* look for key in primary bucket */
-	ret = search_and_remove(h, key, bkt, sig);
+	ret = search_and_remove(h, key, prim_bkt, sig);
 	if (ret != -1) {
 		__hash_rw_writer_unlock(h);
 		return ret;
@@ -1032,17 +1179,51 @@  __rte_hash_del_key_with_hash(const struct rte_hash *h, const void *key,
 	/* Calculate secondary hash */
 	alt_hash = rte_hash_secondary_hash(sig);
 	bucket_idx = alt_hash & h->bucket_bitmask;
-	bkt = &h->buckets[bucket_idx];
+	sec_bkt = &h->buckets[bucket_idx];
 
 	/* look for key in secondary bucket */
-	ret = search_and_remove(h, key, bkt, alt_hash);
+	ret = search_and_remove(h, key, sec_bkt, alt_hash);
 	if (ret != -1) {
 		__hash_rw_writer_unlock(h);
 		return ret;
 	}
 
+	/* Not in main table, we need to search ext buckets */
+	if (h->ext_table_support) {
+		next_bkt = sec_bkt->next;
+		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
+			ret = search_and_remove(h, key, cur_bkt, alt_hash);
+			if (ret != -1)
+				goto return_bkt;
+		}
+	}
+
 	__hash_rw_writer_unlock(h);
 	return -ENOENT;
+
+/* Search extendable buckets to see if any empty bucket need to be recycled */
+return_bkt:
+	for (cur_bkt = sec_bkt->next, prev_bkt = sec_bkt; cur_bkt != NULL;
+			prev_bkt = cur_bkt, cur_bkt = cur_bkt->next) {
+		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			if (cur_bkt->key_idx[i] != EMPTY_SLOT)
+				break;
+		}
+		if (i == RTE_HASH_BUCKET_ENTRIES) {
+			prev_bkt->next = cur_bkt->next;
+			cur_bkt->next = NULL;
+			tobe_removed_bkt = cur_bkt;
+			break;
+		}
+	}
+
+	__hash_rw_writer_unlock(h);
+
+	if (tobe_removed_bkt) {
+		uint32_t index = tobe_removed_bkt - h->buckets_ext + 1;
+		rte_ring_mp_enqueue(h->free_ext_bkts, (void *)(uintptr_t)index);
+	}
+	return ret;
 }
 
 int32_t
@@ -1143,12 +1324,14 @@  __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
 {
 	uint64_t hits = 0;
 	int32_t i;
+	int32_t ret;
 	uint32_t prim_hash[RTE_HASH_LOOKUP_BULK_MAX];
 	uint32_t sec_hash[RTE_HASH_LOOKUP_BULK_MAX];
 	const struct rte_hash_bucket *primary_bkt[RTE_HASH_LOOKUP_BULK_MAX];
 	const struct rte_hash_bucket *secondary_bkt[RTE_HASH_LOOKUP_BULK_MAX];
 	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
 	/* Prefetch first keys */
 	for (i = 0; i < PREFETCH_OFFSET && i < num_keys; i++)
@@ -1266,6 +1449,34 @@  __rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
 		continue;
 	}
 
+	/* all found, do not need to go through ext bkt */
+	if ((hits == ((1ULL << num_keys) - 1)) || !h->ext_table_support) {
+		if (hit_mask != NULL)
+			*hit_mask = hits;
+		__hash_rw_reader_unlock(h);
+		return;
+	}
+
+	/* need to check ext buckets for match */
+	for (i = 0; i < num_keys; i++) {
+		if ((hits & (1ULL << i)) != 0)
+			continue;
+		next_bkt = secondary_bkt[i]->next;
+		FOR_EACH_BUCKET(cur_bkt, next_bkt) {
+			if (data != NULL)
+				ret = search_one_bucket(h, keys[i],
+						sec_hash[i], &data[i], cur_bkt);
+			else
+				ret = search_one_bucket(h, keys[i],
+						sec_hash[i], NULL, cur_bkt);
+			if (ret != -1) {
+				positions[i] = ret;
+				hits |= 1ULL << i;
+				break;
+			}
+		}
+	}
+
 	__hash_rw_reader_unlock(h);
 
 	if (hit_mask != NULL)
@@ -1308,10 +1519,13 @@  rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32
 
 	RETURN_IF_TRUE(((h == NULL) || (next == NULL)), -EINVAL);
 
-	const uint32_t total_entries = h->num_buckets * RTE_HASH_BUCKET_ENTRIES;
+	const uint32_t total_entries_main = h->num_buckets *
+							RTE_HASH_BUCKET_ENTRIES;
+	const uint32_t total_entries = total_entries_main << 1;
+
 	/* Out of bounds */
-	if (*next >= total_entries)
-		return -ENOENT;
+	if (*next >= total_entries_main)
+		goto extend_table;
 
 	/* Calculate bucket and index of current iterator */
 	bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES;
@@ -1321,8 +1535,8 @@  rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32
 	while (h->buckets[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
 		(*next)++;
 		/* End of table */
-		if (*next == total_entries)
-			return -ENOENT;
+		if (*next == total_entries_main)
+			goto extend_table;
 		bucket_idx = *next / RTE_HASH_BUCKET_ENTRIES;
 		idx = *next % RTE_HASH_BUCKET_ENTRIES;
 	}
@@ -1341,4 +1555,32 @@  rte_hash_iterate(const struct rte_hash *h, const void **key, void **data, uint32
 	(*next)++;
 
 	return position - 1;
+
+extend_table:
+	/* Out of bounds */
+	if (*next >= total_entries || !h->ext_table_support)
+		return -ENOENT;
+
+	bucket_idx = (*next - total_entries_main) / RTE_HASH_BUCKET_ENTRIES;
+	idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
+
+	while (h->buckets_ext[bucket_idx].key_idx[idx] == EMPTY_SLOT) {
+		(*next)++;
+		if (*next == total_entries)
+			return -ENOENT;
+		bucket_idx = (*next - total_entries_main) /
+						RTE_HASH_BUCKET_ENTRIES;
+		idx = (*next - total_entries_main) % RTE_HASH_BUCKET_ENTRIES;
+	}
+	/* Get position of entry in key table */
+	position = h->buckets_ext[bucket_idx].key_idx[idx];
+	next_key = (struct rte_hash_key *) ((char *)h->key_store +
+				position * h->key_entry_size);
+	/* Return key and data */
+	*key = next_key->key;
+	*data = next_key->pdata;
+
+	/* Increment iterator */
+	(*next)++;
+	return position - 1;
 }
diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h
index fc0e5c2..e601520 100644
--- a/lib/librte_hash/rte_cuckoo_hash.h
+++ b/lib/librte_hash/rte_cuckoo_hash.h
@@ -142,6 +142,8 @@  struct rte_hash_bucket {
 	hash_sig_t sig_alt[RTE_HASH_BUCKET_ENTRIES];
 
 	uint8_t flag[RTE_HASH_BUCKET_ENTRIES];
+
+	void *next;
 } __rte_cache_aligned;
 
 /** A hash table structure. */
@@ -166,6 +168,7 @@  struct rte_hash {
 	/**< If multi-writer support is enabled. */
 	uint8_t readwrite_concur_support;
 	/**< If read-write concurrency support is enabled */
+	uint8_t ext_table_support;     /**< Enable extendable bucket table */
 	rte_hash_function hash_func;    /**< Function used to calculate hash. */
 	uint32_t hash_func_init_val;    /**< Init value used by hash_func. */
 	rte_hash_cmp_eq_t rte_hash_custom_cmp_eq;
@@ -184,6 +187,8 @@  struct rte_hash {
 	 * to the key table.
 	 */
 	rte_rwlock_t *readwrite_lock; /**< Read-write lock thread-safety. */
+	struct rte_hash_bucket *buckets_ext; /**< Extra buckets array */
+	struct rte_ring *free_ext_bkts; /**< Ring of indexes of free buckets */
 } __rte_cache_aligned;
 
 struct queue_node {
diff --git a/lib/librte_hash/rte_hash.h b/lib/librte_hash/rte_hash.h
index 9e7d931..11d8e28 100644
--- a/lib/librte_hash/rte_hash.h
+++ b/lib/librte_hash/rte_hash.h
@@ -37,6 +37,9 @@  extern "C" {
 /** Flag to support reader writer concurrency */
 #define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY 0x04
 
+/** Flag to indicate the extendabe bucket table feature should be used */
+#define RTE_HASH_EXTRA_FLAGS_EXT_TABLE 0x08
+
 /** Signature of key that is stored internally. */
 typedef uint32_t hash_sig_t;