[v4,06/29] graph: populate fastpath memory for graph reel

Message ID 20200405085613.1336841-7-jerinj@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series graph: introduce graph subsystem |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Jerin Jacob Kollanukkaran April 5, 2020, 8:55 a.m. UTC
  From: Jerin Jacob <jerinj@marvell.com>

Adding support to create and populate the memory for graph reel.
This includes reserving the memory in the memzone, populating the nodes,
Allocating memory for node-specific streams to hold objects.

Once it is populated the reel memory contains the following sections.

+---------------------+
|   Graph Header      |
+---------------------+
|   Fence             |
+---------------------+
|   Circular buffer   |
+---------------------+
|   Fence             |
+---------------------+
|   Node Object 0     |
+------------------- -+
|   Node Object 1     |
+------------------- -+
|   Node Object 2     |
+------------------- -+
|   Node Object n     |
+------------------- -+

Signed-off-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
---
 lib/librte_graph/Makefile              |   2 +
 lib/librte_graph/graph.c               |  16 ++
 lib/librte_graph/graph_populate.c      | 234 +++++++++++++++++++++++++
 lib/librte_graph/graph_private.h       |  64 +++++++
 lib/librte_graph/meson.build           |   4 +-
 lib/librte_graph/node.c                |   5 +
 lib/librte_graph/rte_graph_version.map |   1 +
 lib/librte_graph/rte_graph_worker.h    | 108 ++++++++++++
 8 files changed, 432 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_graph/graph_populate.c
 create mode 100644 lib/librte_graph/rte_graph_worker.h
  

Comments

Andrzej Ostruszka April 8, 2020, 5:30 p.m. UTC | #1
On 4/5/20 10:55 AM, jerinj@marvell.com wrote:
> From: Jerin Jacob <jerinj@marvell.com>
[...]
> diff --git a/lib/librte_graph/graph_populate.c b/lib/librte_graph/graph_populate.c
> new file mode 100644
> index 000000000..093512efa
> --- /dev/null
> +++ b/lib/librte_graph/graph_populate.c
> @@ -0,0 +1,234 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2020 Marvell International Ltd.
> + */
> +
> +#include <fnmatch.h>
> +#include <stdbool.h>
> +
> +#include <rte_common.h>
> +#include <rte_errno.h>
> +#include <rte_malloc.h>
> +#include <rte_memzone.h>
> +
> +#include "graph_private.h"
> +
> +static size_t
> +graph_fp_mem_calc_size(struct graph *graph)
> +{
> +	struct graph_node *graph_node;
> +	rte_node_t val;
> +	size_t sz;
> +
> +	/* Graph header */
> +	sz = sizeof(struct rte_graph);
> +	/* Source nodes list */
> +	sz += sizeof(rte_graph_off_t) * graph->src_node_count;
> +	/* Circular buffer for pending streams of size number of nodes */
> +	val = rte_align32pow2(graph->node_count * sizeof(rte_graph_off_t));
> +	sz = RTE_ALIGN(sz, val);
> +	graph->cir_start = sz;
> +	graph->cir_mask = rte_align32pow2(graph->node_count) - 1;
> +	sz += val;

Aren't here source nodes counted twice?  I'm trying now to wrap my head
around how this all is structured and laid out in memory (thus the
slowdown in review) so I am most probably missing something here.

> +	/* Fence */
> +	sz += sizeof(RTE_GRAPH_FENCE);
> +	sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
> +	graph->nodes_start = sz;
> +	/* For 0..N node objects with fence */
> +	STAILQ_FOREACH(graph_node, &graph->node_list, next) {
> +		sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
> +		sz += sizeof(struct rte_node);
> +		/* Pointer to next nodes(edges) */
> +		sz += sizeof(struct rte_node *) * graph_node->node->nb_edges;
> +	}
> +
> +	graph->mem_sz = sz;
> +	return sz;
> +}
> +
> +static void
> +graph_header_popluate(struct graph *_graph)
> +{
> +	struct rte_graph *graph = _graph->graph;
> +
> +	graph->tail = 0;
> +	graph->head = (int32_t)-_graph->src_node_count;
> +	graph->cir_mask = _graph->cir_mask;
> +	graph->nb_nodes = _graph->node_count;
> +	graph->cir_start = RTE_PTR_ADD(graph, _graph->cir_start);
> +	graph->nodes_start = _graph->nodes_start;
> +	graph->socket = _graph->socket;
> +	graph->id = _graph->id;
> +	memcpy(graph->name, _graph->name, RTE_GRAPH_NAMESIZE);

As I've mentioned above I'm learning the structure of the lib/memory so
quick question here.  My understanding is that rte_graph is a "view of
the 'struct graph' sufficient for worker" so does it need both id &
name?  Both of them seems to be used in error or dump/debug paths.  It
probably doesn't matter (e.g. for performance) - just asking because
'id' seems to be used only in one place (where name could replace it
probably).

> +	graph->fence = RTE_GRAPH_FENCE;
> +}
> +
> +static void
> +graph_nodes_populate(struct graph *_graph)
> +{
> +	rte_graph_off_t off = _graph->nodes_start;
> +	struct rte_graph *graph = _graph->graph;
> +	struct graph_node *graph_node;
> +	rte_edge_t count, nb_edges;
> +	const char *parent;
> +	rte_node_t pid;
> +
> +	STAILQ_FOREACH(graph_node, &_graph->node_list, next) {
> +		struct rte_node *node = RTE_PTR_ADD(graph, off);
> +		memset(node, 0, sizeof(*node));
> +		node->fence = RTE_GRAPH_FENCE;
> +		node->off = off;
> +		node->process = graph_node->node->process;
> +		memcpy(node->name, graph_node->node->name, RTE_GRAPH_NAMESIZE);
> +		pid = graph_node->node->parent_id;
> +		if (pid != RTE_NODE_ID_INVALID) { /* Cloned node */
> +			parent = rte_node_id_to_name(pid);
> +			memcpy(node->parent, parent, RTE_GRAPH_NAMESIZE);
> +		}
> +		node->id = graph_node->node->id;
> +		node->parent_id = pid;
> +		nb_edges = graph_node->node->nb_edges;
> +		node->nb_edges = nb_edges;
> +		off += sizeof(struct rte_node);
> +		/* Copy the name in first pass to replace with rte_node* later*/
> +		for (count = 0; count < nb_edges; count++)
> +			node->nodes[count] = (struct rte_node *)&graph_node
> +						     ->adjacency_list[count]
> +						     ->node->name[0];

I'm not sure I understand what is going here.  Please see below ...

> +
> +		off += sizeof(struct rte_node *) * nb_edges;
> +		off = RTE_ALIGN(off, RTE_CACHE_LINE_SIZE);
> +		node->next = off;
> +		__rte_node_stream_alloc(graph, node);
> +	}
> +}
[...]
> +static int
> +graph_node_nexts_populate(struct graph *_graph)
> +{
> +	rte_node_t count, val;
> +	rte_graph_off_t off;
> +	struct rte_node *node;
> +	const struct rte_graph *graph = _graph->graph;
> +	const char *name;
> +
> +	rte_graph_foreach_node(count, off, graph, node) {
> +		for (val = 0; val < node->nb_edges; val++) {
> +			name = (const char *)node->nodes[val];
> +			node->nodes[val] = graph_node_name_to_ptr(graph, name);

... Is it so that during node the first loop above some node might refer
(by name) to other node that is not yet "registered" so instead of
storing rte_node pointer you stored actually pointer to name which you
now update to proper rte_node?

> +			if (node->nodes[val] == NULL)
> +				SET_ERR_JMP(EINVAL, fail, "%s not found", name);
> +		}
> +	}
> +
> +	return 0;
> +fail:
> +	return -rte_errno;
> +}
[...]

With regards
Andrzej Ostruszka
  
Kiran Kumar Kokkilagadda April 9, 2020, 2:44 a.m. UTC | #2
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Andrzej Ostruszka
> Sent: Wednesday, April 8, 2020 11:00 PM
> To: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v4 06/29] graph: populate fastpath memory for
> graph reel
> 
> On 4/5/20 10:55 AM, jerinj@marvell.com wrote:
> > From: Jerin Jacob <jerinj@marvell.com>
> [...]
> > diff --git a/lib/librte_graph/graph_populate.c
> > b/lib/librte_graph/graph_populate.c
> > new file mode 100644
> > index 000000000..093512efa
> > --- /dev/null
> > +++ b/lib/librte_graph/graph_populate.c
> > @@ -0,0 +1,234 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2020 Marvell International Ltd.
> > + */
> > +
> > +#include <fnmatch.h>
> > +#include <stdbool.h>
> > +
> > +#include <rte_common.h>
> > +#include <rte_errno.h>
> > +#include <rte_malloc.h>
> > +#include <rte_memzone.h>
> > +
> > +#include "graph_private.h"
> > +
> > +static size_t
> > +graph_fp_mem_calc_size(struct graph *graph) {
> > +	struct graph_node *graph_node;
> > +	rte_node_t val;
> > +	size_t sz;
> > +
> > +	/* Graph header */
> > +	sz = sizeof(struct rte_graph);
> > +	/* Source nodes list */
> > +	sz += sizeof(rte_graph_off_t) * graph->src_node_count;
> > +	/* Circular buffer for pending streams of size number of nodes */
> > +	val = rte_align32pow2(graph->node_count * sizeof(rte_graph_off_t));
> > +	sz = RTE_ALIGN(sz, val);
> > +	graph->cir_start = sz;
> > +	graph->cir_mask = rte_align32pow2(graph->node_count) - 1;
> > +	sz += val;
> 
> Aren't here source nodes counted twice?  I'm trying now to wrap my head
> around how this all is structured and laid out in memory (thus the slowdown in
> review) so I am most probably missing something here.
> 

Yes, we are counting source nodes offset, 2 times in the circular buffer. In fact intentionally we are allocating the circular buffer more than the required size (rte_align32pow2).
By allocating circular buffer with more size, at least in some cases we can avoid wraparound.
Let me try to explain how this memory reel and graph walk works.
This is how memory reel looks like.

1. Graph_header---> 2. FENCE ---> 3. [Graph walk always starts from here] memory for source node object offsets ---> 4. [circular buffer starts] enqueued node object offset [ circular buffer end] --> 5. FENCE ---> 6. Memory for Node objects

3 and 4 will have the offset of their corresponding node object in the 6.

Initially before graph walk start we will populate the 3 (see graph_src_nodes_populate) and when the graph walk start first we will go over 3 and based on the enqueues , we will populate the 4 and this is where we are creating circle (we will be keep walking in 4 till there are no more enqueues). So, circular buffer is actually walk the source nodes first then will create circle for enqueued nodes (4).

  
> > +	/* Fence */
> > +	sz += sizeof(RTE_GRAPH_FENCE);
> > +	sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
> > +	graph->nodes_start = sz;
> > +	/* For 0..N node objects with fence */
> > +	STAILQ_FOREACH(graph_node, &graph->node_list, next) {
> > +		sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
> > +		sz += sizeof(struct rte_node);
> > +		/* Pointer to next nodes(edges) */
> > +		sz += sizeof(struct rte_node *) * graph_node->node->nb_edges;
> > +	}
> > +
> > +	graph->mem_sz = sz;
> > +	return sz;
> > +}
> > +
> > +static void
> > +graph_header_popluate(struct graph *_graph) {
> > +	struct rte_graph *graph = _graph->graph;
> > +
> > +	graph->tail = 0;
> > +	graph->head = (int32_t)-_graph->src_node_count;
> > +	graph->cir_mask = _graph->cir_mask;
> > +	graph->nb_nodes = _graph->node_count;
> > +	graph->cir_start = RTE_PTR_ADD(graph, _graph->cir_start);
> > +	graph->nodes_start = _graph->nodes_start;
> > +	graph->socket = _graph->socket;
> > +	graph->id = _graph->id;
> > +	memcpy(graph->name, _graph->name, RTE_GRAPH_NAMESIZE);
> 
> As I've mentioned above I'm learning the structure of the lib/memory so quick
> question here.  My understanding is that rte_graph is a "view of the 'struct
> graph' sufficient for worker" so does it need both id & name?  Both of them
> seems to be used in error or dump/debug paths.  It probably doesn't matter (e.g.
> for performance) - just asking because 'id' seems to be used only in one place
> (where name could replace it probably).
> 

User will have access to the node info both ways using either name or ID. These are used in slow path. 
It is up to the user how he wants to use it.  


> > +	graph->fence = RTE_GRAPH_FENCE;
> > +}
> > +
> > +static void
> > +graph_nodes_populate(struct graph *_graph) {
> > +	rte_graph_off_t off = _graph->nodes_start;
> > +	struct rte_graph *graph = _graph->graph;
> > +	struct graph_node *graph_node;
> > +	rte_edge_t count, nb_edges;
> > +	const char *parent;
> > +	rte_node_t pid;
> > +
> > +	STAILQ_FOREACH(graph_node, &_graph->node_list, next) {
> > +		struct rte_node *node = RTE_PTR_ADD(graph, off);
> > +		memset(node, 0, sizeof(*node));
> > +		node->fence = RTE_GRAPH_FENCE;
> > +		node->off = off;
> > +		node->process = graph_node->node->process;
> > +		memcpy(node->name, graph_node->node->name,
> RTE_GRAPH_NAMESIZE);
> > +		pid = graph_node->node->parent_id;
> > +		if (pid != RTE_NODE_ID_INVALID) { /* Cloned node */
> > +			parent = rte_node_id_to_name(pid);
> > +			memcpy(node->parent, parent,
> RTE_GRAPH_NAMESIZE);
> > +		}
> > +		node->id = graph_node->node->id;
> > +		node->parent_id = pid;
> > +		nb_edges = graph_node->node->nb_edges;
> > +		node->nb_edges = nb_edges;
> > +		off += sizeof(struct rte_node);
> > +		/* Copy the name in first pass to replace with rte_node* later*/
> > +		for (count = 0; count < nb_edges; count++)
> > +			node->nodes[count] = (struct rte_node *)&graph_node
> > +						     ->adjacency_list[count]
> > +						     ->node->name[0];
> 
> I'm not sure I understand what is going here.  Please see below ...


See below.

> 
> > +
> > +		off += sizeof(struct rte_node *) * nb_edges;
> > +		off = RTE_ALIGN(off, RTE_CACHE_LINE_SIZE);
> > +		node->next = off;
> > +		__rte_node_stream_alloc(graph, node);
> > +	}
> > +}
> [...]
> > +static int
> > +graph_node_nexts_populate(struct graph *_graph) {
> > +	rte_node_t count, val;
> > +	rte_graph_off_t off;
> > +	struct rte_node *node;
> > +	const struct rte_graph *graph = _graph->graph;
> > +	const char *name;
> > +
> > +	rte_graph_foreach_node(count, off, graph, node) {
> > +		for (val = 0; val < node->nb_edges; val++) {
> > +			name = (const char *)node->nodes[val];
> > +			node->nodes[val] = graph_node_name_to_ptr(graph,
> name);
> 
> ... Is it so that during node the first loop above some node might refer (by name)
> to other node that is not yet "registered" so instead of storing rte_node pointer
> you stored actually pointer to name which you now update to proper rte_node?

Exactly, it is because next nodes are based on name not based on ID. All we need is user has to create all the nodes before graph create. So, that at the time of graph create we will take care of linking the actual nodes based on name.

> 
> > +			if (node->nodes[val] == NULL)
> > +				SET_ERR_JMP(EINVAL, fail, "%s not found",
> name);
> > +		}
> > +	}
> > +
> > +	return 0;
> > +fail:
> > +	return -rte_errno;
> > +}
> [...]
> 
> With regards
> Andrzej Ostruszka
  

Patch

diff --git a/lib/librte_graph/Makefile b/lib/librte_graph/Makefile
index 39ecb2652..7bfd7d51f 100644
--- a/lib/librte_graph/Makefile
+++ b/lib/librte_graph/Makefile
@@ -18,8 +18,10 @@  SRCS-$(CONFIG_RTE_LIBRTE_GRAPH) += node.c
 SRCS-$(CONFIG_RTE_LIBRTE_GRAPH) += graph.c
 SRCS-$(CONFIG_RTE_LIBRTE_GRAPH) += graph_ops.c
 SRCS-$(CONFIG_RTE_LIBRTE_GRAPH) += graph_debug.c
+SRCS-$(CONFIG_RTE_LIBRTE_GRAPH) += graph_populate.c
 
 # install header files
 SYMLINK-$(CONFIG_RTE_LIBRTE_GRAPH)-include += rte_graph.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_GRAPH)-include += rte_graph_worker.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_graph/graph.c b/lib/librte_graph/graph.c
index 4c3f2fe7b..e1930b7d2 100644
--- a/lib/librte_graph/graph.c
+++ b/lib/librte_graph/graph.c
@@ -2,6 +2,7 @@ 
  * Copyright(C) 2020 Marvell International Ltd.
  */
 
+#include <rte_malloc.h>
 #include <rte_spinlock.h>
 
 #include "graph_private.h"
@@ -19,3 +20,18 @@  graph_spinlock_unlock(void)
 {
 	rte_spinlock_unlock(&graph_lock);
 }
+
+void __rte_noinline
+__rte_node_stream_alloc(struct rte_graph *graph, struct rte_node *node)
+{
+	uint16_t size = node->size;
+
+	RTE_VERIFY(size != UINT16_MAX);
+	/* Allocate double amount of size to avoid immediate realloc */
+	size = RTE_MIN(UINT16_MAX, RTE_MAX(RTE_GRAPH_BURST_SIZE, size * 2));
+	node->objs = rte_realloc_socket(node->objs, size * sizeof(void *),
+					RTE_CACHE_LINE_SIZE, graph->socket);
+	RTE_VERIFY(node->objs);
+	node->size = size;
+	node->realloc_count++;
+}
diff --git a/lib/librte_graph/graph_populate.c b/lib/librte_graph/graph_populate.c
new file mode 100644
index 000000000..093512efa
--- /dev/null
+++ b/lib/librte_graph/graph_populate.c
@@ -0,0 +1,234 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#include <fnmatch.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_memzone.h>
+
+#include "graph_private.h"
+
+static size_t
+graph_fp_mem_calc_size(struct graph *graph)
+{
+	struct graph_node *graph_node;
+	rte_node_t val;
+	size_t sz;
+
+	/* Graph header */
+	sz = sizeof(struct rte_graph);
+	/* Source nodes list */
+	sz += sizeof(rte_graph_off_t) * graph->src_node_count;
+	/* Circular buffer for pending streams of size number of nodes */
+	val = rte_align32pow2(graph->node_count * sizeof(rte_graph_off_t));
+	sz = RTE_ALIGN(sz, val);
+	graph->cir_start = sz;
+	graph->cir_mask = rte_align32pow2(graph->node_count) - 1;
+	sz += val;
+	/* Fence */
+	sz += sizeof(RTE_GRAPH_FENCE);
+	sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
+	graph->nodes_start = sz;
+	/* For 0..N node objects with fence */
+	STAILQ_FOREACH(graph_node, &graph->node_list, next) {
+		sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
+		sz += sizeof(struct rte_node);
+		/* Pointer to next nodes(edges) */
+		sz += sizeof(struct rte_node *) * graph_node->node->nb_edges;
+	}
+
+	graph->mem_sz = sz;
+	return sz;
+}
+
+static void
+graph_header_popluate(struct graph *_graph)
+{
+	struct rte_graph *graph = _graph->graph;
+
+	graph->tail = 0;
+	graph->head = (int32_t)-_graph->src_node_count;
+	graph->cir_mask = _graph->cir_mask;
+	graph->nb_nodes = _graph->node_count;
+	graph->cir_start = RTE_PTR_ADD(graph, _graph->cir_start);
+	graph->nodes_start = _graph->nodes_start;
+	graph->socket = _graph->socket;
+	graph->id = _graph->id;
+	memcpy(graph->name, _graph->name, RTE_GRAPH_NAMESIZE);
+	graph->fence = RTE_GRAPH_FENCE;
+}
+
+static void
+graph_nodes_populate(struct graph *_graph)
+{
+	rte_graph_off_t off = _graph->nodes_start;
+	struct rte_graph *graph = _graph->graph;
+	struct graph_node *graph_node;
+	rte_edge_t count, nb_edges;
+	const char *parent;
+	rte_node_t pid;
+
+	STAILQ_FOREACH(graph_node, &_graph->node_list, next) {
+		struct rte_node *node = RTE_PTR_ADD(graph, off);
+		memset(node, 0, sizeof(*node));
+		node->fence = RTE_GRAPH_FENCE;
+		node->off = off;
+		node->process = graph_node->node->process;
+		memcpy(node->name, graph_node->node->name, RTE_GRAPH_NAMESIZE);
+		pid = graph_node->node->parent_id;
+		if (pid != RTE_NODE_ID_INVALID) { /* Cloned node */
+			parent = rte_node_id_to_name(pid);
+			memcpy(node->parent, parent, RTE_GRAPH_NAMESIZE);
+		}
+		node->id = graph_node->node->id;
+		node->parent_id = pid;
+		nb_edges = graph_node->node->nb_edges;
+		node->nb_edges = nb_edges;
+		off += sizeof(struct rte_node);
+		/* Copy the name in first pass to replace with rte_node* later*/
+		for (count = 0; count < nb_edges; count++)
+			node->nodes[count] = (struct rte_node *)&graph_node
+						     ->adjacency_list[count]
+						     ->node->name[0];
+
+		off += sizeof(struct rte_node *) * nb_edges;
+		off = RTE_ALIGN(off, RTE_CACHE_LINE_SIZE);
+		node->next = off;
+		__rte_node_stream_alloc(graph, node);
+	}
+}
+
+struct rte_node *
+graph_node_id_to_ptr(const struct rte_graph *graph, rte_node_t id)
+{
+	rte_node_t count;
+	rte_graph_off_t off;
+	struct rte_node *node;
+
+	rte_graph_foreach_node(count, off, graph, node)
+		if (unlikely(node->id == id))
+			return node;
+
+	return NULL;
+}
+
+struct rte_node *
+graph_node_name_to_ptr(const struct rte_graph *graph, const char *name)
+{
+	rte_node_t count;
+	rte_graph_off_t off;
+	struct rte_node *node;
+
+	rte_graph_foreach_node(count, off, graph, node)
+		if (strncmp(name, node->name, RTE_NODE_NAMESIZE) == 0)
+			return node;
+
+	return NULL;
+}
+
+static int
+graph_node_nexts_populate(struct graph *_graph)
+{
+	rte_node_t count, val;
+	rte_graph_off_t off;
+	struct rte_node *node;
+	const struct rte_graph *graph = _graph->graph;
+	const char *name;
+
+	rte_graph_foreach_node(count, off, graph, node) {
+		for (val = 0; val < node->nb_edges; val++) {
+			name = (const char *)node->nodes[val];
+			node->nodes[val] = graph_node_name_to_ptr(graph, name);
+			if (node->nodes[val] == NULL)
+				SET_ERR_JMP(EINVAL, fail, "%s not found", name);
+		}
+	}
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+static int
+graph_src_nodes_populate(struct graph *_graph)
+{
+	struct rte_graph *graph = _graph->graph;
+	struct graph_node *graph_node;
+	struct rte_node *node;
+	int32_t head = -1;
+	const char *name;
+
+	STAILQ_FOREACH(graph_node, &_graph->node_list, next) {
+		if (graph_node->node->flags & RTE_NODE_SOURCE_F) {
+			name = graph_node->node->name;
+			node = graph_node_name_to_ptr(graph, name);
+			if (node == NULL)
+				SET_ERR_JMP(EINVAL, fail, "%s not found", name);
+
+			__rte_node_stream_alloc(graph, node);
+			graph->cir_start[head--] = node->off;
+		}
+	}
+
+	return 0;
+fail:
+	return -rte_errno;
+}
+
+static int
+graph_fp_mem_populate(struct graph *graph)
+{
+	int rc;
+
+	graph_header_popluate(graph);
+	graph_nodes_populate(graph);
+	rc = graph_node_nexts_populate(graph);
+	rc |= graph_src_nodes_populate(graph);
+
+	return rc;
+}
+
+int
+graph_fp_mem_create(struct graph *graph)
+{
+	const struct rte_memzone *mz;
+	size_t sz;
+
+	sz = graph_fp_mem_calc_size(graph);
+	mz = rte_memzone_reserve(graph->name, sz, graph->socket, 0);
+	if (mz == NULL)
+		SET_ERR_JMP(ENOMEM, fail, "Memzone %s reserve failed",
+			    graph->name);
+
+	graph->graph = mz->addr;
+	graph->mz = mz;
+
+	return graph_fp_mem_populate(graph);
+fail:
+	return -rte_errno;
+}
+
+static void
+graph_nodes_mem_destroy(struct rte_graph *graph)
+{
+	rte_node_t count;
+	rte_graph_off_t off;
+	struct rte_node *node;
+
+	if (graph == NULL)
+		return;
+
+	rte_graph_foreach_node(count, off, graph, node)
+		rte_free(node->objs);
+}
+
+int
+graph_fp_mem_destroy(struct graph *graph)
+{
+	graph_nodes_mem_destroy(graph->graph);
+	return rte_memzone_free(graph->mz);
+}
diff --git a/lib/librte_graph/graph_private.h b/lib/librte_graph/graph_private.h
index 220a35e2a..7fce52e00 100644
--- a/lib/librte_graph/graph_private.h
+++ b/lib/librte_graph/graph_private.h
@@ -12,6 +12,7 @@ 
 #include <rte_eal.h>
 
 #include "rte_graph.h"
+#include "rte_graph_worker.h"
 
 extern int rte_graph_logtype;
 
@@ -254,6 +255,69 @@  rte_node_t graph_nodes_count(struct graph *graph);
  */
 void graph_mark_nodes_as_not_visited(struct graph *graph);
 
+/* Fast path graph memory populate unctions */
+
+/**
+ * @internal
+ *
+ * Create fast-path memory for the graph and nodes.
+ *
+ * @param graph
+ *   Pointer to the internal graph object.
+ *
+ * @return
+ *   - 0: Success.
+ *   - -ENOMEM: Not enough for graph and nodes.
+ *   - -EINVAL: Graph nodes not found.
+ */
+int graph_fp_mem_create(struct graph *graph);
+
+/**
+ * @internal
+ *
+ * Free fast-path memory used by graph and nodes.
+ *
+ * @param graph
+ *   Pointer to the internal graph object.
+ *
+ * @return
+ *   - 0: Success.
+ *   - <0: Graph memzone related error.
+ */
+int graph_fp_mem_destroy(struct graph *graph);
+
+/* Lookup functions */
+/**
+ * @internal
+ *
+ * Get graph node object from node id.
+ *
+ * @param graph
+ *   Pointer to rte_graph object.
+ * @param id
+ *   Node Identifier.
+ *
+ * @return
+ *   Pointer to rte_node if identifier is valid else NULL.
+ */
+struct rte_node *graph_node_id_to_ptr(const struct rte_graph *graph,
+				      rte_node_t id);
+
+/**
+ * @internal
+ *
+ * Get graph node object from node name.
+ *
+ * @param graph
+ *   Pointer to rte_graph object.
+ * @param node_name
+ *   Pointer to character string holding the node name.
+ *
+ * @return
+ *   Pointer to rte_node if identifier is valid else NULL.
+ */
+struct rte_node *graph_node_name_to_ptr(const struct rte_graph *graph,
+					const char *node_name);
 
 /**
  * @internal
diff --git a/lib/librte_graph/meson.build b/lib/librte_graph/meson.build
index 16e0625c1..fb203a5e2 100644
--- a/lib/librte_graph/meson.build
+++ b/lib/librte_graph/meson.build
@@ -4,8 +4,8 @@ 
 
 name = 'graph'
 
-sources = files('node.c', 'graph.c', 'graph_ops.c', 'graph_debug.c')
-headers = files('rte_graph.h')
+sources = files('node.c', 'graph.c', 'graph_ops.c', 'graph_debug.c', 'graph_populate.c')
+headers = files('rte_graph.h', 'rte_graph_worker.h')
 allow_experimental_apis = true
 
 deps += ['eal']
diff --git a/lib/librte_graph/node.c b/lib/librte_graph/node.c
index 8592c1221..e05c4d5ed 100644
--- a/lib/librte_graph/node.c
+++ b/lib/librte_graph/node.c
@@ -61,6 +61,11 @@  __rte_node_register(const struct rte_node_register *reg)
 	rte_edge_t i;
 	size_t sz;
 
+	/* Limit Node specific metadata to one cacheline on 64B CL machine */
+	RTE_BUILD_BUG_ON((offsetof(struct rte_node, nodes) -
+			  offsetof(struct rte_node, ctx)) !=
+			 RTE_CACHE_LINE_MIN_SIZE);
+
 	graph_spinlock_lock();
 
 	/* Check sanity */
diff --git a/lib/librte_graph/rte_graph_version.map b/lib/librte_graph/rte_graph_version.map
index f2c2139c5..a9fe1b610 100644
--- a/lib/librte_graph/rte_graph_version.map
+++ b/lib/librte_graph/rte_graph_version.map
@@ -2,6 +2,7 @@  EXPERIMENTAL {
 	global:
 
 	__rte_node_register;
+	__rte_node_stream_alloc;
 
 	rte_node_clone;
 	rte_node_dump;
diff --git a/lib/librte_graph/rte_graph_worker.h b/lib/librte_graph/rte_graph_worker.h
new file mode 100644
index 000000000..a8133739d
--- /dev/null
+++ b/lib/librte_graph/rte_graph_worker.h
@@ -0,0 +1,108 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#ifndef _RTE_GRAPH_WORKER_H_
+#define _RTE_GRAPH_WORKER_H_
+
+/**
+ * @file rte_graph_worker.h
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * This API allows a worker thread to walk over a graph and nodes to create,
+ * process, enqueue and move streams of objects to the next nodes.
+ */
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_memcpy.h>
+#include <rte_memory.h>
+
+#include "rte_graph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @internal
+ *
+ * Data structure to hold graph data.
+ */
+struct rte_graph {
+	uint32_t tail;		     /**< Tail of circular buffer. */
+	uint32_t head;		     /**< Head of circular buffer. */
+	uint32_t cir_mask;	     /**< Circular buffer wrap around mask. */
+	rte_node_t nb_nodes;	     /**< Number of nodes in the graph. */
+	rte_graph_off_t *cir_start;  /**< Pointer to circular buffer. */
+	rte_graph_off_t nodes_start; /**< Offset at which node memory starts. */
+	rte_graph_t id;	/**< Graph identifier. */
+	int socket;	/**< Socket ID where memory is allocated. */
+	char name[RTE_GRAPH_NAMESIZE];	/**< Name of the graph. */
+	uint64_t fence;			/**< Fence. */
+} __rte_cache_aligned;
+
+/**
+ * @internal
+ *
+ * Data structure to hold node data.
+ */
+struct rte_node {
+	/* Slow path area  */
+	uint64_t fence;		/**< Fence. */
+	rte_graph_off_t next;	/**< Index to next node. */
+	rte_node_t id;		/**< Node identifier. */
+	rte_node_t parent_id;	/**< Parent Node identifier. */
+	rte_edge_t nb_edges;	/**< Number of edges from this node. */
+	uint32_t realloc_count;	/**< Number of times realloced. */
+
+	char parent[RTE_NODE_NAMESIZE];	/**< Parent node name. */
+	char name[RTE_NODE_NAMESIZE];	/**< Name of the node. */
+
+	/* Fast path area  */
+#define RTE_NODE_CTX_SZ 16
+	uint8_t ctx[RTE_NODE_CTX_SZ] __rte_cache_aligned; /**< Node Context. */
+	uint16_t size;		/**< Total number of objects available. */
+	uint16_t idx;		/**< Number of objects used. */
+	rte_graph_off_t off;	/**< Offset of node in the graph reel. */
+	uint64_t total_cycles;	/**< Cycles spent in this node. */
+	uint64_t total_calls;	/**< Calls done to this node. */
+	uint64_t total_objs;	/**< Objects processed by this node. */
+	RTE_STD_C11
+		union {
+			void **objs;	   /**< Array of object pointers. */
+			uint64_t objs_u64;
+		};
+	RTE_STD_C11
+		union {
+			rte_node_process_t process; /**< Process function. */
+			uint64_t process_u64;
+		};
+	struct rte_node *nodes[] __rte_cache_min_aligned; /**< Next nodes. */
+} __rte_cache_aligned;
+
+/**
+ * @internal
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Allocate a stream of objects.
+ *
+ * If stream already exists then re-allocate it to a larger size.
+ *
+ * @param graph
+ *   Pointer to the graph object.
+ * @param node
+ *   Pointer to the node object.
+ */
+__rte_experimental
+void __rte_node_stream_alloc(struct rte_graph *graph, struct rte_node *node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_GRAPH_WORKER_H_ */