app/dma-perf: per device config support

Message ID 20240805135110.2509227-1-amitprakashs@marvell.com (mailing list archive)
State Superseded
Delegated to: Thomas Monjalon
Headers
Series app/dma-perf: per device config support |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/github-robot: build success github build: passed
ci/intel-Functional success Functional PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-marvell-Functional success Functional Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-sample-apps-testing success Testing PASS

Commit Message

Amit Prakash Shukla Aug. 5, 2024, 1:51 p.m. UTC
Add support to configure device specific config parameters for a
testcase. Example:

lcore_dma0=lcore=11,dev=0000:00:04.1,dir=mem2dev,raddr=0x300000000,
coreid=1,pfid=2,vfid=3
lcore_dma1=lcore=12,dev=0000:00:04.2,dir=dev2mem,raddr=0x200000000,
coreid=3,pfid=2,vfid=1

Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
---
 app/test-dma-perf/benchmark.c | 318 +++++++++++++++++++++-------------
 app/test-dma-perf/config.ini  |  30 ++--
 app/test-dma-perf/main.c      | 199 ++++++---------------
 app/test-dma-perf/main.h      |  20 ++-
 doc/guides/tools/dmaperf.rst  | 107 +++++++++---
 5 files changed, 360 insertions(+), 314 deletions(-)
  

Comments

fengchengwen Sept. 5, 2024, 3:49 a.m. UTC | #1
Hi Amit,

It indeed provide more flexible configuration.

There is a small comment below, with that fixed,
Acked-by: Chengwen Feng <fengchengwen@huawei.com>

Thanks

On 2024/8/5 21:51, Amit Prakash Shukla wrote:
> Add support to configure device specific config parameters for a
> testcase. Example:
> 
> lcore_dma0=lcore=11,dev=0000:00:04.1,dir=mem2dev,raddr=0x300000000,
> coreid=1,pfid=2,vfid=3
> lcore_dma1=lcore=12,dev=0000:00:04.2,dir=dev2mem,raddr=0x200000000,
> coreid=3,pfid=2,vfid=1
> 
> Signed-off-by: Amit Prakash Shukla <amitprakashs@marvell.com>
> ---

...

>  
> -static int populate_pcie_config(const char *key, const char *value, void *test)
> +static int populate_dma_dev_config(const char *key, const char *value, void *test)
>  {
> -	struct test_configure *test_case = (struct test_configure *)test;
> +	struct lcore_dma_config *dma_config = (struct lcore_dma_config *)test;
> +	struct vchan_dev_config *vchan_config = &dma_config->vchan_dev;
> +	struct lcore_dma_map_t *lcore_map = &dma_config->lcore_dma_map;
>  	char *endptr;
>  	int ret = 0;
>  
> -	if (strcmp(key, "raddr") == 0)
> -		test_case->vchan_dev.raddr = strtoull(value, &endptr, 16);
> +	if (strcmp(key, "lcore") == 0)
> +		lcore_map->lcore = (uint8_t)atoi(value);

Suggest use uint16_t, because maybe >=256 cores
  

Patch

diff --git a/app/test-dma-perf/benchmark.c b/app/test-dma-perf/benchmark.c
index d167adc4d2..5eebb4de77 100644
--- a/app/test-dma-perf/benchmark.c
+++ b/app/test-dma-perf/benchmark.c
@@ -148,11 +148,13 @@  cache_flush_buf(__rte_unused struct rte_mbuf **array,
 
 static int
 vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
-		    struct test_configure *cfg)
+		    struct test_configure *cfg, uint16_t dev_num)
 {
+	struct vchan_dev_config *vchan_dconfig;
 	struct rte_dma_info info;
 
-	qconf->direction = cfg->transfer_dir;
+	vchan_dconfig = &cfg->dma_config[dev_num].vchan_dev;
+	qconf->direction = vchan_dconfig->tdir;
 
 	rte_dma_info_get(dev_id, &info);
 	if (!(RTE_BIT64(qconf->direction) & info.dev_capa))
@@ -164,16 +166,16 @@  vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 	case RTE_DMA_DIR_MEM_TO_DEV:
 		qconf->dst_port.pcie.vfen = 1;
 		qconf->dst_port.port_type = RTE_DMA_PORT_PCIE;
-		qconf->dst_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
-		qconf->dst_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
-		qconf->dst_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		qconf->dst_port.pcie.coreid = vchan_dconfig->port.pcie.coreid;
+		qconf->dst_port.pcie.vfid = vchan_dconfig->port.pcie.vfid;
+		qconf->dst_port.pcie.pfid = vchan_dconfig->port.pcie.pfid;
 		break;
 	case RTE_DMA_DIR_DEV_TO_MEM:
 		qconf->src_port.pcie.vfen = 1;
 		qconf->src_port.port_type = RTE_DMA_PORT_PCIE;
-		qconf->src_port.pcie.coreid = cfg->vchan_dev.port.pcie.coreid;
-		qconf->src_port.pcie.vfid = cfg->vchan_dev.port.pcie.vfid;
-		qconf->src_port.pcie.pfid = cfg->vchan_dev.port.pcie.pfid;
+		qconf->src_port.pcie.coreid = vchan_dconfig->port.pcie.coreid;
+		qconf->src_port.pcie.vfid = vchan_dconfig->port.pcie.vfid;
+		qconf->src_port.pcie.pfid = vchan_dconfig->port.pcie.pfid;
 		break;
 	case RTE_DMA_DIR_MEM_TO_MEM:
 	case RTE_DMA_DIR_DEV_TO_DEV:
@@ -185,14 +187,15 @@  vchan_data_populate(uint32_t dev_id, struct rte_dma_vchan_conf *qconf,
 
 /* Configuration of device. */
 static void
-configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t sges_max)
+configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t sges_max,
+		       uint16_t dev_num)
 {
 	uint16_t vchan = 0;
 	struct rte_dma_info info;
 	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
 	struct rte_dma_vchan_conf qconf = { 0 };
 
-	if (vchan_data_populate(dev_id, &qconf, cfg) != 0)
+	if (vchan_data_populate(dev_id, &qconf, cfg, dev_num) != 0)
 		rte_exit(EXIT_FAILURE, "Error with vchan data populate.\n");
 
 	if (rte_dma_configure(dev_id, &dev_config) != 0)
@@ -219,8 +222,8 @@  configure_dmadev_queue(uint32_t dev_id, struct test_configure *cfg, uint8_t sges
 static int
 config_dmadevs(struct test_configure *cfg)
 {
-	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
-	uint32_t nb_workers = ldm->cnt;
+	uint32_t nb_workers = cfg->num_worker;
+	struct lcore_dma_map_t *ldm;
 	uint32_t i;
 	int dev_id;
 	uint16_t nb_dmadevs = 0;
@@ -230,16 +233,17 @@  config_dmadevs(struct test_configure *cfg)
 	if (cfg->is_sg)
 		nb_sges = RTE_MAX(cfg->nb_src_sges, cfg->nb_dst_sges);
 
-	for (i = 0; i < ldm->cnt; i++) {
-		dma_name = ldm->dma_names[i];
+	for (i = 0; i < nb_workers; i++) {
+		ldm = &cfg->dma_config[i].lcore_dma_map;
+		dma_name = ldm->dma_names;
 		dev_id = rte_dma_get_dev_id_by_name(dma_name);
 		if (dev_id < 0) {
 			fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
 			goto end;
 		}
 
-		ldm->dma_ids[i] = dev_id;
-		configure_dmadev_queue(dev_id, cfg, nb_sges);
+		ldm->dma_id = dev_id;
+		configure_dmadev_queue(dev_id, cfg, nb_sges, nb_dmadevs);
 		++nb_dmadevs;
 	}
 
@@ -436,7 +440,6 @@  setup_memory_env(struct test_configure *cfg,
 			 struct rte_mbuf ***srcs, struct rte_mbuf ***dsts,
 			 struct rte_dma_sge **src_sges, struct rte_dma_sge **dst_sges)
 {
-	static struct rte_mbuf_ext_shared_info *ext_buf_info;
 	unsigned int cur_buf_size = cfg->buf_size.cur;
 	unsigned int buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
 	unsigned int nr_sockets;
@@ -499,41 +502,6 @@  setup_memory_env(struct test_configure *cfg,
 		memset(rte_pktmbuf_mtod((*dsts)[i], void *), 0, cur_buf_size);
 	}
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM ||
-	    cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
-		ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
-		if (ext_buf_info == NULL) {
-			printf("Error: ext_buf_info malloc failed.\n");
-			return -1;
-		}
-	}
-
-	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
-		ext_buf_info->free_cb = dummy_free_ext_buf;
-		ext_buf_info->fcb_opaque = NULL;
-		for (i = 0; i < nr_buf; i++) {
-			/* Using mbuf structure to hold remote iova address. */
-			rte_pktmbuf_attach_extbuf((*srcs)[i],
-				(void *)(cfg->vchan_dev.raddr + (i * buf_size)),
-				(rte_iova_t)(cfg->vchan_dev.raddr + (i * buf_size)),
-				0, ext_buf_info);
-			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
-		}
-	}
-
-	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV) {
-		ext_buf_info->free_cb = dummy_free_ext_buf;
-		ext_buf_info->fcb_opaque = NULL;
-		for (i = 0; i < nr_buf; i++) {
-			/* Using mbuf structure to hold remote iova address. */
-			rte_pktmbuf_attach_extbuf((*dsts)[i],
-				(void *)(cfg->vchan_dev.raddr + (i * buf_size)),
-				(rte_iova_t)(cfg->vchan_dev.raddr + (i * buf_size)),
-				0, ext_buf_info);
-			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
-		}
-	}
-
 	if (cfg->is_sg) {
 		uint8_t nb_src_sges = cfg->nb_src_sges;
 		uint8_t nb_dst_sges = cfg->nb_dst_sges;
@@ -575,8 +543,7 @@  setup_memory_env(struct test_configure *cfg,
 static uint32_t
 align_buffer_count(struct test_configure *cfg, uint32_t *nr_sgsrc, uint32_t *nr_sgdst)
 {
-	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
-	uint16_t nb_workers = ldm->cnt;
+	uint16_t nb_workers = cfg->num_worker;
 	uint32_t nr_buf;
 
 	nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
@@ -619,18 +586,98 @@  get_work_function(struct test_configure *cfg)
 	return fn;
 }
 
+static int
+attach_ext_buffer(struct vchan_dev_config *vchan_dev, struct lcore_params *lcore, bool is_sg,
+		  uint32_t nr_sgsrc, uint32_t nr_sgdst)
+{
+	static struct rte_mbuf_ext_shared_info *ext_buf_info;
+	struct rte_dma_sge **src_sges, **dst_sges;
+	struct rte_mbuf **srcs, **dsts;
+	unsigned int cur_buf_size;
+	unsigned int buf_size;
+	uint32_t nr_buf;
+	uint32_t i;
+
+	cur_buf_size = lcore->buf_size;
+	buf_size = cur_buf_size + RTE_PKTMBUF_HEADROOM;
+	nr_buf = lcore->nr_buf;
+	srcs = lcore->srcs;
+	dsts = lcore->dsts;
+
+	ext_buf_info = rte_malloc(NULL, sizeof(struct rte_mbuf_ext_shared_info), 0);
+	if (ext_buf_info == NULL) {
+		printf("Error: ext_buf_info malloc failed.\n");
+		return -1;
+	}
+	ext_buf_info->free_cb = dummy_free_ext_buf;
+	ext_buf_info->fcb_opaque = NULL;
+
+	if (vchan_dev->tdir == RTE_DMA_DIR_DEV_TO_MEM) {
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf(srcs[i],
+				(void *)(vchan_dev->raddr + (i * buf_size)),
+				(rte_iova_t)(vchan_dev->raddr + (i * buf_size)), 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (vchan_dev->tdir == RTE_DMA_DIR_MEM_TO_DEV) {
+		for (i = 0; i < nr_buf; i++) {
+			/* Using mbuf structure to hold remote iova address. */
+			rte_pktmbuf_attach_extbuf(dsts[i],
+				(void *)(vchan_dev->raddr + (i * buf_size)),
+				(rte_iova_t)(vchan_dev->raddr + (i * buf_size)), 0, ext_buf_info);
+			rte_mbuf_ext_refcnt_update(ext_buf_info, 1);
+		}
+	}
+
+	if (is_sg) {
+		uint8_t nb_src_sges = lcore->sge.nb_srcs;
+		uint8_t nb_dst_sges = lcore->sge.nb_dsts;
+		uint32_t sglen_src, sglen_dst;
+
+		src_sges = &lcore->sge.srcs;
+		dst_sges = &lcore->sge.dsts;
+
+		sglen_src = cur_buf_size / nb_src_sges;
+		sglen_dst = cur_buf_size / nb_dst_sges;
+
+		if (vchan_dev->tdir == RTE_DMA_DIR_DEV_TO_MEM) {
+			for (i = 0; i < nr_sgsrc; i++) {
+				(*src_sges)[i].addr = rte_pktmbuf_iova(srcs[i]);
+				(*src_sges)[i].length = sglen_src;
+				if (!((i+1) % nb_src_sges))
+					(*src_sges)[i].length += (cur_buf_size % nb_src_sges);
+			}
+		}
+
+		if (vchan_dev->tdir == RTE_DMA_DIR_MEM_TO_DEV) {
+			for (i = 0; i < nr_sgdst; i++) {
+				(*dst_sges)[i].addr = rte_pktmbuf_iova(dsts[i]);
+				(*dst_sges)[i].length = sglen_dst;
+				if (!((i+1) % nb_dst_sges))
+					(*dst_sges)[i].length += (cur_buf_size % nb_dst_sges);
+			}
+		}
+	}
+
+	return 0;
+}
+
 int
 mem_copy_benchmark(struct test_configure *cfg)
 {
-	uint32_t i, j;
+	uint32_t i, j, k;
 	uint32_t offset;
 	unsigned int lcore_id = 0;
 	struct rte_mbuf **srcs = NULL, **dsts = NULL, **m = NULL;
 	struct rte_dma_sge *src_sges = NULL, *dst_sges = NULL;
-	struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
+	struct vchan_dev_config *vchan_dev = NULL;
+	struct lcore_dma_map_t *lcore_dma_map = NULL;
 	unsigned int buf_size = cfg->buf_size.cur;
 	uint16_t kick_batch = cfg->kick_batch.cur;
-	uint16_t nb_workers = ldm->cnt;
+	uint16_t nb_workers = cfg->num_worker;
 	uint16_t test_secs = cfg->test_secs;
 	float memory = 0;
 	uint32_t avg_cycles = 0;
@@ -660,7 +707,10 @@  mem_copy_benchmark(struct test_configure *cfg)
 	printf("Start testing....\n");
 
 	for (i = 0; i < nb_workers; i++) {
-		lcore_id = ldm->lcores[i];
+		lcore_dma_map = &cfg->dma_config[i].lcore_dma_map;
+		vchan_dev = &cfg->dma_config[i].vchan_dev;
+
+		lcore_id = lcore_dma_map->lcore;
 		offset = nr_buf / nb_workers * i;
 		lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
 		if (lcores[i] == NULL) {
@@ -668,10 +718,11 @@  mem_copy_benchmark(struct test_configure *cfg)
 			break;
 		}
 		if (cfg->is_dma) {
-			lcores[i]->dma_name = ldm->dma_names[i];
-			lcores[i]->dev_id = ldm->dma_ids[i];
+			lcores[i]->dma_name = lcore_dma_map->dma_names;
+			lcores[i]->dev_id = lcore_dma_map->dma_id;
 			lcores[i]->kick_batch = kick_batch;
 		}
+
 		lcores[i]->worker_id = i;
 		lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
 		lcores[i]->buf_size = buf_size;
@@ -688,8 +739,14 @@  mem_copy_benchmark(struct test_configure *cfg)
 			lcores[i]->sge.dsts = dst_sges + (nr_sgdst / nb_workers * i);
 		}
 
-		rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]),
-				lcore_id);
+		if (vchan_dev->tdir == RTE_DMA_DIR_DEV_TO_MEM ||
+		    vchan_dev->tdir == RTE_DMA_DIR_MEM_TO_DEV) {
+			if (attach_ext_buffer(vchan_dev, lcores[i], cfg->is_sg,
+					      (nr_sgsrc/nb_workers), (nr_sgdst/nb_workers)) < 0)
+				goto out;
+		}
+
+		rte_eal_remote_launch(get_work_function(cfg), (void *)(lcores[i]), lcore_id);
 	}
 
 	while (1) {
@@ -721,52 +778,64 @@  mem_copy_benchmark(struct test_configure *cfg)
 
 	rte_eal_mp_wait_lcore();
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM && !cfg->is_sg) {
-		for (i = 0; i < (nr_buf / nb_workers) * nb_workers; i++) {
-			if (memcmp(rte_pktmbuf_mtod(srcs[i], void *),
-				   rte_pktmbuf_mtod(dsts[i], void *),
-				   cfg->buf_size.cur) != 0) {
-				printf("Copy validation fails for buffer number %d\n", i);
-				ret = -1;
-				goto out;
-			}
-		}
-	} else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_MEM && cfg->is_sg) {
-		size_t src_remsz = buf_size % cfg->nb_src_sges;
-		size_t dst_remsz = buf_size % cfg->nb_dst_sges;
-		size_t src_sz = buf_size / cfg->nb_src_sges;
-		size_t dst_sz = buf_size / cfg->nb_dst_sges;
-		uint8_t src[buf_size], dst[buf_size];
-		uint8_t *sbuf, *dbuf, *ptr;
-
-		for (i = 0; i < (nr_buf / RTE_MAX(cfg->nb_src_sges, cfg->nb_dst_sges)); i++) {
-			sbuf = src;
-			dbuf = dst;
-			ptr = NULL;
-
-			for (j = 0; j < cfg->nb_src_sges; j++) {
-				ptr = rte_pktmbuf_mtod(srcs[i * cfg->nb_src_sges + j], uint8_t *);
-				memcpy(sbuf, ptr, src_sz);
-				sbuf += src_sz;
-			}
-
-			if (src_remsz)
-				memcpy(sbuf, ptr + src_sz, src_remsz);
-
-			for (j = 0; j < cfg->nb_dst_sges; j++) {
-				ptr = rte_pktmbuf_mtod(dsts[i * cfg->nb_dst_sges + j], uint8_t *);
-				memcpy(dbuf, ptr, dst_sz);
-				dbuf += dst_sz;
+	for (k = 0; k < nb_workers; k++) {
+		struct rte_mbuf **src_buf = NULL, **dst_buf = NULL;
+		uint32_t nr_buf_pt = nr_buf / nb_workers;
+		vchan_dev = &cfg->dma_config[k].vchan_dev;
+		offset = nr_buf / nb_workers * k;
+		src_buf = srcs + offset;
+		dst_buf = dsts + offset;
+
+		if (vchan_dev->tdir == RTE_DMA_DIR_MEM_TO_MEM && !cfg->is_sg) {
+			for (i = 0; i < nr_buf_pt; i++) {
+				if (memcmp(rte_pktmbuf_mtod(src_buf[i], void *),
+							    rte_pktmbuf_mtod(dst_buf[i], void *),
+							    cfg->buf_size.cur) != 0) {
+					printf("Copy validation fails for buffer number %d\n", i);
+					ret = -1;
+					goto out;
+				}
 			}
-
-			if (dst_remsz)
-				memcpy(dbuf, ptr + dst_sz, dst_remsz);
-
-			if (memcmp(src, dst, buf_size) != 0) {
-				printf("SG Copy validation fails for buffer number %d\n",
-					i * cfg->nb_src_sges);
-				ret = -1;
-				goto out;
+		} else if (vchan_dev->tdir == RTE_DMA_DIR_MEM_TO_MEM && cfg->is_sg) {
+			size_t src_remsz = buf_size % cfg->nb_src_sges;
+			size_t dst_remsz = buf_size % cfg->nb_dst_sges;
+			size_t src_sz = buf_size / cfg->nb_src_sges;
+			size_t dst_sz = buf_size / cfg->nb_dst_sges;
+			uint8_t src[buf_size], dst[buf_size];
+			uint8_t *sbuf, *dbuf, *ptr;
+
+			for (i = 0; i < (nr_buf_pt / RTE_MAX(cfg->nb_src_sges, cfg->nb_dst_sges));
+			     i++) {
+				sbuf = src;
+				dbuf = dst;
+				ptr = NULL;
+
+				for (j = 0; j < cfg->nb_src_sges; j++) {
+					ptr = rte_pktmbuf_mtod(src_buf[i * cfg->nb_src_sges + j],
+							       uint8_t *);
+					memcpy(sbuf, ptr, src_sz);
+					sbuf += src_sz;
+				}
+
+				if (src_remsz)
+					memcpy(sbuf, ptr + src_sz, src_remsz);
+
+				for (j = 0; j < cfg->nb_dst_sges; j++) {
+					ptr = rte_pktmbuf_mtod(dst_buf[i * cfg->nb_dst_sges + j],
+							       uint8_t *);
+					memcpy(dbuf, ptr, dst_sz);
+					dbuf += dst_sz;
+				}
+
+				if (dst_remsz)
+					memcpy(dbuf, ptr + dst_sz, dst_remsz);
+
+				if (memcmp(src, dst, buf_size) != 0) {
+					printf("SG Copy validation fails for buffer number %d\n",
+							i * cfg->nb_src_sges);
+					ret = -1;
+					goto out;
+				}
 			}
 		}
 	}
@@ -775,9 +844,12 @@  mem_copy_benchmark(struct test_configure *cfg)
 	bandwidth_total = 0;
 	avg_cycles_total = 0;
 	for (i = 0; i < nb_workers; i++) {
+		vchan_dev = &cfg->dma_config[i].vchan_dev;
 		calc_result(buf_size, nr_buf, nb_workers, test_secs,
 			lcores[i]->worker_info.test_cpl,
 			&memory, &avg_cycles, &bandwidth, &mops);
+		printf("Direction: %s\n", vchan_dev->tdir == 0 ? "mem2mem" :
+			vchan_dev->tdir == 1 ? "mem2dev" : "dev2mem");
 		output_result(cfg, lcores[i], kick_batch, avg_cycles, buf_size,
 			nr_buf / nb_workers, memory, bandwidth, mops);
 		mops_total += mops;
@@ -792,17 +864,26 @@  mem_copy_benchmark(struct test_configure *cfg)
 
 out:
 
-	if (cfg->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM)
-		m = srcs;
-	else if (cfg->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV)
-		m = dsts;
+	for (k = 0; k < nb_workers; k++) {
+		struct rte_mbuf **sbuf = NULL, **dbuf = NULL;
+		vchan_dev = &cfg->dma_config[k].vchan_dev;
+		offset = nr_buf / nb_workers * k;
+		m = NULL;
+		if (vchan_dev->tdir == RTE_DMA_DIR_DEV_TO_MEM) {
+			sbuf = srcs + offset;
+			m = sbuf;
+		} else if (vchan_dev->tdir == RTE_DMA_DIR_MEM_TO_DEV) {
+			dbuf = dsts + offset;
+			m = dbuf;
+		}
 
-	if (m) {
-		for (i = 0; i < nr_buf; i++)
-			rte_pktmbuf_detach_extbuf(m[i]);
+		if (m) {
+			for (i = 0; i < (nr_buf / nb_workers); i++)
+				rte_pktmbuf_detach_extbuf(m[i]);
 
-		if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
-			rte_free(m[0]->shinfo);
+			if (m[0]->shinfo && rte_mbuf_ext_refcnt_read(m[0]->shinfo) == 0)
+				rte_free(m[0]->shinfo);
+		}
 	}
 
 	/* free mbufs used in the test */
@@ -838,8 +919,9 @@  mem_copy_benchmark(struct test_configure *cfg)
 
 	if (cfg->is_dma) {
 		for (i = 0; i < nb_workers; i++) {
-			printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
-			rte_dma_stop(ldm->dma_ids[i]);
+			lcore_dma_map = &cfg->dma_config[i].lcore_dma_map;
+			printf("Stopping dmadev %d\n", lcore_dma_map->dma_id);
+			rte_dma_stop(lcore_dma_map->dma_id);
 		}
 	}
 
diff --git a/app/test-dma-perf/config.ini b/app/test-dma-perf/config.ini
index a1222ae86c..61e49dbae5 100644
--- a/app/test-dma-perf/config.ini
+++ b/app/test-dma-perf/config.ini
@@ -32,20 +32,18 @@ 
 
 ; "skip" To skip a test-case set skip to 1.
 
-; Parameters for data transfers from "mem to dev" and "dev to mem":
-;
-; "direction" denotes the direction of data transfer. It can take 3 values:
-;    mem2mem - mem to mem transfer
-;    mem2dev - mem to dev transfer
-;    dev2mem - dev to mem transfer
-; If not specified the default value is mem2mem transfer.
-
-; "vchan_dev" denotes below comma separated bus related parameters for mem2dev and dev2mem dma transfer.
+; "lcore_dma*" denotes below comma separated bus related parameters.
+;    "dev" DMA device used for the test.
+;    "dir" denotes direction of data transfer. It can take 3 values:
+;        mem2mem - mem to mem transfer
+;        mem2dev - mem to dev transfer
+;        dev2mem - dev to mem transfer
 ;    "raddr" remote iova address for mem2dev and dev2mem transfer.
 ;    "coreid" denotes PCIe core index.
 ;    "pfid" denotes PF-id to be used for data transfer
 ;    "vfid" denotes VF-id of PF-id to be used for data transfer.
-;    Example: vchan_dev=raddr=0x400000,coreid=1,pfid=2,vfid=3
+;    Example: lcore_dma0=lcore=10,dev=0000:00:04.2,dir=dev2mem,raddr=0x200000000,coreid=1,pfid=2,
+;             vfid=3
 
 ; Parameters for DMA scatter-gather memory copy:
 ;
@@ -71,7 +69,8 @@  src_numa_node=0
 dst_numa_node=0
 cache_flush=0
 test_seconds=2
-lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+lcore_dma0=lcore=10,dev=0000:00:04.1,dir=mem2mem
+lcore_dma1=lcore=11,dev=0000:00:04.2,dir=mem2mem
 eal_args=--in-memory --file-prefix=test
 
 [case2]
@@ -86,14 +85,13 @@  src_numa_node=0
 dst_numa_node=0
 cache_flush=0
 test_seconds=2
-lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+lcore_dma0=lcore=10,dev=0000:00:04.1,dir=mem2mem
+lcore_dma1=lcore=11,dev=0000:00:04.2,dir=mem2mem
 eal_args=--in-memory --file-prefix=test
 
 [case3]
 skip=1
 type=DMA_MEM_COPY
-direction=dev2mem
-vchan_dev=raddr=0x200000000,coreid=1,pfid=2,vfid=3
 mem_size=10
 buf_size=64,4096,2,MUL
 dma_ring_size=1024
@@ -102,7 +100,9 @@  src_numa_node=0
 dst_numa_node=0
 cache_flush=0
 test_seconds=2
-lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+lcore_dma0=lcore=10,dev=0000:00:04.1,dir=mem2mem
+lcore_dma1=lcore=11,dev=0000:00:04.2,dir=dev2mem,raddr=0x200000000,coreid=1,pfid=2,vfid=3
+lcore_dma2=lcore=12,dev=0000:00:04.3,dir=mem2dev,raddr=0x300000000,coreid=3,pfid=2,vfid=1
 eal_args=--in-memory --file-prefix=test
 
 [case4]
diff --git a/app/test-dma-perf/main.c b/app/test-dma-perf/main.c
index 18219918cc..fd1d2ee763 100644
--- a/app/test-dma-perf/main.c
+++ b/app/test-dma-perf/main.c
@@ -134,7 +134,7 @@  run_test(uint32_t case_id, struct test_configure *case_cfg)
 	for (i = 0; i < RTE_DIM(output_str); i++)
 		memset(output_str[i], 0, MAX_OUTPUT_STR_LEN);
 
-	if (nb_lcores <= case_cfg->lcore_dma_map.cnt) {
+	if (nb_lcores <= case_cfg->num_worker) {
 		printf("Case %u: Not enough lcores.\n", case_id);
 		return;
 	}
@@ -190,19 +190,18 @@  parse_lcore(struct test_configure *test_case, const char *value)
 	len = strlen(value);
 	input = (char *)malloc((len + 1) * sizeof(char));
 	strlcpy(input, value, len + 1);
-	lcore_dma_map = &(test_case->lcore_dma_map);
-
-	memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
 
 	char *token = strtok(input, ", ");
 	while (token != NULL) {
-		if (lcore_dma_map->cnt >= MAX_WORKER_NB) {
+		lcore_dma_map = &(test_case->dma_config[test_case->num_worker++].lcore_dma_map);
+		memset(lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
+		if (test_case->num_worker >= MAX_WORKER_NB) {
 			free(input);
 			return -1;
 		}
 
 		uint16_t lcore_id = atoi(token);
-		lcore_dma_map->lcores[lcore_dma_map->cnt++] = lcore_id;
+		lcore_dma_map->lcore = lcore_id;
 
 		token = strtok(NULL, ", ");
 	}
@@ -211,82 +210,6 @@  parse_lcore(struct test_configure *test_case, const char *value)
 	return 0;
 }
 
-static int
-parse_lcore_dma(struct test_configure *test_case, const char *value)
-{
-	struct lcore_dma_map_t *lcore_dma_map;
-	char *input, *addrs;
-	char *ptrs[2];
-	char *start, *end, *substr;
-	uint16_t lcore_id;
-	int ret = 0;
-
-	if (test_case == NULL || value == NULL)
-		return -1;
-
-	input = strndup(value, strlen(value) + 1);
-	if (input == NULL)
-		return -1;
-	addrs = input;
-
-	while (*addrs == '\0')
-		addrs++;
-	if (*addrs == '\0') {
-		fprintf(stderr, "No input DMA addresses\n");
-		ret = -1;
-		goto out;
-	}
-
-	substr = strtok(addrs, ",");
-	if (substr == NULL) {
-		fprintf(stderr, "No input DMA address\n");
-		ret = -1;
-		goto out;
-	}
-
-	memset(&test_case->lcore_dma_map, 0, sizeof(struct lcore_dma_map_t));
-
-	do {
-		if (rte_strsplit(substr, strlen(substr), ptrs, 2, '@') < 0) {
-			fprintf(stderr, "Illegal DMA address\n");
-			ret = -1;
-			break;
-		}
-
-		start = strstr(ptrs[0], "lcore");
-		if (start == NULL) {
-			fprintf(stderr, "Illegal lcore\n");
-			ret = -1;
-			break;
-		}
-
-		start += 5;
-		lcore_id = strtol(start, &end, 0);
-		if (end == start) {
-			fprintf(stderr, "No input lcore ID or ID %d is wrong\n", lcore_id);
-			ret = -1;
-			break;
-		}
-
-		lcore_dma_map = &test_case->lcore_dma_map;
-		if (lcore_dma_map->cnt >= MAX_WORKER_NB) {
-			fprintf(stderr, "lcores count error\n");
-			ret = -1;
-			break;
-		}
-
-		lcore_dma_map->lcores[lcore_dma_map->cnt] = lcore_id;
-		strlcpy(lcore_dma_map->dma_names[lcore_dma_map->cnt], ptrs[1],
-				RTE_DEV_NAME_MAX_LEN);
-		lcore_dma_map->cnt++;
-		substr = strtok(NULL, ",");
-	} while (substr != NULL);
-
-out:
-	free(input);
-	return ret;
-}
-
 static int
 parse_entry(const char *value, struct test_configure_entry *entry)
 {
@@ -331,20 +254,32 @@  parse_entry(const char *value, struct test_configure_entry *entry)
 	return args_nr;
 }
 
-static int populate_pcie_config(const char *key, const char *value, void *test)
+static int populate_dma_dev_config(const char *key, const char *value, void *test)
 {
-	struct test_configure *test_case = (struct test_configure *)test;
+	struct lcore_dma_config *dma_config = (struct lcore_dma_config *)test;
+	struct vchan_dev_config *vchan_config = &dma_config->vchan_dev;
+	struct lcore_dma_map_t *lcore_map = &dma_config->lcore_dma_map;
 	char *endptr;
 	int ret = 0;
 
-	if (strcmp(key, "raddr") == 0)
-		test_case->vchan_dev.raddr = strtoull(value, &endptr, 16);
+	if (strcmp(key, "lcore") == 0)
+		lcore_map->lcore = (uint8_t)atoi(value);
+	else if (strcmp(key, "dev") == 0)
+		strlcpy(lcore_map->dma_names, value, RTE_DEV_NAME_MAX_LEN);
+	else if (strcmp(key, "dir") == 0 && strcmp(value, "mem2mem") == 0)
+		vchan_config->tdir = RTE_DMA_DIR_MEM_TO_MEM;
+	else if (strcmp(key, "dir") == 0 && strcmp(value, "mem2dev") == 0)
+		vchan_config->tdir = RTE_DMA_DIR_MEM_TO_DEV;
+	else if (strcmp(key, "dir") == 0 && strcmp(value, "dev2mem") == 0)
+		vchan_config->tdir = RTE_DMA_DIR_DEV_TO_MEM;
+	else if (strcmp(key, "raddr") == 0)
+		vchan_config->raddr = strtoull(value, &endptr, 16);
 	else if (strcmp(key, "coreid") == 0)
-		test_case->vchan_dev.port.pcie.coreid = (uint8_t)atoi(value);
+		vchan_config->port.pcie.coreid = (uint8_t)atoi(value);
 	else if (strcmp(key, "vfid") == 0)
-		test_case->vchan_dev.port.pcie.vfid = (uint16_t)atoi(value);
+		vchan_config->port.pcie.vfid = (uint16_t)atoi(value);
 	else if (strcmp(key, "pfid") == 0)
-		test_case->vchan_dev.port.pcie.pfid = (uint16_t)atoi(value);
+		vchan_config->port.pcie.pfid = (uint16_t)atoi(value);
 	else {
 		printf("Invalid config param: %s\n", key);
 		ret = -1;
@@ -361,13 +296,11 @@  load_configs(const char *path)
 	struct test_configure *test_case;
 	char section_name[CFG_NAME_LEN];
 	const char *case_type;
-	const char *transfer_dir;
 	const char *lcore_dma;
 	const char *mem_size_str, *buf_size_str, *ring_size_str, *kick_batch_str,
 		*src_sges_str, *dst_sges_str;
 	const char *skip;
 	struct rte_kvargs *kvlist;
-	const char *vchan_dev;
 	int args_nr, nb_vp;
 	bool is_dma;
 
@@ -405,22 +338,6 @@  load_configs(const char *path)
 		if (strcmp(case_type, DMA_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_DMA_MEM_COPY;
 			test_case->test_type_str = DMA_MEM_COPY;
-
-			transfer_dir = rte_cfgfile_get_entry(cfgfile, section_name, "direction");
-			if (transfer_dir == NULL) {
-				printf("Transfer direction not configured."
-					" Defaulting it to MEM to MEM transfer.\n");
-				test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
-			} else {
-				if (strcmp(transfer_dir, "mem2dev") == 0)
-					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_DEV;
-				else if (strcmp(transfer_dir, "dev2mem") == 0)
-					test_case->transfer_dir = RTE_DMA_DIR_DEV_TO_MEM;
-				else {
-					printf("Defaulting the test to MEM to MEM transfer\n");
-					test_case->transfer_dir = RTE_DMA_DIR_MEM_TO_MEM;
-				}
-			}
 			is_dma = true;
 		} else if (strcmp(case_type, CPU_MEM_COPY) == 0) {
 			test_case->test_type = TEST_TYPE_CPU_MEM_COPY;
@@ -432,40 +349,6 @@  load_configs(const char *path)
 			continue;
 		}
 
-		if (test_case->transfer_dir == RTE_DMA_DIR_MEM_TO_DEV ||
-			test_case->transfer_dir == RTE_DMA_DIR_DEV_TO_MEM) {
-			vchan_dev = rte_cfgfile_get_entry(cfgfile, section_name, "vchan_dev");
-			if (vchan_dev == NULL) {
-				printf("Transfer direction mem2dev and dev2mem"
-				       " vhcan_dev shall be configured.\n");
-				test_case->is_valid = false;
-				continue;
-			}
-
-			kvlist = rte_kvargs_parse(vchan_dev, NULL);
-			if (kvlist == NULL) {
-				printf("rte_kvargs_parse() error");
-				test_case->is_valid = false;
-				continue;
-			}
-
-			if (rte_kvargs_process(kvlist, NULL, populate_pcie_config,
-					       (void *)test_case) < 0) {
-				printf("rte_kvargs_process() error\n");
-				rte_kvargs_free(kvlist);
-				test_case->is_valid = false;
-				continue;
-			}
-
-			if (!test_case->vchan_dev.raddr) {
-				printf("For mem2dev and dev2mem configure raddr\n");
-				rte_kvargs_free(kvlist);
-				test_case->is_valid = false;
-				continue;
-			}
-			rte_kvargs_free(kvlist);
-		}
-
 		test_case->is_dma = is_dma;
 		test_case->src_numa_node = (int)atoi(rte_cfgfile_get_entry(cfgfile,
 								section_name, "src_numa_node"));
@@ -543,11 +426,35 @@  load_configs(const char *path)
 			} else if (args_nr == 4)
 				nb_vp++;
 
-			lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, "lcore_dma");
-			int lcore_ret = parse_lcore_dma(test_case, lcore_dma);
-			if (lcore_ret < 0) {
-				printf("parse lcore dma error in case %d.\n", i + 1);
-				test_case->is_valid = false;
+			char lc_dma[RTE_DEV_NAME_MAX_LEN];
+			int i = 0;
+			while (1) {
+				snprintf(lc_dma, RTE_DEV_NAME_MAX_LEN, "lcore_dma%d", i);
+				lcore_dma = rte_cfgfile_get_entry(cfgfile, section_name, lc_dma);
+				if (lcore_dma == NULL)
+					break;
+
+				kvlist = rte_kvargs_parse(lcore_dma, NULL);
+				if (kvlist == NULL) {
+					printf("rte_kvargs_parse() error");
+					test_case->is_valid = false;
+					break;
+				}
+
+				if (rte_kvargs_process(kvlist, NULL, populate_dma_dev_config,
+						       (void *)&test_case->dma_config[i]) < 0) {
+					printf("rte_kvargs_process() error\n");
+					rte_kvargs_free(kvlist);
+					test_case->is_valid = false;
+					break;
+				}
+				i++;
+				test_case->num_worker++;
+				rte_kvargs_free(kvlist);
+			}
+
+			if (test_case->num_worker == 0) {
+				printf("Error: Parsing %s Failed\n", lc_dma);
 				continue;
 			}
 		} else {
diff --git a/app/test-dma-perf/main.h b/app/test-dma-perf/main.h
index e88d72f54f..59eb648b3d 100644
--- a/app/test-dma-perf/main.h
+++ b/app/test-dma-perf/main.h
@@ -32,33 +32,38 @@  struct test_configure_entry {
 };
 
 struct lcore_dma_map_t {
-	uint32_t lcores[MAX_WORKER_NB];
-	char dma_names[MAX_WORKER_NB][RTE_DEV_NAME_MAX_LEN];
-	int16_t dma_ids[MAX_WORKER_NB];
-	uint16_t cnt;
+	char dma_names[RTE_DEV_NAME_MAX_LEN];
+	uint32_t lcore;
+	int16_t dma_id;
 };
 
-struct test_vchan_dev_config {
+struct vchan_dev_config {
 	struct rte_dma_port_param port;
 	uintptr_t raddr;
+	uint8_t tdir;
+};
+
+struct lcore_dma_config {
+	struct lcore_dma_map_t lcore_dma_map;
+	struct vchan_dev_config vchan_dev;
 };
 
 struct test_configure {
 	bool is_valid;
 	bool is_skip;
 	uint8_t test_type;
-	uint8_t transfer_dir;
 	const char *test_type_str;
 	uint16_t src_numa_node;
 	uint16_t dst_numa_node;
 	uint16_t opcode;
 	bool is_dma;
 	bool is_sg;
-	struct lcore_dma_map_t lcore_dma_map;
+	struct lcore_dma_config dma_config[MAX_WORKER_NB];
 	struct test_configure_entry mem_size;
 	struct test_configure_entry buf_size;
 	struct test_configure_entry ring_size;
 	struct test_configure_entry kick_batch;
+	uint16_t num_worker;
 	uint8_t nb_src_sges;
 	uint8_t nb_dst_sges;
 	uint8_t cache_flush;
@@ -66,7 +71,6 @@  struct test_configure {
 	uint16_t test_secs;
 	const char *eal_args;
 	uint8_t scenario_id;
-	struct test_vchan_dev_config vchan_dev;
 };
 
 int mem_copy_benchmark(struct test_configure *cfg);
diff --git a/doc/guides/tools/dmaperf.rst b/doc/guides/tools/dmaperf.rst
index f68353b920..fc3ed05a6d 100644
--- a/doc/guides/tools/dmaperf.rst
+++ b/doc/guides/tools/dmaperf.rst
@@ -5,27 +5,23 @@  dpdk-test-dma-perf Application
 ==============================
 
 The ``dpdk-test-dma-perf`` tool is a Data Plane Development Kit (DPDK) application
-that enables testing the performance of DMA (Direct Memory Access) devices available within DPDK.
-It provides a test framework to assess the performance of CPU and DMA devices
-under various scenarios, such as varying buffer lengths.
-Doing so provides insight into the potential performance
-when using these DMA devices for acceleration in DPDK applications.
+that evaluates the performance of DMA (Direct Memory Access) devices accessible in DPDK environment.
+It provides a benchmark framework to assess the performance of CPU and DMA devices
+under various combinations, such as varying buffer lengths, scatter-gather copy, copying in remote
+memory etc. It helps in evaluating performance of DMA device as hardware acceleration vehicle in
+DPDK application.
 
-It supports memory copy performance tests for now,
-comparing the performance of CPU and DMA automatically in various conditions
-with the help of a pre-set configuration file.
+In addition, this tool supports memory-to-memory, memory-to-device and device-to-memory copy tests,
+to compare the performance of CPU and DMA capabilities under various conditions with the help of a
+pre-set configuration file.
 
 
 Configuration
 -------------
 
-This application uses inherent DPDK EAL command-line options
-as well as custom command-line options in the application.
-An example configuration file for the application is provided
-and gives the meanings for each parameter.
-
-Here is an extracted sample from the configuration file
-(the complete sample can be found in the application source directory):
+Along with EAL command-line arguments, this application supports various parameters for the
+benchmarking through a configuration file. An example configuration file is provided below along
+with the application to demonstrate all the parameters.
 
 .. code-block:: ini
 
@@ -39,7 +35,8 @@  Here is an extracted sample from the configuration file
    dst_numa_node=0
    cache_flush=0
    test_seconds=2
-   lcore_dma=lcore10@0000:00:04.2, lcore11@0000:00:04.3
+   lcore_dma0=lcore=10,dev=0000:00:04.2,dir=mem2mem
+   lcore_dma0=lcore=11,dev=0000:00:04.3,dir=mem2mem
    eal_args=--in-memory --file-prefix=test
 
    [case2]
@@ -53,14 +50,35 @@  Here is an extracted sample from the configuration file
    lcore = 3, 4
    eal_args=--in-memory --no-pci
 
+   [case3]
+   skip=1
+   type=DMA_MEM_COPY
+   dma_src_sge=4
+   dma_dst_sge=1
+   mem_size=10
+   buf_size=64,8192,2,MUL
+   dma_ring_size=1024
+   kick_batch=32
+   src_numa_node=0
+   dst_numa_node=0
+   cache_flush=0
+   test_seconds=2
+   lcore_dma0=lcore=10,dev=0000:00:04.1,dir=mem2mem
+   lcore_dma1=lcore=11,dev=0000:00:04.2,dir=dev2mem,raddr=0x200000000,coreid=1,pfid=2,vfid=3
+   lcore_dma2=lcore=12,dev=0000:00:04.3,dir=mem2dev,raddr=0x200000000,coreid=1,pfid=2,vfid=3
+   eal_args=--in-memory --file-prefix=test
+
 The configuration file is divided into multiple sections, each section represents a test case.
-The four variables ``mem_size``, ``buf_size``, ``dma_ring_size``, and ``kick_batch``
-can vary in each test case.
-The format for this is ``variable=first,last,increment,ADD|MUL``.
-This means that the first value of the variable is 'first',
-the last value is 'last',
-'increment' is the step size,
-and 'ADD|MUL' indicates whether the change is by addition or multiplication.
+The four mandatory variables ``mem_size``, ``buf_size``, ``dma_ring_size``, and ``kick_batch``
+can vary in each test case. The format for this is ``variable=first,last,increment,ADD|MUL``.
+This means that the first value of the variable is 'first', the last value is 'last',
+'increment' is the step size, and 'ADD|MUL' indicates whether the change is by addition or
+multiplication.
+
+The variables for mem2dev and dev2mem copy are ``dir``, ``dev``, ``lcore``, ``coreid``, ``pfid``,
+``vfid``, ``raddr`` and can vary for each device.
+
+For scatter-gather copy test ``dma_src_sge``, ``dma_dst_sge`` must be configured.
 
 Each case can only have one variable change,
 and each change will generate a scenario, so each case can have multiple scenarios.
@@ -69,10 +87,19 @@  and each change will generate a scenario, so each case can have multiple scenari
 Configuration Parameters
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
+``skip``
+  To skip a test-case, must be configured as ``1``
+
 ``type``
   The type of the test.
   Currently supported types are ``DMA_MEM_COPY`` and ``CPU_MEM_COPY``.
 
+``dma_src_sge``
+  Number of source segments for scatter-gather.
+
+``dma_dst_sge``
+  Number of destination segments for scatter-gather.
+
 ``mem_size``
   The size of the memory footprint in megabytes (MB) for source and destination.
 
@@ -99,7 +126,35 @@  Configuration Parameters
   Controls the test time for each scenario.
 
 ``lcore_dma``
-  Specifies the lcore/DMA mapping.
+  Specifies the lcore/DMA mapping and per device specific config.
+
+    * ``lcore``
+        Core number mapped to a DMA device.
+
+    * ``dir``
+        The direction of data transfer.
+        Currently supported directions:
+
+          * ``mem2mem`` - memory to memory copy
+
+          * ``mem2dev`` - memory to device copy
+
+          * ``dev2mem`` - device to memory copy
+
+    * ``dev``
+        DMA device bus address.
+
+    * ``raddr``
+        Remote machine address for ``mem2dev`` and ``dev2mem`` copy.
+
+    * ``coreid``
+        Denotes PCIe core index for ``mem2dev`` and ``dev2mem`` copy.
+
+    * ``pfid``
+        Denotes PF-id to be used for ``mem2dev`` and ``dev2mem`` copy.
+
+    * ``vfid``
+        Denotes VF-id of PF-id to be used for ``mem2dev`` and ``dev2mem`` copy.
 
 .. note::
 
@@ -131,6 +186,4 @@  with the same name as the configuration file with the addition of ``_result.csv`
 Limitations
 -----------
 
-Currently, this tool only supports memory copy performance tests.
-Additional enhancements are possible in the future
-to support more types of tests for DMA devices and CPUs.
+Additional enhancements are possible in the future.