[v2,3/6] app/test: add basic dmadev copy tests

Message ID 20210901163216.120087-4-bruce.richardson@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series add test suite for DMA drivers |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Bruce Richardson Sept. 1, 2021, 4:32 p.m. UTC
  For each dmadev instance, perform some basic copy tests to validate that
functionality.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 app/test/test_dmadev.c | 174 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
  

Comments

Jerin Jacob Sept. 2, 2021, 7:44 a.m. UTC | #1
On Wed, Sep 1, 2021 at 10:02 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> For each dmadev instance, perform some basic copy tests to validate that
> functionality.
>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  app/test/test_dmadev.c | 174 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 174 insertions(+)
>
> diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
> index 12f7c69629..261f45db71 100644
> --- a/app/test/test_dmadev.c
> +++ b/app/test/test_dmadev.c
> @@ -2,12 +2,15 @@
>   * Copyright(c) 2021 HiSilicon Limited.
>   * Copyright(c) 2021 Intel Corporation.
>   */
> +#include <unistd.h>
>  #include <inttypes.h>
>
>  #include <rte_common.h>
>  #include <rte_dev.h>
>  #include <rte_dmadev.h>
>  #include <rte_bus_vdev.h>
> +#include <rte_mbuf.h>
> +#include <rte_random.h>
>
>  #include "test.h"
>
> @@ -16,6 +19,11 @@ extern int test_dmadev_api(uint16_t dev_id);
>
>  #define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
>
> +#define COPY_LEN 1024
> +
> +static struct rte_mempool *pool;
> +static uint16_t id_count;
> +
>  static inline int
>  __rte_format_printf(3, 4)
>  print_err(const char *func, int lineno, const char *format, ...)
> @@ -31,6 +39,134 @@ print_err(const char *func, int lineno, const char *format, ...)
>         return ret;
>  }
>
> +static inline void
> +await_hw(int dev_id, uint16_t vchan)
> +{
> +       int idle = rte_dmadev_vchan_idle(dev_id, vchan);
> +       if (idle < 0) {
> +               /* for drivers that don't support this op, just sleep for 25 microseconds */
> +               usleep(25);
> +               return;
> +       }

Can following model eliminate the need for rte_dmadev_vchan_idle() API. Right?

static inline bool
await_hw(int dev_id, uint16_t vchan, uint16_t  nb_req, uint16_t *last_idx)
{
             const uint64_t tmo =   rte_get_timer_hz();
             bool has_error  = false;

             const uint64_t end_cycles = rte_get_timer_cycles() + tmo;
              while (rte_get_timer_cycles() < end_cycles && nb_req > 0
&& has_error  == false) {
                           rte_pause();
                           nb_req -= rte_dmadev_completed(dev_id,
nb_req, last_idx, &has_error);
              }

              return has_error ;
}



> +
> +       /* for those that do, *max* end time is one second from now, but all should be faster */
> +       const uint64_t end_cycles = rte_get_timer_cycles() + rte_get_timer_hz();
> +       while (!idle && rte_get_timer_cycles() < end_cycles) {
> +               rte_pause();
> +               idle = rte_dmadev_vchan_idle(dev_id, vchan);
> +       }
> +}
  
Bruce Richardson Sept. 2, 2021, 8:06 a.m. UTC | #2
On Thu, Sep 02, 2021 at 01:14:38PM +0530, Jerin Jacob wrote:
> On Wed, Sep 1, 2021 at 10:02 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > For each dmadev instance, perform some basic copy tests to validate that
> > functionality.
> >
> > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > ---
> >  app/test/test_dmadev.c | 174 +++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 174 insertions(+)
> >
> > diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
> > index 12f7c69629..261f45db71 100644
> > --- a/app/test/test_dmadev.c
> > +++ b/app/test/test_dmadev.c
> > @@ -2,12 +2,15 @@
> >   * Copyright(c) 2021 HiSilicon Limited.
> >   * Copyright(c) 2021 Intel Corporation.
> >   */
> > +#include <unistd.h>
> >  #include <inttypes.h>
> >
> >  #include <rte_common.h>
> >  #include <rte_dev.h>
> >  #include <rte_dmadev.h>
> >  #include <rte_bus_vdev.h>
> > +#include <rte_mbuf.h>
> > +#include <rte_random.h>
> >
> >  #include "test.h"
> >
> > @@ -16,6 +19,11 @@ extern int test_dmadev_api(uint16_t dev_id);
> >
> >  #define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> >
> > +#define COPY_LEN 1024
> > +
> > +static struct rte_mempool *pool;
> > +static uint16_t id_count;
> > +
> >  static inline int
> >  __rte_format_printf(3, 4)
> >  print_err(const char *func, int lineno, const char *format, ...)
> > @@ -31,6 +39,134 @@ print_err(const char *func, int lineno, const char *format, ...)
> >         return ret;
> >  }
> >
> > +static inline void
> > +await_hw(int dev_id, uint16_t vchan)
> > +{
> > +       int idle = rte_dmadev_vchan_idle(dev_id, vchan);
> > +       if (idle < 0) {
> > +               /* for drivers that don't support this op, just sleep for 25 microseconds */
> > +               usleep(25);
> > +               return;
> > +       }
> 
> Can following model eliminate the need for rte_dmadev_vchan_idle() API. Right?
> 
> static inline bool
> await_hw(int dev_id, uint16_t vchan, uint16_t  nb_req, uint16_t *last_idx)
> {
>              const uint64_t tmo =   rte_get_timer_hz();
>              bool has_error  = false;
> 
>              const uint64_t end_cycles = rte_get_timer_cycles() + tmo;
>               while (rte_get_timer_cycles() < end_cycles && nb_req > 0
> && has_error  == false) {
>                            rte_pause();
>                            nb_req -= rte_dmadev_completed(dev_id,
> nb_req, last_idx, &has_error);
>               }
> 
>               return has_error ;
> }
>
It would, but unfortunately it also removes the possibility of doing a
number of the tests in the set, particularly around failure handling. We
used runtime coverage tools to ensure we were hitting as many legs of code
as possible in drivers, and to cover these possibilities we need to do
various different types of completion gathering, e.g. gather multiple
bursts in one go, gathering a single burst in two halves, gathering a burst
using completion_status rather than completion, gathering completions
one-at-a-time with a call for each individually, and for error handling
gathering just the failing element alone, or gathering completions for all
remaining elements not just the failing one, etc. etc. 

These tests are useful both for finding bugs (and they did find ones in our
drivers), but also to ensure similar behaviour across different drivers
using the API. However, they really only can be done in a consistent way if
we are able to ensure that at certain points the hardware has finished
processing before we begin gathering completions. Therefore, having a way
to poll for idle is useful. As you see, I've also left in the delay as a
fallback in case drivers choose not to implement it, thereby making it an
optional API.

Beyond testing, I can see the API to poll for idleness being useful for the
device shutdown case. I was considering whether the "stop" API should also
use it to ensure that the hardware is idle before stopping. I decided
against it for now, but you could see applications making use of this -
waiting for the hardware to finish its work before stopping it.

Regards,
/Bruce
  
Jerin Jacob Sept. 2, 2021, 10:54 a.m. UTC | #3
On Thu, Sep 2, 2021 at 1:36 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Thu, Sep 02, 2021 at 01:14:38PM +0530, Jerin Jacob wrote:
> > On Wed, Sep 1, 2021 at 10:02 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > For each dmadev instance, perform some basic copy tests to validate that
> > > functionality.
> > >
> > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > ---
> > >  app/test/test_dmadev.c | 174 +++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 174 insertions(+)
> > >
> > > diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
> > > index 12f7c69629..261f45db71 100644
> > > --- a/app/test/test_dmadev.c
> > > +++ b/app/test/test_dmadev.c
> > > @@ -2,12 +2,15 @@
> > >   * Copyright(c) 2021 HiSilicon Limited.
> > >   * Copyright(c) 2021 Intel Corporation.
> > >   */
> > > +#include <unistd.h>
> > >  #include <inttypes.h>
> > >
> > >  #include <rte_common.h>
> > >  #include <rte_dev.h>
> > >  #include <rte_dmadev.h>
> > >  #include <rte_bus_vdev.h>
> > > +#include <rte_mbuf.h>
> > > +#include <rte_random.h>
> > >
> > >  #include "test.h"
> > >
> > > @@ -16,6 +19,11 @@ extern int test_dmadev_api(uint16_t dev_id);
> > >
> > >  #define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
> > >
> > > +#define COPY_LEN 1024
> > > +
> > > +static struct rte_mempool *pool;
> > > +static uint16_t id_count;
> > > +
> > >  static inline int
> > >  __rte_format_printf(3, 4)
> > >  print_err(const char *func, int lineno, const char *format, ...)
> > > @@ -31,6 +39,134 @@ print_err(const char *func, int lineno, const char *format, ...)
> > >         return ret;
> > >  }
> > >
> > > +static inline void
> > > +await_hw(int dev_id, uint16_t vchan)
> > > +{
> > > +       int idle = rte_dmadev_vchan_idle(dev_id, vchan);
> > > +       if (idle < 0) {
> > > +               /* for drivers that don't support this op, just sleep for 25 microseconds */
> > > +               usleep(25);
> > > +               return;
> > > +       }
> >
> > Can following model eliminate the need for rte_dmadev_vchan_idle() API. Right?
> >
> > static inline bool
> > await_hw(int dev_id, uint16_t vchan, uint16_t  nb_req, uint16_t *last_idx)
> > {
> >              const uint64_t tmo =   rte_get_timer_hz();
> >              bool has_error  = false;
> >
> >              const uint64_t end_cycles = rte_get_timer_cycles() + tmo;
> >               while (rte_get_timer_cycles() < end_cycles && nb_req > 0
> > && has_error  == false) {
> >                            rte_pause();
> >                            nb_req -= rte_dmadev_completed(dev_id,
> > nb_req, last_idx, &has_error);
> >               }
> >
> >               return has_error ;
> > }
> >
> It would, but unfortunately it also removes the possibility of doing a
> number of the tests in the set, particularly around failure handling. We
> used runtime coverage tools to ensure we were hitting as many legs of code
> as possible in drivers, and to cover these possibilities we need to do
> various different types of completion gathering, e.g. gather multiple
> bursts in one go, gathering a single burst in two halves, gathering a burst
> using completion_status rather than completion, gathering completions
> one-at-a-time with a call for each individually, and for error handling
> gathering just the failing element alone, or gathering completions for all
> remaining elements not just the failing one, etc. etc.

Agree with the rationale.


>
> These tests are useful both for finding bugs (and they did find ones in our
> drivers), but also to ensure similar behaviour across different drivers
> using the API. However, they really only can be done in a consistent way if
> we are able to ensure that at certain points the hardware has finished
> processing before we begin gathering completions. Therefore, having a way
> to poll for idle is useful. As you see, I've also left in the delay as a
> fallback in case drivers choose not to implement it, thereby making it an
> optional API.
>
> Beyond testing, I can see the API to poll for idleness being useful for the
> device shutdown case. I was considering whether the "stop" API should also
> use it to ensure that the hardware is idle before stopping. I decided
> against it for now, but you could see applications making use of this -
> waiting for the hardware to finish its work before stopping it.


I think 25us will not be enough, e.s.p If is PCI-Dev to PCI-Dev kind
of test cases.
Since it is the functional test case, I think, we can keep it a very
higher range to
support all cases. Maybe 50ms is a good target.


>
> Regards,
> /Bruce
  
Bruce Richardson Sept. 2, 2021, 11:43 a.m. UTC | #4
On Thu, Sep 02, 2021 at 04:24:18PM +0530, Jerin Jacob wrote:
> 
> I think 25us will not be enough, e.s.p If is PCI-Dev to PCI-Dev kind
> of test cases.
> Since it is the functional test case, I think, we can keep it a very
> higher range to
> support all cases. Maybe 50ms is a good target.
> 

Sure, no problem to push it up. If it turns out that all upstreamed drivers
implement the "idle" function we can remove the fallback option completely,
but I'll keep it for now and push timeout up. Do you really think it needs
to be in the (tens of )millisecond range? Even for tests going across PCI
would most transactions not complete in the microsecond range, e.g. 100
usec?
  
Jerin Jacob Sept. 2, 2021, 1:05 p.m. UTC | #5
On Thu, Sep 2, 2021 at 5:13 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Thu, Sep 02, 2021 at 04:24:18PM +0530, Jerin Jacob wrote:
> >
> > I think 25us will not be enough, e.s.p If is PCI-Dev to PCI-Dev kind
> > of test cases.
> > Since it is the functional test case, I think, we can keep it a very
> > higher range to
> > support all cases. Maybe 50ms is a good target.
> >
>
> Sure, no problem to push it up. If it turns out that all upstreamed drivers
> implement the "idle" function we can remove the fallback option completely,
> but I'll keep it for now and push timeout up. Do you really think it needs
> to be in the (tens of )millisecond range? Even for tests going across PCI
> would most transactions not complete in the microsecond range, e.g. 100
> usec?

Based on busload and size of buffers the completion time can vary. I
think, 1 ms could be
good trade-off. Also, In the future some HW needs beyond that then we
can increase.
  
Bruce Richardson Sept. 2, 2021, 2:21 p.m. UTC | #6
On Thu, Sep 02, 2021 at 06:35:07PM +0530, Jerin Jacob wrote:
> On Thu, Sep 2, 2021 at 5:13 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Thu, Sep 02, 2021 at 04:24:18PM +0530, Jerin Jacob wrote:
> > >
> > > I think 25us will not be enough, e.s.p If is PCI-Dev to PCI-Dev kind
> > > of test cases.
> > > Since it is the functional test case, I think, we can keep it a very
> > > higher range to
> > > support all cases. Maybe 50ms is a good target.
> > >
> >
> > Sure, no problem to push it up. If it turns out that all upstreamed drivers
> > implement the "idle" function we can remove the fallback option completely,
> > but I'll keep it for now and push timeout up. Do you really think it needs
> > to be in the (tens of )millisecond range? Even for tests going across PCI
> > would most transactions not complete in the microsecond range, e.g. 100
> > usec?
> 
> Based on busload and size of buffers the completion time can vary. I
> think, 1 ms could be
> good trade-off. Also, In the future some HW needs beyond that then we
> can increase.

Ok, thanks.
  
Kevin Laatz Sept. 3, 2021, 4:05 p.m. UTC | #7
On 01/09/2021 17:32, Bruce Richardson wrote:
> For each dmadev instance, perform some basic copy tests to validate that
> functionality.
>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>   app/test/test_dmadev.c | 174 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 174 insertions(+)

<snip>

> +
> +static int
> +test_enqueue_copies(int dev_id, uint16_t vchan)
> +{
> +	unsigned int i;
> +	uint16_t id;
> +
> +	/* test doing a single copy */
> +	do {
> +		struct rte_mbuf *src, *dst;
> +		char *src_data, *dst_data;
> +
> +		src = rte_pktmbuf_alloc(pool);
> +		dst = rte_pktmbuf_alloc(pool);
> +		src_data = rte_pktmbuf_mtod(src, char *);
> +		dst_data = rte_pktmbuf_mtod(dst, char *);
> +
> +		for (i = 0; i < COPY_LEN; i++)
> +			src_data[i] = rte_rand() & 0xFF;
> +
> +		id = rte_dmadev_copy(dev_id, vchan, src->buf_iova + src->data_off,
> +				dst->buf_iova + dst->data_off, COPY_LEN, RTE_DMA_OP_FLAG_SUBMIT);

Could use the rte_mbuf APIs to get the struct members here and 
throughout the other tests in this set.

No strong opinion on this either way.


> +		if (id != id_count) {
> +			PRINT_ERR("Error with rte_dmadev_copy, got %u, expected %u\n",
> +					id, id_count);
> +			return -1;
> +		}
> +
> +		/* give time for copy to finish, then check it was done */
> +		await_hw(dev_id, vchan);
> +
> +		for (i = 0; i < COPY_LEN; i++) {
> +			if (dst_data[i] != src_data[i]) {
> +				PRINT_ERR("Data mismatch at char %u [Got %02x not %02x]\n", i,
> +						dst_data[i], src_data[i]);
> +				rte_dmadev_dump(dev_id, stderr);
> +				return -1;
> +			}
> +		}
> +
> +		/* now check completion works */
> +		if (rte_dmadev_completed(dev_id, vchan, 1, &id, NULL) != 1) {
> +			PRINT_ERR("Error with rte_dmadev_completed\n");
> +			return -1;
> +		}
> +		if (id != id_count) {
> +			PRINT_ERR("Error:incorrect job id received, %u [expected %u]\n",
> +					id, id_count);
> +			return -1;
> +		}
> +
> +		rte_pktmbuf_free(src);
> +		rte_pktmbuf_free(dst);
> +
> +		/* now check completion works */

This comment doesn't match with the check being done.


> +		if (rte_dmadev_completed(dev_id, 0, 1, NULL, NULL) != 0) {
> +			PRINT_ERR("Error with rte_dmadev_completed in empty check\n");
> +			return -1;
> +		}
> +		id_count++;
> +
> +	} while (0);
> +

<snip>

Apart from minor comments above, LGTM.

Reviewed-by: Kevin Laatz <kevin.laatz@intel.com>
  
Conor Walsh Sept. 3, 2021, 4:07 p.m. UTC | #8
> For each dmadev instance, perform some basic copy tests to validate that
> functionality.
>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
<snip>
> +static inline void
> +await_hw(int dev_id, uint16_t vchan)
> +{
> +	int idle = rte_dmadev_vchan_idle(dev_id, vchan);
> +	if (idle < 0) {
> +		/* for drivers that don't support this op, just sleep for 25 microseconds */
> +		usleep(25);
> +		return;
> +	}
> +
> +	/* for those that do, *max* end time is one second from now, but all should be faster */
> +	const uint64_t end_cycles = rte_get_timer_cycles() + rte_get_timer_hz();
> +	while (!idle && rte_get_timer_cycles() < end_cycles) {
> +		rte_pause();
> +		idle = rte_dmadev_vchan_idle(dev_id, vchan);
> +	}
> +}

The new DMA IOAT driver works fine with this function and will not be 
affected by an increase in timeout time as suggested by Jerin.

Reviewed-by: Conor Walsh <conor.walsh@intel.com>
  

Patch

diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
index 12f7c69629..261f45db71 100644
--- a/app/test/test_dmadev.c
+++ b/app/test/test_dmadev.c
@@ -2,12 +2,15 @@ 
  * Copyright(c) 2021 HiSilicon Limited.
  * Copyright(c) 2021 Intel Corporation.
  */
+#include <unistd.h>
 #include <inttypes.h>
 
 #include <rte_common.h>
 #include <rte_dev.h>
 #include <rte_dmadev.h>
 #include <rte_bus_vdev.h>
+#include <rte_mbuf.h>
+#include <rte_random.h>
 
 #include "test.h"
 
@@ -16,6 +19,11 @@  extern int test_dmadev_api(uint16_t dev_id);
 
 #define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
 
+#define COPY_LEN 1024
+
+static struct rte_mempool *pool;
+static uint16_t id_count;
+
 static inline int
 __rte_format_printf(3, 4)
 print_err(const char *func, int lineno, const char *format, ...)
@@ -31,6 +39,134 @@  print_err(const char *func, int lineno, const char *format, ...)
 	return ret;
 }
 
+static inline void
+await_hw(int dev_id, uint16_t vchan)
+{
+	int idle = rte_dmadev_vchan_idle(dev_id, vchan);
+	if (idle < 0) {
+		/* for drivers that don't support this op, just sleep for 25 microseconds */
+		usleep(25);
+		return;
+	}
+
+	/* for those that do, *max* end time is one second from now, but all should be faster */
+	const uint64_t end_cycles = rte_get_timer_cycles() + rte_get_timer_hz();
+	while (!idle && rte_get_timer_cycles() < end_cycles) {
+		rte_pause();
+		idle = rte_dmadev_vchan_idle(dev_id, vchan);
+	}
+}
+
+static int
+test_enqueue_copies(int dev_id, uint16_t vchan)
+{
+	unsigned int i;
+	uint16_t id;
+
+	/* test doing a single copy */
+	do {
+		struct rte_mbuf *src, *dst;
+		char *src_data, *dst_data;
+
+		src = rte_pktmbuf_alloc(pool);
+		dst = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(src, char *);
+		dst_data = rte_pktmbuf_mtod(dst, char *);
+
+		for (i = 0; i < COPY_LEN; i++)
+			src_data[i] = rte_rand() & 0xFF;
+
+		id = rte_dmadev_copy(dev_id, vchan, src->buf_iova + src->data_off,
+				dst->buf_iova + dst->data_off, COPY_LEN, RTE_DMA_OP_FLAG_SUBMIT);
+		if (id != id_count) {
+			PRINT_ERR("Error with rte_dmadev_copy, got %u, expected %u\n",
+					id, id_count);
+			return -1;
+		}
+
+		/* give time for copy to finish, then check it was done */
+		await_hw(dev_id, vchan);
+
+		for (i = 0; i < COPY_LEN; i++) {
+			if (dst_data[i] != src_data[i]) {
+				PRINT_ERR("Data mismatch at char %u [Got %02x not %02x]\n", i,
+						dst_data[i], src_data[i]);
+				rte_dmadev_dump(dev_id, stderr);
+				return -1;
+			}
+		}
+
+		/* now check completion works */
+		if (rte_dmadev_completed(dev_id, vchan, 1, &id, NULL) != 1) {
+			PRINT_ERR("Error with rte_dmadev_completed\n");
+			return -1;
+		}
+		if (id != id_count) {
+			PRINT_ERR("Error:incorrect job id received, %u [expected %u]\n",
+					id, id_count);
+			return -1;
+		}
+
+		rte_pktmbuf_free(src);
+		rte_pktmbuf_free(dst);
+
+		/* now check completion works */
+		if (rte_dmadev_completed(dev_id, 0, 1, NULL, NULL) != 0) {
+			PRINT_ERR("Error with rte_dmadev_completed in empty check\n");
+			return -1;
+		}
+		id_count++;
+
+	} while (0);
+
+	/* test doing a multiple single copies */
+	do {
+		const uint16_t max_ops = 4;
+		struct rte_mbuf *src, *dst;
+		char *src_data, *dst_data;
+
+		src = rte_pktmbuf_alloc(pool);
+		dst = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(src, char *);
+		dst_data = rte_pktmbuf_mtod(dst, char *);
+
+		for (i = 0; i < COPY_LEN; i++)
+			src_data[i] = rte_rand() & 0xFF;
+
+		/* perform the same copy <max_ops> times */
+		for (i = 0; i < max_ops; i++) {
+			if (rte_dmadev_copy(dev_id, vchan,
+					src->buf_iova + src->data_off,
+					dst->buf_iova + dst->data_off,
+					COPY_LEN, RTE_DMA_OP_FLAG_SUBMIT) != id_count++) {
+				PRINT_ERR("Error with rte_dmadev_copy\n");
+				return -1;
+			}
+		}
+		await_hw(dev_id, vchan);
+
+		if ((i = rte_dmadev_completed(dev_id, vchan, max_ops * 2, &id, NULL)) != max_ops) {
+			PRINT_ERR("Error with rte_dmadev_completed, got %u not %u\n", i, max_ops);
+			return -1;
+		}
+		if (id != id_count - 1) {
+			PRINT_ERR("Error, incorrect job id returned: got %u not %u\n",
+					id, id_count - 1);
+			return -1;
+		}
+		for (i = 0; i < COPY_LEN; i++) {
+			if (dst_data[i] != src_data[i]) {
+				PRINT_ERR("Data mismatch at char %u\n", i);
+				return -1;
+			}
+		}
+		rte_pktmbuf_free(src);
+		rte_pktmbuf_free(dst);
+	} while (0);
+
+	return 0;
+}
+
 static int
 test_dmadev_instance(uint16_t dev_id)
 {
@@ -43,6 +179,7 @@  test_dmadev_instance(uint16_t dev_id)
 			.nb_desc = TEST_RINGSIZE,
 	};
 	const int vchan = 0;
+	int i;
 
 	printf("\n### Test dmadev instance %u\n", dev_id);
 
@@ -79,10 +216,47 @@  test_dmadev_instance(uint16_t dev_id)
 				stats.completed, stats.submitted, stats.errors);
 		return -1;
 	}
+	id_count = 0;
+
+	/* create a mempool for running tests */
+	pool = rte_pktmbuf_pool_create("TEST_DMADEV_POOL",
+			TEST_RINGSIZE * 2, /* n == num elements */
+			32,  /* cache size */
+			0,   /* priv size */
+			2048, /* data room size */
+			info.device->numa_node);
+	if (pool == NULL) {
+		PRINT_ERR("Error with mempool creation\n");
+		return -1;
+	}
+
+	/* run the test cases, use many iterations to ensure UINT16_MAX id wraparound */
+	printf("DMA Dev: %u, Running Copy Tests\n", dev_id);
+	for (i = 0; i < 640; i++) {
+
+		if (test_enqueue_copies(dev_id, vchan) != 0) {
+			printf("Error with iteration %d\n", i);
+			rte_dmadev_dump(dev_id, stdout);
+			goto err;
+		}
 
+		rte_dmadev_stats_get(dev_id, 0, &stats);
+		printf("Ops submitted: %"PRIu64"\t", stats.submitted);
+		printf("Ops completed: %"PRIu64"\t", stats.completed);
+		printf("Errors: %"PRIu64"\r", stats.errors);
+	}
+	printf("\n");
+
+
+	rte_mempool_free(pool);
 	rte_dmadev_stop(dev_id);
 	rte_dmadev_stats_reset(dev_id, vchan);
 	return 0;
+
+err:
+	rte_mempool_free(pool);
+	rte_dmadev_stop(dev_id);
+	return -1;
 }
 
 static int