diff mbox series

dmadev: introduce DMA device library

Message ID 1625231891-2963-1-git-send-email-fengchengwen@huawei.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series dmadev: introduce DMA device library | expand

Checks

Context Check Description
ci/iol-abi-testing success Testing PASS
ci/intel-Testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-testing fail Testing issues
ci/iol-intel-Functional success Functional Testing PASS
ci/github-robot success github build: passed
ci/checkpatch warning coding style issues

Commit Message

Chengwen Feng July 2, 2021, 1:18 p.m. UTC
This patch introduces 'dmadevice' which is a generic type of DMA
device.

The APIs of dmadev library exposes some generic operations which can
enable configuration and I/O with the DMA devices.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 MAINTAINERS                  |   4 +
 config/rte_config.h          |   3 +
 lib/dmadev/meson.build       |   6 +
 lib/dmadev/rte_dmadev.c      | 438 +++++++++++++++++++++
 lib/dmadev/rte_dmadev.h      | 919 +++++++++++++++++++++++++++++++++++++++++++
 lib/dmadev/rte_dmadev_core.h |  98 +++++
 lib/dmadev/rte_dmadev_pmd.h  | 210 ++++++++++
 lib/dmadev/version.map       |  32 ++
 lib/meson.build              |   1 +
 9 files changed, 1711 insertions(+)
 create mode 100644 lib/dmadev/meson.build
 create mode 100644 lib/dmadev/rte_dmadev.c
 create mode 100644 lib/dmadev/rte_dmadev.h
 create mode 100644 lib/dmadev/rte_dmadev_core.h
 create mode 100644 lib/dmadev/rte_dmadev_pmd.h
 create mode 100644 lib/dmadev/version.map

Comments

Bruce Richardson July 2, 2021, 1:59 p.m. UTC | #1
On Fri, Jul 02, 2021 at 09:18:11PM +0800, Chengwen Feng wrote:
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
Thanks for this new revision. We will try porting our driver
implementations under this API and see how it performs. We'll send on
feedback later based on that and based on code review.

/Bruce
Jerin Jacob July 4, 2021, 9:30 a.m. UTC | #2
On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.
>
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
>
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

Thanks for v1.

I would suggest finalizing  lib/dmadev/rte_dmadev.h before doing the
implementation so that you don't need
to waste time on rewoking the implementation.

Comments inline.

> ---
>  MAINTAINERS                  |   4 +
>  config/rte_config.h          |   3 +
>  lib/dmadev/meson.build       |   6 +
>  lib/dmadev/rte_dmadev.c      | 438 +++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 919 +++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h |  98 +++++
>  lib/dmadev/rte_dmadev_pmd.h  | 210 ++++++++++
>  lib/dmadev/version.map       |  32 ++

Missed to update doxygen. See doc/api/doxy-api.conf.in
Use meson  -Denable_docs=true to verify the generated doxgen doc.

>  lib/meson.build              |   1 +
>  9 files changed, 1711 insertions(+)
>  create mode 100644 lib/dmadev/meson.build
>  create mode 100644 lib/dmadev/rte_dmadev.c
>  create mode 100644 lib/dmadev/rte_dmadev.h
>  create mode 100644 lib/dmadev/rte_dmadev_core.h
>  create mode 100644 lib/dmadev/rte_dmadev_pmd.h
>  create mode 100644 lib/dmadev/version.map
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..2019783 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
>

Add EXPERIMENTAL

> +Dma device API
> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
>

> new file mode 100644
> index 0000000..a94e839
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,438 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stdint.h>
> +
> +#include <rte_log.h>
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_string_fns.h>

Sort in alphabetical order.

> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];

# Please check have you missed any multiprocess angle.
lib/regexdev/rte_regexdev.c is latest device class implemented in dpdk and
please check *rte_regexdev_shared_data scheme.


# Missing dynamic log for this library.


> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..f74fc6a
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,919 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.

It would be nice to add other companies' names who have contributed to
the specification.

> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The generic DMA device diagram:
> + *
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                  ----------------
> + *                  |dma-controller|
> + *                  ----------------
> + *
> + *   The DMA could have multiple HW-queues, each HW-queue could have multiple
> + *   capabilities, e.g. whether to support fill operation, supported DMA
> + *   transfter direction and etc.

typo

> + *
> + * The DMA framework is built on the following abstraction model:
> + *
> + *     ------------    ------------
> + *     |virt-queue|    |virt-queue|
> + *     ------------    ------------
> + *            \           /
> + *             \         /
> + *              \       /
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                     ----------
> + *                     | dmadev |
> + *                     ----------

Continuing the discussion with @Morten Brørup , I think, we need to
finalize the model.

> + *   a) The DMA operation request must be submitted to the virt queue, virt
> + *      queues must be created based on HW queues, the DMA device could have
> + *      multiple HW queues.
> + *   b) The virt queues on the same HW-queue could represent different contexts,
> + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> + *      mem-to-dev transfer scenario.
> + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> + *         scenario as long as the corresponding driver supports.
> + *
> + * The control plane APIs include configure/queue_setup/queue_release/start/
> + * stop/reset/close, in order to start device work, the call sequence must be
> + * as follows:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_queue_setup()
> + *     - rte_dmadev_start()

Please add reconfigure behaviour etc, Please check the
lib/regexdev/rte_regexdev.h
introduction. I have added similar ones so you could reuse as much as possible.


> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virt queues which have different
> + *      contexts.
> + *      The first four APIs are used to submit the operation request to the virt
> + *      queue, if the submission is successful, a cookie (as type
> + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> + * information query and self-test capabilities.
> + *
> + * About the dataplane APIs MT-safe, there are two dimensions:
> + *   a) For one virt queue, the submit/completion API could be MT-safe,
> + *      e.g. one thread do submit operation, another thread do completion
> + *      operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.

From an application PoV it may not be good to write portable
applications. Please check
latest thread with @Morten Brørup

> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_compat.h>

Sort in alphabetical order.

> +
> +/**
> + * dma_cookie_t - an opaque DMA cookie

Since we are defining the behaviour is not opaque any more.
I think, it is better to call ring_idx or so.


> +#define RTE_DMA_DEV_CAPA_MT_MVQ (1ull << 11) /**< Support MT-safe of multiple virt queues */

Please lot of @see for all symbols where it is being used. So that one
can understand the full scope of
symbols. See below example.

#define RTE_REGEXDEV_CAPA_RUNTIME_COMPILATION_F (1ULL << 0)
/**< RegEx device does support compiling the rules at runtime unlike
 * loading only the pre-built rule database using
 * struct rte_regexdev_config::rule_db in rte_regexdev_configure()
 *
 * @see struct rte_regexdev_config::rule_db, rte_regexdev_configure()
 * @see struct rte_regexdev_info::regexdev_capa
 */

> + *
> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> + * code.
> + * When using cookies, comply with the following rules:
> + * a) Cookies for each virtual queue are independent.
> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> + *    the INT_MAX, it wraps back to zero.
> + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> + *    reset, the virt queue's cookie needs to be reset to zero.
> + * Example:
> + *    step-1: start one dmadev
> + *    step-2: enqueue a copy operation, the cookie return is 0
> + *    step-3: enqueue a copy operation again, the cookie return is 1
> + *    ...
> + *    step-101: stop the dmadev
> + *    step-102: start the dmadev
> + *    step-103: enqueue a copy operation, the cookie return is 0
> + *    ...
> + */

Good explanation.

> +typedef int32_t dma_cookie_t;


> +
> +/**
> + * dma_scatterlist - can hold scatter DMA operation request
> + */
> +struct dma_scatterlist {

I prefer to change scatterlist -> sg
i.e rte_dma_sg

> +       void *src;
> +       void *dst;
> +       uint32_t length;
> +};
> +

> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +       /**
> +        * Fields filled by framewok

typo.

> +        */
> +       struct rte_device *device; /**< Generic Device information */
> +       const char *driver_name; /**< Device driver name */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +
> +       /**
> +        * Specification fields filled by driver
> +        */
> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> +       uint16_t max_vqs_per_hw_queue;
> +       /**< Maximum number of virt queues to allocate per HW queue */
> +       uint16_t max_desc;
> +       /**< Maximum allowed number of virt queue descriptors */
> +       uint16_t min_desc;
> +       /**< Minimum allowed number of virt queue descriptors */

Please add max_nb_segs. i.e maximum number of segments supported.

> +
> +       /**
> +        * Status fields filled by driver
> +        */
> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> +       uint16_t nb_vqs; /**< Number of virt queues configured */
> +};
> + i
> +
> +/**
> + * dma_address_type
> + */
> +enum dma_address_type {
> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> +};
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +       enum dma_address_type addr_type; /**< Address type to used */

I think, there are 3 kinds of limitations/capabilities.

When the system is configured as IOVA as VA
1) Device supports any VA address like memory from rte_malloc(),
rte_memzone(), malloc, stack memory
2) Device support only VA address from rte_malloc(), rte_memzone() i.e
memory backed by hugepage and added to DMA map.

When the system is configured as IOVA as PA
1) Devices support only PA addresses .

IMO, Above needs to be  advertised as capability and application needs
to align with that
and I dont think application requests the driver to work in any of the modes.



> +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> +       uint16_t max_vqs; /**< Maximum number of virt queues to use */

You need to what is max value allowed etc i.e it is based on
info_get() and mention the field
in info structure


> +
> +/**
> + * dma_transfer_direction
> + */
> +enum dma_transfer_direction {

rte_dma_transter_direction

> +       DMA_MEM_TO_MEM,
> +       DMA_MEM_TO_DEV,
> +       DMA_DEV_TO_MEM,
> +       DMA_DEV_TO_DEV,
> +};
> +
> +/**
> + * A structure used to configure a DMA virt queue.
> + */
> +struct rte_dmadev_queue_conf {
> +       enum dma_transfer_direction direction;


> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */

Use of this? Need more comments on this.
Since it is in slowpath, We can have non opaque names here based on
each driver capability.


> +       void *dev_ctx; /**< Device specific context */

Use of this ? Need more comment ont this.


Please add some good amount of reserved bits and have API to init this
structure for future ABI stability, say rte_dmadev_queue_config_init()
or so.


> +
> +/**
> + * A structure used to retrieve information of a DMA virt queue.
> + */
> +struct rte_dmadev_queue_info {
> +       enum dma_transfer_direction direction;

A queue may support all directions so I think it should be a bitfield.

> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */
> +};
> +

> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> +                  const struct dma_scatterlist *sg,
> +                  uint32_t sg_len, uint64_t flags)

I would like to change this as:
rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
rte_dma_sg *src, uint32_t nb_src,
const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
src 30 MB copy can be splitted as written as 1 MB x 30 dst.



> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the DMA virt queue
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An opaque flags for this operation.

PLEASE REMOVE opaque stuff from fastpath it will be a pain for
application writers as
they need to write multiple combinations of fastpath. flags are OK, if
we have a valid
generic flag now to control the transfer behavior.


> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Add a fence to force ordering between operations
> + *
> + * This adds a fence to a sequence of operations to enforce ordering, such that
> + * all operations enqueued before the fence must be completed before operations
> + * after the fence.
> + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> + * this API may not function correctly when called immediately after an
> + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful add fence.
> + *   - <0: Failure to add fence.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fence)(dev, vq_id);
> +}

Since HW submission is in a queue(FIFO) the ordering is always
maintained. Right?
Could you share more details and use case of fence() from
driver/application PoV?


> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->perform)(dev, vq_id);
> +}

Since we have additional function call overhead in all the
applications for this scheme, I would like to understand
the use of doing this way vs enq does the doorbell implicitly from
driver/application PoV?


> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successful completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] cookie
> + *   The last completed operation's cookie.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *
> + * @return
> + *   The number of operations that successful completed.

successfully

> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> +                    dma_cookie_t *cookie, bool *has_error)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       has_error = false;
> +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);

It may be better to have cookie/ring_idx as third argument.

> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
(> + * @param nb_status
> + *   Indicates the size  of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + * @param[out] cookie
> + *   The last failed completed operation's cookie.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> +                          const uint16_t nb_status, uint32_t *status,
> +                          dma_cookie_t *cookie)

IMO, it is better to move cookie/rind_idx at 3.
Why it would return any array of errors? since it called after
rte_dmadev_completed() has
has_error. Is it better to change

rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
*cookie,  uint32_t *status)

I also think, we may need to set status as bitmask and enumerate all
the combination of error codes
of all the driver and return string from driver existing rte_flow_error

See
struct rte_flow_error {
        enum rte_flow_error_type type; /**< Cause field and error types. */
        const void *cause; /**< Object responsible for the error. */
        const char *message; /**< Human-readable error message. */
};

> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> +}
> +
> +struct rte_dmadev_stats {
> +       uint64_t enqueue_fail_count;
> +       /**< Conut of all operations which failed enqueued */
> +       uint64_t enqueued_count;
> +       /**< Count of all operations which successful enqueued */
> +       uint64_t completed_fail_count;
> +       /**< Count of all operations which failed to complete */
> +       uint64_t completed_count;
> +       /**< Count of all operations which successful complete */
> +};

We need to have capability API to tell which items are
updated/supported by the driver.


> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..a3afea2
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types. But they are still part of the
> + * public API because they are used by inline public functions.
> + */
> +
> +struct rte_dmadev;
> +
> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +                                     void *src, void *dst,
> +                                     uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a copy operation. */

To avoid namespace conflict(as it is public API) use rte_


> +
> +/**
> + * The data structure associated with each DMA device.
> + */
> +struct rte_dmadev {
> +       /**< Enqueue a copy operation onto the DMA device. */
> +       dmadev_copy_t copy;
> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> +       dmadev_copy_sg_t copy_sg;
> +       /**< Enqueue a fill operation onto the DMA device. */
> +       dmadev_fill_t fill;
> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> +       dmadev_fill_sg_t fill_sg;
> +       /**< Add a fence to force ordering between operations. */
> +       dmadev_fence_t fence;
> +       /**< Trigger hardware to begin performing enqueued operations. */
> +       dmadev_perform_t perform;
> +       /**< Returns the number of operations that successful completed. */
> +       dmadev_completed_t completed;
> +       /**< Returns the number of operations that failed to complete. */
> +       dmadev_completed_fails_t completed_fails;

We need to limit fastpath items in 1 CL

> +
> +       void *dev_private; /**< PMD-specific private data */
> +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> +
> +       uint16_t dev_id; /**< Device ID for this instance */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +       struct rte_device *device;
> +       /**< Device info. supplied during device initialization */
> +       const char *driver_name; /**< Driver info. supplied by probing */
> +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> +
> +       RTE_STD_C11
> +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */

Add a couple of reserved fields for future ABI stability.

> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
Andrew Rybchenko July 4, 2021, 2:57 p.m. UTC | #3
On 7/2/21 4:18 PM, Chengwen Feng wrote:
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.

"This patch introduces ... " -> "Introduce ..."

> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>

[snip]

> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..2019783 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
>  
> +Dma device API

Dma -> DMA

> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
>  
>  Memory Pool Drivers
>  -------------------

[snip]

> diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
> new file mode 100644
> index 0000000..a94e839
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,438 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stdint.h>
> +
> +#include <rte_log.h>
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_string_fns.h>
> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
> +
> +uint16_t
> +rte_dmadev_count(void)
> +{
> +	uint16_t count = 0;
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (rte_dmadevices[i].attached)
> +			count++;
> +	}
> +
> +	return count;
> +}
> +
> +int
> +rte_dmadev_get_dev_id(const char *name)
> +{
> +	uint16_t i;
> +
> +	if (name == NULL)
> +		return -EINVAL;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
> +		if ((strcmp(rte_dmadevices[i].name, name) == 0) &&
> +		    (rte_dmadevices[i].attached == RTE_DMADEV_ATTACHED))
> +			return i;
> +
> +	return -ENODEV;
> +}
> +
> +int
> +rte_dmadev_socket_id(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	dev = &rte_dmadevices[dev_id];
> +
> +	return dev->socket_id;
> +}
> +
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
> +
> +	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
> +	diag = (*dev->dev_ops->dev_info_get)(dev, dev_info);
> +	if (diag != 0)
> +		return diag;
> +
> +	dev_info->device = dev->device;
> +	dev_info->driver_name = dev->driver_name;
> +	dev_info->socket_id = dev->socket_id;
> +
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
> +
> +	if (dev->started) {
> +		RTE_DMADEV_LOG(ERR,
> +		   "device %u must be stopped to allow configuration", dev_id);
> +		return -EBUSY;
> +	}
> +
> +	diag = (*dev->dev_ops->dev_configure)(dev, dev_conf);
> +	if (diag != 0)
> +		RTE_DMADEV_LOG(ERR, "device %u dev_configure failed, ret = %d",
> +			       dev_id, diag);
> +	else
> +		dev->attached = 1;
> +
> +	return diag;
> +}
> +
> +int
> +rte_dmadev_start(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +	if (dev->started != 0) {
> +		RTE_DMADEV_LOG(ERR, "device %u already started", dev_id);
> +		return 0;
> +	}
> +
> +	if (dev->dev_ops->dev_start == NULL)
> +		goto mark_started;
> +
> +	diag = (*dev->dev_ops->dev_start)(dev);
> +	if (diag != 0)
> +		return diag;
> +
> +mark_started:
> +	dev->started = 1;
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_stop(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +	int diag;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (dev->started == 0) {
> +		RTE_DMADEV_LOG(ERR, "device %u already stopped", dev_id);
> +		return 0;
> +	}
> +
> +	if (dev->dev_ops->dev_stop == NULL)
> +		goto mark_stopped;
> +
> +	diag = (*dev->dev_ops->dev_stop)(dev);
> +	if (diag != 0)
> +		return diag;
> +
> +mark_stopped:
> +	dev->started = 0;
> +	return 0;
> +}
> +
> +int
> +rte_dmadev_close(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
> +
> +	/* Device must be stopped before it can be closed */
> +	if (dev->started == 1) {
> +		RTE_DMADEV_LOG(ERR, "device %u must be stopped before closing",
> +			       dev_id);
> +		return -EBUSY;
> +	}
> +
> +	return (*dev->dev_ops->dev_close)(dev);
> +}
> +
> +int
> +rte_dmadev_reset(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
> +
> +	/* Reset is not dependent on state of the device */
> +	return (*dev->dev_ops->dev_reset)(dev);
> +}
> +
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +		       const struct rte_dmadev_queue_conf *conf)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -ENOTSUP);
> +
> +	return (*dev->dev_ops->queue_setup)(dev, conf);
> +}
> +
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP);
> +
> +	return (*dev->dev_ops->queue_release)(dev, vq_id);
> +}
> +
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +			  struct rte_dmadev_queue_info *info)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(info, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_info_get, -ENOTSUP);
> +
> +	memset(info, 0, sizeof(struct rte_dmadev_queue_info));
> +	return (*dev->dev_ops->queue_info_get)(dev, vq_id, info);
> +}
> +
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +		     struct rte_dmadev_stats *stats)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
> +
> +	return (*dev->dev_ops->stats_get)(dev, vq_id, stats);
> +}
> +
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
> +
> +	return (*dev->dev_ops->stats_reset)(dev, vq_id);
> +}
> +
> +static int
> +xstats_get_count(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
> +
> +	return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
> +}
> +
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +			    struct rte_dmadev_xstats_name *xstats_names,
> +			    uint32_t size)
> +{
> +	struct rte_dmadev *dev;
> +	int cnt_expected_entries;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	cnt_expected_entries = xstats_get_count(dev_id);
> +
> +	if (xstats_names == NULL || cnt_expected_entries < 0 ||
> +	    (int)size < cnt_expected_entries || size == 0)
> +		return cnt_expected_entries;
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
> +	return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
> +}
> +
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +		      uint64_t values[], uint32_t n)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
> +	RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -ENOTSUP);
> +
> +	return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
> +}
> +
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -ENOTSUP);
> +
> +	return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
> +}
> +
> +int
> +rte_dmadev_selftest(uint16_t dev_id)
> +{
> +	struct rte_dmadev *dev;
> +
> +	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
> +
> +	return (*dev->dev_ops->dev_selftest)(dev_id);
> +}
> +
> +static inline uint16_t
> +rte_dmadev_find_free_device_index(void)
> +{
> +	uint16_t i;
> +
> +	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +		if (rte_dmadevices[i].attached == RTE_DMADEV_DETACHED)
> +			return i;
> +	}
> +
> +	return RTE_DMADEV_MAX_DEVS;
> +}
> +
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id)
> +{
> +	struct rte_dmadev *dev;
> +	uint16_t dev_id;
> +
> +	if (rte_dmadev_get_dev_id(name) >= 0) {
> +		RTE_DMADEV_LOG(ERR,
> +			"device with name %s already allocated!", name);
> +		return NULL;
> +	}
> +
> +	dev_id = rte_dmadev_find_free_device_index();
> +	if (dev_id == RTE_DMADEV_MAX_DEVS) {
> +		RTE_DMADEV_LOG(ERR, "reached maximum number of DMA devices");
> +		return NULL;
> +	}
> +
> +	dev = &rte_dmadevices[dev_id];
> +
> +	if (dev_priv_size > 0) {
> +		dev->dev_private = rte_zmalloc_socket("dmadev private",
> +				     dev_priv_size,
> +				     RTE_CACHE_LINE_SIZE,
> +				     socket_id);
> +		if (dev->dev_private == NULL) {
> +			RTE_DMADEV_LOG(ERR,
> +				"unable to allocate memory for dmadev");
> +			return NULL;
> +		}
> +	}
> +
> +	dev->dev_id = dev_id;
> +	dev->socket_id = socket_id;
> +	dev->started = 0;
> +	strlcpy(dev->name, name, RTE_DMADEV_NAME_MAX_LEN);
> +
> +	dev->attached = RTE_DMADEV_ATTACHED;
> +
> +	return dev;
> +}
> +
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev)
> +{
> +	int ret;
> +
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	ret = rte_dmadev_close(dev->dev_id);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (dev->dev_private != NULL)
> +		rte_free(dev->dev_private);
> +
> +	memset(dev, 0, sizeof(struct rte_dmadev));
> +	dev->attached = RTE_DMADEV_DETACHED;
> +
> +	return 0;
> +}
> +
> +RTE_LOG_REGISTER(libdmadev_logtype, lib.dmadev, INFO);
> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..f74fc6a
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,919 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The generic DMA device diagram:
> + *
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                  ----------------
> + *                  |dma-controller|
> + *                  ----------------
> + *
> + *   The DMA could have multiple HW-queues, each HW-queue could have multiple
> + *   capabilities, e.g. whether to support fill operation, supported DMA
> + *   transfter direction and etc.
> + *
> + * The DMA framework is built on the following abstraction model:
> + *
> + *     ------------    ------------
> + *     |virt-queue|    |virt-queue|
> + *     ------------    ------------

Do we really need "virt" here? "virt queue" could be
incorrectly associated with virtio spec.
IMHO it would be better w/o virt or full "virtual"
everywhere in the documentation.

> + *            \           /
> + *             \         /
> + *              \       /
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                     ----------
> + *                     | dmadev |
> + *                     ----------
> + *
> + *   a) The DMA operation request must be submitted to the virt queue, virt
> + *      queues must be created based on HW queues, the DMA device could have
> + *      multiple HW queues.

What does define mapping of virtual queues to HW queues? Does
API user sees HW queues? If no, it should be kept transparent
and HW queues should be simply removed from the picture.

> + *   b) The virt queues on the same HW-queue could represent different contexts,
> + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> + *      mem-to-dev transfer scenario.
> + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> + *         scenario as long as the corresponding driver supports.
> + *
> + * The control plane APIs include configure/queue_setup/queue_release/start/
> + * stop/reset/close, in order to start device work, the call sequence must be
> + * as follows:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_queue_setup()
> + *     - rte_dmadev_start()
> + *
> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virt queues which have different
> + *      contexts.
> + *      The first four APIs are used to submit the operation request to the virt
> + *      queue, if the submission is successful, a cookie (as type
> + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> + * information query and self-test capabilities.
> + *
> + * About the dataplane APIs MT-safe, there are two dimensions:
> + *   a) For one virt queue, the submit/completion API could be MT-safe,
> + *      e.g. one thread do submit operation, another thread do completion
> + *      operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_compat.h>
> +
> +/**
> + * dma_cookie_t - an opaque DMA cookie
> + *
> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> + * code.
> + * When using cookies, comply with the following rules:
> + * a) Cookies for each virtual queue are independent.
> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> + *    the INT_MAX, it wraps back to zero.

INT32_MAX

> + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> + *    reset, the virt queue's cookie needs to be reset to zero.
> + * Example:
> + *    step-1: start one dmadev
> + *    step-2: enqueue a copy operation, the cookie return is 0
> + *    step-3: enqueue a copy operation again, the cookie return is 1
> + *    ...
> + *    step-101: stop the dmadev
> + *    step-102: start the dmadev
> + *    step-103: enqueue a copy operation, the cookie return is 0
> + *    ...
> + */
> +typedef int32_t dma_cookie_t;
> +
> +/**
> + * dma_scatterlist - can hold scatter DMA operation request
> + */
> +struct dma_scatterlist {
> +	void *src;
> +	void *dst;
> +	uint32_t length;
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the total number of DMA devices that have been successfully
> + * initialised.
> + *
> + * @return
> + *   The total number of usable DMA devices.
> + */
> +__rte_experimental
> +uint16_t
> +rte_dmadev_count(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the device identifier for the named DMA device.
> + *
> + * @param name
> + *   DMA device name to select the DMA device identifier.
> + *
> + * @return
> + *   Returns DMA device identifier on success.
> + *   - <0: Failure to find named DMA device.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_get_dev_id(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Return the NUMA socket to which a device is connected.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   The NUMA socket id to which the device is connected or
> + *   a default of zero if the socket could not be determined.
> + *   - -EINVAL: dev_id value is out of range.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_socket_id(uint16_t dev_id);

It should be rte_dmadev_numa_node(), I guess.

> +
> +/**
> + * The capabilities of a DMA device
> + */
> +#define RTE_DMA_DEV_CAPA_M2M	(1ull << 0) /**< Support mem-to-mem transfer */
> +#define RTE_DMA_DEV_CAPA_M2D	(1ull << 1) /**< Support mem-to-dev transfer */
> +#define RTE_DMA_DEV_CAPA_D2M	(1ull << 2) /**< Support dev-to-mem transfer */
> +#define RTE_DMA_DEV_CAPA_D2D	(1ull << 3) /**< Support dev-to-dev transfer */
> +#define RTE_DMA_DEV_CAPA_COPY	(1ull << 4) /**< Support copy ops */
> +#define RTE_DMA_DEV_CAPA_FILL	(1ull << 5) /**< Support fill ops */
> +#define RTE_DMA_DEV_CAPA_SG	(1ull << 6) /**< Support scatter-gather ops */
> +#define RTE_DMA_DEV_CAPA_FENCE	(1ull << 7) /**< Support fence ops */
> +#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
> +#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
> +#define RTE_DMA_DEV_CAPA_MT_VQ	(1ull << 10) /**< Support MT-safe of one virt queue */
> +#define RTE_DMA_DEV_CAPA_MT_MVQ	(1ull << 11) /**< Support MT-safe of multiple virt queues */

Above is very hard to read. Values should be aligned at the
same column.

> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +	/**
> +	 * Fields filled by framewok
> +	 */

Doxygen has a way to document groups. Please, use it.

> +	struct rte_device *device; /**< Generic Device information */
> +	const char *driver_name; /**< Device driver name */
> +	int socket_id; /**< Socket ID where memory is allocated */
> +
> +	/**
> +	 * Specification fields filled by driver
> +	 */
> +	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +	uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> +	uint16_t max_vqs_per_hw_queue;
> +	/**< Maximum number of virt queues to allocate per HW queue */
> +	uint16_t max_desc;
> +	/**< Maximum allowed number of virt queue descriptors */
> +	uint16_t min_desc;
> +	/**< Minimum allowed number of virt queue descriptors */
> +
> +	/**
> +	 * Status fields filled by driver
> +	 */
> +	uint16_t nb_hw_queues; /**< Number of HW queues configured */
> +	uint16_t nb_vqs; /**< Number of virt queues configured */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve the contextual information of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @param[out] dev_info
> + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> + *   contextual information of the device.
> + * @return
> + *   - =0: Success, driver updates the contextual information of the DMA device
> + *   - <0: Error code returned by the driver info get function.
> + *
> + */
> +__rte_experimental
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
> +
> +/**
> + * dma_address_type
> + */
> +enum dma_address_type {
> +	DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> +	DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> +};
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +	enum dma_address_type addr_type; /**< Address type to used */
> +	uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> +	uint16_t max_vqs; /**< Maximum number of virt queues to use */

Is it total or per HW queue? Please, clarify in the
documetnation.

> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Configure a DMA device.
> + *
> + * This function must be invoked first before any other function in the
> + * API. This function can also be re-invoked when a device is in the
> + * stopped state.
> + *
> + * The caller may use rte_dmadev_info_get() to get the capability of each
> + * resources available for this DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device to configure.
> + * @param dev_conf
> + *   The DMA device configuration structure encapsulated into rte_dmadev_conf
> + *   object.
> + *
> + * @return
> + *   - =0: Success, device configured.
> + *   - <0: Error code returned by the driver configuration function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Start a DMA device.
> + *
> + * The device start step is the last one and consists of setting the DMA
> + * to start accepting jobs.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device started.
> + *   - <0: Error code returned by the driver start function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_start(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Stop a DMA device.
> + *
> + * The device can be restarted with a call to rte_dmadev_start()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device stopped.
> + *   - <0: Error code returned by the driver stop function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stop(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Close a DMA device.
> + *
> + * The device cannot be restarted after this call.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *  - =0: Successfully closing device
> + *  - <0: Failure to close device
> + */
> +__rte_experimental
> +int
> +rte_dmadev_close(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset a DMA device.
> + *
> + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
> + * sense similar to hard or soft reset.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Successful reset device.
> + *   - <0: Failure to reset device.
> + *   - (-ENOTSUP): If the device doesn't support this function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_reset(uint16_t dev_id);
> +
> +/**
> + * dma_transfer_direction

Such comments must be avoided.

> + */
> +enum dma_transfer_direction {
> +	DMA_MEM_TO_MEM,
> +	DMA_MEM_TO_DEV,
> +	DMA_DEV_TO_MEM,
> +	DMA_DEV_TO_DEV,
> +};
> +
> +/**
> + * A structure used to configure a DMA virt queue.
> + */
> +struct rte_dmadev_queue_conf {
> +	enum dma_transfer_direction direction;
> +	/**< Associated transfer direction */

Please, put comments before the code if comment is in a
separate line.


> +	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */

How does caller understand which HW queue ID to use?
Which one should be chosen? May PMD should device
which HW queue to use? Queue configuration can
provide hints to make the right choice: required
capabilties, relation to another virtual queue etc.

> +	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +	uint64_t dev_flags; /**< Device specific flags */
> +	void *dev_ctx; /**< Device specific context */

Device specific flags and context sounds bad and adds vendors
specifics  in API. If so, it could be very hard to switch from
vendor to vendor. Do I misunderstand?

> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Allocate and set up a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param conf
> + *   The queue configuration structure encapsulated into rte_dmadev_queue_conf
> + *   object.
> + *
> + * @return
> + *   - >=0: Allocate virt queue success, it is virt queue id.
> + *   - <0: Error code returned by the driver queue setup function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +		       const struct rte_dmadev_queue_conf *conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + *
> + * @return
> + *   - =0: Successful release the virt queue.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id);
> +
> +/**
> + * A structure used to retrieve information of a DMA virt queue.
> + */
> +struct rte_dmadev_queue_info {
> +	enum dma_transfer_direction direction;
> +	/**< Associated transfer direction */

Please, put comments before the code if comment is in a
separate line.

> +	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> +	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +	uint64_t dev_flags; /**< Device specific flags */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve information of a DMA virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + * @param[out] info
> + *   The queue info structure encapsulated into rte_dmadev_queue_info object.
> + *
> + * @return
> + *   - =0: Successful retrieve information.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +			  struct rte_dmadev_queue_info *info);
> +
> +#include "rte_dmadev_core.h"
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a copy operation onto the DMA virt queue.
> + *
> + * This queues up a copy operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param src
> + *   The address of the source buffer.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the data to be copied.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,
> +		uint32_t length, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->copy)(dev, vq_id, src, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list copy operation onto the DMA virt queue.
> + *
> + * This queues up a scatter list copy operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> +		   const struct dma_scatterlist *sg,
> +		   uint32_t sg_len, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the DMA virt queue
> + *
> + * This queues up a fill operation to be performed by hardware, but does not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +		void *dst, uint32_t length, uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list fill operation onto the DMA virt queue
> + *
> + * This queues up a scatter list fill operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +		   const struct dma_scatterlist *sg, uint32_t sg_len,
> +		   uint64_t flags)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->fill_sg)(dev, vq_id, pattern, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Add a fence to force ordering between operations
> + *
> + * This adds a fence to a sequence of operations to enforce ordering, such that
> + * all operations enqueued before the fence must be completed before operations
> + * after the fence.
> + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> + * this API may not function correctly when called immediately after an
> + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful add fence.
> + *   - <0: Failure to add fence.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->fence)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->perform)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successful completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] cookie
> + *   The last completed operation's cookie.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *
> + * @return
> + *   The number of operations that successful completed.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> +		     dma_cookie_t *cookie, bool *has_error)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	has_error = false;
> +	return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_status
> + *   Indicates the size of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + * @param[out] cookie
> + *   The last failed completed operation's cookie.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> +			   const uint16_t nb_status, uint32_t *status,
> +			   dma_cookie_t *cookie)
> +{
> +	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +	return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> +}
> +
> +struct rte_dmadev_stats {
> +	uint64_t enqueue_fail_count;
> +	/**< Conut of all operations which failed enqueued */

Please, put comments before the code.

> +	uint64_t enqueued_count;
> +	/**< Count of all operations which successful enqueued */
> +	uint64_t completed_fail_count;
> +	/**< Count of all operations which failed to complete */
> +	uint64_t completed_count;
> +	/**< Count of all operations which successful complete */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + * @param[out] stats
> + *   The basic statistics structure encapsulated into rte_dmadev_stats
> + *   object.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +		     struct rte_dmadev_stats *stats);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
> +
> +/** Maximum name length for extended statistics counters */
> +#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
> +
> +/**
> + * A name-key lookup element for extended statistics.
> + *
> + * This structure is used to map between names and ID numbers
> + * for extended ethdev statistics.
> + */
> +struct rte_dmadev_xstats_name {
> +	char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve names of extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param[out] xstats_names
> + *   Block of memory to insert names into. Must be at least size in capacity.
> + *   If set to NULL, function returns required capacity.
> + * @param size
> + *   Capacity of xstats_names (number of names).
> + * @return
> + *   - positive value lower or equal to size: success. The return value
> + *     is the number of entries filled in the stats table.
> + *   - positive value higher than size: error, the given statistics table
> + *     is too small. The return value corresponds to the size that should
> + *     be given to succeed. The entries in the table are not valid and
> + *     shall not be used by the caller.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +			    struct rte_dmadev_xstats_name *xstats_names,
> +			    uint32_t size);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   The id numbers of the stats to get. The ids can be got from the stat
> + *   position in the stat list from rte_dmadev_get_xstats_names().
> + * @param[out] values
> + *   The values for each stats request by ID.
> + * @param n
> + *   The number of stats requested.
> + *
> + * @return
> + *   - positive value: number of stat entries filled into the values array.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +		      uint64_t values[], uint32_t n);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset the values of the xstats of the selected component in the device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   Selects specific statistics to be reset. When NULL, all statistics
> + *   will be reset. If non-NULL, must point to array of at least
> + *   *nb_ids* size.
> + * @param nb_ids
> + *   The number of ids available from the *ids* array. Ignored when ids is NULL.
> + *
> + * @return
> + *   - zero: successfully reset the statistics to zero.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger the dmadev self test.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - 0: Selftest successful.
> + *   - -ENOTSUP if the device doesn't support selftest
> + *   - other values < 0 on failure.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_selftest(uint16_t dev_id);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_H_ */
> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..a3afea2
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types. But they are still part of the
> + * public API because they are used by inline public functions.

Do we really want it? Anyway rte_dmadev must not be here.
Some sub-structure could be, but not entire rte_dmadev.

> + */
> +
> +struct rte_dmadev;
> +
> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				      void *src, void *dst,
> +				      uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a copy operation. */

Avoid comments after the code in a seprate line. Move it to be
before the code.

> +
> +typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +					 const struct dma_scatterlist *sg,
> +					 uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list copy operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				      uint64_t pattern, void *dst,
> +				      uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a fill operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +			uint64_t pattern, const struct dma_scatterlist *sg,
> +			uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list fill operation. */
> +
> +typedef int (*dmadev_fence_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to add a fence ordering between operations. */
> +
> +typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to trigger hardware to begin performing. */
> +
> +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				       const uint16_t nb_cpls,
> +				       dma_cookie_t *cookie, bool *has_error);
> +/**< @internal Function used to return number of successful completed operations */
> +
> +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> +			uint16_t vq_id, const uint16_t nb_status,
> +			uint32_t *status, dma_cookie_t *cookie);
> +/**< @internal Function used to return number of failed completed operations */
> +
> +#define RTE_DMADEV_NAME_MAX_LEN	64 /**< Max length of name of DMA PMD */
> +
> +struct rte_dmadev_ops;
> +
> +/**
> + * The data structure associated with each DMA device.
> + */
> +struct rte_dmadev {
> +	/**< Enqueue a copy operation onto the DMA device. */

Comment before code should start from /** (not /**< ).

> +	dmadev_copy_t copy;
> +	/**< Enqueue a scatter list copy operation onto the DMA device. */
> +	dmadev_copy_sg_t copy_sg;
> +	/**< Enqueue a fill operation onto the DMA device. */
> +	dmadev_fill_t fill;
> +	/**< Enqueue a scatter list fill operation onto the DMA device. */
> +	dmadev_fill_sg_t fill_sg;
> +	/**< Add a fence to force ordering between operations. */
> +	dmadev_fence_t fence;
> +	/**< Trigger hardware to begin performing enqueued operations. */
> +	dmadev_perform_t perform;
> +	/**< Returns the number of operations that successful completed. */
> +	dmadev_completed_t completed;
> +	/**< Returns the number of operations that failed to complete. */
> +	dmadev_completed_fails_t completed_fails;
> +
> +	void *dev_private; /**< PMD-specific private data */
> +	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> +
> +	uint16_t dev_id; /**< Device ID for this instance */
> +	int socket_id; /**< Socket ID where memory is allocated */
> +	struct rte_device *device;
> +	/**< Device info. supplied during device initialization */

Please, put comments before the code if comment is in a
separate line.

> +	const char *driver_name; /**< Driver info. supplied by probing */
> +	char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> +
> +	RTE_STD_C11
> +	uint8_t attached : 1; /**< Flag indicating the device is attached */
> +	uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
> +#endif /* _RTE_DMADEV_CORE_H_ */
> diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h

Let's remove rte_ prefix from DPDK internal headers.

> new file mode 100644
> index 0000000..ef03cf7
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_pmd.h
> @@ -0,0 +1,210 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_PMD_H_
> +#define _RTE_DMADEV_PMD_H_
> +
> +/** @file
> + * RTE DMA PMD APIs
> + *
> + * @note
> + * Driver facing APIs for a DMA device. These are not to be called directly by
> + * any application.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <string.h>
> +
> +#include <rte_dev.h>
> +#include <rte_log.h>
> +#include <rte_common.h>
> +
> +#include "rte_dmadev.h"
> +
> +extern int libdmadev_logtype;
> +
> +#define RTE_DMADEV_LOG(level, fmt, args...) \

Do we need RTE_ prefix for internal API?

> +	rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
> +		__func__, ##args)
> +
> +/* Macros to check for valid device */
> +#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +		return retval; \
> +	} \
> +} while (0)
> +
> +#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +		return; \
> +	} \
> +} while (0)
> +
> +#define RTE_DMADEV_DETACHED  0
> +#define RTE_DMADEV_ATTACHED  1

Do we really need RTE_ prefix for interlal defines?

> +
> +/**
> + * Validate if the DMA device index is a valid attached DMA device.
> + *
> + * @param dev_id
> + *   DMA device index.
> + *
> + * @return
> + *   - If the device index is valid (1) or not (0).
> + */
> +static inline unsigned

'unsigned int', but it sounds like the function should return
'bool'.

> +rte_dmadev_pmd_is_valid_dev(uint16_t dev_id)

Again, do we really need rte_ prefix for internal functions?

> +{
> +	struct rte_dmadev *dev;
> +
> +	if (dev_id >= RTE_DMADEV_MAX_DEVS)
> +		return 0;
> +
> +	dev = &rte_dmadevices[dev_id];
> +	if (dev->attached != RTE_DMADEV_ATTACHED)
> +		return 0;
> +	else
> +		return 1;
> +}
> +
> +/**
> + * Definitions of control-plane functions exported by a driver through the
> + * generic structure of type *rte_dmadev_ops* supplied in the *rte_dmadev*
> + * structure associated with a device.
> + */
> +
> +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +				 struct rte_dmadev_info *dev_info);
> +/**< @internal Function used to get device information of a device. */

Let's don't use documentation after code in a separate line in
a new code.

> +
> +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> +				  const struct rte_dmadev_conf *dev_conf);
> +/**< @internal Function used to configure a device. */
> +
> +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to start a configured device. */
> +
> +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to stop a configured device. */
> +
> +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to close a configured device. */
> +
> +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to reset a configured device. */
> +
> +typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
> +				    const struct rte_dmadev_queue_conf *conf);
> +/**< @internal Function used to allocate and set up a virt queue. */
> +
> +typedef int (*dmadev_queue_release_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to release a virt queue. */
> +
> +typedef int (*dmadev_queue_info_t)(struct rte_dmadev *dev, uint16_t vq_id,
> +				   struct rte_dmadev_queue_info *info);
> +/**< @internal Function used to retrieve information of a virt queue. */
> +
> +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
> +				  struct rte_dmadev_stats *stats);
> +/**< @internal Function used to retrieve basic statistics. */
> +
> +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
> +/**< @internal Function used to reset basic statistics. */
> +
> +typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev *dev,
> +		struct rte_dmadev_xstats_name *xstats_names,
> +		uint32_t size);
> +/**< @internal Function used to get names of extended stats. */
> +
> +typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
> +		const uint32_t ids[], uint64_t values[], uint32_t n);
> +/**< @internal Function used to retrieve extended stats. */
> +
> +typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
> +				     const uint32_t ids[], uint32_t nb_ids);
> +/**< @internal Function used to reset extended stats. */

Do we really need both stats and xstats from the very
beginning? I think it is better to start from just
generic stats and add xstats when it is really required.

> +
> +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> +/**< @internal Function used to start dmadev selftest. */
> +
> +/** DMA device operations function pointer table */
> +struct rte_dmadev_ops {

Do we need rte_ prefiix for internal data types?

> +	/**< Get device info. */
> +	dmadev_info_get_t dev_info_get;
> +	/**< Configure device. */
> +	dmadev_configure_t dev_configure;
> +	/**< Start device. */
> +	dmadev_start_t dev_start;
> +	/**< Stop device. */
> +	dmadev_stop_t dev_stop;
> +	/**< Close device. */
> +	dmadev_close_t dev_close;
> +	/**< Reset device. */
> +	dmadev_reset_t dev_reset;
> +
> +	/**< Allocate and set up a virt queue. */
> +	dmadev_queue_setup_t queue_setup;
> +	/**< Release a virt queue. */
> +	dmadev_queue_release_t queue_release;
> +	/**< Retrieve information of a virt queue */
> +	dmadev_queue_info_t queue_info_get;
> +
> +	/**< Get basic statistics. */
> +	dmadev_stats_get_t stats_get;
> +	/**< Reset basic statistics. */
> +	dmadev_stats_reset_t stats_reset;
> +	/**< Get names of extended stats. */
> +	dmadev_xstats_get_names_t xstats_get_names;
> +	/**< Get extended statistics. */
> +	dmadev_xstats_get_t xstats_get;
> +	/**< Reset extended statistics values. */
> +	dmadev_xstats_reset_t xstats_reset;
> +
> +	/**< Device selftest function */
> +	dmadev_selftest_t dev_selftest;
> +};
> +
> +/**
> + * Allocates a new dmadev slot for an DMA device and returns the pointer
> + * to that slot for the driver to use.
> + *
> + * @param name
> + *   Unique identifier name for each device
> + * @param dev_private_size
> + *   Size of private data memory allocated within rte_dmadev object.
> + *   Set to 0 to disable internal memory allocation and allow for
> + *   self-allocation.
> + * @param socket_id
> + *   Socket to allocate resources on.

It should be numa_node. See recent mails from Thomas M.

> + *
> + * @return
> + *   - NULL: Failure to allocate
> + *   - Other: The rte_dmadev structure pointer for the new device
> + */
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_private_size,
> +			int socket_id);
> +
> +/**
> + * Release the specified dmadev device.

"dmadev device" sounds strange. May be "DMA device"?

> + *
> + * @param dev
> + *   The *dmadev* pointer is the address of the *rte_dmadev* structure.
> + *
> + * @return
> + *   - 0 on success, negative on error
> + */
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_PMD_H_ */

[snip]
Matan Azrad July 4, 2021, 3:21 p.m. UTC | #4
From: Chengwen Feng
> This patch introduces 'dmadevice' which is a generic type of DMA
> device.
> 
> The APIs of dmadev library exposes some generic operations which can
> enable configuration and I/O with the DMA devices.
> 
Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?

> Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> ---
>  MAINTAINERS                  |   4 +
>  config/rte_config.h          |   3 +
>  lib/dmadev/meson.build       |   6 +
>  lib/dmadev/rte_dmadev.c      | 438 +++++++++++++++++++++
>  lib/dmadev/rte_dmadev.h      | 919
> +++++++++++++++++++++++++++++++++++++++++++
>  lib/dmadev/rte_dmadev_core.h |  98 +++++
>  lib/dmadev/rte_dmadev_pmd.h  | 210 ++++++++++
>  lib/dmadev/version.map       |  32 ++
>  lib/meson.build              |   1 +
>  9 files changed, 1711 insertions(+)
>  create mode 100644 lib/dmadev/meson.build
>  create mode 100644 lib/dmadev/rte_dmadev.c
>  create mode 100644 lib/dmadev/rte_dmadev.h
>  create mode 100644 lib/dmadev/rte_dmadev_core.h
>  create mode 100644 lib/dmadev/rte_dmadev_pmd.h
>  create mode 100644 lib/dmadev/version.map
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4347555..2019783 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -496,6 +496,10 @@ F: drivers/raw/skeleton/
>  F: app/test/test_rawdev.c
>  F: doc/guides/prog_guide/rawdev.rst
> 
> +Dma device API
> +M: Chengwen Feng <fengchengwen@huawei.com>
> +F: lib/dmadev/
> +
> 
>  Memory Pool Drivers
>  -------------------
> diff --git a/config/rte_config.h b/config/rte_config.h
> index 590903c..331a431 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -81,6 +81,9 @@
>  /* rawdev defines */
>  #define RTE_RAWDEV_MAX_DEVS 64
> 
> +/* dmadev defines */
> +#define RTE_DMADEV_MAX_DEVS 64
> +
>  /* ip_fragmentation defines */
>  #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
>  #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
> diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
> new file mode 100644
> index 0000000..c918dae
> --- /dev/null
> +++ b/lib/dmadev/meson.build
> @@ -0,0 +1,6 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2021 HiSilicon Limited.
> +
> +sources = files('rte_dmadev.c')
> +headers = files('rte_dmadev.h', 'rte_dmadev_pmd.h')
> +indirect_headers += files('rte_dmadev_core.h')
> diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
> new file mode 100644
> index 0000000..a94e839
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.c
> @@ -0,0 +1,438 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <stdint.h>
> +
> +#include <rte_log.h>
> +#include <rte_debug.h>
> +#include <rte_dev.h>
> +#include <rte_memory.h>
> +#include <rte_memzone.h>
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_string_fns.h>
> +
> +#include "rte_dmadev.h"
> +#include "rte_dmadev_pmd.h"
> +
> +struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
> +
> +uint16_t
> +rte_dmadev_count(void)
> +{
> +       uint16_t count = 0;
> +       uint16_t i;
> +
> +       for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +               if (rte_dmadevices[i].attached)
> +                       count++;
> +       }
> +
> +       return count;
> +}
> +
> +int
> +rte_dmadev_get_dev_id(const char *name)
> +{
> +       uint16_t i;
> +
> +       if (name == NULL)
> +               return -EINVAL;
> +
> +       for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
> +               if ((strcmp(rte_dmadevices[i].name, name) == 0) &&
> +                   (rte_dmadevices[i].attached == RTE_DMADEV_ATTACHED))
> +                       return i;
> +
> +       return -ENODEV;
> +}
> +
> +int
> +rte_dmadev_socket_id(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       dev = &rte_dmadevices[dev_id];
> +
> +       return dev->socket_id;
> +}
> +
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -
> ENOTSUP);
> +
> +       memset(dev_info, 0, sizeof(struct rte_dmadev_info));
> +       diag = (*dev->dev_ops->dev_info_get)(dev, dev_info);
> +       if (diag != 0)
> +               return diag;
> +
> +       dev_info->device = dev->device;
> +       dev_info->driver_name = dev->driver_name;
> +       dev_info->socket_id = dev->socket_id;
> +
> +       return 0;
> +}
> +
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf
> *dev_conf)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -
> ENOTSUP);
> +
> +       if (dev->started) {
> +               RTE_DMADEV_LOG(ERR,
> +                  "device %u must be stopped to allow configuration", dev_id);
> +               return -EBUSY;
> +       }
> +
> +       diag = (*dev->dev_ops->dev_configure)(dev, dev_conf);
> +       if (diag != 0)
> +               RTE_DMADEV_LOG(ERR, "device %u dev_configure failed, ret =
> %d",
> +                              dev_id, diag);
> +       else
> +               dev->attached = 1;
> +
> +       return diag;
> +}
> +
> +int
> +rte_dmadev_start(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +       if (dev->started != 0) {
> +               RTE_DMADEV_LOG(ERR, "device %u already started", dev_id);
> +               return 0;
> +       }
> +
> +       if (dev->dev_ops->dev_start == NULL)
> +               goto mark_started;
> +
> +       diag = (*dev->dev_ops->dev_start)(dev);
> +       if (diag != 0)
> +               return diag;
> +
> +mark_started:
> +       dev->started = 1;
> +       return 0;
> +}
> +
> +int
> +rte_dmadev_stop(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +       int diag;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       if (dev->started == 0) {
> +               RTE_DMADEV_LOG(ERR, "device %u already stopped", dev_id);
> +               return 0;
> +       }
> +
> +       if (dev->dev_ops->dev_stop == NULL)
> +               goto mark_stopped;
> +
> +       diag = (*dev->dev_ops->dev_stop)(dev);
> +       if (diag != 0)
> +               return diag;
> +
> +mark_stopped:
> +       dev->started = 0;
> +       return 0;
> +}
> +
> +int
> +rte_dmadev_close(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
> +
> +       /* Device must be stopped before it can be closed */
> +       if (dev->started == 1) {
> +               RTE_DMADEV_LOG(ERR, "device %u must be stopped before
> closing",
> +                              dev_id);
> +               return -EBUSY;
> +       }
> +
> +       return (*dev->dev_ops->dev_close)(dev);
> +}
> +
> +int
> +rte_dmadev_reset(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
> +
> +       /* Reset is not dependent on state of the device */
> +       return (*dev->dev_ops->dev_reset)(dev);
> +}
> +
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +                      const struct rte_dmadev_queue_conf *conf)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->queue_setup)(dev, conf);
> +}
> +
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->queue_release)(dev, vq_id);
> +}
> +
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +                         struct rte_dmadev_queue_info *info)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(info, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_info_get, -
> ENOTSUP);
> +
> +       memset(info, 0, sizeof(struct rte_dmadev_queue_info));
> +       return (*dev->dev_ops->queue_info_get)(dev, vq_id, info);
> +}
> +
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +                    struct rte_dmadev_stats *stats)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
> +
> +       return (*dev->dev_ops->stats_get)(dev, vq_id, stats);
> +}
> +
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->stats_reset)(dev, vq_id);
> +}
> +
> +static int
> +xstats_get_count(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
> +}
> +
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +                           struct rte_dmadev_xstats_name *xstats_names,
> +                           uint32_t size)
> +{
> +       struct rte_dmadev *dev;
> +       int cnt_expected_entries;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       cnt_expected_entries = xstats_get_count(dev_id);
> +
> +       if (xstats_names == NULL || cnt_expected_entries < 0 ||
> +           (int)size < cnt_expected_entries || size == 0)
> +               return cnt_expected_entries;
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -
> ENOTSUP);
> +       return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
> +}
> +
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +                     uint64_t values[], uint32_t n)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
> +       RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
> +}
> +
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t
> nb_ids)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
> +}
> +
> +int
> +rte_dmadev_selftest(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -
> ENOTSUP);
> +
> +       return (*dev->dev_ops->dev_selftest)(dev_id);
> +}
> +
> +static inline uint16_t
> +rte_dmadev_find_free_device_index(void)
> +{
> +       uint16_t i;
> +
> +       for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
> +               if (rte_dmadevices[i].attached == RTE_DMADEV_DETACHED)
> +                       return i;
> +       }
> +
> +       return RTE_DMADEV_MAX_DEVS;
> +}
> +
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int
> socket_id)
> +{
> +       struct rte_dmadev *dev;
> +       uint16_t dev_id;
> +
> +       if (rte_dmadev_get_dev_id(name) >= 0) {
> +               RTE_DMADEV_LOG(ERR,
> +                       "device with name %s already allocated!", name);
> +               return NULL;
> +       }
> +
> +       dev_id = rte_dmadev_find_free_device_index();
> +       if (dev_id == RTE_DMADEV_MAX_DEVS) {
> +               RTE_DMADEV_LOG(ERR, "reached maximum number of DMA
> devices");
> +               return NULL;
> +       }
> +
> +       dev = &rte_dmadevices[dev_id];
> +
> +       if (dev_priv_size > 0) {
> +               dev->dev_private = rte_zmalloc_socket("dmadev private",
> +                                    dev_priv_size,
> +                                    RTE_CACHE_LINE_SIZE,
> +                                    socket_id);
> +               if (dev->dev_private == NULL) {
> +                       RTE_DMADEV_LOG(ERR,
> +                               "unable to allocate memory for dmadev");
> +                       return NULL;
> +               }
> +       }
> +
> +       dev->dev_id = dev_id;
> +       dev->socket_id = socket_id;
> +       dev->started = 0;
> +       strlcpy(dev->name, name, RTE_DMADEV_NAME_MAX_LEN);
> +
> +       dev->attached = RTE_DMADEV_ATTACHED;
> +
> +       return dev;
> +}
> +
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev)
> +{
> +       int ret;
> +
> +       if (dev == NULL)
> +               return -EINVAL;
> +
> +       ret = rte_dmadev_close(dev->dev_id);
> +       if (ret != 0)
> +               return ret;
> +
> +       if (dev->dev_private != NULL)
> +               rte_free(dev->dev_private);
> +
> +       memset(dev, 0, sizeof(struct rte_dmadev));
> +       dev->attached = RTE_DMADEV_DETACHED;
> +
> +       return 0;
> +}
> +
> +RTE_LOG_REGISTER(libdmadev_logtype, lib.dmadev, INFO);
> diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
> new file mode 100644
> index 0000000..f74fc6a
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev.h
> @@ -0,0 +1,919 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_H_
> +#define _RTE_DMADEV_H_
> +
> +/**
> + * @file rte_dmadev.h
> + *
> + * RTE DMA (Direct Memory Access) device APIs.
> + *
> + * The generic DMA device diagram:
> + *
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                  ----------------
> + *                  |dma-controller|
> + *                  ----------------
> + *
> + *   The DMA could have multiple HW-queues, each HW-queue could have
> multiple
> + *   capabilities, e.g. whether to support fill operation, supported DMA
> + *   transfter direction and etc.
> + *
> + * The DMA framework is built on the following abstraction model:
> + *
> + *     ------------    ------------
> + *     |virt-queue|    |virt-queue|
> + *     ------------    ------------
> + *            \           /
> + *             \         /
> + *              \       /
> + *            ------------     ------------
> + *            | HW-queue |     | HW-queue |
> + *            ------------     ------------
> + *                   \            /
> + *                    \          /
> + *                     \        /
> + *                     ----------
> + *                     | dmadev |
> + *                     ----------
> + *
> + *   a) The DMA operation request must be submitted to the virt queue, virt
> + *      queues must be created based on HW queues, the DMA device could
> have
> + *      multiple HW queues.
> + *   b) The virt queues on the same HW-queue could represent different
> contexts,
> + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> + *      mem-to-dev transfer scenario.
> + *   NOTE: user could also create multiple virt queues for mem-to-mem
> transfer
> + *         scenario as long as the corresponding driver supports.
> + *
> + * The control plane APIs include
> configure/queue_setup/queue_release/start/
> + * stop/reset/close, in order to start device work, the call sequence must be
> + * as follows:
> + *     - rte_dmadev_configure()
> + *     - rte_dmadev_queue_setup()
> + *     - rte_dmadev_start()
> + *
> + * The dataplane APIs include two parts:
> + *   a) The first part is the submission of operation requests:
> + *        - rte_dmadev_copy()
> + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> + *        - rte_dmadev_fill()
> + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> + *        - rte_dmadev_fence()   - add a fence force ordering between
> operations
> + *        - rte_dmadev_perform() - issue doorbell to hardware
> + *      These APIs could work with different virt queues which have different
> + *      contexts.
> + *      The first four APIs are used to submit the operation request to the virt
> + *      queue, if the submission is successful, a cookie (as type
> + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> + *   b) The second part is to obtain the result of requests:
> + *        - rte_dmadev_completed()
> + *            - return the number of operation requests completed successfully.
> + *        - rte_dmadev_completed_fails()
> + *            - return the number of operation requests failed to complete.
> + *
> + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest,
> provide
> + * information query and self-test capabilities.
> + *
> + * About the dataplane APIs MT-safe, there are two dimensions:
> + *   a) For one virt queue, the submit/completion API could be MT-safe,
> + *      e.g. one thread do submit operation, another thread do completion
> + *      operation.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> + *      operation on virt-queue-0, another thread do operation on virt-queue-
> 1.
> + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> + *      If driver don't support it, it's up to the application to guarantee
> + *      MT-safe.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_errno.h>
> +#include <rte_compat.h>
> +
> +/**
> + * dma_cookie_t - an opaque DMA cookie
> + *
> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> + * code.
> + * When using cookies, comply with the following rules:
> + * a) Cookies for each virtual queue are independent.
> + * b) For a virt queue, the cookie are monotonically incremented, when it
> reach
> + *    the INT_MAX, it wraps back to zero.
> + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> + *    reset, the virt queue's cookie needs to be reset to zero.
> + * Example:
> + *    step-1: start one dmadev
> + *    step-2: enqueue a copy operation, the cookie return is 0
> + *    step-3: enqueue a copy operation again, the cookie return is 1
> + *    ...
> + *    step-101: stop the dmadev
> + *    step-102: start the dmadev
> + *    step-103: enqueue a copy operation, the cookie return is 0
> + *    ...
> + */
> +typedef int32_t dma_cookie_t;
> +
> +/**
> + * dma_scatterlist - can hold scatter DMA operation request
> + */
> +struct dma_scatterlist {
> +       void *src;
> +       void *dst;
> +       uint32_t length;
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the total number of DMA devices that have been successfully
> + * initialised.
> + *
> + * @return
> + *   The total number of usable DMA devices.
> + */
> +__rte_experimental
> +uint16_t
> +rte_dmadev_count(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Get the device identifier for the named DMA device.
> + *
> + * @param name
> + *   DMA device name to select the DMA device identifier.
> + *
> + * @return
> + *   Returns DMA device identifier on success.
> + *   - <0: Failure to find named DMA device.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_get_dev_id(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Return the NUMA socket to which a device is connected.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   The NUMA socket id to which the device is connected or
> + *   a default of zero if the socket could not be determined.
> + *   - -EINVAL: dev_id value is out of range.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_socket_id(uint16_t dev_id);
> +
> +/**
> + * The capabilities of a DMA device
> + */
> +#define RTE_DMA_DEV_CAPA_M2M   (1ull << 0) /**< Support mem-to-
> mem transfer */
> +#define RTE_DMA_DEV_CAPA_M2D   (1ull << 1) /**< Support mem-to-dev
> transfer */
> +#define RTE_DMA_DEV_CAPA_D2M   (1ull << 2) /**< Support dev-to-mem
> transfer */
> +#define RTE_DMA_DEV_CAPA_D2D   (1ull << 3) /**< Support dev-to-dev
> transfer */
> +#define RTE_DMA_DEV_CAPA_COPY  (1ull << 4) /**< Support copy ops */
> +#define RTE_DMA_DEV_CAPA_FILL  (1ull << 5) /**< Support fill ops */
> +#define RTE_DMA_DEV_CAPA_SG    (1ull << 6) /**< Support scatter-gather
> ops */
> +#define RTE_DMA_DEV_CAPA_FENCE (1ull << 7) /**< Support fence ops */
> +#define RTE_DMA_DEV_CAPA_IOVA  (1ull << 8) /**< Support IOVA as
> DMA address */
> +#define RTE_DMA_DEV_CAPA_VA    (1ull << 9) /**< Support VA as DMA
> address */
> +#define RTE_DMA_DEV_CAPA_MT_VQ (1ull << 10) /**< Support MT-safe
> of one virt queue */
> +#define RTE_DMA_DEV_CAPA_MT_MVQ        (1ull << 11) /**< Support MT-
> safe of multiple virt queues */
> +
> +/**
> + * A structure used to retrieve the contextual information of
> + * an DMA device
> + */
> +struct rte_dmadev_info {
> +       /**
> +        * Fields filled by framewok
> +        */
> +       struct rte_device *device; /**< Generic Device information */
> +       const char *driver_name; /**< Device driver name */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +
> +       /**
> +        * Specification fields filled by driver
> +        */
> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> +       uint16_t max_vqs_per_hw_queue;
> +       /**< Maximum number of virt queues to allocate per HW queue */
> +       uint16_t max_desc;
> +       /**< Maximum allowed number of virt queue descriptors */
> +       uint16_t min_desc;
> +       /**< Minimum allowed number of virt queue descriptors */
> +
> +       /**
> +        * Status fields filled by driver
> +        */
> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> +       uint16_t nb_vqs; /**< Number of virt queues configured */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve the contextual information of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @param[out] dev_info
> + *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
> + *   contextual information of the device.
> + * @return
> + *   - =0: Success, driver updates the contextual information of the DMA
> device
> + *   - <0: Error code returned by the driver info get function.
> + *
> + */
> +__rte_experimental
> +int
> +rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info
> *dev_info);
> +
> +/**
> + * dma_address_type
> + */
> +enum dma_address_type {
> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> +};
> +
> +/**
> + * A structure used to configure a DMA device.
> + */
> +struct rte_dmadev_conf {
> +       enum dma_address_type addr_type; /**< Address type to used */
> +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> +       uint16_t max_vqs; /**< Maximum number of virt queues to use */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Configure a DMA device.
> + *
> + * This function must be invoked first before any other function in the
> + * API. This function can also be re-invoked when a device is in the
> + * stopped state.
> + *
> + * The caller may use rte_dmadev_info_get() to get the capability of each
> + * resources available for this DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device to configure.
> + * @param dev_conf
> + *   The DMA device configuration structure encapsulated into
> rte_dmadev_conf
> + *   object.
> + *
> + * @return
> + *   - =0: Success, device configured.
> + *   - <0: Error code returned by the driver configuration function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf
> *dev_conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Start a DMA device.
> + *
> + * The device start step is the last one and consists of setting the DMA
> + * to start accepting jobs.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device started.
> + *   - <0: Error code returned by the driver start function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_start(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Stop a DMA device.
> + *
> + * The device can be restarted with a call to rte_dmadev_start()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Success, device stopped.
> + *   - <0: Error code returned by the driver stop function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stop(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Close a DMA device.
> + *
> + * The device cannot be restarted after this call.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *  - =0: Successfully closing device
> + *  - <0: Failure to close device
> + */
> +__rte_experimental
> +int
> +rte_dmadev_close(uint16_t dev_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset a DMA device.
> + *
> + * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in
> the
> + * sense similar to hard or soft reset.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - =0: Successful reset device.
> + *   - <0: Failure to reset device.
> + *   - (-ENOTSUP): If the device doesn't support this function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_reset(uint16_t dev_id);
> +
> +/**
> + * dma_transfer_direction
> + */
> +enum dma_transfer_direction {
> +       DMA_MEM_TO_MEM,
> +       DMA_MEM_TO_DEV,
> +       DMA_DEV_TO_MEM,
> +       DMA_DEV_TO_DEV,
> +};
> +
> +/**
> + * A structure used to configure a DMA virt queue.
> + */
> +struct rte_dmadev_queue_conf {
> +       enum dma_transfer_direction direction;
> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt
> queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */
> +       void *dev_ctx; /**< Device specific context */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Allocate and set up a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param conf
> + *   The queue configuration structure encapsulated into
> rte_dmadev_queue_conf
> + *   object.
> + *
> + * @return
> + *   - >=0: Allocate virt queue success, it is virt queue id.
> + *   - <0: Error code returned by the driver queue setup function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_setup(uint16_t dev_id,
> +                      const struct rte_dmadev_queue_conf *conf);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Release a virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + *
> + * @return
> + *   - =0: Successful release the virt queue.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id);
> +
> +/**
> + * A structure used to retrieve information of a DMA virt queue.
> + */
> +struct rte_dmadev_queue_info {
> +       enum dma_transfer_direction direction;
> +       /**< Associated transfer direction */
> +       uint16_t hw_queue_id; /**< The HW queue on which to create virt
> queue */
> +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> +       uint64_t dev_flags; /**< Device specific flags */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve information of a DMA virt queue.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue which return by queue setup.
> + * @param[out] info
> + *   The queue info structure encapsulated into rte_dmadev_queue_info
> object.
> + *
> + * @return
> + *   - =0: Successful retrieve information.
> + *   - <0: Error code returned by the driver queue release function.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
> +                         struct rte_dmadev_queue_info *info);
> +
> +#include "rte_dmadev_core.h"
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a copy operation onto the DMA virt queue.
> + *
> + * This queues up a copy operation to be performed by hardware, but does
> not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param src
> + *   The address of the source buffer.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the data to be copied.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,

Did you consider also mbuf API usage for memory descriptor?

> +               uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->copy)(dev, vq_id, src, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list copy operation onto the DMA virt queue.
> + *
> + * This queues up a scatter list copy operation to be performed by
> hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> +                  const struct dma_scatterlist *sg,
> +                  uint32_t sg_len, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a fill operation onto the DMA virt queue
> + *
> + * This queues up a fill operation to be performed by hardware, but does
> not
> + * trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param dst
> + *   The address of the destination buffer.
> + * @param length
> + *   The length of the destination buffer.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +               void *dst, uint32_t length, uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Enqueue a scatter list fill operation onto the DMA virt queue
> + *
> + * This queues up a scatter list fill operation to be performed by hardware,
> + * but does not trigger hardware to begin that operation.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param pattern
> + *   The pattern to populate the destination buffer with.
> + * @param sg
> + *   The pointer of scatterlist.
> + * @param sg_len
> + *   The number of scatterlist elements.
> + * @param flags
> + *   An opaque flags for this operation.
> + *
> + * @return
> + *   dma_cookie_t: please refer to the corresponding definition.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline dma_cookie_t
> +rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
> +                  const struct dma_scatterlist *sg, uint32_t sg_len,
> +                  uint64_t flags)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fill_sg)(dev, vq_id, pattern, sg, sg_len, flags);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Add a fence to force ordering between operations
> + *
> + * This adds a fence to a sequence of operations to enforce ordering, such
> that
> + * all operations enqueued before the fence must be completed before
> operations
> + * after the fence.
> + * NOTE: Since this fence may be added as a flag to the last operation
> enqueued,
> + * this API may not function correctly when called immediately after an
> + * "rte_dmadev_perform" call i.e. before any new operations are
> enqueued.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful add fence.
> + *   - <0: Failure to add fence.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->fence)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger hardware to begin performing enqueued operations
> + *
> + * This API is used to write the "doorbell" to the hardware to trigger it
> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + *
> + * @return
> + *   - =0: Successful trigger hardware.
> + *   - <0: Failure to trigger hardware.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline int
> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->perform)(dev, vq_id);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that have been successful completed.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_cpls
> + *   The maximum number of completed operations that can be processed.
> + * @param[out] cookie
> + *   The last completed operation's cookie.
> + * @param[out] has_error
> + *   Indicates if there are transfer error.
> + *
> + * @return
> + *   The number of operations that successful completed.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t
> nb_cpls,
> +                    dma_cookie_t *cookie, bool *has_error)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       has_error = false;
> +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Returns the number of operations that failed to complete.
> + * NOTE: This API was used when rte_dmadev_completed has_error was
> set.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue.
> + * @param nb_status
> + *   Indicates the size of status array.
> + * @param[out] status
> + *   The error code of operations that failed to complete.
> + * @param[out] cookie
> + *   The last failed completed operation's cookie.
> + *
> + * @return
> + *   The number of operations that failed to complete.
> + *
> + * NOTE: The caller must ensure that the input parameter is valid and the
> + *       corresponding device supports the operation.
> + */
> +__rte_experimental
> +static inline uint16_t
> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> +                          const uint16_t nb_status, uint32_t *status,
> +                          dma_cookie_t *cookie)
> +{
> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> +}
> +
> +struct rte_dmadev_stats {
> +       uint64_t enqueue_fail_count;
> +       /**< Conut of all operations which failed enqueued */
> +       uint64_t enqueued_count;
> +       /**< Count of all operations which successful enqueued */
> +       uint64_t completed_fail_count;
> +       /**< Count of all operations which failed to complete */
> +       uint64_t completed_count;
> +       /**< Count of all operations which successful complete */
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + * @param[out] stats
> + *   The basic statistics structure encapsulated into rte_dmadev_stats
> + *   object.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
> +                    struct rte_dmadev_stats *stats);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset basic statistics of a or all DMA virt queue(s).
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param vq_id
> + *   The identifier of virt queue, -1 means all virt queues.
> + *
> + * @return
> + *   - =0: Successful retrieve stats.
> + *   - <0: Failure to retrieve stats.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
> +
> +/** Maximum name length for extended statistics counters */
> +#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
> +
> +/**
> + * A name-key lookup element for extended statistics.
> + *
> + * This structure is used to map between names and ID numbers
> + * for extended ethdev statistics.
> + */
> +struct rte_dmadev_xstats_name {
> +       char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
> +};
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve names of extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param[out] xstats_names
> + *   Block of memory to insert names into. Must be at least size in capacity.
> + *   If set to NULL, function returns required capacity.
> + * @param size
> + *   Capacity of xstats_names (number of names).
> + * @return
> + *   - positive value lower or equal to size: success. The return value
> + *     is the number of entries filled in the stats table.
> + *   - positive value higher than size: error, the given statistics table
> + *     is too small. The return value corresponds to the size that should
> + *     be given to succeed. The entries in the table are not valid and
> + *     shall not be used by the caller.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_names_get(uint16_t dev_id,
> +                           struct rte_dmadev_xstats_name *xstats_names,
> +                           uint32_t size);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Retrieve extended statistics of a DMA device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   The id numbers of the stats to get. The ids can be got from the stat
> + *   position in the stat list from rte_dmadev_get_xstats_names().
> + * @param[out] values
> + *   The values for each stats request by ID.
> + * @param n
> + *   The number of stats requested.
> + *
> + * @return
> + *   - positive value: number of stat entries filled into the values array.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
> +                     uint64_t values[], uint32_t n);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Reset the values of the xstats of the selected component in the device.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + * @param ids
> + *   Selects specific statistics to be reset. When NULL, all statistics
> + *   will be reset. If non-NULL, must point to array of at least
> + *   *nb_ids* size.
> + * @param nb_ids
> + *   The number of ids available from the *ids* array. Ignored when ids is
> NULL.
> + *
> + * @return
> + *   - zero: successfully reset the statistics to zero.
> + *   - negative value on error.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t
> nb_ids);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Trigger the dmadev self test.
> + *
> + * @param dev_id
> + *   The identifier of the device.
> + *
> + * @return
> + *   - 0: Selftest successful.
> + *   - -ENOTSUP if the device doesn't support selftest
> + *   - other values < 0 on failure.
> + */
> +__rte_experimental
> +int
> +rte_dmadev_selftest(uint16_t dev_id);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_H_ */
> diff --git a/lib/dmadev/rte_dmadev_core.h
> b/lib/dmadev/rte_dmadev_core.h
> new file mode 100644
> index 0000000..a3afea2
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_core.h
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_CORE_H_
> +#define _RTE_DMADEV_CORE_H_
> +
> +/**
> + * @file
> + *
> + * RTE DMA Device internal header.
> + *
> + * This header contains internal data types. But they are still part of the
> + * public API because they are used by inline public functions.
> + */
> +
> +struct rte_dmadev;
> +
> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                                     void *src, void *dst,
> +                                     uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a copy operation. */
> +
> +typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                                        const struct dma_scatterlist *sg,
> +                                        uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list copy operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t
> vq_id,
> +                                     uint64_t pattern, void *dst,
> +                                     uint32_t length, uint64_t flags);
> +/**< @internal Function used to enqueue a fill operation. */
> +
> +typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                       uint64_t pattern, const struct dma_scatterlist *sg,
> +                       uint32_t sg_len, uint64_t flags);
> +/**< @internal Function used to enqueue a scatter list fill operation. */
> +
> +typedef int (*dmadev_fence_t)(struct rte_dmadev *dev, uint16_t vq_id);
> +/**< @internal Function used to add a fence ordering between operations.
> */
> +
> +typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t
> vq_id);
> +/**< @internal Function used to trigger hardware to begin performing. */
> +
> +typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev,
> uint16_t vq_id,
> +                                      const uint16_t nb_cpls,
> +                                      dma_cookie_t *cookie, bool *has_error);
> +/**< @internal Function used to return number of successful completed
> operations */
> +
> +typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
> +                       uint16_t vq_id, const uint16_t nb_status,
> +                       uint32_t *status, dma_cookie_t *cookie);
> +/**< @internal Function used to return number of failed completed
> operations */
> +
> +#define RTE_DMADEV_NAME_MAX_LEN        64 /**< Max length of name
> of DMA PMD */
> +
> +struct rte_dmadev_ops;
> +
> +/**
> + * The data structure associated with each DMA device.
> + */
> +struct rte_dmadev {
> +       /**< Enqueue a copy operation onto the DMA device. */
> +       dmadev_copy_t copy;
> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> +       dmadev_copy_sg_t copy_sg;
> +       /**< Enqueue a fill operation onto the DMA device. */
> +       dmadev_fill_t fill;
> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> +       dmadev_fill_sg_t fill_sg;
> +       /**< Add a fence to force ordering between operations. */
> +       dmadev_fence_t fence;
> +       /**< Trigger hardware to begin performing enqueued operations. */
> +       dmadev_perform_t perform;
> +       /**< Returns the number of operations that successful completed. */
> +       dmadev_completed_t completed;
> +       /**< Returns the number of operations that failed to complete. */
> +       dmadev_completed_fails_t completed_fails;
> +
> +       void *dev_private; /**< PMD-specific private data */
> +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by
> PMD */
> +
> +       uint16_t dev_id; /**< Device ID for this instance */
> +       int socket_id; /**< Socket ID where memory is allocated */
> +       struct rte_device *device;
> +       /**< Device info. supplied during device initialization */
> +       const char *driver_name; /**< Driver info. supplied by probing */
> +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> +
> +       RTE_STD_C11
> +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> +
> +} __rte_cache_aligned;
> +
> +extern struct rte_dmadev rte_dmadevices[];
> +
> +#endif /* _RTE_DMADEV_CORE_H_ */
> diff --git a/lib/dmadev/rte_dmadev_pmd.h
> b/lib/dmadev/rte_dmadev_pmd.h
> new file mode 100644
> index 0000000..ef03cf7
> --- /dev/null
> +++ b/lib/dmadev/rte_dmadev_pmd.h
> @@ -0,0 +1,210 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2021 HiSilicon Limited.
> + */
> +
> +#ifndef _RTE_DMADEV_PMD_H_
> +#define _RTE_DMADEV_PMD_H_
> +
> +/** @file
> + * RTE DMA PMD APIs
> + *
> + * @note
> + * Driver facing APIs for a DMA device. These are not to be called directly by
> + * any application.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <string.h>
> +
> +#include <rte_dev.h>
> +#include <rte_log.h>
> +#include <rte_common.h>
> +
> +#include "rte_dmadev.h"
> +
> +extern int libdmadev_logtype;
> +
> +#define RTE_DMADEV_LOG(level, fmt, args...) \
> +       rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
> +               __func__, ##args)
> +
> +/* Macros to check for valid device */
> +#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
> +       if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +               RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +               return retval; \
> +       } \
> +} while (0)
> +
> +#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
> +       if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
> +               RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
> +               return; \
> +       } \
> +} while (0)
> +
> +#define RTE_DMADEV_DETACHED  0
> +#define RTE_DMADEV_ATTACHED  1
> +
> +/**
> + * Validate if the DMA device index is a valid attached DMA device.
> + *
> + * @param dev_id
> + *   DMA device index.
> + *
> + * @return
> + *   - If the device index is valid (1) or not (0).
> + */
> +static inline unsigned
> +rte_dmadev_pmd_is_valid_dev(uint16_t dev_id)
> +{
> +       struct rte_dmadev *dev;
> +
> +       if (dev_id >= RTE_DMADEV_MAX_DEVS)
> +               return 0;
> +
> +       dev = &rte_dmadevices[dev_id];
> +       if (dev->attached != RTE_DMADEV_ATTACHED)
> +               return 0;
> +       else
> +               return 1;
> +}
> +
> +/**
> + * Definitions of control-plane functions exported by a driver through the
> + * generic structure of type *rte_dmadev_ops* supplied in the
> *rte_dmadev*
> + * structure associated with a device.
> + */
> +
> +typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
> +                                struct rte_dmadev_info *dev_info);
> +/**< @internal Function used to get device information of a device. */
> +
> +typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
> +                                 const struct rte_dmadev_conf *dev_conf);
> +/**< @internal Function used to configure a device. */
> +
> +typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to start a configured device. */
> +
> +typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to stop a configured device. */
> +
> +typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to close a configured device. */
> +
> +typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
> +/**< @internal Function used to reset a configured device. */
> +
> +typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
> +                                   const struct rte_dmadev_queue_conf *conf);
> +/**< @internal Function used to allocate and set up a virt queue. */
> +
> +typedef int (*dmadev_queue_release_t)(struct rte_dmadev *dev,
> uint16_t vq_id);
> +/**< @internal Function used to release a virt queue. */
> +
> +typedef int (*dmadev_queue_info_t)(struct rte_dmadev *dev, uint16_t
> vq_id,
> +                                  struct rte_dmadev_queue_info *info);
> +/**< @internal Function used to retrieve information of a virt queue. */
> +
> +typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
> +                                 struct rte_dmadev_stats *stats);
> +/**< @internal Function used to retrieve basic statistics. */
> +
> +typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
> +/**< @internal Function used to reset basic statistics. */
> +
> +typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev
> *dev,
> +               struct rte_dmadev_xstats_name *xstats_names,
> +               uint32_t size);
> +/**< @internal Function used to get names of extended stats. */
> +
> +typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
> +               const uint32_t ids[], uint64_t values[], uint32_t n);
> +/**< @internal Function used to retrieve extended stats. */
> +
> +typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
> +                                    const uint32_t ids[], uint32_t nb_ids);
> +/**< @internal Function used to reset extended stats. */
> +
> +typedef int (*dmadev_selftest_t)(uint16_t dev_id);
> +/**< @internal Function used to start dmadev selftest. */
> +
> +/** DMA device operations function pointer table */
> +struct rte_dmadev_ops {
> +       /**< Get device info. */
> +       dmadev_info_get_t dev_info_get;
> +       /**< Configure device. */
> +       dmadev_configure_t dev_configure;
> +       /**< Start device. */
> +       dmadev_start_t dev_start;
> +       /**< Stop device. */
> +       dmadev_stop_t dev_stop;
> +       /**< Close device. */
> +       dmadev_close_t dev_close;
> +       /**< Reset device. */
> +       dmadev_reset_t dev_reset;
> +
> +       /**< Allocate and set up a virt queue. */
> +       dmadev_queue_setup_t queue_setup;
> +       /**< Release a virt queue. */
> +       dmadev_queue_release_t queue_release;
> +       /**< Retrieve information of a virt queue */
> +       dmadev_queue_info_t queue_info_get;
> +
> +       /**< Get basic statistics. */
> +       dmadev_stats_get_t stats_get;
> +       /**< Reset basic statistics. */
> +       dmadev_stats_reset_t stats_reset;
> +       /**< Get names of extended stats. */
> +       dmadev_xstats_get_names_t xstats_get_names;
> +       /**< Get extended statistics. */
> +       dmadev_xstats_get_t xstats_get;
> +       /**< Reset extended statistics values. */
> +       dmadev_xstats_reset_t xstats_reset;
> +
> +       /**< Device selftest function */
> +       dmadev_selftest_t dev_selftest;
> +};
> +
> +/**
> + * Allocates a new dmadev slot for an DMA device and returns the pointer
> + * to that slot for the driver to use.
> + *
> + * @param name
> + *   Unique identifier name for each device
> + * @param dev_private_size
> + *   Size of private data memory allocated within rte_dmadev object.
> + *   Set to 0 to disable internal memory allocation and allow for
> + *   self-allocation.
> + * @param socket_id
> + *   Socket to allocate resources on.
> + *
> + * @return
> + *   - NULL: Failure to allocate
> + *   - Other: The rte_dmadev structure pointer for the new device
> + */
> +struct rte_dmadev *
> +rte_dmadev_pmd_allocate(const char *name, size_t dev_private_size,
> +                       int socket_id);
> +
> +/**
> + * Release the specified dmadev device.
> + *
> + * @param dev
> + *   The *dmadev* pointer is the address of the *rte_dmadev* structure.
> + *
> + * @return
> + *   - 0 on success, negative on error
> + */
> +int
> +rte_dmadev_pmd_release(struct rte_dmadev *dev);
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_DMADEV_PMD_H_ */
> diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
> new file mode 100644
> index 0000000..383b3ca
> --- /dev/null
> +++ b/lib/dmadev/version.map
> @@ -0,0 +1,32 @@
> +EXPERIMENTAL {
> +       global:
> +
> +       rte_dmadev_count;
> +       rte_dmadev_get_dev_id;
> +       rte_dmadev_socket_id;
> +       rte_dmadev_info_get;
> +       rte_dmadev_configure;
> +       rte_dmadev_start;
> +       rte_dmadev_stop;
> +       rte_dmadev_close;
> +       rte_dmadev_reset;
> +       rte_dmadev_queue_setup;
> +       rte_dmadev_queue_release;
> +       rte_dmadev_queue_info_get;
> +       rte_dmadev_copy;
> +       rte_dmadev_copy_sg;
> +       rte_dmadev_fill;
> +       rte_dmadev_fill_sg;
> +       rte_dmadev_fence;
> +       rte_dmadev_perform;
> +       rte_dmadev_completed;
> +       rte_dmadev_completed_fails;
> +       rte_dmadev_stats_get;
> +       rte_dmadev_stats_reset;
> +       rte_dmadev_xstats_names_get;
> +       rte_dmadev_xstats_get;
> +       rte_dmadev_xstats_reset;
> +       rte_dmadev_selftest;
> +
> +       local: *;
> +};
> diff --git a/lib/meson.build b/lib/meson.build
> index 1673ca4..68d239f 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -60,6 +60,7 @@ libraries = [
>          'bpf',
>          'graph',
>          'node',
> +        'dmadev',
>  ]
> 
>  if is_windows
> --
> 2.8.1
Bruce Richardson July 5, 2021, 10:52 a.m. UTC | #5
On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> >
> > This patch introduces 'dmadevice' which is a generic type of DMA
> > device.
> >
> > The APIs of dmadev library exposes some generic operations which can
> > enable configuration and I/O with the DMA devices.
> >
> > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> 
> Thanks for v1.
> 
> I would suggest finalizing  lib/dmadev/rte_dmadev.h before doing the
> implementation so that you don't need
> to waste time on rewoking the implementation.
> 

I actually like having the .c file available too. Before we lock down the
.h file and the API, I want to verify the performance of our drivers with
the implementation, and having a working .c file is obviously necessary for
that. So I appreciate having it as part of the RFC.

> Comments inline.
> 
> > ---
<snip>
> > + *
> > + * The DMA framework is built on the following abstraction model:
> > + *
> > + *     ------------    ------------
> > + *     |virt-queue|    |virt-queue|
> > + *     ------------    ------------
> > + *            \           /
> > + *             \         /
> > + *              \       /
> > + *            ------------     ------------
> > + *            | HW-queue |     | HW-queue |
> > + *            ------------     ------------
> > + *                   \            /
> > + *                    \          /
> > + *                     \        /
> > + *                     ----------
> > + *                     | dmadev |
> > + *                     ----------
> 
> Continuing the discussion with @Morten Brørup , I think, we need to
> finalize the model.
> 

+1 and the terminology with regards to queues and channels. With our ioat
hardware, each HW queue was called a channel for instance.

> > + *   a) The DMA operation request must be submitted to the virt queue, virt
> > + *      queues must be created based on HW queues, the DMA device could have
> > + *      multiple HW queues.
> > + *   b) The virt queues on the same HW-queue could represent different contexts,
> > + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> > + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> > + *      mem-to-dev transfer scenario.
> > + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> > + *         scenario as long as the corresponding driver supports.
> > + *
> > + * The control plane APIs include configure/queue_setup/queue_release/start/
> > + * stop/reset/close, in order to start device work, the call sequence must be
> > + * as follows:
> > + *     - rte_dmadev_configure()
> > + *     - rte_dmadev_queue_setup()
> > + *     - rte_dmadev_start()
> 
> Please add reconfigure behaviour etc, Please check the
> lib/regexdev/rte_regexdev.h
> introduction. I have added similar ones so you could reuse as much as possible.
> 
> 
> > + * The dataplane APIs include two parts:
> > + *   a) The first part is the submission of operation requests:
> > + *        - rte_dmadev_copy()
> > + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> > + *        - rte_dmadev_fill()
> > + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> > + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> > + *        - rte_dmadev_perform() - issue doorbell to hardware
> > + *      These APIs could work with different virt queues which have different
> > + *      contexts.
> > + *      The first four APIs are used to submit the operation request to the virt
> > + *      queue, if the submission is successful, a cookie (as type
> > + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> > + *   b) The second part is to obtain the result of requests:
> > + *        - rte_dmadev_completed()
> > + *            - return the number of operation requests completed successfully.
> > + *        - rte_dmadev_completed_fails()
> > + *            - return the number of operation requests failed to complete.
> > + *
> > + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> > + * information query and self-test capabilities.
> > + *
> > + * About the dataplane APIs MT-safe, there are two dimensions:
> > + *   a) For one virt queue, the submit/completion API could be MT-safe,
> > + *      e.g. one thread do submit operation, another thread do completion
> > + *      operation.
> > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> > + *      If driver don't support it, it's up to the application to guarantee
> > + *      MT-safe.
> > + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> > + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> > + *      If driver don't support it, it's up to the application to guarantee
> > + *      MT-safe.
> 
> From an application PoV it may not be good to write portable
> applications. Please check
> latest thread with @Morten Brørup
> 
> > + */
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include <rte_common.h>
> > +#include <rte_memory.h>
> > +#include <rte_errno.h>
> > +#include <rte_compat.h>
> 
> Sort in alphabetical order.
> 
> > +
> > +/**
> > + * dma_cookie_t - an opaque DMA cookie
> 
> Since we are defining the behaviour is not opaque any more.
> I think, it is better to call ring_idx or so.
> 

+1 for ring index. We don't need a separate type for it though, just
document the index as an unsigned return value.

> 
> > +#define RTE_DMA_DEV_CAPA_MT_MVQ (1ull << 11) /**< Support MT-safe of multiple virt queues */
> 
> Please lot of @see for all symbols where it is being used. So that one
> can understand the full scope of
> symbols. See below example.
> 
> #define RTE_REGEXDEV_CAPA_RUNTIME_COMPILATION_F (1ULL << 0)
> /**< RegEx device does support compiling the rules at runtime unlike
>  * loading only the pre-built rule database using
>  * struct rte_regexdev_config::rule_db in rte_regexdev_configure()
>  *
>  * @see struct rte_regexdev_config::rule_db, rte_regexdev_configure()
>  * @see struct rte_regexdev_info::regexdev_capa
>  */
> 
> > + *
> > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > + * code.
> > + * When using cookies, comply with the following rules:
> > + * a) Cookies for each virtual queue are independent.
> > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > + *    the INT_MAX, it wraps back to zero.

I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
value, it means that we cannot use implicit wrap-around inside the CPU and
have to check for the INT_MAX value. Better to:
1. Specify that it wraps at UINT16_MAX which allows us to just use a
uint16_t internally and wrap-around automatically, or:
2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
drivers the flexibility at what value to wrap around.

> > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > + *    reset, the virt queue's cookie needs to be reset to zero.
> > + * Example:
> > + *    step-1: start one dmadev
> > + *    step-2: enqueue a copy operation, the cookie return is 0
> > + *    step-3: enqueue a copy operation again, the cookie return is 1
> > + *    ...
> > + *    step-101: stop the dmadev
> > + *    step-102: start the dmadev
> > + *    step-103: enqueue a copy operation, the cookie return is 0
> > + *    ...
> > + */
> 
> Good explanation.
> 
> > +typedef int32_t dma_cookie_t;
> 

As I mentioned before, I'd just remove this, and use regular int types,
with "ring_idx" as the name.

> 
> > +
> > +/**
> > + * dma_scatterlist - can hold scatter DMA operation request
> > + */
> > +struct dma_scatterlist {
> 
> I prefer to change scatterlist -> sg
> i.e rte_dma_sg
> 
> > +       void *src;
> > +       void *dst;
> > +       uint32_t length;
> > +};
> > +
> 
> > +
> > +/**
> > + * A structure used to retrieve the contextual information of
> > + * an DMA device
> > + */
> > +struct rte_dmadev_info {
> > +       /**
> > +        * Fields filled by framewok
> 
> typo.
> 
> > +        */
> > +       struct rte_device *device; /**< Generic Device information */
> > +       const char *driver_name; /**< Device driver name */
> > +       int socket_id; /**< Socket ID where memory is allocated */
> > +
> > +       /**
> > +        * Specification fields filled by driver
> > +        */
> > +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> > +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> > +       uint16_t max_vqs_per_hw_queue;
> > +       /**< Maximum number of virt queues to allocate per HW queue */
> > +       uint16_t max_desc;
> > +       /**< Maximum allowed number of virt queue descriptors */
> > +       uint16_t min_desc;
> > +       /**< Minimum allowed number of virt queue descriptors */
> 
> Please add max_nb_segs. i.e maximum number of segments supported.
> 
> > +
> > +       /**
> > +        * Status fields filled by driver
> > +        */
> > +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> > +       uint16_t nb_vqs; /**< Number of virt queues configured */
> > +};
> > + i
> > +
> > +/**
> > + * dma_address_type
> > + */
> > +enum dma_address_type {
> > +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> > +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> > +};
> > +
> > +/**
> > + * A structure used to configure a DMA device.
> > + */
> > +struct rte_dmadev_conf {
> > +       enum dma_address_type addr_type; /**< Address type to used */
> 
> I think, there are 3 kinds of limitations/capabilities.
> 
> When the system is configured as IOVA as VA
> 1) Device supports any VA address like memory from rte_malloc(),
> rte_memzone(), malloc, stack memory
> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> memory backed by hugepage and added to DMA map.
> 
> When the system is configured as IOVA as PA
> 1) Devices support only PA addresses .
> 
> IMO, Above needs to be  advertised as capability and application needs
> to align with that
> and I dont think application requests the driver to work in any of the modes.
> 
> 

I don't think we need this level of detail for addressing capabilities.
Unless I'm missing something, the hardware should behave exactly as other
hardware does taking in iova's.  If the user wants to check whether virtual
addresses to pinned memory can be used directly, the user can call
"rte_eal_iova_mode". We can't have a situation where some hardware uses one
type of addresses and another hardware the other.

Therefore, the only additional addressing capability we should need to
report is that the hardware can use SVM/SVA and use virtual addresses not
in hugepage memory.

> 
> > +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> > +       uint16_t max_vqs; /**< Maximum number of virt queues to use */
> 
> You need to what is max value allowed etc i.e it is based on
> info_get() and mention the field
> in info structure
> 
> 
> > +
> > +/**
> > + * dma_transfer_direction
> > + */
> > +enum dma_transfer_direction {
> 
> rte_dma_transter_direction
> 
> > +       DMA_MEM_TO_MEM,
> > +       DMA_MEM_TO_DEV,
> > +       DMA_DEV_TO_MEM,
> > +       DMA_DEV_TO_DEV,
> > +};
> > +
> > +/**
> > + * A structure used to configure a DMA virt queue.
> > + */
> > +struct rte_dmadev_queue_conf {
> > +       enum dma_transfer_direction direction;
> 
> 
> > +       /**< Associated transfer direction */
> > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > +       uint64_t dev_flags; /**< Device specific flags */
> 
> Use of this? Need more comments on this.
> Since it is in slowpath, We can have non opaque names here based on
> each driver capability.
> 
> 
> > +       void *dev_ctx; /**< Device specific context */
> 
> Use of this ? Need more comment ont this.
> 

I think this should be dropped. We should not have any opaque
device-specific info in these structs, rather if a particular device needs
parameters we should call them out. Drivers for which it's not relevant can
ignore them (and report same in capability if necessary). Since this is not
a dataplane API, we aren't concerned too much about perf and can size the
struct appropriately.

> 
> Please add some good amount of reserved bits and have API to init this
> structure for future ABI stability, say rte_dmadev_queue_config_init()
> or so.
> 

I don't think that is necessary. Since the config struct is used only as
parameter to the config function, any changes to it can be managed by
versioning that single function. Padding would only be necessary if we had
an array of these config structs somewhere.

> 
> > +
> > +/**
> > + * A structure used to retrieve information of a DMA virt queue.
> > + */
> > +struct rte_dmadev_queue_info {
> > +       enum dma_transfer_direction direction;
> 
> A queue may support all directions so I think it should be a bitfield.
> 
> > +       /**< Associated transfer direction */
> > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > +       uint64_t dev_flags; /**< Device specific flags */
> > +};
> > +
> 
> > +__rte_experimental
> > +static inline dma_cookie_t
> > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > +                  const struct dma_scatterlist *sg,
> > +                  uint32_t sg_len, uint64_t flags)
> 
> I would like to change this as:
> rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> rte_dma_sg *src, uint32_t nb_src,
> const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> 
> 
> 
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Enqueue a fill operation onto the DMA virt queue
> > + *
> > + * This queues up a fill operation to be performed by hardware, but does not
> > + * trigger hardware to begin that operation.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + * @param pattern
> > + *   The pattern to populate the destination buffer with.
> > + * @param dst
> > + *   The address of the destination buffer.
> > + * @param length
> > + *   The length of the destination buffer.
> > + * @param flags
> > + *   An opaque flags for this operation.
> 
> PLEASE REMOVE opaque stuff from fastpath it will be a pain for
> application writers as
> they need to write multiple combinations of fastpath. flags are OK, if
> we have a valid
> generic flag now to control the transfer behavior.
> 

+1. Flags need to be explicitly listed. If we don't have any flags for now,
we can specify that the value must be given as zero and it's for future
use.

> 
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Add a fence to force ordering between operations
> > + *
> > + * This adds a fence to a sequence of operations to enforce ordering, such that
> > + * all operations enqueued before the fence must be completed before operations
> > + * after the fence.
> > + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> > + * this API may not function correctly when called immediately after an
> > + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + *
> > + * @return
> > + *   - =0: Successful add fence.
> > + *   - <0: Failure to add fence.
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->fence)(dev, vq_id);
> > +}
> 
> Since HW submission is in a queue(FIFO) the ordering is always
> maintained. Right?
> Could you share more details and use case of fence() from
> driver/application PoV?
> 

There are different kinds of ordering to consider, ordering of completions
and the ordering of operations. While jobs are reported as completed to the
user in order, for performance hardware, may overlap individual jobs within
a burst (or even across bursts). Therefore, we need a fence operation to
inform hardware that one job should not be started until the other has
fully completed.

> 
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Trigger hardware to begin performing enqueued operations
> > + *
> > + * This API is used to write the "doorbell" to the hardware to trigger it
> > + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + *
> > + * @return
> > + *   - =0: Successful trigger hardware.
> > + *   - <0: Failure to trigger hardware.
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline int
> > +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->perform)(dev, vq_id);
> > +}
> 
> Since we have additional function call overhead in all the
> applications for this scheme, I would like to understand
> the use of doing this way vs enq does the doorbell implicitly from
> driver/application PoV?
> 

In our benchmarks it's just faster. When we tested it, the overhead of the
function calls was noticably less than the cost of building up the
parameter array(s) for passing the jobs in as a burst. [We don't see this
cost with things like NIC I/O since DPDK tends to already have the mbuf
fully populated before the TX call anyway.]

> 
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that have been successful completed.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> > + * @param nb_cpls
> > + *   The maximum number of completed operations that can be processed.
> > + * @param[out] cookie
> > + *   The last completed operation's cookie.
> > + * @param[out] has_error
> > + *   Indicates if there are transfer error.
> > + *
> > + * @return
> > + *   The number of operations that successful completed.
> 
> successfully
> 
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline uint16_t
> > +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> > +                    dma_cookie_t *cookie, bool *has_error)
> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       has_error = false;
> > +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> 
> It may be better to have cookie/ring_idx as third argument.
> 

No strong opinions here, but having it as in the code above means all
input parameters come before all output, which makes sense to me.

> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice.
> > + *
> > + * Returns the number of operations that failed to complete.
> > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > + *
> > + * @param dev_id
> > + *   The identifier of the device.
> > + * @param vq_id
> > + *   The identifier of virt queue.
> (> + * @param nb_status
> > + *   Indicates the size  of status array.
> > + * @param[out] status
> > + *   The error code of operations that failed to complete.
> > + * @param[out] cookie
> > + *   The last failed completed operation's cookie.
> > + *
> > + * @return
> > + *   The number of operations that failed to complete.
> > + *
> > + * NOTE: The caller must ensure that the input parameter is valid and the
> > + *       corresponding device supports the operation.
> > + */
> > +__rte_experimental
> > +static inline uint16_t
> > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > +                          const uint16_t nb_status, uint32_t *status,
> > +                          dma_cookie_t *cookie)
> 
> IMO, it is better to move cookie/rind_idx at 3.
> Why it would return any array of errors? since it called after
> rte_dmadev_completed() has
> has_error. Is it better to change
> 
> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> *cookie,  uint32_t *status)
> 
> I also think, we may need to set status as bitmask and enumerate all
> the combination of error codes
> of all the driver and return string from driver existing rte_flow_error
> 
> See
> struct rte_flow_error {
>         enum rte_flow_error_type type; /**< Cause field and error types. */
>         const void *cause; /**< Object responsible for the error. */
>         const char *message; /**< Human-readable error message. */
> };
> 

I think we need a multi-return value API here, as we may add operations in
future which have non-error status values to return. The obvious case is
DMA engines which support "compare" operations. In that case a successful
compare (as in there were no DMA or HW errors) can return "equal" or
"not-equal" as statuses. For general "copy" operations, the faster
completion op can be used to just return successful values (and only call
this status version on error), while apps using those compare ops or a
mixture of copy and compare ops, would always use the slower one that
returns status values for each and every op..

The ioat APIs used 32-bit integer values for this status array so as to
allow e.g. 16-bits for error code and 16-bits for future status values. For
most operations there should be a fairly small set of things that can go
wrong, i.e. bad source address, bad destination address or invalid length.
Within that we may have a couple of specifics for why an address is bad,
but even so I don't think we need to start having multiple bit
combinations.

> > +{
> > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> > +}
> > +
> > +struct rte_dmadev_stats {
> > +       uint64_t enqueue_fail_count;
> > +       /**< Conut of all operations which failed enqueued */
> > +       uint64_t enqueued_count;
> > +       /**< Count of all operations which successful enqueued */
> > +       uint64_t completed_fail_count;
> > +       /**< Count of all operations which failed to complete */
> > +       uint64_t completed_count;
> > +       /**< Count of all operations which successful complete */
> > +};
> 
> We need to have capability API to tell which items are
> updated/supported by the driver.
> 

I also would remove the enqueue fail counts, since they are better counted
by the app. If a driver reports 20,000 failures we have no way of knowing
if that is 20,000 unique operations which failed to enqueue or a single
operation which failed to enqueue 20,000 times but succeeded on attempt
20,001.

> 
> > diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> > new file mode 100644
> > index 0000000..a3afea2
> > --- /dev/null
> > +++ b/lib/dmadev/rte_dmadev_core.h
> > @@ -0,0 +1,98 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright 2021 HiSilicon Limited.
> > + */
> > +
> > +#ifndef _RTE_DMADEV_CORE_H_
> > +#define _RTE_DMADEV_CORE_H_
> > +
> > +/**
> > + * @file
> > + *
> > + * RTE DMA Device internal header.
> > + *
> > + * This header contains internal data types. But they are still part of the
> > + * public API because they are used by inline public functions.
> > + */
> > +
> > +struct rte_dmadev;
> > +
> > +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> > +                                     void *src, void *dst,
> > +                                     uint32_t length, uint64_t flags);
> > +/**< @internal Function used to enqueue a copy operation. */
> 
> To avoid namespace conflict(as it is public API) use rte_
> 
> 
> > +
> > +/**
> > + * The data structure associated with each DMA device.
> > + */
> > +struct rte_dmadev {
> > +       /**< Enqueue a copy operation onto the DMA device. */
> > +       dmadev_copy_t copy;
> > +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> > +       dmadev_copy_sg_t copy_sg;
> > +       /**< Enqueue a fill operation onto the DMA device. */
> > +       dmadev_fill_t fill;
> > +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> > +       dmadev_fill_sg_t fill_sg;
> > +       /**< Add a fence to force ordering between operations. */
> > +       dmadev_fence_t fence;
> > +       /**< Trigger hardware to begin performing enqueued operations. */
> > +       dmadev_perform_t perform;
> > +       /**< Returns the number of operations that successful completed. */
> > +       dmadev_completed_t completed;
> > +       /**< Returns the number of operations that failed to complete. */
> > +       dmadev_completed_fails_t completed_fails;
> 
> We need to limit fastpath items in 1 CL
> 

I don't think that is going to be possible. I also would like to see
numbers to check if we benefit much from having these fastpath ops separate
from the regular ops.

> > +
> > +       void *dev_private; /**< PMD-specific private data */
> > +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> > +
> > +       uint16_t dev_id; /**< Device ID for this instance */
> > +       int socket_id; /**< Socket ID where memory is allocated */
> > +       struct rte_device *device;
> > +       /**< Device info. supplied during device initialization */
> > +       const char *driver_name; /**< Driver info. supplied by probing */
> > +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> > +
> > +       RTE_STD_C11
> > +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> > +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> 
> Add a couple of reserved fields for future ABI stability.
> 
> > +
> > +} __rte_cache_aligned;
> > +
> > +extern struct rte_dmadev rte_dmadevices[];
> > +
Morten Brørup July 5, 2021, 11:12 a.m. UTC | #6
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> Sent: Monday, 5 July 2021 12.53
> 
> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng
> <fengchengwen@huawei.com> wrote:
> > >
> > > +
> > > +/**
> > > + * The data structure associated with each DMA device.
> > > + */
> > > +struct rte_dmadev {
> > > +       /**< Enqueue a copy operation onto the DMA device. */
> > > +       dmadev_copy_t copy;
> > > +       /**< Enqueue a scatter list copy operation onto the DMA
> device. */
> > > +       dmadev_copy_sg_t copy_sg;
> > > +       /**< Enqueue a fill operation onto the DMA device. */
> > > +       dmadev_fill_t fill;
> > > +       /**< Enqueue a scatter list fill operation onto the DMA
> device. */
> > > +       dmadev_fill_sg_t fill_sg;
> > > +       /**< Add a fence to force ordering between operations. */
> > > +       dmadev_fence_t fence;
> > > +       /**< Trigger hardware to begin performing enqueued
> operations. */
> > > +       dmadev_perform_t perform;
> > > +       /**< Returns the number of operations that successful
> completed. */
> > > +       dmadev_completed_t completed;
> > > +       /**< Returns the number of operations that failed to
> complete. */
> > > +       dmadev_completed_fails_t completed_fails;
> >
> > We need to limit fastpath items in 1 CL
> >
> 
> I don't think that is going to be possible. I also would like to see
> numbers to check if we benefit much from having these fastpath ops
> separate
> from the regular ops.

The fastpath ops may not fit into 1 cache line, but it is a good design practice to separate hot data from cold data, and I do consider the fastpath function pointers hot and configuration function pointers cold.

The important point is keeping all the fastpath ops (of a dmadevice) together and spanning as few cache lines as possible.

Configuration ops and other slow data may follow in the same structure; that should make no performance difference. It might make a difference for memory consumption if the other data are very large and not dynamically allocated, as we are discussing regarding ethdev.
Bruce Richardson July 5, 2021, 1:44 p.m. UTC | #7
On Mon, Jul 05, 2021 at 01:12:54PM +0200, Morten Brørup wrote:
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Bruce Richardson
> > Sent: Monday, 5 July 2021 12.53
> > 
> > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng
> > <fengchengwen@huawei.com> wrote:
> > > >
> > > > +
> > > > +/**
> > > > + * The data structure associated with each DMA device.
> > > > + */
> > > > +struct rte_dmadev {
> > > > +       /**< Enqueue a copy operation onto the DMA device. */
> > > > +       dmadev_copy_t copy;
> > > > +       /**< Enqueue a scatter list copy operation onto the DMA
> > device. */
> > > > +       dmadev_copy_sg_t copy_sg;
> > > > +       /**< Enqueue a fill operation onto the DMA device. */
> > > > +       dmadev_fill_t fill;
> > > > +       /**< Enqueue a scatter list fill operation onto the DMA
> > device. */
> > > > +       dmadev_fill_sg_t fill_sg;
> > > > +       /**< Add a fence to force ordering between operations. */
> > > > +       dmadev_fence_t fence;
> > > > +       /**< Trigger hardware to begin performing enqueued
> > operations. */
> > > > +       dmadev_perform_t perform;
> > > > +       /**< Returns the number of operations that successful
> > completed. */
> > > > +       dmadev_completed_t completed;
> > > > +       /**< Returns the number of operations that failed to
> > complete. */
> > > > +       dmadev_completed_fails_t completed_fails;
> > >
> > > We need to limit fastpath items in 1 CL
> > >
> > 
> > I don't think that is going to be possible. I also would like to see
> > numbers to check if we benefit much from having these fastpath ops
> > separate
> > from the regular ops.
> 
> The fastpath ops may not fit into 1 cache line, but it is a good design practice to separate hot data from cold data, and I do consider the fastpath function pointers hot and configuration function pointers cold.
> 
> The important point is keeping all the fastpath ops (of a dmadevice) together and spanning as few cache lines as possible.
> 
> Configuration ops and other slow data may follow in the same structure; that should make no performance difference. It might make a difference for memory consumption if the other data are very large and not dynamically allocated, as we are discussing regarding ethdev.
> 

Yes, I agree if it can be done, it should be.
Jerin Jacob July 5, 2021, 3:55 p.m. UTC | #8
need

On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > >
> > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > device.
> > >
> > > The APIs of dmadev library exposes some generic operations which can
> > > enable configuration and I/O with the DMA devices.
> > >
> > > Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
> >
> > Thanks for v1.
> >
> > I would suggest finalizing  lib/dmadev/rte_dmadev.h before doing the
> > implementation so that you don't need
> > to waste time on rewoking the implementation.
> >
>
> I actually like having the .c file available too. Before we lock down the
> .h file and the API, I want to verify the performance of our drivers with
> the implementation, and having a working .c file is obviously necessary for
> that. So I appreciate having it as part of the RFC.

Ack.

>
> > Comments inline.
> >
> > > ---
> <snip>
> > > + *
> > > + * The DMA framework is built on the following abstraction model:
> > > + *
> > > + *     ------------    ------------
> > > + *     |virt-queue|    |virt-queue|
> > > + *     ------------    ------------
> > > + *            \           /
> > > + *             \         /
> > > + *              \       /
> > > + *            ------------     ------------
> > > + *            | HW-queue |     | HW-queue |
> > > + *            ------------     ------------
> > > + *                   \            /
> > > + *                    \          /
> > > + *                     \        /
> > > + *                     ----------
> > > + *                     | dmadev |
> > > + *                     ----------
> >
> > Continuing the discussion with @Morten Brørup , I think, we need to
> > finalize the model.
> >
>
> +1 and the terminology with regards to queues and channels. With our ioat
> hardware, each HW queue was called a channel for instance.

Looks like <dmadev> <> <channel> can cover all the use cases, if the
HW has more than
1 queues it can be exposed as separate dmadev dev.


>
> > > + *   a) The DMA operation request must be submitted to the virt queue, virt
> > > + *      queues must be created based on HW queues, the DMA device could have
> > > + *      multiple HW queues.
> > > + *   b) The virt queues on the same HW-queue could represent different contexts,
> > > + *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
> > > + *      transfer scenario, and create virt-queue-1 on the same HW-queue for
> > > + *      mem-to-dev transfer scenario.
> > > + *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
> > > + *         scenario as long as the corresponding driver supports.
> > > + *
> > > + * The control plane APIs include configure/queue_setup/queue_release/start/
> > > + * stop/reset/close, in order to start device work, the call sequence must be
> > > + * as follows:
> > > + *     - rte_dmadev_configure()
> > > + *     - rte_dmadev_queue_setup()
> > > + *     - rte_dmadev_start()
> >
> > Please add reconfigure behaviour etc, Please check the
> > lib/regexdev/rte_regexdev.h
> > introduction. I have added similar ones so you could reuse as much as possible.
> >
> >
> > > + * The dataplane APIs include two parts:
> > > + *   a) The first part is the submission of operation requests:
> > > + *        - rte_dmadev_copy()
> > > + *        - rte_dmadev_copy_sg() - scatter-gather form of copy
> > > + *        - rte_dmadev_fill()
> > > + *        - rte_dmadev_fill_sg() - scatter-gather form of fill
> > > + *        - rte_dmadev_fence()   - add a fence force ordering between operations
> > > + *        - rte_dmadev_perform() - issue doorbell to hardware
> > > + *      These APIs could work with different virt queues which have different
> > > + *      contexts.
> > > + *      The first four APIs are used to submit the operation request to the virt
> > > + *      queue, if the submission is successful, a cookie (as type
> > > + *      'dma_cookie_t') is returned, otherwise a negative number is returned.
> > > + *   b) The second part is to obtain the result of requests:
> > > + *        - rte_dmadev_completed()
> > > + *            - return the number of operation requests completed successfully.
> > > + *        - rte_dmadev_completed_fails()
> > > + *            - return the number of operation requests failed to complete.
> > > + *
> > > + * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
> > > + * information query and self-test capabilities.
> > > + *
> > > + * About the dataplane APIs MT-safe, there are two dimensions:
> > > + *   a) For one virt queue, the submit/completion API could be MT-safe,
> > > + *      e.g. one thread do submit operation, another thread do completion
> > > + *      operation.
> > > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
> > > + *      If driver don't support it, it's up to the application to guarantee
> > > + *      MT-safe.
> > > + *   b) For multiple virt queues on the same HW queue, e.g. one thread do
> > > + *      operation on virt-queue-0, another thread do operation on virt-queue-1.
> > > + *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
> > > + *      If driver don't support it, it's up to the application to guarantee
> > > + *      MT-safe.
> >
> > From an application PoV it may not be good to write portable
> > applications. Please check
> > latest thread with @Morten Brørup
> >
> > > + */
> > > +
> > > +#ifdef __cplusplus
> > > +extern "C" {
> > > +#endif
> > > +
> > > +#include <rte_common.h>
> > > +#include <rte_memory.h>
> > > +#include <rte_errno.h>
> > > +#include <rte_compat.h>
> >
> > Sort in alphabetical order.
> >
> > > +
> > > +/**
> > > + * dma_cookie_t - an opaque DMA cookie
> >
> > Since we are defining the behaviour is not opaque any more.
> > I think, it is better to call ring_idx or so.
> >
>
> +1 for ring index. We don't need a separate type for it though, just
> document the index as an unsigned return value.
>
> >
> > > +#define RTE_DMA_DEV_CAPA_MT_MVQ (1ull << 11) /**< Support MT-safe of multiple virt queues */
> >
> > Please lot of @see for all symbols where it is being used. So that one
> > can understand the full scope of
> > symbols. See below example.
> >
> > #define RTE_REGEXDEV_CAPA_RUNTIME_COMPILATION_F (1ULL << 0)
> > /**< RegEx device does support compiling the rules at runtime unlike
> >  * loading only the pre-built rule database using
> >  * struct rte_regexdev_config::rule_db in rte_regexdev_configure()
> >  *
> >  * @see struct rte_regexdev_config::rule_db, rte_regexdev_configure()
> >  * @see struct rte_regexdev_info::regexdev_capa
> >  */
> >
> > > + *
> > > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > > + * code.
> > > + * When using cookies, comply with the following rules:
> > > + * a) Cookies for each virtual queue are independent.
> > > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > > + *    the INT_MAX, it wraps back to zero.
>
> I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> value, it means that we cannot use implicit wrap-around inside the CPU and
> have to check for the INT_MAX value. Better to:
> 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> uint16_t internally and wrap-around automatically, or:
> 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> drivers the flexibility at what value to wrap around.

I think, (2) better than 1. I think, even better to wrap around the number of
descriptors configured in dev_configure()(We cake make this as the power of 2),


>
> > > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > > + *    reset, the virt queue's cookie needs to be reset to zero.
> > > + * Example:
> > > + *    step-1: start one dmadev
> > > + *    step-2: enqueue a copy operation, the cookie return is 0
> > > + *    step-3: enqueue a copy operation again, the cookie return is 1
> > > + *    ...
> > > + *    step-101: stop the dmadev
> > > + *    step-102: start the dmadev
> > > + *    step-103: enqueue a copy operation, the cookie return is 0
> > > + *    ...
> > > + */
> >
> > Good explanation.
> >
> > > +typedef int32_t dma_cookie_t;
> >
>
> As I mentioned before, I'd just remove this, and use regular int types,
> with "ring_idx" as the name.

+1

>
> >
> > > +
> > > +/**
> > > + * dma_scatterlist - can hold scatter DMA operation request
> > > + */
> > > +struct dma_scatterlist {
> >
> > I prefer to change scatterlist -> sg
> > i.e rte_dma_sg
> >
> > > +       void *src;
> > > +       void *dst;
> > > +       uint32_t length;
> > > +};
> > > +
> >
> > > +
> > > +/**
> > > + * A structure used to retrieve the contextual information of
> > > + * an DMA device
> > > + */
> > > +struct rte_dmadev_info {
> > > +       /**
> > > +        * Fields filled by framewok
> >
> > typo.
> >
> > > +        */
> > > +       struct rte_device *device; /**< Generic Device information */
> > > +       const char *driver_name; /**< Device driver name */
> > > +       int socket_id; /**< Socket ID where memory is allocated */
> > > +
> > > +       /**
> > > +        * Specification fields filled by driver
> > > +        */
> > > +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> > > +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> > > +       uint16_t max_vqs_per_hw_queue;
> > > +       /**< Maximum number of virt queues to allocate per HW queue */
> > > +       uint16_t max_desc;
> > > +       /**< Maximum allowed number of virt queue descriptors */
> > > +       uint16_t min_desc;
> > > +       /**< Minimum allowed number of virt queue descriptors */
> >
> > Please add max_nb_segs. i.e maximum number of segments supported.
> >
> > > +
> > > +       /**
> > > +        * Status fields filled by driver
> > > +        */
> > > +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> > > +       uint16_t nb_vqs; /**< Number of virt queues configured */
> > > +};
> > > + i
> > > +
> > > +/**
> > > + * dma_address_type
> > > + */
> > > +enum dma_address_type {
> > > +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> > > +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> > > +};
> > > +
> > > +/**
> > > + * A structure used to configure a DMA device.
> > > + */
> > > +struct rte_dmadev_conf {
> > > +       enum dma_address_type addr_type; /**< Address type to used */
> >
> > I think, there are 3 kinds of limitations/capabilities.
> >
> > When the system is configured as IOVA as VA
> > 1) Device supports any VA address like memory from rte_malloc(),
> > rte_memzone(), malloc, stack memory
> > 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> > memory backed by hugepage and added to DMA map.
> >
> > When the system is configured as IOVA as PA
> > 1) Devices support only PA addresses .
> >
> > IMO, Above needs to be  advertised as capability and application needs
> > to align with that
> > and I dont think application requests the driver to work in any of the modes.
> >
> >
>
> I don't think we need this level of detail for addressing capabilities.
> Unless I'm missing something, the hardware should behave exactly as other
> hardware does taking in iova's.  If the user wants to check whether virtual
> addresses to pinned memory can be used directly, the user can call
> "rte_eal_iova_mode". We can't have a situation where some hardware uses one
> type of addresses and another hardware the other.
>
> Therefore, the only additional addressing capability we should need to
> report is that the hardware can use SVM/SVA and use virtual addresses not
> in hugepage memory.

+1.


>
> >
> > > +       uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
> > > +       uint16_t max_vqs; /**< Maximum number of virt queues to use */
> >
> > You need to what is max value allowed etc i.e it is based on
> > info_get() and mention the field
> > in info structure
> >
> >
> > > +
> > > +/**
> > > + * dma_transfer_direction
> > > + */
> > > +enum dma_transfer_direction {
> >
> > rte_dma_transter_direction
> >
> > > +       DMA_MEM_TO_MEM,
> > > +       DMA_MEM_TO_DEV,
> > > +       DMA_DEV_TO_MEM,
> > > +       DMA_DEV_TO_DEV,
> > > +};
> > > +
> > > +/**
> > > + * A structure used to configure a DMA virt queue.
> > > + */
> > > +struct rte_dmadev_queue_conf {
> > > +       enum dma_transfer_direction direction;
> >
> >
> > > +       /**< Associated transfer direction */
> > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > +       uint64_t dev_flags; /**< Device specific flags */
> >
> > Use of this? Need more comments on this.
> > Since it is in slowpath, We can have non opaque names here based on
> > each driver capability.
> >
> >
> > > +       void *dev_ctx; /**< Device specific context */
> >
> > Use of this ? Need more comment ont this.
> >
>
> I think this should be dropped. We should not have any opaque
> device-specific info in these structs, rather if a particular device needs
> parameters we should call them out. Drivers for which it's not relevant can
> ignore them (and report same in capability if necessary). Since this is not
> a dataplane API, we aren't concerned too much about perf and can size the
> struct appropriately.
>
> >
> > Please add some good amount of reserved bits and have API to init this
> > structure for future ABI stability, say rte_dmadev_queue_config_init()
> > or so.
> >
>
> I don't think that is necessary. Since the config struct is used only as
> parameter to the config function, any changes to it can be managed by
> versioning that single function. Padding would only be necessary if we had
> an array of these config structs somewhere.

OK.

For some reason, the versioning API looks ugly to me in code instead of keeping
some rsvd fields look cool to me with init function.

But I agree. function versioning works in this case. No need to find other API
if tt is not general DPDK API practice.

In other libraries, I have seen such _init or function that can use
for this as well as filling default value
in some cases implementation values is not zero).
So that application can avoid memset for param structure.
Added rte_event_queue_default_conf_get() in eventdev spec for this.

No strong opinion on this.



>
> >
> > > +
> > > +/**
> > > + * A structure used to retrieve information of a DMA virt queue.
> > > + */
> > > +struct rte_dmadev_queue_info {
> > > +       enum dma_transfer_direction direction;
> >
> > A queue may support all directions so I think it should be a bitfield.
> >
> > > +       /**< Associated transfer direction */
> > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > +       uint64_t dev_flags; /**< Device specific flags */
> > > +};
> > > +
> >
> > > +__rte_experimental
> > > +static inline dma_cookie_t
> > > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > > +                  const struct dma_scatterlist *sg,
> > > +                  uint32_t sg_len, uint64_t flags)
> >
> > I would like to change this as:
> > rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> > rte_dma_sg *src, uint32_t nb_src,
> > const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> > src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> >
> >
> >
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
> > > +}
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Enqueue a fill operation onto the DMA virt queue
> > > + *
> > > + * This queues up a fill operation to be performed by hardware, but does not
> > > + * trigger hardware to begin that operation.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + * @param pattern
> > > + *   The pattern to populate the destination buffer with.
> > > + * @param dst
> > > + *   The address of the destination buffer.
> > > + * @param length
> > > + *   The length of the destination buffer.
> > > + * @param flags
> > > + *   An opaque flags for this operation.
> >
> > PLEASE REMOVE opaque stuff from fastpath it will be a pain for
> > application writers as
> > they need to write multiple combinations of fastpath. flags are OK, if
> > we have a valid
> > generic flag now to control the transfer behavior.
> >
>
> +1. Flags need to be explicitly listed. If we don't have any flags for now,
> we can specify that the value must be given as zero and it's for future
> use.

OK.

>
> >
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Add a fence to force ordering between operations
> > > + *
> > > + * This adds a fence to a sequence of operations to enforce ordering, such that
> > > + * all operations enqueued before the fence must be completed before operations
> > > + * after the fence.
> > > + * NOTE: Since this fence may be added as a flag to the last operation enqueued,
> > > + * this API may not function correctly when called immediately after an
> > > + * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + *
> > > + * @return
> > > + *   - =0: Successful add fence.
> > > + *   - <0: Failure to add fence.
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline int
> > > +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->fence)(dev, vq_id);
> > > +}
> >
> > Since HW submission is in a queue(FIFO) the ordering is always
> > maintained. Right?
> > Could you share more details and use case of fence() from
> > driver/application PoV?
> >
>
> There are different kinds of ordering to consider, ordering of completions
> and the ordering of operations. While jobs are reported as completed to the
> user in order, for performance hardware, may overlap individual jobs within
> a burst (or even across bursts). Therefore, we need a fence operation to
> inform hardware that one job should not be started until the other has
> fully completed.

Got it. In order to save space if first CL size for fastpath(Saving 8B
for the pointer) and to avoid
function overhead, Can we use one bit of flags of op function to
enable the fence?

>
> >
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Trigger hardware to begin performing enqueued operations
> > > + *
> > > + * This API is used to write the "doorbell" to the hardware to trigger it
> > > + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + *
> > > + * @return
> > > + *   - =0: Successful trigger hardware.
> > > + *   - <0: Failure to trigger hardware.
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline int
> > > +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->perform)(dev, vq_id);
> > > +}
> >
> > Since we have additional function call overhead in all the
> > applications for this scheme, I would like to understand
> > the use of doing this way vs enq does the doorbell implicitly from
> > driver/application PoV?
> >
>
> In our benchmarks it's just faster. When we tested it, the overhead of the
> function calls was noticably less than the cost of building up the
> parameter array(s) for passing the jobs in as a burst. [We don't see this
> cost with things like NIC I/O since DPDK tends to already have the mbuf
> fully populated before the TX call anyway.]

OK. I agree with stack population.

My question was more on doing implicit doorbell update enq. Is doorbell write
costly in other HW compare to a function call? In our HW, it is just write of
the number of instructions written in a register.

Also, we need to again access the internal PMD memory structure to find
where to write etc if it is a separate function.


>
> >
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Returns the number of operations that have been successful completed.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > > + * @param nb_cpls
> > > + *   The maximum number of completed operations that can be processed.
> > > + * @param[out] cookie
> > > + *   The last completed operation's cookie.
> > > + * @param[out] has_error
> > > + *   Indicates if there are transfer error.
> > > + *
> > > + * @return
> > > + *   The number of operations that successful completed.
> >
> > successfully
> >
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline uint16_t
> > > +rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
> > > +                    dma_cookie_t *cookie, bool *has_error)
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       has_error = false;
> > > +       return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
> >
> > It may be better to have cookie/ring_idx as third argument.
> >
>
> No strong opinions here, but having it as in the code above means all
> input parameters come before all output, which makes sense to me.

+1

>
> > > +}
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > + *
> > > + * Returns the number of operations that failed to complete.
> > > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > > + *
> > > + * @param dev_id
> > > + *   The identifier of the device.
> > > + * @param vq_id
> > > + *   The identifier of virt queue.
> > (> + * @param nb_status
> > > + *   Indicates the size  of status array.
> > > + * @param[out] status
> > > + *   The error code of operations that failed to complete.
> > > + * @param[out] cookie
> > > + *   The last failed completed operation's cookie.
> > > + *
> > > + * @return
> > > + *   The number of operations that failed to complete.
> > > + *
> > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > + *       corresponding device supports the operation.
> > > + */
> > > +__rte_experimental
> > > +static inline uint16_t
> > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > > +                          const uint16_t nb_status, uint32_t *status,
> > > +                          dma_cookie_t *cookie)
> >
> > IMO, it is better to move cookie/rind_idx at 3.
> > Why it would return any array of errors? since it called after
> > rte_dmadev_completed() has
> > has_error. Is it better to change
> >
> > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> > *cookie,  uint32_t *status)
> >
> > I also think, we may need to set status as bitmask and enumerate all
> > the combination of error codes
> > of all the driver and return string from driver existing rte_flow_error
> >
> > See
> > struct rte_flow_error {
> >         enum rte_flow_error_type type; /**< Cause field and error types. */
> >         const void *cause; /**< Object responsible for the error. */
> >         const char *message; /**< Human-readable error message. */
> > };
> >
>
> I think we need a multi-return value API here, as we may add operations in
> future which have non-error status values to return. The obvious case is
> DMA engines which support "compare" operations. In that case a successful
> compare (as in there were no DMA or HW errors) can return "equal" or
> "not-equal" as statuses. For general "copy" operations, the faster
> completion op can be used to just return successful values (and only call
> this status version on error), while apps using those compare ops or a
> mixture of copy and compare ops, would always use the slower one that
> returns status values for each and every op..
>
> The ioat APIs used 32-bit integer values for this status array so as to
> allow e.g. 16-bits for error code and 16-bits for future status values. For
> most operations there should be a fairly small set of things that can go
> wrong, i.e. bad source address, bad destination address or invalid length.
> Within that we may have a couple of specifics for why an address is bad,
> but even so I don't think we need to start having multiple bit
> combinations.

OK. What is the purpose of errors status? Is it for application printing it or
Does the application need to take any action based on specific error requests?

If the former is scope, then we need to define the standard enum value
for the error right?
ie. uint32_t *status needs to change to enum rte_dma_error or so.



>
> > > +{
> > > +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> > > +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> > > +}
> > > +
> > > +struct rte_dmadev_stats {
> > > +       uint64_t enqueue_fail_count;
> > > +       /**< Conut of all operations which failed enqueued */
> > > +       uint64_t enqueued_count;
> > > +       /**< Count of all operations which successful enqueued */
> > > +       uint64_t completed_fail_count;
> > > +       /**< Count of all operations which failed to complete */
> > > +       uint64_t completed_count;
> > > +       /**< Count of all operations which successful complete */
> > > +};
> >
> > We need to have capability API to tell which items are
> > updated/supported by the driver.
> >
>
> I also would remove the enqueue fail counts, since they are better counted
> by the app. If a driver reports 20,000 failures we have no way of knowing
> if that is 20,000 unique operations which failed to enqueue or a single
> operation which failed to enqueue 20,000 times but succeeded on attempt
> 20,001.
>
> >
> > > diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> > > new file mode 100644
> > > index 0000000..a3afea2
> > > --- /dev/null
> > > +++ b/lib/dmadev/rte_dmadev_core.h
> > > @@ -0,0 +1,98 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright 2021 HiSilicon Limited.
> > > + */
> > > +
> > > +#ifndef _RTE_DMADEV_CORE_H_
> > > +#define _RTE_DMADEV_CORE_H_
> > > +
> > > +/**
> > > + * @file
> > > + *
> > > + * RTE DMA Device internal header.
> > > + *
> > > + * This header contains internal data types. But they are still part of the
> > > + * public API because they are used by inline public functions.
> > > + */
> > > +
> > > +struct rte_dmadev;
> > > +
> > > +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> > > +                                     void *src, void *dst,
> > > +                                     uint32_t length, uint64_t flags);
> > > +/**< @internal Function used to enqueue a copy operation. */
> >
> > To avoid namespace conflict(as it is public API) use rte_
> >
> >
> > > +
> > > +/**
> > > + * The data structure associated with each DMA device.
> > > + */
> > > +struct rte_dmadev {
> > > +       /**< Enqueue a copy operation onto the DMA device. */
> > > +       dmadev_copy_t copy;
> > > +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> > > +       dmadev_copy_sg_t copy_sg;
> > > +       /**< Enqueue a fill operation onto the DMA device. */
> > > +       dmadev_fill_t fill;
> > > +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> > > +       dmadev_fill_sg_t fill_sg;
> > > +       /**< Add a fence to force ordering between operations. */
> > > +       dmadev_fence_t fence;
> > > +       /**< Trigger hardware to begin performing enqueued operations. */
> > > +       dmadev_perform_t perform;
> > > +       /**< Returns the number of operations that successful completed. */
> > > +       dmadev_completed_t completed;
> > > +       /**< Returns the number of operations that failed to complete. */
> > > +       dmadev_completed_fails_t completed_fails;
> >
> > We need to limit fastpath items in 1 CL
> >
>
> I don't think that is going to be possible. I also would like to see
> numbers to check if we benefit much from having these fastpath ops separate
> from the regular ops.
>
> > > +
> > > +       void *dev_private; /**< PMD-specific private data */
> > > +       const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
> > > +
> > > +       uint16_t dev_id; /**< Device ID for this instance */
> > > +       int socket_id; /**< Socket ID where memory is allocated */
> > > +       struct rte_device *device;
> > > +       /**< Device info. supplied during device initialization */
> > > +       const char *driver_name; /**< Driver info. supplied by probing */
> > > +       char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
> > > +
> > > +       RTE_STD_C11
> > > +       uint8_t attached : 1; /**< Flag indicating the device is attached */
> > > +       uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
> >
> > Add a couple of reserved fields for future ABI stability.
> >
> > > +
> > > +} __rte_cache_aligned;
> > > +
> > > +extern struct rte_dmadev rte_dmadevices[];
> > > +
Bruce Richardson July 5, 2021, 5:16 p.m. UTC | #9
On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> 
> On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > >
> > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > device.
<snip>
> >
> > +1 and the terminology with regards to queues and channels. With our ioat
> > hardware, each HW queue was called a channel for instance.
> 
> Looks like <dmadev> <> <channel> can cover all the use cases, if the
> HW has more than
> 1 queues it can be exposed as separate dmadev dev.
> 

Fine for me.

However, just to confirm that Morten's suggestion of using a
(device-specific void *) channel pointer rather than dev_id + channel_id
pair of parameters won't work for you? You can't store a pointer or dev
index in the channel struct in the driver?

> 
<snip>
> > > > + *
> > > > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > > > + * code.
> > > > + * When using cookies, comply with the following rules:
> > > > + * a) Cookies for each virtual queue are independent.
> > > > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > > > + *    the INT_MAX, it wraps back to zero.
> >
> > I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> > value, it means that we cannot use implicit wrap-around inside the CPU and
> > have to check for the INT_MAX value. Better to:
> > 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> > uint16_t internally and wrap-around automatically, or:
> > 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> > drivers the flexibility at what value to wrap around.
> 
> I think, (2) better than 1. I think, even better to wrap around the number of
> descriptors configured in dev_configure()(We cake make this as the power of 2),
> 

Interesting, I hadn't really considered that before. My only concern
would be if an app wants to keep values in the app ring for a while after
they have been returned from dmadev. I thought it easier to have the full
16-bit counter value returned to the user to give the most flexibility,
given that going from that to any power-of-2 ring size smaller is a trivial
operation.

Overall, while my ideal situation is to always have a 0..UINT16_MAX return
value from the function, I can live with your suggestion of wrapping at
ring_size, since drivers will likely do that internally anyway.
I think wrapping at INT32_MAX is too awkward and will be error prone since
we can't rely on hardware automatically wrapping to zero, nor on the driver
having pre-masked the value.

> >
> > > > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > > > + *    reset, the virt queue's cookie needs to be reset to zero.
<snip>
> > >
> > > Please add some good amount of reserved bits and have API to init this
> > > structure for future ABI stability, say rte_dmadev_queue_config_init()
> > > or so.
> > >
> >
> > I don't think that is necessary. Since the config struct is used only as
> > parameter to the config function, any changes to it can be managed by
> > versioning that single function. Padding would only be necessary if we had
> > an array of these config structs somewhere.
> 
> OK.
> 
> For some reason, the versioning API looks ugly to me in code instead of keeping
> some rsvd fields look cool to me with init function.
> 
> But I agree. function versioning works in this case. No need to find other API
> if tt is not general DPDK API practice.
> 

The one thing I would suggest instead of the padding is for the internal
APIS, to pass the struct size through, since we can't version those - and
for padding we can't know whether any replaced padding should be used or
not. Specifically:

	typedef int (*rte_dmadev_configure_t)(struct rte_dmadev *dev, struct
			rte_dmadev_conf *cfg, size_t cfg_size);

but for the public function:

	int
	rte_dmadev_configure(struct rte_dmadev *dev, struct
			rte_dmadev_conf *cfg)
	{
		...
		ret = dev->ops.configure(dev, cfg, sizeof(*cfg));
		...
	}

Then if we change the structure and version the config API, the driver can
tell from the size what struct version it is and act accordingly. Without
that, each time the struct changed, we'd have to add a new function pointer
to the device ops.

> In other libraries, I have seen such _init or function that can use
> for this as well as filling default value
> in some cases implementation values is not zero).
> So that application can avoid memset for param structure.
> Added rte_event_queue_default_conf_get() in eventdev spec for this.
> 

I think that would largely have the same issues, unless it returned a
pointer to data inside the driver - and which therefore could not be
modified. Alternatively it would mean that the memory would have been
allocated in the driver and we would need to ensure proper cleanup
functions were called to free memory afterwards. Supporting having the
config parameter as a local variable I think makes things a lot easier.

> No strong opinion on this.
> 
> 
> 
> >
> > >
> > > > +
> > > > +/**
> > > > + * A structure used to retrieve information of a DMA virt queue.
> > > > + */
> > > > +struct rte_dmadev_queue_info {
> > > > +       enum dma_transfer_direction direction;
> > >
> > > A queue may support all directions so I think it should be a bitfield.
> > >
> > > > +       /**< Associated transfer direction */
> > > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > > +       uint64_t dev_flags; /**< Device specific flags */
> > > > +};
> > > > +
> > >
> > > > +__rte_experimental
> > > > +static inline dma_cookie_t
> > > > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > > > +                  const struct dma_scatterlist *sg,
> > > > +                  uint32_t sg_len, uint64_t flags)
> > >
> > > I would like to change this as:
> > > rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> > > rte_dma_sg *src, uint32_t nb_src,
> > > const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> > > src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> > >

Out of interest, do you see much benefit (and in what way) from having the
scatter-gather support? Unlike sending 5 buffers in one packet rather than
5 buffers in 5 packets to a NIC, copying an array of memory in one op vs
multiple is functionally identical.

> > >
> > >
<snip>
> Got it. In order to save space if first CL size for fastpath(Saving 8B
> for the pointer) and to avoid
> function overhead, Can we use one bit of flags of op function to
> enable the fence?
> 

The original ioat implementation did exactly that. However, I then
discovered that because a fence logically belongs between two operations,
does the fence flag on an operation mean "don't do any jobs after this
until this job has completed" or does it mean "don't start this job until
all previous jobs have completed". [Or theoretically does it mean both :-)]
Naturally, some hardware does it the former way (i.e. fence flag goes on
last op before fence), while other hardware the latter way (i.e. fence flag
goes on first op after the fence). Therefore, since fencing is about
ordering *between* two (sets of) jobs, I decided that it should do exactly
that and go between two jobs, so there is no ambiguity!

However, I'm happy enough to switch to having a fence flag, but I think if
we do that, it should be put in the "first job after fence" case, because
it is always easier to modify a previously written job if we need to, than
to save the flag for a future one.

Alternatively, if we keep the fence as a separate function, I'm happy
enough for it not to be on the same cacheline as the "hot" operations,
since fencing will always introduce a small penalty anyway.

> >
> > >
<snip>
> > > Since we have additional function call overhead in all the
> > > applications for this scheme, I would like to understand
> > > the use of doing this way vs enq does the doorbell implicitly from
> > > driver/application PoV?
> > >
> >
> > In our benchmarks it's just faster. When we tested it, the overhead of the
> > function calls was noticably less than the cost of building up the
> > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > fully populated before the TX call anyway.]
> 
> OK. I agree with stack population.
> 
> My question was more on doing implicit doorbell update enq. Is doorbell write
> costly in other HW compare to a function call? In our HW, it is just write of
> the number of instructions written in a register.
> 
> Also, we need to again access the internal PMD memory structure to find
> where to write etc if it is a separate function.
> 

The cost varies depending on a number of factors - even writing to a single
HW register can be very slow if that register is mapped as device
(uncacheable) memory, since (AFAIK) it will act as a full fence and wait
for the write to go all the way to hardware. For more modern HW, the cost
can be lighter. However, any cost of HW writes is going to be the same
whether its a separate function call or not.

However, the main thing about the doorbell update is that it's a
once-per-burst thing, rather than a once-per-job. Therefore, even if you
have to re-read the struct memory (which is likely still somewhere in your
cores' cache), any extra small cost of doing so is to be amortized over the
cost of a whole burst of copies.

> 
> >
> > >
<snip>
> > > > +
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > + *
> > > > + * Returns the number of operations that failed to complete.
> > > > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > > > + *
> > > > + * @param dev_id
> > > > + *   The identifier of the device.
> > > > + * @param vq_id
> > > > + *   The identifier of virt queue.
> > > (> + * @param nb_status
> > > > + *   Indicates the size  of status array.
> > > > + * @param[out] status
> > > > + *   The error code of operations that failed to complete.
> > > > + * @param[out] cookie
> > > > + *   The last failed completed operation's cookie.
> > > > + *
> > > > + * @return
> > > > + *   The number of operations that failed to complete.
> > > > + *
> > > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > > + *       corresponding device supports the operation.
> > > > + */
> > > > +__rte_experimental
> > > > +static inline uint16_t
> > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > > > +                          const uint16_t nb_status, uint32_t *status,
> > > > +                          dma_cookie_t *cookie)
> > >
> > > IMO, it is better to move cookie/rind_idx at 3.
> > > Why it would return any array of errors? since it called after
> > > rte_dmadev_completed() has
> > > has_error. Is it better to change
> > >
> > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> > > *cookie,  uint32_t *status)
> > >
> > > I also think, we may need to set status as bitmask and enumerate all
> > > the combination of error codes
> > > of all the driver and return string from driver existing rte_flow_error
> > >
> > > See
> > > struct rte_flow_error {
> > >         enum rte_flow_error_type type; /**< Cause field and error types. */
> > >         const void *cause; /**< Object responsible for the error. */
> > >         const char *message; /**< Human-readable error message. */
> > > };
> > >
> >
> > I think we need a multi-return value API here, as we may add operations in
> > future which have non-error status values to return. The obvious case is
> > DMA engines which support "compare" operations. In that case a successful
> > compare (as in there were no DMA or HW errors) can return "equal" or
> > "not-equal" as statuses. For general "copy" operations, the faster
> > completion op can be used to just return successful values (and only call
> > this status version on error), while apps using those compare ops or a
> > mixture of copy and compare ops, would always use the slower one that
> > returns status values for each and every op..
> >
> > The ioat APIs used 32-bit integer values for this status array so as to
> > allow e.g. 16-bits for error code and 16-bits for future status values. For
> > most operations there should be a fairly small set of things that can go
> > wrong, i.e. bad source address, bad destination address or invalid length.
> > Within that we may have a couple of specifics for why an address is bad,
> > but even so I don't think we need to start having multiple bit
> > combinations.
> 
> OK. What is the purpose of errors status? Is it for application printing it or
> Does the application need to take any action based on specific error requests?

It's largely for information purposes, but in the case of SVA/SVM errors
could occur due to the memory not being pinned, i.e. a page fault, in some
cases. If that happens, then it's up the app to either touch the memory and
retry the copy, or to do a SW memcpy as a fallback.

In other error cases, I think it's good to tell the application if it's
passing around bad data, or data that is beyond the scope of hardware, e.g.
a copy that is beyond what can be done in a single transaction for a HW
instance. Given that there are always things that can go wrong, I think we
need some error reporting mechanism.

> If the former is scope, then we need to define the standard enum value
> for the error right?
> ie. uint32_t *status needs to change to enum rte_dma_error or so.
> 
Sure. Perhaps an error/status structure either is an option, where we
explicitly call out error info from status info.

> 
> 
<snip to end>

/Bruce
Chengwen Feng July 6, 2021, 3:01 a.m. UTC | #10
Many thanks, mostly OK, and a few comment inline

On 2021/7/4 17:30, Jerin Jacob wrote:
> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>>
>> This patch introduces 'dmadevice' which is a generic type of DMA
>> device.
...
>> +#include <rte_compat.h>
> 
> Sort in alphabetical order.
> 
>> +
>> +/**
>> + * dma_cookie_t - an opaque DMA cookie
> 
> Since we are defining the behaviour is not opaque any more.
> I think, it is better to call ring_idx or so.
> 


This type is designed to have two meanings, return <0 on failure and return >=0 on success.

How about follwing definition:
    typedef int dma_ring_index_t;

if >= 0, it's value range is [0, 65535] = uint16_t, so driver implementation will simply.
if <0, then men enqueue failure

For driver, it could hold uint16_t ring_index, if enquer fail just return fail, else return
the current ring_index, and update it by: ring_index++;

>> +
>> +/**
>> + * A structure used to retrieve the contextual information of
>> + * an DMA device
>> + */
>> +struct rte_dmadev_info {
>> +       /**
>> +        * Fields filled by framewok
> 
> typo.
> 
>> +        */
>> +       struct rte_device *device; /**< Generic Device information */
>> +       const char *driver_name; /**< Device driver name */
>> +       int socket_id; /**< Socket ID where memory is allocated */
>> +
>> +       /**
>> +        * Specification fields filled by driver
>> +        */
>> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
>> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
>> +       uint16_t max_vqs_per_hw_queue;
>> +       /**< Maximum number of virt queues to allocate per HW queue */
>> +       uint16_t max_desc;
>> +       /**< Maximum allowed number of virt queue descriptors */
>> +       uint16_t min_desc;
>> +       /**< Minimum allowed number of virt queue descriptors */
> 
> Please add max_nb_segs. i.e maximum number of segments supported.

Do you means something like "burst_size" ?

> 
>> +
>> +       /**
>> +        * Status fields filled by driver
>> +        */
>> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
>> +       uint16_t nb_vqs; /**< Number of virt queues configured */
>> +};
>> + i
>> +
>> +/**
>> + * dma_address_type
>> + */
>> +enum dma_address_type {
>> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
>> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
>> +};
>> +
>> +/**
>> + * A structure used to configure a DMA device.
>> + */
>> +struct rte_dmadev_conf {
>> +       enum dma_address_type addr_type; /**< Address type to used */
> 
> I think, there are 3 kinds of limitations/capabilities.
> 
> When the system is configured as IOVA as VA
> 1) Device supports any VA address like memory from rte_malloc(),
> rte_memzone(), malloc, stack memory
> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> memory backed by hugepage and added to DMA map.
> 
> When the system is configured as IOVA as PA
> 1) Devices support only PA addresses .
> 
> IMO, Above needs to be  advertised as capability and application needs
> to align with that
> and I dont think application requests the driver to work in any of the modes.
> 

OK, Let's put together our ideas on address type:

There are three mode, we may define as:
	IOVA_as_VA-ALL     ---for device which may need support SVA feature
                           ---may also be a CPU memcpy 'device'
	IOVA_as_VA         ---for device which need support IOMMU
	IOVA_as_PA

There are many combination of the modes which device supports: eg. some device
may only support IOVA_as_PA, some may only support IOVA_as_VA, and some support
IOVA_as_PA and IOVA_as_VA. The specific runtime type is determined by the vfio
and drive capability(e.g RTE_PCI_DRV_NEED_IOVA_AS_VA).

So we already define two capabilities for this:
	#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
					---this cover IOVA_as_VA and IOVA_as_PA
	#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
					---this cover IOVA_as_VA-ALL
for a device which don't support SVA:
	only declare RTE_DMA_DEV_CAPA_IOVA
for a device which support SVA:
	delcare RTE_DAMA_DEV_CAPA_IOVA
	delcare RTE_DMA_DEV_CAPA_VA (only when IOMMU enabled and 'SVA flag' was set)
for a CPU memcpy device:
	only declare RTE_DMA_DEV_CAPA_VA

As application:
- if RTE_DMA_DEV_CAPA_VA support, then it could pass any va address to the DMA,
- else if RTE_DMA_DEV_CAPA_IOVA support, then it should pass iova address to the DMA
- else the DMA device should not exist.

> 
>> +__rte_experimental
>> +static inline dma_cookie_t
>> +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
>> +                  const struct dma_scatterlist *sg,
>> +                  uint32_t sg_len, uint64_t flags)
> 
> I would like to change this as:
> rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> rte_dma_sg *src, uint32_t nb_src,
> const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like
> src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> 

There are already too many arguments, and the above use case could split 30 sg-item.

>> +__rte_experimental
>> +static inline int
>> +rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
>> +{
>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
>> +       return (*dev->fence)(dev, vq_id);
>> +}
> 
> Since HW submission is in a queue(FIFO) the ordering is always
> maintained. Right?
> Could you share more details and use case of fence() from
> driver/application PoV?
> 

For Kunpeng DMA, hardware supports parallel execution of requests in the same queue,

It applies to the following scenarios: communication with the remote end is involved. driver
should ensure issure 'doorbell' after data was full written.

> 
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * Trigger hardware to begin performing enqueued operations
>> + *
>> + * This API is used to write the "doorbell" to the hardware to trigger it
>> + * to begin the operations previously enqueued by rte_dmadev_copy/fill()
>> + *
>> + * @param dev_id
>> + *   The identifier of the device.
>> + * @param vq_id
>> + *   The identifier of virt queue.
>> + *
>> + * @return
>> + *   - =0: Successful trigger hardware.
>> + *   - <0: Failure to trigger hardware.
>> + *
>> + * NOTE: The caller must ensure that the input parameter is valid and the
>> + *       corresponding device supports the operation.
>> + */
>> +__rte_experimental
>> +static inline int
>> +rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
>> +{
>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
>> +       return (*dev->perform)(dev, vq_id);
>> +}
> 
> Since we have additional function call overhead in all the
> applications for this scheme, I would like to understand
> the use of doing this way vs enq does the doorbell implicitly from
> driver/application PoV?
> 

Because we split the burst operation into multiple substeps: for each enq we
don't issue 'doorbell', and at last call perform() to issue 'doorbell'.

For ARM platform, should call mb ops when issue 'doorbell', if call mb ops
every enq, it may lead significant performance degradation.

>> +__rte_experimental
>> +static inline uint16_t
>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
>> +                          const uint16_t nb_status, uint32_t *status,
>> +                          dma_cookie_t *cookie)
> 
> IMO, it is better to move cookie/rind_idx at 3.
> Why it would return any array of errors? since it called after
> rte_dmadev_completed() has
> has_error. Is it better to change
> 
> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> *cookie,  uint32_t *status)
> 
> I also think, we may need to set status as bitmask and enumerate all
> the combination of error codes
> of all the driver and return string from driver existing rte_flow_error
> 

bitmask has limit for most 32 (or we can extend 64), and also the rte_flow_error is
heavy.

Considering that errors are a small number of scenarios, so it's OK to
pass status array, and status have 32bit it could denotes a very large number
of errcode.

>> +
>> +struct rte_dmadev_stats {
>> +       uint64_t enqueue_fail_count;
>> +       /**< Conut of all operations which failed enqueued */
>> +       uint64_t enqueued_count;
>> +       /**< Count of all operations which successful enqueued */
>> +       uint64_t completed_fail_count;
>> +       /**< Count of all operations which failed to complete */
>> +       uint64_t completed_count;
>> +       /**< Count of all operations which successful complete */
>> +};
> 
> We need to have capability API to tell which items are
> updated/supported by the driver.
> 

There are fewer fields, and I don't think it's necessary to add capability API,
for those who don't support, it could don't implement the callback.
For those support, these fields are minimum et.

> 
>> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
>> new file mode 100644
>> index 0000000..a3afea2
>> --- /dev/null
>> +++ b/lib/dmadev/rte_dmadev_core.h
>> @@ -0,0 +1,98 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright 2021 HiSilicon Limited.
>> + */
>> +
>> +#ifndef _RTE_DMADEV_CORE_H_
>> +#define _RTE_DMADEV_CORE_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * RTE DMA Device internal header.
>> + *
>> + * This header contains internal data types. But they are still part of the
>> + * public API because they are used by inline public functions.
>> + */
>> +
>> +struct rte_dmadev;
>> +
>> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
>> +                                     void *src, void *dst,
>> +                                     uint32_t length, uint64_t flags);
>> +/**< @internal Function used to enqueue a copy operation. */
> 
> To avoid namespace conflict(as it is public API) use rte_

These are internal function used by driver, not application.
and the eth/regexdev_core also defined without rte_

So I think it should remain as it is.

> 
> 
>> +
>> +/**
>> + * The data structure associated with each DMA device.
>> + */
>> +struct rte_dmadev {
>> +       /**< Enqueue a copy operation onto the DMA device. */
>> +       dmadev_copy_t copy;
>> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
>> +       dmadev_copy_sg_t copy_sg;
>> +       /**< Enqueue a fill operation onto the DMA device. */
>> +       dmadev_fill_t fill;
>> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
>> +       dmadev_fill_sg_t fill_sg;
>> +       /**< Add a fence to force ordering between operations. */
>> +       dmadev_fence_t fence;
>> +       /**< Trigger hardware to begin performing enqueued operations. */
>> +       dmadev_perform_t perform;
>> +       /**< Returns the number of operations that successful completed. */
>> +       dmadev_completed_t completed;
>> +       /**< Returns the number of operations that failed to complete. */
>> +       dmadev_completed_fails_t completed_fails;
> 
> We need to limit fastpath items in 1 CL

yes, currently there are 8 callback, which just fill one cache line.
Chengwen Feng July 6, 2021, 3:56 a.m. UTC | #11
Many thanks, mostly OK, a few comment inline

On 2021/7/4 22:57, Andrew Rybchenko wrote:
> On 7/2/21 4:18 PM, Chengwen Feng wrote:
>> This patch introduces 'dmadevice' which is a generic type of DMA
>> device.

[snip]

>> +#ifndef _RTE_DMADEV_CORE_H_
>> +#define _RTE_DMADEV_CORE_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * RTE DMA Device internal header.
>> + *
>> + * This header contains internal data types. But they are still part of the
>> + * public API because they are used by inline public functions.
> 
> Do we really want it? Anyway rte_dmadev must not be here.
> Some sub-structure could be, but not entire rte_dmadev.
> 

struct rte_dmadev should expose to public for device probe and etc.
and because the public dataplane function use static inline to embellish,
should put the rte_dmadevices to public file too.

PS: it widely used in eth/regexdev...

>> +
>> +extern struct rte_dmadev rte_dmadevices[];
>> +
>> +#endif /* _RTE_DMADEV_CORE_H_ */
>> diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
> 
> Let's remove rte_ prefix from DPDK internal headers.

as above explained, it's public header file.

>> +
>> +#define RTE_DMADEV_LOG(level, fmt, args...) \
> 
> Do we need RTE_ prefix for internal API?
> 
>> +	rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
>> +		__func__, ##args)
>> +
>> +/* Macros to check for valid device */
>> +#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
>> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
>> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
>> +		return retval; \
>> +	} \
>> +} while (0)
>> +
>> +#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
>> +	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
>> +		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
>> +		return; \
>> +	} \
>> +} while (0)
>> +
>> +#define RTE_DMADEV_DETACHED  0
>> +#define RTE_DMADEV_ATTACHED  1
> 
> Do we really need RTE_ prefix for interlal defines?

with RTE_ prefix will reduce namespace conflicts.

it's same as it lib/eth or regexdev...

>> +typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
>> +				     const uint32_t ids[], uint32_t nb_ids);
>> +/**< @internal Function used to reset extended stats. */
> 
> Do we really need both stats and xstats from the very
> beginning? I think it is better to start from just
> generic stats and add xstats when it is really required.

OK, but I think we should add one dump ops, which could be useful to
find the problem.

> .
>
Chengwen Feng July 6, 2021, 6:25 a.m. UTC | #12
On 2021/7/4 23:21, Matan Azrad wrote:
> 
> 
> From: Chengwen Feng
>> This patch introduces 'dmadevice' which is a generic type of DMA
>> device.
>>
>> The APIs of dmadev library exposes some generic operations which can
>> enable configuration and I/O with the DMA devices.
>>
> Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
> 

em, I just looked at the code.

The RTE_COMP_ALGO_NULL is a small feature of the compression device.
and currently only mlx5 and isal support it.

Also the compressdev dataplane API relatively complicated to do just
DMA copy.

So I think we need a separate driver framework for the DMA device.

thanks

[snip]

> 
> 
> .
>
Matan Azrad July 6, 2021, 6:50 a.m. UTC | #13
Hi

From: fengchengwen
> On 2021/7/4 23:21, Matan Azrad wrote:
> >
> >
> > From: Chengwen Feng
> >> This patch introduces 'dmadevice' which is a generic type of DMA
> >> device.
> >>
> >> The APIs of dmadev library exposes some generic operations which can
> >> enable configuration and I/O with the DMA devices.
> >>
> > Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
> >
> 
> em, I just looked at the code.
> 
> The RTE_COMP_ALGO_NULL is a small feature of the compression device.
> and currently only mlx5 and isal support it.

Yes, but what that is mean?
If more drivers support DMA operations they can add the support there, no?


> Also the compressdev dataplane API relatively complicated to do just DMA
> copy.

You snipped more comments I wrote below 😊
Maybe it is related....

> So I think we need a separate driver framework for the DMA device.

Need to consider deprecation in compressdev if so....

> thanks
> 
> [snip]
> 
> >
> >
> > .
> >
Chengwen Feng July 6, 2021, 8:20 a.m. UTC | #14
On 2021/7/5 18:52, Bruce Richardson wrote:
> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
>> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:

[snip]

>>> + *
>>> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
>>> + * code.
>>> + * When using cookies, comply with the following rules:
>>> + * a) Cookies for each virtual queue are independent.
>>> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
>>> + *    the INT_MAX, it wraps back to zero.
> 
> I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> value, it means that we cannot use implicit wrap-around inside the CPU and
> have to check for the INT_MAX value. Better to:
> 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> uint16_t internally and wrap-around automatically, or:
> 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> drivers the flexibility at what value to wrap around.

+1 for option 1
BTW: option 2 seem a little complicated for driver and application.

>> When the system is configured as IOVA as VA
>> 1) Device supports any VA address like memory from rte_malloc(),
>> rte_memzone(), malloc, stack memory
>> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
>> memory backed by hugepage and added to DMA map.
>>
>> When the system is configured as IOVA as PA
>> 1) Devices support only PA addresses .
>>
>> IMO, Above needs to be  advertised as capability and application needs
>> to align with that
>> and I dont think application requests the driver to work in any of the modes.
>>
>>
> 
> I don't think we need this level of detail for addressing capabilities.
> Unless I'm missing something, the hardware should behave exactly as other
> hardware does taking in iova's.  If the user wants to check whether virtual
> addresses to pinned memory can be used directly, the user can call
> "rte_eal_iova_mode". We can't have a situation where some hardware uses one
> type of addresses and another hardware the other.
> 
> Therefore, the only additional addressing capability we should need to
> report is that the hardware can use SVM/SVA and use virtual addresses not
> in hugepage memory.
> 

I discuss the addressing capability in previous thread.
Indeed, we can reduce it to just one capability.

>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice.
>>> + *
>>> + * Enqueue a fill operation onto the DMA virt queue
>>> + *
>>> + * This queues up a fill operation to be performed by hardware, but does not
>>> + * trigger hardware to begin that operation.
>>> + *
>>> + * @param dev_id
>>> + *   The identifier of the device.
>>> + * @param vq_id
>>> + *   The identifier of virt queue.
>>> + * @param pattern
>>> + *   The pattern to populate the destination buffer with.
>>> + * @param dst
>>> + *   The address of the destination buffer.
>>> + * @param length
>>> + *   The length of the destination buffer.
>>> + * @param flags
>>> + *   An opaque flags for this operation.
>>
>> PLEASE REMOVE opaque stuff from fastpath it will be a pain for
>> application writers as
>> they need to write multiple combinations of fastpath. flags are OK, if
>> we have a valid
>> generic flag now to control the transfer behavior.
>>
> 
> +1. Flags need to be explicitly listed. If we don't have any flags for now,
> we can specify that the value must be given as zero and it's for future
> use.
> 

+1, I will delete the flags parameters.

Currently we have fence which was implemented by ops, if later need more flags,
maybe we need create one new ops, this is not the way to expand.

So I think we need change fence ops to extra_flags ops:
	rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
to
	rte_dmadev_extra_flags(uint16_t dev_id, uint16_t vq_id, uint64_t flags);

So we could add fence by: rte_dmadev_extra_flags(dev_id, vq_id, RTE_DMA_FLAG_FENCE);
	
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice.
>>> + *
>>> + * Returns the number of operations that failed to complete.
>>> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
>>> + *
>>> + * @param dev_id
>>> + *   The identifier of the device.
>>> + * @param vq_id
>>> + *   The identifier of virt queue.
>> (> + * @param nb_status
>>> + *   Indicates the size  of status array.
>>> + * @param[out] status
>>> + *   The error code of operations that failed to complete.
>>> + * @param[out] cookie
>>> + *   The last failed completed operation's cookie.
>>> + *
>>> + * @return
>>> + *   The number of operations that failed to complete.
>>> + *
>>> + * NOTE: The caller must ensure that the input parameter is valid and the
>>> + *       corresponding device supports the operation.
>>> + */
>>> +__rte_experimental
>>> +static inline uint16_t
>>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
>>> +                          const uint16_t nb_status, uint32_t *status,
>>> +                          dma_cookie_t *cookie)
>>
>> IMO, it is better to move cookie/rind_idx at 3.
>> Why it would return any array of errors? since it called after
>> rte_dmadev_completed() has
>> has_error. Is it better to change
>>
>> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
>> *cookie,  uint32_t *status)
>>
>> I also think, we may need to set status as bitmask and enumerate all
>> the combination of error codes
>> of all the driver and return string from driver existing rte_flow_error
>>
>> See
>> struct rte_flow_error {
>>         enum rte_flow_error_type type; /**< Cause field and error types. */
>>         const void *cause; /**< Object responsible for the error. */
>>         const char *message; /**< Human-readable error message. */
>> };
>>
> 
> I think we need a multi-return value API here, as we may add operations in
> future which have non-error status values to return. The obvious case is
> DMA engines which support "compare" operations. In that case a successful

Just curious, what the 'compare' operations's application scenario ?

> compare (as in there were no DMA or HW errors) can return "equal" or
> "not-equal" as statuses. For general "copy" operations, the faster
> completion op can be used to just return successful values (and only call
> this status version on error), while apps using those compare ops or a
> mixture of copy and compare ops, would always use the slower one that
> returns status values for each and every op..

In the current design, rte_dmadev_completed_fails applies only to failure
scenarios. Do you mean in 'compare' operations, the status always non-zero
whether or not the two are consistent ?

> 
> The ioat APIs used 32-bit integer values for this status array so as to
> allow e.g. 16-bits for error code and 16-bits for future status values. For
> most operations there should be a fairly small set of things that can go
> wrong, i.e. bad source address, bad destination address or invalid length.
> Within that we may have a couple of specifics for why an address is bad,
> but even so I don't think we need to start having multiple bit
> combinations.
> 
>>> +{
>>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
>>> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
>>> +}
>>> +
>>> +struct rte_dmadev_stats {
>>> +       uint64_t enqueue_fail_count;
>>> +       /**< Conut of all operations which failed enqueued */
>>> +       uint64_t enqueued_count;
>>> +       /**< Count of all operations which successful enqueued */
>>> +       uint64_t completed_fail_count;
>>> +       /**< Count of all operations which failed to complete */
>>> +       uint64_t completed_count;
>>> +       /**< Count of all operations which successful complete */
>>> +};
>>
>> We need to have capability API to tell which items are
>> updated/supported by the driver.
>>
> 
> I also would remove the enqueue fail counts, since they are better counted
> by the app. If a driver reports 20,000 failures we have no way of knowing
> if that is 20,000 unique operations which failed to enqueue or a single
> operation which failed to enqueue 20,000 times but succeeded on attempt
> 20,001.
> 

This does exist, The application may just show a DEBUG trace other than recording.
So I would recommend keeping at least know if it happens after a long run.

>
Chengwen Feng July 6, 2021, 9:08 a.m. UTC | #15
On 2021/7/6 14:50, Matan Azrad wrote:
> Hi
> 
> From: fengchengwen
>> On 2021/7/4 23:21, Matan Azrad wrote:
>>>
>>>
>>> From: Chengwen Feng
>>>> This patch introduces 'dmadevice' which is a generic type of DMA
>>>> device.
>>>>
>>>> The APIs of dmadev library exposes some generic operations which can
>>>> enable configuration and I/O with the DMA devices.
>>>>
>>> Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
>>>
>>
>> em, I just looked at the code.
>>
>> The RTE_COMP_ALGO_NULL is a small feature of the compression device.
>> and currently only mlx5 and isal support it.
> 
> Yes, but what that is mean?
> If more drivers support DMA operations they can add the support there, no?
> 

You mean to expand directly on compressdev ?
I think it hard to expand, and may break the compressdev concept.

> 
>> Also the compressdev dataplane API relatively complicated to do just DMA
>> copy.
> 
> You snipped more comments I wrote below 😊
> Maybe it is related....

Sorry, I just skipped.

'Did you consider also mbuf API usage for memory descriptor?'
---One scenario of the DMA is vhost-net, which src or dst could be mbuf, but the
peer were not mbuf. so here we use raw fields.

> 
>> So I think we need a separate driver framework for the DMA device.
> 
> Need to consider deprecation in compressdev if so....
> 
>> thanks
>>
>> [snip]
>>
>>>
>>>
>>> .
>>>
>
Matan Azrad July 6, 2021, 9:17 a.m. UTC | #16
From: fengchengwen
> On 2021/7/6 14:50, Matan Azrad wrote:
> > Hi
> >
> > From: fengchengwen
> >> On 2021/7/4 23:21, Matan Azrad wrote:
> >>>
> >>>
> >>> From: Chengwen Feng
> >>>> This patch introduces 'dmadevice' which is a generic type of DMA
> >>>> device.
> >>>>
> >>>> The APIs of dmadev library exposes some generic operations which
> >>>> can enable configuration and I/O with the DMA devices.
> >>>>
> >>> Did you consider RTE_COMP_ALGO_NULL xform in compressdev library?
> >>>
> >>
> >> em, I just looked at the code.
> >>
> >> The RTE_COMP_ALGO_NULL is a small feature of the compression device.
> >> and currently only mlx5 and isal support it.
> >
> > Yes, but what that is mean?
> > If more drivers support DMA operations they can add the support there,
> no?
> >
> 
> You mean to expand directly on compressdev ?
> I think it hard to expand, and may break the compressdev concept.

Maybe, what do you need to expand?
Also maybe your expansion is related also to compress, finally both are mem-to-mem offload.
 

> >
> >> Also the compressdev dataplane API relatively complicated to do just
> >> DMA copy.
> >
> > You snipped more comments I wrote below 😊
> > Maybe it is related....
> 
> Sorry, I just skipped.
> 
> 'Did you consider also mbuf API usage for memory descriptor?'
> ---One scenario of the DMA is vhost-net, which src or dst could be mbuf, but
> the peer were not mbuf. so here we use raw fields.

Did you consider using external\attached mbuf for this case?

Did you also consider raw API in cryptodev library?
 
> >
> >> So I think we need a separate driver framework for the DMA device.
> >
> > Need to consider deprecation in compressdev if so....
> >
> >> thanks
> >>
> >> [snip]
> >>
> >>>
> >>>
> >>> .
> >>>
> >
Bruce Richardson July 6, 2021, 9:27 a.m. UTC | #17
On Tue, Jul 06, 2021 at 04:20:38PM +0800, fengchengwen wrote:
> On 2021/7/5 18:52, Bruce Richardson wrote:
> > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> >> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> 
> [snip]
> 
> >>> + *
> >>> + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> >>> + * code.
> >>> + * When using cookies, comply with the following rules:
> >>> + * a) Cookies for each virtual queue are independent.
> >>> + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> >>> + *    the INT_MAX, it wraps back to zero.
> > 
> > I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> > value, it means that we cannot use implicit wrap-around inside the CPU and
> > have to check for the INT_MAX value. Better to:
> > 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> > uint16_t internally and wrap-around automatically, or:
> > 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> > drivers the flexibility at what value to wrap around.
> 
> +1 for option 1
> BTW: option 2 seem a little complicated for driver and application.
> 

I would tend to agree. I just included it in case there was a case where
you explicitly wanted more than UINT16_MAX values in your driver.

> >> When the system is configured as IOVA as VA
> >> 1) Device supports any VA address like memory from rte_malloc(),
> >> rte_memzone(), malloc, stack memory
> >> 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> >> memory backed by hugepage and added to DMA map.
> >>
> >> When the system is configured as IOVA as PA
> >> 1) Devices support only PA addresses .
> >>
> >> IMO, Above needs to be  advertised as capability and application needs
> >> to align with that
> >> and I dont think application requests the driver to work in any of the modes.
> >>
> >>
> > 
> > I don't think we need this level of detail for addressing capabilities.
> > Unless I'm missing something, the hardware should behave exactly as other
> > hardware does taking in iova's.  If the user wants to check whether virtual
> > addresses to pinned memory can be used directly, the user can call
> > "rte_eal_iova_mode". We can't have a situation where some hardware uses one
> > type of addresses and another hardware the other.
> > 
> > Therefore, the only additional addressing capability we should need to
> > report is that the hardware can use SVM/SVA and use virtual addresses not
> > in hugepage memory.
> > 
> 
> I discuss the addressing capability in previous thread.
> Indeed, we can reduce it to just one capability.
> 
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change without prior notice.
> >>> + *
> >>> + * Enqueue a fill operation onto the DMA virt queue
> >>> + *
> >>> + * This queues up a fill operation to be performed by hardware, but does not
> >>> + * trigger hardware to begin that operation.
> >>> + *
> >>> + * @param dev_id
> >>> + *   The identifier of the device.
> >>> + * @param vq_id
> >>> + *   The identifier of virt queue.
> >>> + * @param pattern
> >>> + *   The pattern to populate the destination buffer with.
> >>> + * @param dst
> >>> + *   The address of the destination buffer.
> >>> + * @param length
> >>> + *   The length of the destination buffer.
> >>> + * @param flags
> >>> + *   An opaque flags for this operation.
> >>
> >> PLEASE REMOVE opaque stuff from fastpath it will be a pain for
> >> application writers as
> >> they need to write multiple combinations of fastpath. flags are OK, if
> >> we have a valid
> >> generic flag now to control the transfer behavior.
> >>
> > 
> > +1. Flags need to be explicitly listed. If we don't have any flags for now,
> > we can specify that the value must be given as zero and it's for future
> > use.
> > 
> 
> +1, I will delete the flags parameters.
> 
> Currently we have fence which was implemented by ops, if later need more flags,
> maybe we need create one new ops, this is not the way to expand.
> 
> So I think we need change fence ops to extra_flags ops:
> 	rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
> to
> 	rte_dmadev_extra_flags(uint16_t dev_id, uint16_t vq_id, uint64_t flags);
> 
> So we could add fence by: rte_dmadev_extra_flags(dev_id, vq_id, RTE_DMA_FLAG_FENCE);
> 

I don't think this is the way to go. I think we will need the flags
parameter per op in the future, so we should keep it, even if it is always
zero for now. It gives us future expandability options.

> >>> +/**
> >>> + * @warning
> >>> + * @b EXPERIMENTAL: this API may change without prior notice.
> >>> + *
> >>> + * Returns the number of operations that failed to complete.
> >>> + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> >>> + *
> >>> + * @param dev_id
> >>> + *   The identifier of the device.
> >>> + * @param vq_id
> >>> + *   The identifier of virt queue.
> >> (> + * @param nb_status
> >>> + *   Indicates the size  of status array.
> >>> + * @param[out] status
> >>> + *   The error code of operations that failed to complete.
> >>> + * @param[out] cookie
> >>> + *   The last failed completed operation's cookie.
> >>> + *
> >>> + * @return
> >>> + *   The number of operations that failed to complete.
> >>> + *
> >>> + * NOTE: The caller must ensure that the input parameter is valid and the
> >>> + *       corresponding device supports the operation.
> >>> + */
> >>> +__rte_experimental
> >>> +static inline uint16_t
> >>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> >>> +                          const uint16_t nb_status, uint32_t *status,
> >>> +                          dma_cookie_t *cookie)
> >>
> >> IMO, it is better to move cookie/rind_idx at 3.
> >> Why it would return any array of errors? since it called after
> >> rte_dmadev_completed() has
> >> has_error. Is it better to change
> >>
> >> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> >> *cookie,  uint32_t *status)
> >>
> >> I also think, we may need to set status as bitmask and enumerate all
> >> the combination of error codes
> >> of all the driver and return string from driver existing rte_flow_error
> >>
> >> See
> >> struct rte_flow_error {
> >>         enum rte_flow_error_type type; /**< Cause field and error types. */
> >>         const void *cause; /**< Object responsible for the error. */
> >>         const char *message; /**< Human-readable error message. */
> >> };
> >>
> > 
> > I think we need a multi-return value API here, as we may add operations in
> > future which have non-error status values to return. The obvious case is
> > DMA engines which support "compare" operations. In that case a successful
> 
> Just curious, what the 'compare' operations's application scenario ?
> 

We are not looking to use this capability just now - but it's a capability
in our hardware that offers some interest possibilities so I'd like to
ensure it's possible to integrate in future. To do so, we just need to
ensure that the function which returns the "error" status - or status
generally, can be used to returns bursts of statuses, even if it's slower
compared to the regular completion return which just assumes all succeed.

> > compare (as in there were no DMA or HW errors) can return "equal" or
> > "not-equal" as statuses. For general "copy" operations, the faster
> > completion op can be used to just return successful values (and only call
> > this status version on error), while apps using those compare ops or a
> > mixture of copy and compare ops, would always use the slower one that
> > returns status values for each and every op..
> 
> In the current design, rte_dmadev_completed_fails applies only to failure
> scenarios. Do you mean in 'compare' operations, the status always non-zero
> whether or not the two are consistent ?
> 

Yes and no. There are two separate "status" values to be returned for such
operations - the actual HW status i.e. all parameters valid, and the actual
memcmp result of equal/non-equal. In our completion records these are
called "status" and "result" respectively. "Result" is only valid if
"status" is successful, and is not relevant for copy or fill or similar
ops. Therefore, to support this, we just need some bits in "status"
reserved for that result case, and the completed_status op to return an
array of status values, not just a single one.

> > 
> > The ioat APIs used 32-bit integer values for this status array so as to
> > allow e.g. 16-bits for error code and 16-bits for future status values. For
> > most operations there should be a fairly small set of things that can go
> > wrong, i.e. bad source address, bad destination address or invalid length.
> > Within that we may have a couple of specifics for why an address is bad,
> > but even so I don't think we need to start having multiple bit
> > combinations.
> > 
> >>> +{
> >>> +       struct rte_dmadev *dev = &rte_dmadevices[dev_id];
> >>> +       return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
> >>> +}
> >>> +
> >>> +struct rte_dmadev_stats {
> >>> +       uint64_t enqueue_fail_count;
> >>> +       /**< Conut of all operations which failed enqueued */
> >>> +       uint64_t enqueued_count;
> >>> +       /**< Count of all operations which successful enqueued */
> >>> +       uint64_t completed_fail_count;
> >>> +       /**< Count of all operations which failed to complete */
> >>> +       uint64_t completed_count;
> >>> +       /**< Count of all operations which successful complete */
> >>> +};
> >>
> >> We need to have capability API to tell which items are
> >> updated/supported by the driver.
> >>
> > 
> > I also would remove the enqueue fail counts, since they are better counted
> > by the app. If a driver reports 20,000 failures we have no way of knowing
> > if that is 20,000 unique operations which failed to enqueue or a single
> > operation which failed to enqueue 20,000 times but succeeded on attempt
> > 20,001.
> > 
> 
> This does exist, The application may just show a DEBUG trace other than recording.
> So I would recommend keeping at least know if it happens after a long run.
> 
I disagree here - the enqueue failure should only be tracked by the app,
because:
1. only app knows whether a particular enqueue failure is retry or not and
   how it should be counted
2. these failures cannot be counted by hardware and must be counted by
   software, so adding additional operations to our enqueue path. In the
   retry case, that could be a lot of load-update-stores that will have to
   be done in the driver, while if tracked in the app, the count would
   just be a register increment.

Operation failures can be tracked in driver stats, though, as that is
related to hardware operation.

/Bruce
Bruce Richardson July 6, 2021, 10:01 a.m. UTC | #18
On Tue, Jul 06, 2021 at 11:01:17AM +0800, fengchengwen wrote:
> Many thanks, mostly OK, and a few comment inline
> 
> On 2021/7/4 17:30, Jerin Jacob wrote:
> > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> >>
> >> This patch introduces 'dmadevice' which is a generic type of DMA
> >> device.
> ...
> >> +#include <rte_compat.h>
> > 
> > Sort in alphabetical order.
> > 
> >> +
> >> +/**
> >> + * dma_cookie_t - an opaque DMA cookie
> > 
> > Since we are defining the behaviour is not opaque any more.
> > I think, it is better to call ring_idx or so.
> > 
> 
> 
> This type is designed to have two meanings, return <0 on failure and return >=0 on success.
> 
> How about follwing definition:
>     typedef int dma_ring_index_t;
> 
> if >= 0, it's value range is [0, 65535] = uint16_t, so driver implementation will simply.
> if <0, then men enqueue failure
> 
> For driver, it could hold uint16_t ring_index, if enquer fail just return fail, else return
> the current ring_index, and update it by: ring_index++;
> 

Well, yes and no on the "two meanings". For the enqueue function, yes the
return value can have two meanings, but I don't consider them one type. On
the completion call, however, this can only be positive values <
UINT16_MAX, so having two meanings is actually confusing. Better to have

* enqueue return regular int, with doxygen comment 
	"@return 
	  Negative on error, otherwise job index between 0 and UINT16_MAX"
* for completions, take a uint16_t* parameter for the last completed index
  since no negative values are needed.

Beyond this, we generally don't use typedefs in DPDK for basic types (with
a few exceptions e.g. rte_iova_t), and save their use only for function
pointers.

> >> +
> >> +/**
> >> + * A structure used to retrieve the contextual information of
> >> + * an DMA device
> >> + */
> >> +struct rte_dmadev_info {
> >> +       /**
> >> +        * Fields filled by framewok
> > 
> > typo.
> > 
> >> +        */
> >> +       struct rte_device *device; /**< Generic Device information */
> >> +       const char *driver_name; /**< Device driver name */
> >> +       int socket_id; /**< Socket ID where memory is allocated */
> >> +
> >> +       /**
> >> +        * Specification fields filled by driver
> >> +        */
> >> +       uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
> >> +       uint16_t max_hw_queues; /**< Maximum number of HW queues. */
> >> +       uint16_t max_vqs_per_hw_queue;
> >> +       /**< Maximum number of virt queues to allocate per HW queue */
> >> +       uint16_t max_desc;
> >> +       /**< Maximum allowed number of virt queue descriptors */
> >> +       uint16_t min_desc;
> >> +       /**< Minimum allowed number of virt queue descriptors */
> > 
> > Please add max_nb_segs. i.e maximum number of segments supported.
> 
> Do you means something like "burst_size" ?
> 
> > 
> >> +
> >> +       /**
> >> +        * Status fields filled by driver
> >> +        */
> >> +       uint16_t nb_hw_queues; /**< Number of HW queues configured */
> >> +       uint16_t nb_vqs; /**< Number of virt queues configured */
> >> +};
> >> + i
> >> +
> >> +/**
> >> + * dma_address_type
> >> + */
> >> +enum dma_address_type {
> >> +       DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
> >> +       DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
> >> +};
> >> +
> >> +/**
> >> + * A structure used to configure a DMA device.
> >> + */
> >> +struct rte_dmadev_conf {
> >> +       enum dma_address_type addr_type; /**< Address type to used */
> > 
> > I think, there are 3 kinds of limitations/capabilities.
> > 
> > When the system is configured as IOVA as VA
> > 1) Device supports any VA address like memory from rte_malloc(),
> > rte_memzone(), malloc, stack memory
> > 2) Device support only VA address from rte_malloc(), rte_memzone() i.e
> > memory backed by hugepage and added to DMA map.
> > 
> > When the system is configured as IOVA as PA
> > 1) Devices support only PA addresses .
> > 
> > IMO, Above needs to be  advertised as capability and application needs
> > to align with that
> > and I dont think application requests the driver to work in any of the modes.
> > 
> 
> OK, Let's put together our ideas on address type:
> 
> There are three mode, we may define as:
> 	IOVA_as_VA-ALL     ---for device which may need support SVA feature
>                            ---may also be a CPU memcpy 'device'
> 	IOVA_as_VA         ---for device which need support IOMMU
> 	IOVA_as_PA
> 
> There are many combination of the modes which device supports: eg. some device
> may only support IOVA_as_PA, some may only support IOVA_as_VA, and some support
> IOVA_as_PA and IOVA_as_VA. The specific runtime type is determined by the vfio
> and drive capability(e.g RTE_PCI_DRV_NEED_IOVA_AS_VA).
> 
> So we already define two capabilities for this:
> 	#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
> 					---this cover IOVA_as_VA and IOVA_as_PA
> 	#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
> 					---this cover IOVA_as_VA-ALL
> for a device which don't support SVA:
> 	only declare RTE_DMA_DEV_CAPA_IOVA
> for a device which support SVA:
> 	delcare RTE_DAMA_DEV_CAPA_IOVA
> 	delcare RTE_DMA_DEV_CAPA_VA (only when IOMMU enabled and 'SVA flag' was set)
> for a CPU memcpy device:
> 	only declare RTE_DMA_DEV_CAPA_VA
> 
> As application:
> - if RTE_DMA_DEV_CAPA_VA support, then it could pass any va address to the DMA,
> - else if RTE_DMA_DEV_CAPA_IOVA support, then it should pass iova address to the DMA
> - else the DMA device should not exist.
> 

I still don't think we need all of this. DPDK already has support through
the existing bus infrastructure for determining if DPDK needs to use
physical or virtual addresses, so we should not be duplicating that as
devices *cannot* use a different addressing mode to DPDK itself.
Given that, the only flag we need is one to indicate SVA support.

> > 
<snip>
> > 
> > I also think, we may need to set status as bitmask and enumerate all
> > the combination of error codes
> > of all the driver and return string from driver existing rte_flow_error
> > 
> 
> bitmask has limit for most 32 (or we can extend 64), and also the rte_flow_error is
> heavy.
> 
> Considering that errors are a small number of scenarios, so it's OK to
> pass status array, and status have 32bit it could denotes a very large number
> of errcode.
> 

+1 to this.

> >> +
> >> +struct rte_dmadev_stats {
> >> +       uint64_t enqueue_fail_count;
> >> +       /**< Conut of all operations which failed enqueued */
> >> +       uint64_t enqueued_count;
> >> +       /**< Count of all operations which successful enqueued */
> >> +       uint64_t completed_fail_count;
> >> +       /**< Count of all operations which failed to complete */
> >> +       uint64_t completed_count;
> >> +       /**< Count of all operations which successful complete */
> >> +};
> > 
> > We need to have capability API to tell which items are
> > updated/supported by the driver.
> > 
> 
> There are fewer fields, and I don't think it's necessary to add capability API,
> for those who don't support, it could don't implement the callback.
> For those support, these fields are minimum et.
> 
> > 
> >> diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
> >> new file mode 100644
> >> index 0000000..a3afea2
> >> --- /dev/null
> >> +++ b/lib/dmadev/rte_dmadev_core.h
> >> @@ -0,0 +1,98 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright 2021 HiSilicon Limited.
> >> + */
> >> +
> >> +#ifndef _RTE_DMADEV_CORE_H_
> >> +#define _RTE_DMADEV_CORE_H_
> >> +
> >> +/**
> >> + * @file
> >> + *
> >> + * RTE DMA Device internal header.
> >> + *
> >> + * This header contains internal data types. But they are still part of the
> >> + * public API because they are used by inline public functions.
> >> + */
> >> +
> >> +struct rte_dmadev;
> >> +
> >> +typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
> >> +                                     void *src, void *dst,
> >> +                                     uint32_t length, uint64_t flags);
> >> +/**< @internal Function used to enqueue a copy operation. */
> > 
> > To avoid namespace conflict(as it is public API) use rte_
> 
> These are internal function used by driver, not application.
> and the eth/regexdev_core also defined without rte_
> 
> So I think it should remain as it is.
> 

Even if only used by a driver, APIs are exported from the .so built for
the library, which means that they become public for apps using the lib.
Even for header-only symbols for drivers, it's good practice to put the
prefix since they are for use outside the compilation unit.

> > 
> > 
> >> +
> >> +/**
> >> + * The data structure associated with each DMA device.
> >> + */
> >> +struct rte_dmadev {
> >> +       /**< Enqueue a copy operation onto the DMA device. */
> >> +       dmadev_copy_t copy;
> >> +       /**< Enqueue a scatter list copy operation onto the DMA device. */
> >> +       dmadev_copy_sg_t copy_sg;
> >> +       /**< Enqueue a fill operation onto the DMA device. */
> >> +       dmadev_fill_t fill;
> >> +       /**< Enqueue a scatter list fill operation onto the DMA device. */
> >> +       dmadev_fill_sg_t fill_sg;
> >> +       /**< Add a fence to force ordering between operations. */
> >> +       dmadev_fence_t fence;
> >> +       /**< Trigger hardware to begin performing enqueued operations. */
> >> +       dmadev_perform_t perform;
> >> +       /**< Returns the number of operations that successful completed. */
> >> +       dmadev_completed_t completed;
> >> +       /**< Returns the number of operations that failed to complete. */
> >> +       dmadev_completed_fails_t completed_fails;
> > 
> > We need to limit fastpath items in 1 CL
> 
> yes, currently there are 8 callback, which just fill one cache line.
> 

Before we get overly concerned about this, I think we should benchmark it
to see how much our "one cacheline" is giving us compared to having them in
ops. For example, the "perform" doorbell function, or the completed
function is only called once every burst, so it would be interesting to see
how much difference it really makes for that.

/Bruce
Bruce Richardson July 6, 2021, 10:02 a.m. UTC | #19
On Tue, Jul 06, 2021 at 11:56:03AM +0800, fengchengwen wrote:
> Many thanks, mostly OK, a few comment inline
> 
> On 2021/7/4 22:57, Andrew Rybchenko wrote:
> > On 7/2/21 4:18 PM, Chengwen Feng wrote:
> >> This patch introduces 'dmadevice' which is a generic type of DMA
> >> device.
<snip>
> > Do we really need both stats and xstats from the very
> > beginning? I think it is better to start from just
> > generic stats and add xstats when it is really required.
> 
> OK, but I think we should add one dump ops, which could be useful to
> find the problem.
> 
+1 to both suggestions - dropping xstats (for now) and adding dump fn.
Jerin Jacob July 7, 2021, 8:08 a.m. UTC | #20
On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> >
> > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > >
> > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > device.
> <snip>
> > >
> > > +1 and the terminology with regards to queues and channels. With our ioat
> > > hardware, each HW queue was called a channel for instance.
> >
> > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > HW has more than
> > 1 queues it can be exposed as separate dmadev dev.
> >
>
> Fine for me.
>
> However, just to confirm that Morten's suggestion of using a
> (device-specific void *) channel pointer rather than dev_id + channel_id
> pair of parameters won't work for you? You can't store a pointer or dev
> index in the channel struct in the driver?

Yes. That will work. To confirm, the suggestion is to use, void *
object instead of channel_id,
That will avoid one more indirection.(index -> pointer)


>
> >
> <snip>
> > > > > + *
> > > > > + * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
> > > > > + * code.
> > > > > + * When using cookies, comply with the following rules:
> > > > > + * a) Cookies for each virtual queue are independent.
> > > > > + * b) For a virt queue, the cookie are monotonically incremented, when it reach
> > > > > + *    the INT_MAX, it wraps back to zero.
> > >
> > > I disagree with the INT_MAX (or INT32_MAX) value here. If we use that
> > > value, it means that we cannot use implicit wrap-around inside the CPU and
> > > have to check for the INT_MAX value. Better to:
> > > 1. Specify that it wraps at UINT16_MAX which allows us to just use a
> > > uint16_t internally and wrap-around automatically, or:
> > > 2. Specify that it wraps at a power-of-2 value >= UINT16_MAX, giving
> > > drivers the flexibility at what value to wrap around.
> >
> > I think, (2) better than 1. I think, even better to wrap around the number of
> > descriptors configured in dev_configure()(We cake make this as the power of 2),
> >
>
> Interesting, I hadn't really considered that before. My only concern
> would be if an app wants to keep values in the app ring for a while after
> they have been returned from dmadev. I thought it easier to have the full
> 16-bit counter value returned to the user to give the most flexibility,
> given that going from that to any power-of-2 ring size smaller is a trivial
> operation.
>
> Overall, while my ideal situation is to always have a 0..UINT16_MAX return
> value from the function, I can live with your suggestion of wrapping at
> ring_size, since drivers will likely do that internally anyway.
> I think wrapping at INT32_MAX is too awkward and will be error prone since
> we can't rely on hardware automatically wrapping to zero, nor on the driver
> having pre-masked the value.

OK. +1 for UINT16_MAX

>
> > >
> > > > > + * c) The initial cookie of a virt queue is zero, after the device is stopped or
> > > > > + *    reset, the virt queue's cookie needs to be reset to zero.
> <snip>
> > > >
> > > > Please add some good amount of reserved bits and have API to init this
> > > > structure for future ABI stability, say rte_dmadev_queue_config_init()
> > > > or so.
> > > >
> > >
> > > I don't think that is necessary. Since the config struct is used only as
> > > parameter to the config function, any changes to it can be managed by
> > > versioning that single function. Padding would only be necessary if we had
> > > an array of these config structs somewhere.
> >
> > OK.
> >
> > For some reason, the versioning API looks ugly to me in code instead of keeping
> > some rsvd fields look cool to me with init function.
> >
> > But I agree. function versioning works in this case. No need to find other API
> > if tt is not general DPDK API practice.
> >
>
> The one thing I would suggest instead of the padding is for the internal
> APIS, to pass the struct size through, since we can't version those - and
> for padding we can't know whether any replaced padding should be used or
> not. Specifically:
>
>         typedef int (*rte_dmadev_configure_t)(struct rte_dmadev *dev, struct
>                         rte_dmadev_conf *cfg, size_t cfg_size);
>
> but for the public function:
>
>         int
>         rte_dmadev_configure(struct rte_dmadev *dev, struct
>                         rte_dmadev_conf *cfg)
>         {
>                 ...
>                 ret = dev->ops.configure(dev, cfg, sizeof(*cfg));
>                 ...
>         }

Makes sense.

>
> Then if we change the structure and version the config API, the driver can
> tell from the size what struct version it is and act accordingly. Without
> that, each time the struct changed, we'd have to add a new function pointer
> to the device ops.
>
> > In other libraries, I have seen such _init or function that can use
> > for this as well as filling default value
> > in some cases implementation values is not zero).
> > So that application can avoid memset for param structure.
> > Added rte_event_queue_default_conf_get() in eventdev spec for this.
> >
>
> I think that would largely have the same issues, unless it returned a
> pointer to data inside the driver - and which therefore could not be
> modified. Alternatively it would mean that the memory would have been
> allocated in the driver and we would need to ensure proper cleanup
> functions were called to free memory afterwards. Supporting having the
> config parameter as a local variable I think makes things a lot easier.
>
> > No strong opinion on this.
> >
> >
> >
> > >
> > > >
> > > > > +
> > > > > +/**
> > > > > + * A structure used to retrieve information of a DMA virt queue.
> > > > > + */
> > > > > +struct rte_dmadev_queue_info {
> > > > > +       enum dma_transfer_direction direction;
> > > >
> > > > A queue may support all directions so I think it should be a bitfield.
> > > >
> > > > > +       /**< Associated transfer direction */
> > > > > +       uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
> > > > > +       uint16_t nb_desc; /**< Number of descriptor for this virt queue */
> > > > > +       uint64_t dev_flags; /**< Device specific flags */
> > > > > +};
> > > > > +
> > > >
> > > > > +__rte_experimental
> > > > > +static inline dma_cookie_t
> > > > > +rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
> > > > > +                  const struct dma_scatterlist *sg,
> > > > > +                  uint32_t sg_len, uint64_t flags)
> > > >
> > > > I would like to change this as:
> > > > rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id, const struct
> > > > rte_dma_sg *src, uint32_t nb_src,
> > > > const struct rte_dma_sg *dst, uint32_t nb_dst) or so allow the use case like

In the above syntax, @Chengchang Tang
rte_dma_sg needs to contains only ptr and size.

> > > > src 30 MB copy can be splitted as written as 1 MB x 30 dst.
> > > >
>
> Out of interest, do you see much benefit (and in what way) from having the
> scatter-gather support? Unlike sending 5 buffers in one packet rather than
> 5 buffers in 5 packets to a NIC, copying an array of memory in one op vs
> multiple is functionally identical.

Knowing upfront or in shot if such segments expressed can have better
optimization
in drivers like
1) In one DMA job request HW can fill multiple segments vs multiple
DMA job requests with each segment.
2) Single completion i.e less overhead system.
3) Less latency for the job requests.


>
> > > >
> > > >
> <snip>
> > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > for the pointer) and to avoid
> > function overhead, Can we use one bit of flags of op function to
> > enable the fence?
> >
>
> The original ioat implementation did exactly that. However, I then
> discovered that because a fence logically belongs between two operations,
> does the fence flag on an operation mean "don't do any jobs after this
> until this job has completed" or does it mean "don't start this job until
> all previous jobs have completed". [Or theoretically does it mean both :-)]
> Naturally, some hardware does it the former way (i.e. fence flag goes on
> last op before fence), while other hardware the latter way (i.e. fence flag
> goes on first op after the fence). Therefore, since fencing is about
> ordering *between* two (sets of) jobs, I decided that it should do exactly
> that and go between two jobs, so there is no ambiguity!
>
> However, I'm happy enough to switch to having a fence flag, but I think if
> we do that, it should be put in the "first job after fence" case, because
> it is always easier to modify a previously written job if we need to, than
> to save the flag for a future one.
>
> Alternatively, if we keep the fence as a separate function, I'm happy
> enough for it not to be on the same cacheline as the "hot" operations,
> since fencing will always introduce a small penalty anyway.

Ack.
You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
there any use case for this or it makes sense for your HW)


For us, Fence is NOP for us as we have an implicit fence between each
HW job descriptor.


>
> > >
> > > >
> <snip>
> > > > Since we have additional function call overhead in all the
> > > > applications for this scheme, I would like to understand
> > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > driver/application PoV?
> > > >
> > >
> > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > function calls was noticably less than the cost of building up the
> > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > fully populated before the TX call anyway.]
> >
> > OK. I agree with stack population.
> >
> > My question was more on doing implicit doorbell update enq. Is doorbell write
> > costly in other HW compare to a function call? In our HW, it is just write of
> > the number of instructions written in a register.
> >
> > Also, we need to again access the internal PMD memory structure to find
> > where to write etc if it is a separate function.
> >
>
> The cost varies depending on a number of factors - even writing to a single
> HW register can be very slow if that register is mapped as device
> (uncacheable) memory, since (AFAIK) it will act as a full fence and wait

I don't know, At least in our case, writes are write-back. so core does not need
to wait.(If there is no read operation).

> for the write to go all the way to hardware. For more modern HW, the cost
> can be lighter. However, any cost of HW writes is going to be the same
> whether its a separate function call or not.
>
> However, the main thing about the doorbell update is that it's a
> once-per-burst thing, rather than a once-per-job. Therefore, even if you
> have to re-read the struct memory (which is likely still somewhere in your
> cores' cache), any extra small cost of doing so is to be amortized over the
> cost of a whole burst of copies.

Linux kernel has xmit_more flag in skb to address similar thing.
i.e enq job flag can have one more bit field to say update ring bell or not?
Rather having yet another function overhead.IMO, it is the best of both worlds.


>
> >
> > >
> > > >
> <snip>
> > > > > +
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice.
> > > > > + *
> > > > > + * Returns the number of operations that failed to complete.
> > > > > + * NOTE: This API was used when rte_dmadev_completed has_error was set.
> > > > > + *
> > > > > + * @param dev_id
> > > > > + *   The identifier of the device.
> > > > > + * @param vq_id
> > > > > + *   The identifier of virt queue.
> > > > (> + * @param nb_status
> > > > > + *   Indicates the size  of status array.
> > > > > + * @param[out] status
> > > > > + *   The error code of operations that failed to complete.
> > > > > + * @param[out] cookie
> > > > > + *   The last failed completed operation's cookie.
> > > > > + *
> > > > > + * @return
> > > > > + *   The number of operations that failed to complete.
> > > > > + *
> > > > > + * NOTE: The caller must ensure that the input parameter is valid and the
> > > > > + *       corresponding device supports the operation.
> > > > > + */
> > > > > +__rte_experimental
> > > > > +static inline uint16_t
> > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
> > > > > +                          const uint16_t nb_status, uint32_t *status,
> > > > > +                          dma_cookie_t *cookie)
> > > >
> > > > IMO, it is better to move cookie/rind_idx at 3.
> > > > Why it would return any array of errors? since it called after
> > > > rte_dmadev_completed() has
> > > > has_error. Is it better to change
> > > >
> > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id, dma_cookie_t
> > > > *cookie,  uint32_t *status)
> > > >
> > > > I also think, we may need to set status as bitmask and enumerate all
> > > > the combination of error codes
> > > > of all the driver and return string from driver existing rte_flow_error
> > > >
> > > > See
> > > > struct rte_flow_error {
> > > >         enum rte_flow_error_type type; /**< Cause field and error types. */
> > > >         const void *cause; /**< Object responsible for the error. */
> > > >         const char *message; /**< Human-readable error message. */
> > > > };
> > > >
> > >
> > > I think we need a multi-return value API here, as we may add operations in
> > > future which have non-error status values to return. The obvious case is
> > > DMA engines which support "compare" operations. In that case a successful
> > > compare (as in there were no DMA or HW errors) can return "equal" or
> > > "not-equal" as statuses. For general "copy" operations, the faster
> > > completion op can be used to just return successful values (and only call
> > > this status version on error), while apps using those compare ops or a
> > > mixture of copy and compare ops, would always use the slower one that
> > > returns status values for each and every op..
> > >
> > > The ioat APIs used 32-bit integer values for this status array so as to
> > > allow e.g. 16-bits for error code and 16-bits for future status values. For
> > > most operations there should be a fairly small set of things that can go
> > > wrong, i.e. bad source address, bad destination address or invalid length.
> > > Within that we may have a couple of specifics for why an address is bad,
> > > but even so I don't think we need to start having multiple bit
> > > combinations.
> >
> > OK. What is the purpose of errors status? Is it for application printing it or
> > Does the application need to take any action based on specific error requests?
>
> It's largely for information purposes, but in the case of SVA/SVM errors
> could occur due to the memory not being pinned, i.e. a page fault, in some
> cases. If that happens, then it's up the app to either touch the memory and
> retry the copy, or to do a SW memcpy as a fallback.
>
> In other error cases, I think it's good to tell the application if it's
> passing around bad data, or data that is beyond the scope of hardware, e.g.
> a copy that is beyond what can be done in a single transaction for a HW
> instance. Given that there are always things that can go wrong, I think we
> need some error reporting mechanism.
>
> > If the former is scope, then we need to define the standard enum value
> > for the error right?
> > ie. uint32_t *status needs to change to enum rte_dma_error or so.
> >
> Sure. Perhaps an error/status structure either is an option, where we
> explicitly call out error info from status info.

Agree. Better to have a structure with filed like,

1)  enum rte_dma_error_type
2)  memory to store, informative message on fine aspects of error.
LIke address caused issue etc.(Which will be driver-specific
information).


>
> >
> >
> <snip to end>
>
> /Bruce
Bruce Richardson July 7, 2021, 8:35 a.m. UTC | #21
On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
> On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> > >
> > > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > > <bruce.richardson@intel.com> wrote:
> > > >
> > > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > > >
> > > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > > device.
> > <snip>
> > > >
> > > > +1 and the terminology with regards to queues and channels. With our ioat
> > > > hardware, each HW queue was called a channel for instance.
> > >
> > > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > > HW has more than
> > > 1 queues it can be exposed as separate dmadev dev.
> > >
> >
> > Fine for me.
> >
> > However, just to confirm that Morten's suggestion of using a
> > (device-specific void *) channel pointer rather than dev_id + channel_id
> > pair of parameters won't work for you? You can't store a pointer or dev
> > index in the channel struct in the driver?
> 
> Yes. That will work. To confirm, the suggestion is to use, void *
> object instead of channel_id,
> That will avoid one more indirection.(index -> pointer)
> 

The proposal was to use it in place of "dev_id + channel_id", i.e.

copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)

Where the channel pointer implicitly indicates the device too. However, I
realise now that this would be something completely transparent to the
driver as it would all have to be implemented in the dmadev level, and
lead to lots of duplication of function pointers, etc. Therefore, let's
just go with original scheme. :-(

> 
> >
> > >

<snip>

> > > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > > for the pointer) and to avoid
> > > function overhead, Can we use one bit of flags of op function to
> > > enable the fence?
> > >
> >
> > The original ioat implementation did exactly that. However, I then
> > discovered that because a fence logically belongs between two operations,
> > does the fence flag on an operation mean "don't do any jobs after this
> > until this job has completed" or does it mean "don't start this job until
> > all previous jobs have completed". [Or theoretically does it mean both :-)]
> > Naturally, some hardware does it the former way (i.e. fence flag goes on
> > last op before fence), while other hardware the latter way (i.e. fence flag
> > goes on first op after the fence). Therefore, since fencing is about
> > ordering *between* two (sets of) jobs, I decided that it should do exactly
> > that and go between two jobs, so there is no ambiguity!
> >
> > However, I'm happy enough to switch to having a fence flag, but I think if
> > we do that, it should be put in the "first job after fence" case, because
> > it is always easier to modify a previously written job if we need to, than
> > to save the flag for a future one.
> >
> > Alternatively, if we keep the fence as a separate function, I'm happy
> > enough for it not to be on the same cacheline as the "hot" operations,
> > since fencing will always introduce a small penalty anyway.
> 
> Ack.
> You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
> there any use case for this or it makes sense for your HW)
> 
> 
> For us, Fence is NOP for us as we have an implicit fence between each
> HW job descriptor.
> 

I actually still think that having a separate fence function in the "ops"
section is the best approach here. It's unabiguous as to whether it's
fence-before or fence-after, and if we have it in the ops, it doesn't use a
"fast-path" slot.

However, if we *really* want to use a flag instead, I don't see the value
in having two flags, it will be really confusing.  Instead, if we do go
with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
that the fence occurs before the job in question.

> 
> >
> > > >
> > > > >
> > <snip>
> > > > > Since we have additional function call overhead in all the
> > > > > applications for this scheme, I would like to understand
> > > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > > driver/application PoV?
> > > > >
> > > >
> > > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > > function calls was noticably less than the cost of building up the
> > > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > > fully populated before the TX call anyway.]
> > >
> > > OK. I agree with stack population.
> > >
> > > My question was more on doing implicit doorbell update enq. Is doorbell write
> > > costly in other HW compare to a function call? In our HW, it is just write of
> > > the number of instructions written in a register.
> > >
> > > Also, we need to again access the internal PMD memory structure to find
> > > where to write etc if it is a separate function.
> > >
> >
> > The cost varies depending on a number of factors - even writing to a single
> > HW register can be very slow if that register is mapped as device
> > (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
> 
> I don't know, At least in our case, writes are write-back. so core does not need
> to wait.(If there is no read operation).
> 
> > for the write to go all the way to hardware. For more modern HW, the cost
> > can be lighter. However, any cost of HW writes is going to be the same
> > whether its a separate function call or not.
> >
> > However, the main thing about the doorbell update is that it's a
> > once-per-burst thing, rather than a once-per-job. Therefore, even if you
> > have to re-read the struct memory (which is likely still somewhere in your
> > cores' cache), any extra small cost of doing so is to be amortized over the
> > cost of a whole burst of copies.
> 
> Linux kernel has xmit_more flag in skb to address similar thing.
> i.e enq job flag can have one more bit field to say update ring bell or not?
> Rather having yet another function overhead.IMO, it is the best of both worlds.
> 

It's just more conditionals and branches all through the code. Inside the
user application, the user has to check whether to set the flag or not (or
special-case the last transaction outside the loop), and within the driver,
there has to be a branch whether or not to call the doorbell function. The
code on both sides is far simpler and more readable if the doorbell
function is exactly that - a separate function.

> 
> >
> > >
> > > >
> > > > >
> > <snip>
> > > > > > + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> > > > > > without prior notice.  + * + * Returns the number of operations
> > > > > > that failed to complete.  + * NOTE: This API was used when
> > > > > > rte_dmadev_completed has_error was set.  + * + * @param dev_id
> > > > > > + *   The identifier of the device.  + * @param vq_id + *   The
> > > > > > identifier of virt queue.
> > > > > (> + * @param nb_status
> > > > > > + *   Indicates the size  of status array.  + * @param[out]
> > > > > > status + *   The error code of operations that failed to
> > > > > > complete.  + * @param[out] cookie + *   The last failed
> > > > > > completed operation's cookie.  + * + * @return + *   The number
> > > > > > of operations that failed to complete.  + * + * NOTE: The
> > > > > > caller must ensure that the input parameter is valid and the +
> > > > > > *       corresponding device supports the operation.  + */
> > > > > > +__rte_experimental +static inline uint16_t
> > > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> > > > > > const uint16_t nb_status, uint32_t *status, +
> > > > > > dma_cookie_t *cookie)
> > > > >
> > > > > IMO, it is better to move cookie/rind_idx at 3.  Why it would
> > > > > return any array of errors? since it called after
> > > > > rte_dmadev_completed() has has_error. Is it better to change
> > > > >
> > > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> > > > > dma_cookie_t *cookie,  uint32_t *status)
> > > > >
> > > > > I also think, we may need to set status as bitmask and enumerate
> > > > > all the combination of error codes of all the driver and return
> > > > > string from driver existing rte_flow_error
> > > > >
> > > > > See struct rte_flow_error { enum rte_flow_error_type type; /**<
> > > > > Cause field and error types. */ const void *cause; /**< Object
> > > > > responsible for the error. */ const char *message; /**<
> > > > > Human-readable error message. */ };
> > > > >
> > > >
> > > > I think we need a multi-return value API here, as we may add
> > > > operations in future which have non-error status values to return.
> > > > The obvious case is DMA engines which support "compare" operations.
> > > > In that case a successful compare (as in there were no DMA or HW
> > > > errors) can return "equal" or "not-equal" as statuses. For general
> > > > "copy" operations, the faster completion op can be used to just
> > > > return successful values (and only call this status version on
> > > > error), while apps using those compare ops or a mixture of copy and
> > > > compare ops, would always use the slower one that returns status
> > > > values for each and every op..
> > > >
> > > > The ioat APIs used 32-bit integer values for this status array so
> > > > as to allow e.g. 16-bits for error code and 16-bits for future
> > > > status values. For most operations there should be a fairly small
> > > > set of things that can go wrong, i.e. bad source address, bad
> > > > destination address or invalid length.  Within that we may have a
> > > > couple of specifics for why an address is bad, but even so I don't
> > > > think we need to start having multiple bit combinations.
> > >
> > > OK. What is the purpose of errors status? Is it for application
> > > printing it or Does the application need to take any action based on
> > > specific error requests?
> >
> > It's largely for information purposes, but in the case of SVA/SVM
> > errors could occur due to the memory not being pinned, i.e. a page
> > fault, in some cases. If that happens, then it's up the app to either
> > touch the memory and retry the copy, or to do a SW memcpy as a
> > fallback.
> >
> > In other error cases, I think it's good to tell the application if it's
> > passing around bad data, or data that is beyond the scope of hardware,
> > e.g.  a copy that is beyond what can be done in a single transaction
> > for a HW instance. Given that there are always things that can go
> > wrong, I think we need some error reporting mechanism.
> >
> > > If the former is scope, then we need to define the standard enum
> > > value for the error right?  ie. uint32_t *status needs to change to
> > > enum rte_dma_error or so.
> > >
> > Sure. Perhaps an error/status structure either is an option, where we
> > explicitly call out error info from status info.
> 
> Agree. Better to have a structure with filed like,
> 
> 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> fine aspects of error.  LIke address caused issue etc.(Which will be
> driver-specific information).
> 
The only issue I have with that is that once we have driver specific
information it is of little use to the application, since it can't know
anything about it excepy maybe log it.  I'd much rather have a set of error
codes telling user that "source address is wrong", "dest address is wrong",
and a generic "an address is wrong" in case driver/HW cannot distinguish
source of error. Can we see how far we get with just error codes before we
start into passing string messages around and all the memory management
headaches that implies.

/Bruce
Jerin Jacob July 7, 2021, 10:34 a.m. UTC | #22
On Wed, Jul 7, 2021 at 2:05 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
> > On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
> > <bruce.richardson@intel.com> wrote:
> > >
> > > On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> > > >
> > > > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > > > <bruce.richardson@intel.com> wrote:
> > > > >
> > > > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > > > >
> > > > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > > > device.
> > > <snip>
> > > > >
> > > > > +1 and the terminology with regards to queues and channels. With our ioat
> > > > > hardware, each HW queue was called a channel for instance.
> > > >
> > > > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > > > HW has more than
> > > > 1 queues it can be exposed as separate dmadev dev.
> > > >
> > >
> > > Fine for me.
> > >
> > > However, just to confirm that Morten's suggestion of using a
> > > (device-specific void *) channel pointer rather than dev_id + channel_id
> > > pair of parameters won't work for you? You can't store a pointer or dev
> > > index in the channel struct in the driver?
> >
> > Yes. That will work. To confirm, the suggestion is to use, void *
> > object instead of channel_id,
> > That will avoid one more indirection.(index -> pointer)
> >
>
> The proposal was to use it in place of "dev_id + channel_id", i.e.
>
> copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)
>
> Where the channel pointer implicitly indicates the device too. However, I
> realise now that this would be something completely transparent to the
> driver as it would all have to be implemented in the dmadev level, and
> lead to lots of duplication of function pointers, etc. Therefore, let's
> just go with original scheme. :-(

Yes. Just go with the original scheme.

>
> >
> > >
> > > >
>
> <snip>
>
> > > > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > > > for the pointer) and to avoid
> > > > function overhead, Can we use one bit of flags of op function to
> > > > enable the fence?
> > > >
> > >
> > > The original ioat implementation did exactly that. However, I then
> > > discovered that because a fence logically belongs between two operations,
> > > does the fence flag on an operation mean "don't do any jobs after this
> > > until this job has completed" or does it mean "don't start this job until
> > > all previous jobs have completed". [Or theoretically does it mean both :-)]
> > > Naturally, some hardware does it the former way (i.e. fence flag goes on
> > > last op before fence), while other hardware the latter way (i.e. fence flag
> > > goes on first op after the fence). Therefore, since fencing is about
> > > ordering *between* two (sets of) jobs, I decided that it should do exactly
> > > that and go between two jobs, so there is no ambiguity!
> > >
> > > However, I'm happy enough to switch to having a fence flag, but I think if
> > > we do that, it should be put in the "first job after fence" case, because
> > > it is always easier to modify a previously written job if we need to, than
> > > to save the flag for a future one.
> > >
> > > Alternatively, if we keep the fence as a separate function, I'm happy
> > > enough for it not to be on the same cacheline as the "hot" operations,
> > > since fencing will always introduce a small penalty anyway.
> >
> > Ack.
> > You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
> > there any use case for this or it makes sense for your HW)
> >
> >
> > For us, Fence is NOP for us as we have an implicit fence between each
> > HW job descriptor.
> >
>
> I actually still think that having a separate fence function in the "ops"
> section is the best approach here. It's unabiguous as to whether it's
> fence-before or fence-after, and if we have it in the ops, it doesn't use a
> "fast-path" slot.
>
> However, if we *really* want to use a flag instead, I don't see the value
> in having two flags, it will be really confusing.  Instead, if we do go
> with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
> that the fence occurs before the job in question.

IMO, We need to use flags and the name can be RTE_DMA_PRE_FENCE
due to overhead of the driver implementation where the fence request
can be NOP and
to save the first cache line occupancy.

>
> >
> > >
> > > > >
> > > > > >
> > > <snip>
> > > > > > Since we have additional function call overhead in all the
> > > > > > applications for this scheme, I would like to understand
> > > > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > > > driver/application PoV?
> > > > > >
> > > > >
> > > > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > > > function calls was noticably less than the cost of building up the
> > > > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > > > fully populated before the TX call anyway.]
> > > >
> > > > OK. I agree with stack population.
> > > >
> > > > My question was more on doing implicit doorbell update enq. Is doorbell write
> > > > costly in other HW compare to a function call? In our HW, it is just write of
> > > > the number of instructions written in a register.
> > > >
> > > > Also, we need to again access the internal PMD memory structure to find
> > > > where to write etc if it is a separate function.
> > > >
> > >
> > > The cost varies depending on a number of factors - even writing to a single
> > > HW register can be very slow if that register is mapped as device
> > > (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
> >
> > I don't know, At least in our case, writes are write-back. so core does not need
> > to wait.(If there is no read operation).
> >
> > > for the write to go all the way to hardware. For more modern HW, the cost
> > > can be lighter. However, any cost of HW writes is going to be the same
> > > whether its a separate function call or not.
> > >
> > > However, the main thing about the doorbell update is that it's a
> > > once-per-burst thing, rather than a once-per-job. Therefore, even if you
> > > have to re-read the struct memory (which is likely still somewhere in your
> > > cores' cache), any extra small cost of doing so is to be amortized over the
> > > cost of a whole burst of copies.
> >
> > Linux kernel has xmit_more flag in skb to address similar thing.
> > i.e enq job flag can have one more bit field to say update ring bell or not?
> > Rather having yet another function overhead.IMO, it is the best of both worlds.
> >
>
> It's just more conditionals and branches all through the code. Inside the
> user application, the user has to check whether to set the flag or not (or
> special-case the last transaction outside the loop), and within the driver,
> there has to be a branch whether or not to call the doorbell function. The
> code on both sides is far simpler and more readable if the doorbell
> function is exactly that - a separate function.

I disagree. The reason is:

We will have two classes of applications

a) do dma copy request as and when it has data(I think, this is the
prime use case), for those,
I think, it is considerable overhead to have two function invocation
per transfer i.e
rte_dma_copy() and rte_dma_perform()

b) do dma copy when the data is reached to a logical state,  like copy
IP frame from Ethernet packets or so,
In that case, the application will have  a LOGIC to detect when to
perform it so on the end of
that rte_dma_copy() flag can be updated to fire the doorbell.

IMO, We are comparing against a branch(flag is already in register) vs
a set of instructions for
1) function pointer overhead
2) Need to use the channel context again back in another function.

IMO, a single branch is most optimal from performance PoV.


>
> >
> > >
> > > >
> > > > >
> > > > > >
> > > <snip>
> > > > > > > + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> > > > > > > without prior notice.  + * + * Returns the number of operations
> > > > > > > that failed to complete.  + * NOTE: This API was used when
> > > > > > > rte_dmadev_completed has_error was set.  + * + * @param dev_id
> > > > > > > + *   The identifier of the device.  + * @param vq_id + *   The
> > > > > > > identifier of virt queue.
> > > > > > (> + * @param nb_status
> > > > > > > + *   Indicates the size  of status array.  + * @param[out]
> > > > > > > status + *   The error code of operations that failed to
> > > > > > > complete.  + * @param[out] cookie + *   The last failed
> > > > > > > completed operation's cookie.  + * + * @return + *   The number
> > > > > > > of operations that failed to complete.  + * + * NOTE: The
> > > > > > > caller must ensure that the input parameter is valid and the +
> > > > > > > *       corresponding device supports the operation.  + */
> > > > > > > +__rte_experimental +static inline uint16_t
> > > > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> > > > > > > const uint16_t nb_status, uint32_t *status, +
> > > > > > > dma_cookie_t *cookie)
> > > > > >
> > > > > > IMO, it is better to move cookie/rind_idx at 3.  Why it would
> > > > > > return any array of errors? since it called after
> > > > > > rte_dmadev_completed() has has_error. Is it better to change
> > > > > >
> > > > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> > > > > > dma_cookie_t *cookie,  uint32_t *status)
> > > > > >
> > > > > > I also think, we may need to set status as bitmask and enumerate
> > > > > > all the combination of error codes of all the driver and return
> > > > > > string from driver existing rte_flow_error
> > > > > >
> > > > > > See struct rte_flow_error { enum rte_flow_error_type type; /**<
> > > > > > Cause field and error types. */ const void *cause; /**< Object
> > > > > > responsible for the error. */ const char *message; /**<
> > > > > > Human-readable error message. */ };
> > > > > >
> > > > >
> > > > > I think we need a multi-return value API here, as we may add
> > > > > operations in future which have non-error status values to return.
> > > > > The obvious case is DMA engines which support "compare" operations.
> > > > > In that case a successful compare (as in there were no DMA or HW
> > > > > errors) can return "equal" or "not-equal" as statuses. For general
> > > > > "copy" operations, the faster completion op can be used to just
> > > > > return successful values (and only call this status version on
> > > > > error), while apps using those compare ops or a mixture of copy and
> > > > > compare ops, would always use the slower one that returns status
> > > > > values for each and every op..
> > > > >
> > > > > The ioat APIs used 32-bit integer values for this status array so
> > > > > as to allow e.g. 16-bits for error code and 16-bits for future
> > > > > status values. For most operations there should be a fairly small
> > > > > set of things that can go wrong, i.e. bad source address, bad
> > > > > destination address or invalid length.  Within that we may have a
> > > > > couple of specifics for why an address is bad, but even so I don't
> > > > > think we need to start having multiple bit combinations.
> > > >
> > > > OK. What is the purpose of errors status? Is it for application
> > > > printing it or Does the application need to take any action based on
> > > > specific error requests?
> > >
> > > It's largely for information purposes, but in the case of SVA/SVM
> > > errors could occur due to the memory not being pinned, i.e. a page
> > > fault, in some cases. If that happens, then it's up the app to either
> > > touch the memory and retry the copy, or to do a SW memcpy as a
> > > fallback.
> > >
> > > In other error cases, I think it's good to tell the application if it's
> > > passing around bad data, or data that is beyond the scope of hardware,
> > > e.g.  a copy that is beyond what can be done in a single transaction
> > > for a HW instance. Given that there are always things that can go
> > > wrong, I think we need some error reporting mechanism.
> > >
> > > > If the former is scope, then we need to define the standard enum
> > > > value for the error right?  ie. uint32_t *status needs to change to
> > > > enum rte_dma_error or so.
> > > >
> > > Sure. Perhaps an error/status structure either is an option, where we
> > > explicitly call out error info from status info.
> >
> > Agree. Better to have a structure with filed like,
> >
> > 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> > fine aspects of error.  LIke address caused issue etc.(Which will be
> > driver-specific information).
> >
> The only issue I have with that is that once we have driver specific
> information it is of little use to the application, since it can't know
> anything about it excepy maybe log it.  I'd much rather have a set of error
> codes telling user that "source address is wrong", "dest address is wrong",
> and a generic "an address is wrong" in case driver/HW cannot distinguish
> source of error. Can we see how far we get with just error codes before we
> start into passing string messages around and all the memory management
> headaches that implies.

Works for me. It should be "enum rte_dma_error_type" then, which has a standard
error type. Which is missing in the spec now.

>
> /Bruce
Bruce Richardson July 7, 2021, 11:01 a.m. UTC | #23
On Wed, Jul 07, 2021 at 04:04:16PM +0530, Jerin Jacob wrote:
> On Wed, Jul 7, 2021 at 2:05 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
> > > On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
> > > <bruce.richardson@intel.com> wrote:
> > > >
> > > > On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
> > > > >
> > > > > On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
> > > > > <bruce.richardson@intel.com> wrote:
> > > > > >
> > > > > > On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
> > > > > > > On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
> > > > > > > >
> > > > > > > > This patch introduces 'dmadevice' which is a generic type of DMA
> > > > > > > > device.
> > > > <snip>
> > > > > >
> > > > > > +1 and the terminology with regards to queues and channels. With our ioat
> > > > > > hardware, each HW queue was called a channel for instance.
> > > > >
> > > > > Looks like <dmadev> <> <channel> can cover all the use cases, if the
> > > > > HW has more than
> > > > > 1 queues it can be exposed as separate dmadev dev.
> > > > >
> > > >
> > > > Fine for me.
> > > >
> > > > However, just to confirm that Morten's suggestion of using a
> > > > (device-specific void *) channel pointer rather than dev_id + channel_id
> > > > pair of parameters won't work for you? You can't store a pointer or dev
> > > > index in the channel struct in the driver?
> > >
> > > Yes. That will work. To confirm, the suggestion is to use, void *
> > > object instead of channel_id,
> > > That will avoid one more indirection.(index -> pointer)
> > >
> >
> > The proposal was to use it in place of "dev_id + channel_id", i.e.
> >
> > copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)
> >
> > Where the channel pointer implicitly indicates the device too. However, I
> > realise now that this would be something completely transparent to the
> > driver as it would all have to be implemented in the dmadev level, and
> > lead to lots of duplication of function pointers, etc. Therefore, let's
> > just go with original scheme. :-(
> 
> Yes. Just go with the original scheme.
>
+1

> >
> > >
> > > >
> > > > >
> >
> > <snip>
> >
> > > > > Got it. In order to save space if first CL size for fastpath(Saving 8B
> > > > > for the pointer) and to avoid
> > > > > function overhead, Can we use one bit of flags of op function to
> > > > > enable the fence?
> > > > >
> > > >
> > > > The original ioat implementation did exactly that. However, I then
> > > > discovered that because a fence logically belongs between two operations,
> > > > does the fence flag on an operation mean "don't do any jobs after this
> > > > until this job has completed" or does it mean "don't start this job until
> > > > all previous jobs have completed". [Or theoretically does it mean both :-)]
> > > > Naturally, some hardware does it the former way (i.e. fence flag goes on
> > > > last op before fence), while other hardware the latter way (i.e. fence flag
> > > > goes on first op after the fence). Therefore, since fencing is about
> > > > ordering *between* two (sets of) jobs, I decided that it should do exactly
> > > > that and go between two jobs, so there is no ambiguity!
> > > >
> > > > However, I'm happy enough to switch to having a fence flag, but I think if
> > > > we do that, it should be put in the "first job after fence" case, because
> > > > it is always easier to modify a previously written job if we need to, than
> > > > to save the flag for a future one.
> > > >
> > > > Alternatively, if we keep the fence as a separate function, I'm happy
> > > > enough for it not to be on the same cacheline as the "hot" operations,
> > > > since fencing will always introduce a small penalty anyway.
> > >
> > > Ack.
> > > You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
> > > there any use case for this or it makes sense for your HW)
> > >
> > >
> > > For us, Fence is NOP for us as we have an implicit fence between each
> > > HW job descriptor.
> > >
> >
> > I actually still think that having a separate fence function in the "ops"
> > section is the best approach here. It's unabiguous as to whether it's
> > fence-before or fence-after, and if we have it in the ops, it doesn't use a
> > "fast-path" slot.
> >
> > However, if we *really* want to use a flag instead, I don't see the value
> > in having two flags, it will be really confusing.  Instead, if we do go
> > with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
> > that the fence occurs before the job in question.
> 
> IMO, We need to use flags and the name can be RTE_DMA_PRE_FENCE
> due to overhead of the driver implementation where the fence request
> can be NOP and
> to save the first cache line occupancy.
> 
> >
> > >
> > > >
> > > > > >
> > > > > > >
> > > > <snip>
> > > > > > > Since we have additional function call overhead in all the
> > > > > > > applications for this scheme, I would like to understand
> > > > > > > the use of doing this way vs enq does the doorbell implicitly from
> > > > > > > driver/application PoV?
> > > > > > >
> > > > > >
> > > > > > In our benchmarks it's just faster. When we tested it, the overhead of the
> > > > > > function calls was noticably less than the cost of building up the
> > > > > > parameter array(s) for passing the jobs in as a burst. [We don't see this
> > > > > > cost with things like NIC I/O since DPDK tends to already have the mbuf
> > > > > > fully populated before the TX call anyway.]
> > > > >
> > > > > OK. I agree with stack population.
> > > > >
> > > > > My question was more on doing implicit doorbell update enq. Is doorbell write
> > > > > costly in other HW compare to a function call? In our HW, it is just write of
> > > > > the number of instructions written in a register.
> > > > >
> > > > > Also, we need to again access the internal PMD memory structure to find
> > > > > where to write etc if it is a separate function.
> > > > >
> > > >
> > > > The cost varies depending on a number of factors - even writing to a single
> > > > HW register can be very slow if that register is mapped as device
> > > > (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
> > >
> > > I don't know, At least in our case, writes are write-back. so core does not need
> > > to wait.(If there is no read operation).
> > >
> > > > for the write to go all the way to hardware. For more modern HW, the cost
> > > > can be lighter. However, any cost of HW writes is going to be the same
> > > > whether its a separate function call or not.
> > > >
> > > > However, the main thing about the doorbell update is that it's a
> > > > once-per-burst thing, rather than a once-per-job. Therefore, even if you
> > > > have to re-read the struct memory (which is likely still somewhere in your
> > > > cores' cache), any extra small cost of doing so is to be amortized over the
> > > > cost of a whole burst of copies.
> > >
> > > Linux kernel has xmit_more flag in skb to address similar thing.
> > > i.e enq job flag can have one more bit field to say update ring bell or not?
> > > Rather having yet another function overhead.IMO, it is the best of both worlds.
> > >
> >
> > It's just more conditionals and branches all through the code. Inside the
> > user application, the user has to check whether to set the flag or not (or
> > special-case the last transaction outside the loop), and within the driver,
> > there has to be a branch whether or not to call the doorbell function. The
> > code on both sides is far simpler and more readable if the doorbell
> > function is exactly that - a separate function.
> 
> I disagree. The reason is:
> 
> We will have two classes of applications
> 
> a) do dma copy request as and when it has data(I think, this is the
> prime use case), for those,
> I think, it is considerable overhead to have two function invocation
> per transfer i.e
> rte_dma_copy() and rte_dma_perform()
> 
> b) do dma copy when the data is reached to a logical state,  like copy
> IP frame from Ethernet packets or so,
> In that case, the application will have  a LOGIC to detect when to
> perform it so on the end of
> that rte_dma_copy() flag can be updated to fire the doorbell.
> 
> IMO, We are comparing against a branch(flag is already in register) vs
> a set of instructions for
> 1) function pointer overhead
> 2) Need to use the channel context again back in another function.
> 
> IMO, a single branch is most optimal from performance PoV.
> 
Ok, let's try it and see how it goes.

> 
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > <snip>
> > > > > > > > + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> > > > > > > > without prior notice.  + * + * Returns the number of operations
> > > > > > > > that failed to complete.  + * NOTE: This API was used when
> > > > > > > > rte_dmadev_completed has_error was set.  + * + * @param dev_id
> > > > > > > > + *   The identifier of the device.  + * @param vq_id + *   The
> > > > > > > > identifier of virt queue.
> > > > > > > (> + * @param nb_status
> > > > > > > > + *   Indicates the size  of status array.  + * @param[out]
> > > > > > > > status + *   The error code of operations that failed to
> > > > > > > > complete.  + * @param[out] cookie + *   The last failed
> > > > > > > > completed operation's cookie.  + * + * @return + *   The number
> > > > > > > > of operations that failed to complete.  + * + * NOTE: The
> > > > > > > > caller must ensure that the input parameter is valid and the +
> > > > > > > > *       corresponding device supports the operation.  + */
> > > > > > > > +__rte_experimental +static inline uint16_t
> > > > > > > > +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> > > > > > > > const uint16_t nb_status, uint32_t *status, +
> > > > > > > > dma_cookie_t *cookie)
> > > > > > >
> > > > > > > IMO, it is better to move cookie/rind_idx at 3.  Why it would
> > > > > > > return any array of errors? since it called after
> > > > > > > rte_dmadev_completed() has has_error. Is it better to change
> > > > > > >
> > > > > > > rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> > > > > > > dma_cookie_t *cookie,  uint32_t *status)
> > > > > > >
> > > > > > > I also think, we may need to set status as bitmask and enumerate
> > > > > > > all the combination of error codes of all the driver and return
> > > > > > > string from driver existing rte_flow_error
> > > > > > >
> > > > > > > See struct rte_flow_error { enum rte_flow_error_type type; /**<
> > > > > > > Cause field and error types. */ const void *cause; /**< Object
> > > > > > > responsible for the error. */ const char *message; /**<
> > > > > > > Human-readable error message. */ };
> > > > > > >
> > > > > >
> > > > > > I think we need a multi-return value API here, as we may add
> > > > > > operations in future which have non-error status values to return.
> > > > > > The obvious case is DMA engines which support "compare" operations.
> > > > > > In that case a successful compare (as in there were no DMA or HW
> > > > > > errors) can return "equal" or "not-equal" as statuses. For general
> > > > > > "copy" operations, the faster completion op can be used to just
> > > > > > return successful values (and only call this status version on
> > > > > > error), while apps using those compare ops or a mixture of copy and
> > > > > > compare ops, would always use the slower one that returns status
> > > > > > values for each and every op..
> > > > > >
> > > > > > The ioat APIs used 32-bit integer values for this status array so
> > > > > > as to allow e.g. 16-bits for error code and 16-bits for future
> > > > > > status values. For most operations there should be a fairly small
> > > > > > set of things that can go wrong, i.e. bad source address, bad
> > > > > > destination address or invalid length.  Within that we may have a
> > > > > > couple of specifics for why an address is bad, but even so I don't
> > > > > > think we need to start having multiple bit combinations.
> > > > >
> > > > > OK. What is the purpose of errors status? Is it for application
> > > > > printing it or Does the application need to take any action based on
> > > > > specific error requests?
> > > >
> > > > It's largely for information purposes, but in the case of SVA/SVM
> > > > errors could occur due to the memory not being pinned, i.e. a page
> > > > fault, in some cases. If that happens, then it's up the app to either
> > > > touch the memory and retry the copy, or to do a SW memcpy as a
> > > > fallback.
> > > >
> > > > In other error cases, I think it's good to tell the application if it's
> > > > passing around bad data, or data that is beyond the scope of hardware,
> > > > e.g.  a copy that is beyond what can be done in a single transaction
> > > > for a HW instance. Given that there are always things that can go
> > > > wrong, I think we need some error reporting mechanism.
> > > >
> > > > > If the former is scope, then we need to define the standard enum
> > > > > value for the error right?  ie. uint32_t *status needs to change to
> > > > > enum rte_dma_error or so.
> > > > >
> > > > Sure. Perhaps an error/status structure either is an option, where we
> > > > explicitly call out error info from status info.
> > >
> > > Agree. Better to have a structure with filed like,
> > >
> > > 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> > > fine aspects of error.  LIke address caused issue etc.(Which will be
> > > driver-specific information).
> > >
> > The only issue I have with that is that once we have driver specific
> > information it is of little use to the application, since it can't know
> > anything about it excepy maybe log it.  I'd much rather have a set of error
> > codes telling user that "source address is wrong", "dest address is wrong",
> > and a generic "an address is wrong" in case driver/HW cannot distinguish
> > source of error. Can we see how far we get with just error codes before we
> > start into passing string messages around and all the memory management
> > headaches that implies.
> 
> Works for me. It should be "enum rte_dma_error_type" then, which has a standard
> error type. Which is missing in the spec now.
> 
+1
Chengwen Feng July 8, 2021, 3:11 a.m. UTC | #24
On 2021/7/7 19:01, Bruce Richardson wrote:
> On Wed, Jul 07, 2021 at 04:04:16PM +0530, Jerin Jacob wrote:
>> On Wed, Jul 7, 2021 at 2:05 PM Bruce Richardson
>> <bruce.richardson@intel.com> wrote:
>>>
>>> On Wed, Jul 07, 2021 at 01:38:58PM +0530, Jerin Jacob wrote:
>>>> On Mon, Jul 5, 2021 at 10:46 PM Bruce Richardson
>>>> <bruce.richardson@intel.com> wrote:
>>>>>
>>>>> On Mon, Jul 05, 2021 at 09:25:34PM +0530, Jerin Jacob wrote:
>>>>>>
>>>>>> On Mon, Jul 5, 2021 at 4:22 PM Bruce Richardson
>>>>>> <bruce.richardson@intel.com> wrote:
>>>>>>>
>>>>>>> On Sun, Jul 04, 2021 at 03:00:30PM +0530, Jerin Jacob wrote:
>>>>>>>> On Fri, Jul 2, 2021 at 6:51 PM Chengwen Feng <fengchengwen@huawei.com> wrote:
>>>>>>>>>
>>>>>>>>> This patch introduces 'dmadevice' which is a generic type of DMA
>>>>>>>>> device.
>>>>> <snip>
>>>>>>>
>>>>>>> +1 and the terminology with regards to queues and channels. With our ioat
>>>>>>> hardware, each HW queue was called a channel for instance.
>>>>>>
>>>>>> Looks like <dmadev> <> <channel> can cover all the use cases, if the
>>>>>> HW has more than
>>>>>> 1 queues it can be exposed as separate dmadev dev.
>>>>>>
>>>>>
>>>>> Fine for me.
>>>>>
>>>>> However, just to confirm that Morten's suggestion of using a
>>>>> (device-specific void *) channel pointer rather than dev_id + channel_id
>>>>> pair of parameters won't work for you? You can't store a pointer or dev
>>>>> index in the channel struct in the driver?
>>>>
>>>> Yes. That will work. To confirm, the suggestion is to use, void *
>>>> object instead of channel_id,
>>>> That will avoid one more indirection.(index -> pointer)
>>>>
>>>
>>> The proposal was to use it in place of "dev_id + channel_id", i.e.
>>>
>>> copy(dev_id, ch_id, src, dst, len, flags) --> copy(ch_ptr, src, dst, len, flags)
>>>
>>> Where the channel pointer implicitly indicates the device too. However, I
>>> realise now that this would be something completely transparent to the
>>> driver as it would all have to be implemented in the dmadev level, and
>>> lead to lots of duplication of function pointers, etc. Therefore, let's
>>> just go with original scheme. :-(
>>
>> Yes. Just go with the original scheme.
>>
> +1
> 
>>>
>>>>
>>>>>
>>>>>>
>>>
>>> <snip>
>>>
>>>>>> Got it. In order to save space if first CL size for fastpath(Saving 8B
>>>>>> for the pointer) and to avoid
>>>>>> function overhead, Can we use one bit of flags of op function to
>>>>>> enable the fence?
>>>>>>
>>>>>
>>>>> The original ioat implementation did exactly that. However, I then
>>>>> discovered that because a fence logically belongs between two operations,
>>>>> does the fence flag on an operation mean "don't do any jobs after this
>>>>> until this job has completed" or does it mean "don't start this job until
>>>>> all previous jobs have completed". [Or theoretically does it mean both :-)]
>>>>> Naturally, some hardware does it the former way (i.e. fence flag goes on
>>>>> last op before fence), while other hardware the latter way (i.e. fence flag
>>>>> goes on first op after the fence). Therefore, since fencing is about
>>>>> ordering *between* two (sets of) jobs, I decided that it should do exactly
>>>>> that and go between two jobs, so there is no ambiguity!
>>>>>
>>>>> However, I'm happy enough to switch to having a fence flag, but I think if
>>>>> we do that, it should be put in the "first job after fence" case, because
>>>>> it is always easier to modify a previously written job if we need to, than
>>>>> to save the flag for a future one.
>>>>>
>>>>> Alternatively, if we keep the fence as a separate function, I'm happy
>>>>> enough for it not to be on the same cacheline as the "hot" operations,
>>>>> since fencing will always introduce a small penalty anyway.
>>>>
>>>> Ack.
>>>> You may consider two flags, FENCE_THEN_JOB and JOB_THEN_FENCE( If
>>>> there any use case for this or it makes sense for your HW)
>>>>
>>>>
>>>> For us, Fence is NOP for us as we have an implicit fence between each
>>>> HW job descriptor.
>>>>
>>>
>>> I actually still think that having a separate fence function in the "ops"
>>> section is the best approach here. It's unabiguous as to whether it's
>>> fence-before or fence-after, and if we have it in the ops, it doesn't use a
>>> "fast-path" slot.
>>>
>>> However, if we *really* want to use a flag instead, I don't see the value
>>> in having two flags, it will be really confusing.  Instead, if we do go
>>> with a flag, I think "RTE_DMA_PRE_FENCE" should be the name, indicating
>>> that the fence occurs before the job in question.
>>
>> IMO, We need to use flags and the name can be RTE_DMA_PRE_FENCE
>> due to overhead of the driver implementation where the fence request
>> can be NOP and
>> to save the first cache line occupancy.
>>
>>>
>>>>
>>>>>
>>>>>>>
>>>>>>>>
>>>>> <snip>
>>>>>>>> Since we have additional function call overhead in all the
>>>>>>>> applications for this scheme, I would like to understand
>>>>>>>> the use of doing this way vs enq does the doorbell implicitly from
>>>>>>>> driver/application PoV?
>>>>>>>>
>>>>>>>
>>>>>>> In our benchmarks it's just faster. When we tested it, the overhead of the
>>>>>>> function calls was noticably less than the cost of building up the
>>>>>>> parameter array(s) for passing the jobs in as a burst. [We don't see this
>>>>>>> cost with things like NIC I/O since DPDK tends to already have the mbuf
>>>>>>> fully populated before the TX call anyway.]
>>>>>>
>>>>>> OK. I agree with stack population.
>>>>>>
>>>>>> My question was more on doing implicit doorbell update enq. Is doorbell write
>>>>>> costly in other HW compare to a function call? In our HW, it is just write of
>>>>>> the number of instructions written in a register.
>>>>>>
>>>>>> Also, we need to again access the internal PMD memory structure to find
>>>>>> where to write etc if it is a separate function.
>>>>>>
>>>>>
>>>>> The cost varies depending on a number of factors - even writing to a single
>>>>> HW register can be very slow if that register is mapped as device
>>>>> (uncacheable) memory, since (AFAIK) it will act as a full fence and wait
>>>>
>>>> I don't know, At least in our case, writes are write-back. so core does not need
>>>> to wait.(If there is no read operation).
>>>>
>>>>> for the write to go all the way to hardware. For more modern HW, the cost
>>>>> can be lighter. However, any cost of HW writes is going to be the same
>>>>> whether its a separate function call or not.
>>>>>
>>>>> However, the main thing about the doorbell update is that it's a
>>>>> once-per-burst thing, rather than a once-per-job. Therefore, even if you
>>>>> have to re-read the struct memory (which is likely still somewhere in your
>>>>> cores' cache), any extra small cost of doing so is to be amortized over the
>>>>> cost of a whole burst of copies.
>>>>
>>>> Linux kernel has xmit_more flag in skb to address similar thing.
>>>> i.e enq job flag can have one more bit field to say update ring bell or not?
>>>> Rather having yet another function overhead.IMO, it is the best of both worlds.
>>>>
>>>
>>> It's just more conditionals and branches all through the code. Inside the
>>> user application, the user has to check whether to set the flag or not (or
>>> special-case the last transaction outside the loop), and within the driver,
>>> there has to be a branch whether or not to call the doorbell function. The
>>> code on both sides is far simpler and more readable if the doorbell
>>> function is exactly that - a separate function.
>>
>> I disagree. The reason is:
>>
>> We will have two classes of applications
>>
>> a) do dma copy request as and when it has data(I think, this is the
>> prime use case), for those,
>> I think, it is considerable overhead to have two function invocation
>> per transfer i.e
>> rte_dma_copy() and rte_dma_perform()
>>
>> b) do dma copy when the data is reached to a logical state,  like copy
>> IP frame from Ethernet packets or so,
>> In that case, the application will have  a LOGIC to detect when to
>> perform it so on the end of
>> that rte_dma_copy() flag can be updated to fire the doorbell.
>>
>> IMO, We are comparing against a branch(flag is already in register) vs
>> a set of instructions for
>> 1) function pointer overhead
>> 2) Need to use the channel context again back in another function.
>>
>> IMO, a single branch is most optimal from performance PoV.
>>
> Ok, let's try it and see how it goes.

Test result show:
1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
2) For Xeon E5-2690 v2 (X86) could benefit with separate function
3) Both platform could benefit with doorbell in flags if burst < 5

There is a performance gain in small bursts (<5). Given the extensive use of bursts
in DPDK applications and users are accustomed to the concept, I do not recommend
using the 'doorbell' in flags.
And also user may confuse about the doorbell operations.

Kunpeng platform test result:
    [root@SZ tmp]# ./a1 1
    burst = 1
    perform_after_multiple_enqueue: burst:1 cost:0s.554422us
    doorbell_for_every_enqueue: burst:1 cost:0s.450927us
    last_enqueue_issue_doorbell: burst:1 cost:0s.450479us
    [root@SZ tmp]#
    [root@SZ tmp]# ./a1 2
    burst = 2
    perform_after_multiple_enqueue: burst:2 cost:0s.900884us
    doorbell_for_every_enqueue: burst:2 cost:0s.866732us
    last_enqueue_issue_doorbell: burst:2 cost:0s.732469us
    [root@SZ tmp]# ./a1 5
    burst = 5
    perform_after_multiple_enqueue: burst:5 cost:1s.732410us
    doorbell_for_every_enqueue: burst:5 cost:2s.115479us
    last_enqueue_issue_doorbell: burst:5 cost:1s.759349us
    [root@SZ tmp]# ./a1 10
    burst = 10
    perform_after_multiple_enqueue: burst:10 cost:3s.490716us
    doorbell_for_every_enqueue: burst:10 cost:4s.194691us
    last_enqueue_issue_doorbell: burst:10 cost:3s.331825us
    [root@SZ tmp]# ./a1 30
    burst = 30
    perform_after_multiple_enqueue: burst:30 cost:9s.61761us
    doorbell_for_every_enqueue: burst:30 cost:12s.517082us
    last_enqueue_issue_doorbell: burst:30 cost:9s.614802us

X86 platform test result:
    fengchengwen@SZ:~/tmp$ ./a1 1
    burst = 1
    perform_after_multiple_enqueue: burst:1 cost:0s.406331us
    doorbell_for_every_enqueue: burst:1 cost:0s.331109us
    last_enqueue_issue_doorbell: burst:1 cost:0s.381782us
    fengchengwen@SZ:~/tmp$ ./a1 2
    burst = 2
    perform_after_multiple_enqueue: burst:2 cost:0s.569024us
    doorbell_for_every_enqueue: burst:2 cost:0s.643449us
    last_enqueue_issue_doorbell: burst:2 cost:0s.486639us
    fengchengwen@SZ:~/tmp$ ./a1 5
    burst = 5
    perform_after_multiple_enqueue: burst:5 cost:1s.166384us
    doorbell_for_every_enqueue: burst:5 cost:1s.602369us
    last_enqueue_issue_doorbell: burst:5 cost:1s.209392us
    fengchengwen@SZ:~/tmp$ ./a1 10
    burst = 10
    perform_after_multiple_enqueue: burst:10 cost:2s.229901us
    doorbell_for_every_enqueue: burst:10 cost:3s.754802us
    last_enqueue_issue_doorbell: burst:10 cost:2s.328705us
    fengchengwen@SZ:~/tmp$
    fengchengwen@SZ:~/tmp$ ./a1 30
    burst = 30
    perform_after_multiple_enqueue: burst:30 cost:6s.132817us
    doorbell_for_every_enqueue: burst:30 cost:9s.944619us
    last_enqueue_issue_doorbell: burst:30 cost:7s.73551us


test-code:

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <time.h>
#include <sys/time.h>

struct dmadev;

unsigned int dev_reg[10240];
volatile unsigned int *ring;
volatile unsigned int *doorbell;

void init_global(void)
{
        ring = &dev_reg[100];
        doorbell = &dev_reg[10000];
}

#define rte_wmb() asm volatile("dmb oshst" : : : "memory")
//#define rte_wmb() asm volatile ("" : : : "memory")

typedef int (*enqueue_t)(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags);
typedef void (*perform_t)(struct dmadev *dev, int vchan);

struct dmadev {
        enqueue_t enqueue;
        perform_t perform;
        char rsv[512];
};

int hisi_dma_enqueue(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
{
        *ring = 1;
}

int hisi_dma_enqueue_doorbell(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
{
        *ring = 1;
        if (flags == 1) {
                rte_wmb();
                *doorbell = 1;
        }
}

void hisi_dma_perform(struct dmadev *dev, int vchan)
{
        rte_wmb();
        *doorbell = 1;
}

struct dmadev devlist[64];

void init_devlist(bool enq_doorbell)
{
        int i;
        for (i = 0; i < 64; i++) {
                devlist[i].enqueue = enq_doorbell ? hisi_dma_enqueue_doorbell : hisi_dma_enqueue;
                devlist[i].perform = hisi_dma_perform;
        }
}

static inline int dma_enqueue(int dev_id, int vchan, void *src, void *dst, int len, int flags)
{
        struct dmadev *dev = &devlist[dev_id];
        return dev->enqueue(dev, vchan, src, dst, len, flags);
}

static inline void dma_perform(int dev_id, int vchan)
{
        struct dmadev *dev = &devlist[dev_id];
        return dev->perform(dev, vchan);
}

#define MAX_LOOPS       90000000

void test_for_perform_after_multiple_enqueue(int burst)
{
        struct timeval start, end, delta;
        unsigned int i, j;
        init_devlist(false);
        gettimeofday(&start, NULL);
        for (i = 0; i < MAX_LOOPS; i++) {
                for (j = 0; j < burst; j++)
                        (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
                dma_perform(10, 0);
        }
        gettimeofday(&end, NULL);
        timersub(&end, &start, &delta);
        printf("perform_after_multiple_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
}

void test_for_doorbell_for_every_enqueue(int burst)
{
        struct timeval start, end, delta;
        unsigned int i, j;
        init_devlist(true);
        gettimeofday(&start, NULL);
        for (i = 0; i < MAX_LOOPS; i++) {
                for (j = 0; j < burst; j++)
                        (void)dma_enqueue(10, 0, NULL, NULL, 0, 1);
        }
        gettimeofday(&end, NULL);
        timersub(&end, &start, &delta);
        printf("doorbell_for_every_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
}

void test_for_last_enqueue_issue_doorbell(int burst)
{
        struct timeval start, end, delta;
        unsigned int i, j;
        init_devlist(true);
        gettimeofday(&start, NULL);
        for (i = 0; i < MAX_LOOPS; i++) {
                for (j = 0; j < burst - 1; j++)
                        (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
                dma_enqueue(10, 0, NULL, NULL, 0, 1);
        }
        gettimeofday(&end, NULL);
        timersub(&end, &start, &delta);
        printf("last_enqueue_issue_doorbell: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
}

void main(int argc, char *argv[])
{
        if (argc < 2) {
                printf("please input burst parameter!\n");
                return;
        }
        init_global();
        int burst = atol(argv[1]);
        printf("burst = %d \n", burst);
        test_for_perform_after_multiple_enqueue(burst);
        test_for_doorbell_for_every_enqueue(burst);
        test_for_last_enqueue_issue_doorbell(burst);
}

> 
>>
>>>
>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>>>
>>>>> <snip>
>>>>>>>>> + +/** + * @warning + * @b EXPERIMENTAL: this API may change
>>>>>>>>> without prior notice.  + * + * Returns the number of operations
>>>>>>>>> that failed to complete.  + * NOTE: This API was used when
>>>>>>>>> rte_dmadev_completed has_error was set.  + * + * @param dev_id
>>>>>>>>> + *   The identifier of the device.  + * @param vq_id + *   The
>>>>>>>>> identifier of virt queue.
>>>>>>>> (> + * @param nb_status
>>>>>>>>> + *   Indicates the size  of status array.  + * @param[out]
>>>>>>>>> status + *   The error code of operations that failed to
>>>>>>>>> complete.  + * @param[out] cookie + *   The last failed
>>>>>>>>> completed operation's cookie.  + * + * @return + *   The number
>>>>>>>>> of operations that failed to complete.  + * + * NOTE: The
>>>>>>>>> caller must ensure that the input parameter is valid and the +
>>>>>>>>> *       corresponding device supports the operation.  + */
>>>>>>>>> +__rte_experimental +static inline uint16_t
>>>>>>>>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
>>>>>>>>> const uint16_t nb_status, uint32_t *status, +
>>>>>>>>> dma_cookie_t *cookie)
>>>>>>>>
>>>>>>>> IMO, it is better to move cookie/rind_idx at 3.  Why it would
>>>>>>>> return any array of errors? since it called after
>>>>>>>> rte_dmadev_completed() has has_error. Is it better to change
>>>>>>>>
>>>>>>>> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
>>>>>>>> dma_cookie_t *cookie,  uint32_t *status)
>>>>>>>>
>>>>>>>> I also think, we may need to set status as bitmask and enumerate
>>>>>>>> all the combination of error codes of all the driver and return
>>>>>>>> string from driver existing rte_flow_error
>>>>>>>>
>>>>>>>> See struct rte_flow_error { enum rte_flow_error_type type; /**<
>>>>>>>> Cause field and error types. */ const void *cause; /**< Object
>>>>>>>> responsible for the error. */ const char *message; /**<
>>>>>>>> Human-readable error message. */ };
>>>>>>>>
>>>>>>>
>>>>>>> I think we need a multi-return value API here, as we may add
>>>>>>> operations in future which have non-error status values to return.
>>>>>>> The obvious case is DMA engines which support "compare" operations.
>>>>>>> In that case a successful compare (as in there were no DMA or HW
>>>>>>> errors) can return "equal" or "not-equal" as statuses. For general
>>>>>>> "copy" operations, the faster completion op can be used to just
>>>>>>> return successful values (and only call this status version on
>>>>>>> error), while apps using those compare ops or a mixture of copy and
>>>>>>> compare ops, would always use the slower one that returns status
>>>>>>> values for each and every op..
>>>>>>>
>>>>>>> The ioat APIs used 32-bit integer values for this status array so
>>>>>>> as to allow e.g. 16-bits for error code and 16-bits for future
>>>>>>> status values. For most operations there should be a fairly small
>>>>>>> set of things that can go wrong, i.e. bad source address, bad
>>>>>>> destination address or invalid length.  Within that we may have a
>>>>>>> couple of specifics for why an address is bad, but even so I don't
>>>>>>> think we need to start having multiple bit combinations.
>>>>>>
>>>>>> OK. What is the purpose of errors status? Is it for application
>>>>>> printing it or Does the application need to take any action based on
>>>>>> specific error requests?
>>>>>
>>>>> It's largely for information purposes, but in the case of SVA/SVM
>>>>> errors could occur due to the memory not being pinned, i.e. a page
>>>>> fault, in some cases. If that happens, then it's up the app to either
>>>>> touch the memory and retry the copy, or to do a SW memcpy as a
>>>>> fallback.
>>>>>
>>>>> In other error cases, I think it's good to tell the application if it's
>>>>> passing around bad data, or data that is beyond the scope of hardware,
>>>>> e.g.  a copy that is beyond what can be done in a single transaction
>>>>> for a HW instance. Given that there are always things that can go
>>>>> wrong, I think we need some error reporting mechanism.
>>>>>
>>>>>> If the former is scope, then we need to define the standard enum
>>>>>> value for the error right?  ie. uint32_t *status needs to change to
>>>>>> enum rte_dma_error or so.
>>>>>>
>>>>> Sure. Perhaps an error/status structure either is an option, where we
>>>>> explicitly call out error info from status info.
>>>>
>>>> Agree. Better to have a structure with filed like,
>>>>
>>>> 1)  enum rte_dma_error_type 2)  memory to store, informative message on
>>>> fine aspects of error.  LIke address caused issue etc.(Which will be
>>>> driver-specific information).
>>>>
>>> The only issue I have with that is that once we have driver specific
>>> information it is of little use to the application, since it can't know
>>> anything about it excepy maybe log it.  I'd much rather have a set of error
>>> codes telling user that "source address is wrong", "dest address is wrong",
>>> and a generic "an address is wrong" in case driver/HW cannot distinguish
>>> source of error. Can we see how far we get with just error codes before we
>>> start into passing string messages around and all the memory management
>>> headaches that implies.
>>
>> Works for me. It should be "enum rte_dma_error_type" then, which has a standard
>> error type. Which is missing in the spec now.
>>
> +1
> .
>
Jerin Jacob July 8, 2021, 6:35 p.m. UTC | #25
On Thu, Jul 8, 2021 at 8:41 AM fengchengwen <fengchengwen@huawei.com> wrote:
>

> >>>
> >>> It's just more conditionals and branches all through the code. Inside the
> >>> user application, the user has to check whether to set the flag or not (or
> >>> special-case the last transaction outside the loop), and within the driver,
> >>> there has to be a branch whether or not to call the doorbell function. The
> >>> code on both sides is far simpler and more readable if the doorbell
> >>> function is exactly that - a separate function.
> >>
> >> I disagree. The reason is:
> >>
> >> We will have two classes of applications
> >>
> >> a) do dma copy request as and when it has data(I think, this is the
> >> prime use case), for those,
> >> I think, it is considerable overhead to have two function invocation
> >> per transfer i.e
> >> rte_dma_copy() and rte_dma_perform()
> >>
> >> b) do dma copy when the data is reached to a logical state,  like copy
> >> IP frame from Ethernet packets or so,
> >> In that case, the application will have  a LOGIC to detect when to
> >> perform it so on the end of
> >> that rte_dma_copy() flag can be updated to fire the doorbell.
> >>
> >> IMO, We are comparing against a branch(flag is already in register) vs
> >> a set of instructions for
> >> 1) function pointer overhead
> >> 2) Need to use the channel context again back in another function.
> >>
> >> IMO, a single branch is most optimal from performance PoV.
> >>
> > Ok, let's try it and see how it goes.
>
> Test result show:
> 1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
> 2) For Xeon E5-2690 v2 (X86) could benefit with separate function
> 3) Both platform could benefit with doorbell in flags if burst < 5
>
> There is a performance gain in small bursts (<5). Given the extensive use of bursts
 in DPDK applications and users are accustomed to the concept, I do
not recommend
> using the 'doorbell' in flags.

There is NO concept change between one option vs other option. Just
argument differnet.
Also, _perform() scheme not used anywhere in DPDK. I

Regarding performance, I have added dummy instructions to simulate the real work
load[1], now burst also has some gain in both x86 and arm64[3]

I have modified your application[2] to dpdk test application to use
cpu isolation etc.
So this is gain in flag scheme ad code is checked in to Github[2[

[1]
static inline void
delay(void)
{
        volatile int k;

        for (k = 0; k < 16; k++) {


      }

}

__rte_noinline
int
hisi_dma_enqueue(struct dmadev *dev, int vchan, void *src, void *dst,
int len, const int flags)
{
         delay();

        *ring = 1;

         return 0;
}

__rte_noinline
int
hisi_dma_enqueue_doorbell(struct dmadev *dev, int vchan, void *src,
void *dst, int len, const int flags)
{
        delay();

        *ring = 1;

        if (unlikely(flags == 1)) {
                rte_wmb();
                *doorbell = 1;
        }
      return 0;
}


[2]

https://github.com/jerinjacobk/dpdk-dmatest/commit/4fc9bc3029543bbc4caaa5183d98bac93c34f588

Update results [3]

Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz

echo "dma_perf_autotest" | ./build/app/test/dpdk-test --no-huge -c 0xf00

core=24 Timer running at 2600.00MHz
   test_for_perform_after_multiple_enqueue: burst=1 cycles=46.000000
      test_for_last_enqueue_issue_doorbell: burst=1 cycles=45.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=2 cycles=90.000000
      test_for_last_enqueue_issue_doorbell: burst=2 cycles=89.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=3 cycles=134.000000
      test_for_last_enqueue_issue_doorbell: burst=3 cycles=133.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=4 cycles=177.000000
      test_for_last_enqueue_issue_doorbell: burst=4 cycles=176.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=5 cycles=221.000000
      test_for_last_enqueue_issue_doorbell: burst=5 cycles=221.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=6 cycles=265.000000
      test_for_last_enqueue_issue_doorbell: burst=6 cycles=265.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=7 cycles=333.000000
      test_for_last_enqueue_issue_doorbell: burst=7 cycles=309.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=8 cycles=375.000000
      test_for_last_enqueue_issue_doorbell: burst=8 cycles=373.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=9 cycles=418.000000
      test_for_last_enqueue_issue_doorbell: burst=9 cycles=414.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=10 cycles=462.000000
      test_for_last_enqueue_issue_doorbell: burst=10 cycles=458.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=11 cycles=507.000000
      test_for_last_enqueue_issue_doorbell: burst=11 cycles=501.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=12 cycles=552.000000
      test_for_last_enqueue_issue_doorbell: burst=12 cycles=546.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=13 cycles=593.000000
      test_for_last_enqueue_issue_doorbell: burst=13 cycles=590.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=14 cycles=638.000000
      test_for_last_enqueue_issue_doorbell: burst=14 cycles=634.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=15 cycles=681.000000
      test_for_last_enqueue_issue_doorbell: burst=15 cycles=678.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=16 cycles=725.000000
      test_for_last_enqueue_issue_doorbell: burst=16 cycles=722.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=17 cycles=770.000000
      test_for_last_enqueue_issue_doorbell: burst=17 cycles=767.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=18 cycles=815.000000
      test_for_last_enqueue_issue_doorbell: burst=18 cycles=812.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=19 cycles=857.000000
      test_for_last_enqueue_issue_doorbell: burst=19 cycles=854.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=20 cycles=902.000000
      test_for_last_enqueue_issue_doorbell: burst=20 cycles=899.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=21 cycles=945.000000
      test_for_last_enqueue_issue_doorbell: burst=21 cycles=943.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=22 cycles=990.000000
      test_for_last_enqueue_issue_doorbell: burst=22 cycles=988.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=23 cycles=1033.000000
      test_for_last_enqueue_issue_doorbell: burst=23 cycles=1031.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=24 cycles=1077.000000
      test_for_last_enqueue_issue_doorbell: burst=24 cycles=1075.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=25 cycles=1121.000000
      test_for_last_enqueue_issue_doorbell: burst=25 cycles=1119.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=26 cycles=1166.000000
      test_for_last_enqueue_issue_doorbell: burst=26 cycles=1163.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=27 cycles=1208.000000
      test_for_last_enqueue_issue_doorbell: burst=27 cycles=1208.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=28 cycles=1252.000000
      test_for_last_enqueue_issue_doorbell: burst=28 cycles=1252.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=29 cycles=1295.000000
      test_for_last_enqueue_issue_doorbell: burst=29 cycles=1295.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=30 cycles=1342.000000
      test_for_last_enqueue_issue_doorbell: burst=30 cycles=1340.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=31 cycles=1386.000000
      test_for_last_enqueue_issue_doorbell: burst=31 cycles=1384.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=32 cycles=1429.000000
      test_for_last_enqueue_issue_doorbell: burst=32 cycles=1428.000000
-------------------------------------------------------------------------------



octeontx2:

See https://doc.dpdk.org/guides/prog_guide/profile_app.html section
62.2.3. High-resolution cycle counter


meson --cross config/arm/arm64_octeontx2_linux_gcc
-Dc_args='-DRTE_ARM_EAL_RDTSC_USE_PMU' build

 echo "dma_perf_autotest" | ./build/app/test/dpdk-test --no-huge -c 0xff0000
RTE>>dma_perf_autotest^M
lcore=16 Timer running at 2400.00MHz
   test_for_perform_after_multiple_enqueue: burst=1 cycles=105.000000
      test_for_last_enqueue_issue_doorbell: burst=1 cycles=105.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=2 cycles=207.000000
      test_for_last_enqueue_issue_doorbell: burst=2 cycles=207.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=3 cycles=309.000000
      test_for_last_enqueue_issue_doorbell: burst=3 cycles=310.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=4 cycles=411.000000
      test_for_last_enqueue_issue_doorbell: burst=4 cycles=410.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=5 cycles=513.000000
      test_for_last_enqueue_issue_doorbell: burst=5 cycles=512.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=6 cycles=615.000000
      test_for_last_enqueue_issue_doorbell: burst=6 cycles=615.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=7 cycles=717.000000
      test_for_last_enqueue_issue_doorbell: burst=7 cycles=716.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=8 cycles=819.000000
      test_for_last_enqueue_issue_doorbell: burst=8 cycles=818.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=9 cycles=921.000000
      test_for_last_enqueue_issue_doorbell: burst=9 cycles=922.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=10 cycles=1023.000000
      test_for_last_enqueue_issue_doorbell: burst=10 cycles=1022.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=11 cycles=1126.000000
      test_for_last_enqueue_issue_doorbell: burst=11 cycles=1124.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=12 cycles=1227.000000
      test_for_last_enqueue_issue_doorbell: burst=12 cycles=1227.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=13 cycles=1329.000000
      test_for_last_enqueue_issue_doorbell: burst=13 cycles=1328.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=14 cycles=1431.000000
      test_for_last_enqueue_issue_doorbell: burst=14 cycles=1430.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=15 cycles=1534.000000
      test_for_last_enqueue_issue_doorbell: burst=15 cycles=1534.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=16 cycles=1638.000000
      test_for_last_enqueue_issue_doorbell: burst=16 cycles=1640.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=17 cycles=1746.000000
      test_for_last_enqueue_issue_doorbell: burst=17 cycles=1739.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=18 cycles=1847.000000
      test_for_last_enqueue_issue_doorbell: burst=18 cycles=1841.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=19 cycles=1950.000000
      test_for_last_enqueue_issue_doorbell: burst=19 cycles=1944.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=20 cycles=2051.000000
       test_for_last_enqueue_issue_doorbell: burst=20 cycles=2045.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=21 cycles=2154.000000
      test_for_last_enqueue_issue_doorbell: burst=21 cycles=2148.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=22 cycles=2257.000000
      test_for_last_enqueue_issue_doorbell: burst=22 cycles=2249.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=23 cycles=2358.000000
      test_for_last_enqueue_issue_doorbell: burst=23 cycles=2352.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=24 cycles=2459.000000
      test_for_last_enqueue_issue_doorbell: burst=24 cycles=2454.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=25 cycles=2562.000000
      test_for_last_enqueue_issue_doorbell: burst=25 cycles=2555.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=26 cycles=2665.000000
      test_for_last_enqueue_issue_doorbell: burst=26 cycles=2657.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=27 cycles=2766.000000
      test_for_last_enqueue_issue_doorbell: burst=27 cycles=2760.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=28 cycles=2867.000000
      test_for_last_enqueue_issue_doorbell: burst=28 cycles=2861.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=29 cycles=2970.000000
      test_for_last_enqueue_issue_doorbell: burst=29 cycles=2964.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=30 cycles=3073.000000
      test_for_last_enqueue_issue_doorbell: burst=30 cycles=3065.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=31 cycles=3174.000000
      test_for_last_enqueue_issue_doorbell: burst=31 cycles=3168.000000
-------------------------------------------------------------------------------
   test_for_perform_after_multiple_enqueue: burst=32 cycles=3275.000000
      test_for_last_enqueue_issue_doorbell: burst=32 cycles=3269.000000
-------------------------------------------------------------------------------
Test OK
RTE>




> And also user may confuse about the doorbell operations.
>
> Kunpeng platform test result:
>     [root@SZ tmp]# ./a1 1
>     burst = 1
>     perform_after_multiple_enqueue: burst:1 cost:0s.554422us
>     doorbell_for_every_enqueue: burst:1 cost:0s.450927us
>     last_enqueue_issue_doorbell: burst:1 cost:0s.450479us
>     [root@SZ tmp]#
>     [root@SZ tmp]# ./a1 2
>     burst = 2
>     perform_after_multiple_enqueue: burst:2 cost:0s.900884us
>     doorbell_for_every_enqueue: burst:2 cost:0s.866732us
>     last_enqueue_issue_doorbell: burst:2 cost:0s.732469us
>     [root@SZ tmp]# ./a1 5
>     burst = 5
>     perform_after_multiple_enqueue: burst:5 cost:1s.732410us
>     doorbell_for_every_enqueue: burst:5 cost:2s.115479us
>     last_enqueue_issue_doorbell: burst:5 cost:1s.759349us
>     [root@SZ tmp]# ./a1 10
>     burst = 10
>     perform_after_multiple_enqueue: burst:10 cost:3s.490716us
>     doorbell_for_every_enqueue: burst:10 cost:4s.194691us
>     last_enqueue_issue_doorbell: burst:10 cost:3s.331825us
>     [root@SZ tmp]# ./a1 30
>     burst = 30
>     perform_after_multiple_enqueue: burst:30 cost:9s.61761us
>     doorbell_for_every_enqueue: burst:30 cost:12s.517082us
>     last_enqueue_issue_doorbell: burst:30 cost:9s.614802us
>
> X86 platform test result:
>     fengchengwen@SZ:~/tmp$ ./a1 1
>     burst = 1
>     perform_after_multiple_enqueue: burst:1 cost:0s.406331us
>     doorbell_for_every_enqueue: burst:1 cost:0s.331109us
>     last_enqueue_issue_doorbell: burst:1 cost:0s.381782us
>     fengchengwen@SZ:~/tmp$ ./a1 2
>     burst = 2
>     perform_after_multiple_enqueue: burst:2 cost:0s.569024us
>     doorbell_for_every_enqueue: burst:2 cost:0s.643449us
>     last_enqueue_issue_doorbell: burst:2 cost:0s.486639us
>     fengchengwen@SZ:~/tmp$ ./a1 5
>     burst = 5
>     perform_after_multiple_enqueue: burst:5 cost:1s.166384us
>     doorbell_for_every_enqueue: burst:5 cost:1s.602369us
>     last_enqueue_issue_doorbell: burst:5 cost:1s.209392us
>     fengchengwen@SZ:~/tmp$ ./a1 10
>     burst = 10
>     perform_after_multiple_enqueue: burst:10 cost:2s.229901us
>     doorbell_for_every_enqueue: burst:10 cost:3s.754802us
>     last_enqueue_issue_doorbell: burst:10 cost:2s.328705us
>     fengchengwen@SZ:~/tmp$
>     fengchengwen@SZ:~/tmp$ ./a1 30
>     burst = 30
>     perform_after_multiple_enqueue: burst:30 cost:6s.132817us
>     doorbell_for_every_enqueue: burst:30 cost:9s.944619us
>     last_enqueue_issue_doorbell: burst:30 cost:7s.73551us
>
>
> test-code:
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <stdbool.h>
> #include <time.h>
> #include <sys/time.h>
>
> struct dmadev;
>
> unsigned int dev_reg[10240];
> volatile unsigned int *ring;
> volatile unsigned int *doorbell;
>
> void init_global(void)
> {
>         ring = &dev_reg[100];
>         doorbell = &dev_reg[10000];
> }
>
> #define rte_wmb() asm volatile("dmb oshst" : : : "memory")
> //#define rte_wmb() asm volatile ("" : : : "memory")
>
> typedef int (*enqueue_t)(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags);
> typedef void (*perform_t)(struct dmadev *dev, int vchan);
>
> struct dmadev {
>         enqueue_t enqueue;
>         perform_t perform;
>         char rsv[512];
> };
>
> int hisi_dma_enqueue(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
> {
>         *ring = 1;
> }
>
> int hisi_dma_enqueue_doorbell(struct dmadev *dev, int vchan, void *src, void *dst, int len, int flags)
> {
>         *ring = 1;
>         if (flags == 1) {
>                 rte_wmb();
>                 *doorbell = 1;
>         }
> }
>
> void hisi_dma_perform(struct dmadev *dev, int vchan)
> {
>         rte_wmb();
>         *doorbell = 1;
> }
>
> struct dmadev devlist[64];
>
> void init_devlist(bool enq_doorbell)
> {
>         int i;
>         for (i = 0; i < 64; i++) {
>                 devlist[i].enqueue = enq_doorbell ? hisi_dma_enqueue_doorbell : hisi_dma_enqueue;
>                 devlist[i].perform = hisi_dma_perform;
>         }
> }
>
> static inline int dma_enqueue(int dev_id, int vchan, void *src, void *dst, int len, int flags)
> {
>         struct dmadev *dev = &devlist[dev_id];
>         return dev->enqueue(dev, vchan, src, dst, len, flags);
> }
>
> static inline void dma_perform(int dev_id, int vchan)
> {
>         struct dmadev *dev = &devlist[dev_id];
>         return dev->perform(dev, vchan);
> }
>
> #define MAX_LOOPS       90000000
>
> void test_for_perform_after_multiple_enqueue(int burst)
> {
>         struct timeval start, end, delta;
>         unsigned int i, j;
>         init_devlist(false);
>         gettimeofday(&start, NULL);
>         for (i = 0; i < MAX_LOOPS; i++) {
>                 for (j = 0; j < burst; j++)
>                         (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
>                 dma_perform(10, 0);
>         }
>         gettimeofday(&end, NULL);
>         timersub(&end, &start, &delta);
>         printf("perform_after_multiple_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
> }
>
> void test_for_doorbell_for_every_enqueue(int burst)
> {
>         struct timeval start, end, delta;
>         unsigned int i, j;
>         init_devlist(true);
>         gettimeofday(&start, NULL);
>         for (i = 0; i < MAX_LOOPS; i++) {
>                 for (j = 0; j < burst; j++)
>                         (void)dma_enqueue(10, 0, NULL, NULL, 0, 1);
>         }
>         gettimeofday(&end, NULL);
>         timersub(&end, &start, &delta);
>         printf("doorbell_for_every_enqueue: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
> }
>
> void test_for_last_enqueue_issue_doorbell(int burst)
> {
>         struct timeval start, end, delta;
>         unsigned int i, j;
>         init_devlist(true);
>         gettimeofday(&start, NULL);
>         for (i = 0; i < MAX_LOOPS; i++) {
>                 for (j = 0; j < burst - 1; j++)
>                         (void)dma_enqueue(10, 0, NULL, NULL, 0, 0);
>                 dma_enqueue(10, 0, NULL, NULL, 0, 1);
>         }
>         gettimeofday(&end, NULL);
>         timersub(&end, &start, &delta);
>         printf("last_enqueue_issue_doorbell: burst:%d cost:%us.%uus \n", burst, delta.tv_sec, delta.tv_usec);
> }
>
> void main(int argc, char *argv[])
> {
>         if (argc < 2) {
>                 printf("please input burst parameter!\n");
>                 return;
>         }
>         init_global();
>         int burst = atol(argv[1]);
>         printf("burst = %d \n", burst);
>         test_for_perform_after_multiple_enqueue(burst);
>         test_for_doorbell_for_every_enqueue(burst);
>         test_for_last_enqueue_issue_doorbell(burst);
> }
>
> >
> >>
> >>>
> >>>>
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>>
> >>>>> <snip>
> >>>>>>>>> + +/** + * @warning + * @b EXPERIMENTAL: this API may change
> >>>>>>>>> without prior notice.  + * + * Returns the number of operations
> >>>>>>>>> that failed to complete.  + * NOTE: This API was used when
> >>>>>>>>> rte_dmadev_completed has_error was set.  + * + * @param dev_id
> >>>>>>>>> + *   The identifier of the device.  + * @param vq_id + *   The
> >>>>>>>>> identifier of virt queue.
> >>>>>>>> (> + * @param nb_status
> >>>>>>>>> + *   Indicates the size  of status array.  + * @param[out]
> >>>>>>>>> status + *   The error code of operations that failed to
> >>>>>>>>> complete.  + * @param[out] cookie + *   The last failed
> >>>>>>>>> completed operation's cookie.  + * + * @return + *   The number
> >>>>>>>>> of operations that failed to complete.  + * + * NOTE: The
> >>>>>>>>> caller must ensure that the input parameter is valid and the +
> >>>>>>>>> *       corresponding device supports the operation.  + */
> >>>>>>>>> +__rte_experimental +static inline uint16_t
> >>>>>>>>> +rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id, +
> >>>>>>>>> const uint16_t nb_status, uint32_t *status, +
> >>>>>>>>> dma_cookie_t *cookie)
> >>>>>>>>
> >>>>>>>> IMO, it is better to move cookie/rind_idx at 3.  Why it would
> >>>>>>>> return any array of errors? since it called after
> >>>>>>>> rte_dmadev_completed() has has_error. Is it better to change
> >>>>>>>>
> >>>>>>>> rte_dmadev_error_status((uint16_t dev_id, uint16_t vq_id,
> >>>>>>>> dma_cookie_t *cookie,  uint32_t *status)
> >>>>>>>>
> >>>>>>>> I also think, we may need to set status as bitmask and enumerate
> >>>>>>>> all the combination of error codes of all the driver and return
> >>>>>>>> string from driver existing rte_flow_error
> >>>>>>>>
> >>>>>>>> See struct rte_flow_error { enum rte_flow_error_type type; /**<
> >>>>>>>> Cause field and error types. */ const void *cause; /**< Object
> >>>>>>>> responsible for the error. */ const char *message; /**<
> >>>>>>>> Human-readable error message. */ };
> >>>>>>>>
> >>>>>>>
> >>>>>>> I think we need a multi-return value API here, as we may add
> >>>>>>> operations in future which have non-error status values to return.
> >>>>>>> The obvious case is DMA engines which support "compare" operations.
> >>>>>>> In that case a successful compare (as in there were no DMA or HW
> >>>>>>> errors) can return "equal" or "not-equal" as statuses. For general
> >>>>>>> "copy" operations, the faster completion op can be used to just
> >>>>>>> return successful values (and only call this status version on
> >>>>>>> error), while apps using those compare ops or a mixture of copy and
> >>>>>>> compare ops, would always use the slower one that returns status
> >>>>>>> values for each and every op..
> >>>>>>>
> >>>>>>> The ioat APIs used 32-bit integer values for this status array so
> >>>>>>> as to allow e.g. 16-bits for error code and 16-bits for future
> >>>>>>> status values. For most operations there should be a fairly small
> >>>>>>> set of things that can go wrong, i.e. bad source address, bad
> >>>>>>> destination address or invalid length.  Within that we may have a
> >>>>>>> couple of specifics for why an address is bad, but even so I don't
> >>>>>>> think we need to start having multiple bit combinations.
> >>>>>>
> >>>>>> OK. What is the purpose of errors status? Is it for application
> >>>>>> printing it or Does the application need to take any action based on
> >>>>>> specific error requests?
> >>>>>
> >>>>> It's largely for information purposes, but in the case of SVA/SVM
> >>>>> errors could occur due to the memory not being pinned, i.e. a page
> >>>>> fault, in some cases. If that happens, then it's up the app to either
> >>>>> touch the memory and retry the copy, or to do a SW memcpy as a
> >>>>> fallback.
> >>>>>
> >>>>> In other error cases, I think it's good to tell the application if it's
> >>>>> passing around bad data, or data that is beyond the scope of hardware,
> >>>>> e.g.  a copy that is beyond what can be done in a single transaction
> >>>>> for a HW instance. Given that there are always things that can go
> >>>>> wrong, I think we need some error reporting mechanism.
> >>>>>
> >>>>>> If the former is scope, then we need to define the standard enum
> >>>>>> value for the error right?  ie. uint32_t *status needs to change to
> >>>>>> enum rte_dma_error or so.
> >>>>>>
> >>>>> Sure. Perhaps an error/status structure either is an option, where we
> >>>>> explicitly call out error info from status info.
> >>>>
> >>>> Agree. Better to have a structure with filed like,
> >>>>
> >>>> 1)  enum rte_dma_error_type 2)  memory to store, informative message on
> >>>> fine aspects of error.  LIke address caused issue etc.(Which will be
> >>>> driver-specific information).
> >>>>
> >>> The only issue I have with that is that once we have driver specific
> >>> information it is of little use to the application, since it can't know
> >>> anything about it excepy maybe log it.  I'd much rather have a set of error
> >>> codes telling user that "source address is wrong", "dest address is wrong",
> >>> and a generic "an address is wrong" in case driver/HW cannot distinguish
> >>> source of error. Can we see how far we get with just error codes before we
> >>> start into passing string messages around and all the memory management
> >>> headaches that implies.
> >>
> >> Works for me. It should be "enum rte_dma_error_type" then, which has a standard
> >> error type. Which is missing in the spec now.
> >>
> > +1
> > .
> >
Bruce Richardson July 9, 2021, 9:14 a.m. UTC | #26
On Fri, Jul 09, 2021 at 12:05:40AM +0530, Jerin Jacob wrote:
> On Thu, Jul 8, 2021 at 8:41 AM fengchengwen <fengchengwen@huawei.com> wrote:
> >
> 
> > >>>
> > >>> It's just more conditionals and branches all through the code. Inside the
> > >>> user application, the user has to check whether to set the flag or not (or
> > >>> special-case the last transaction outside the loop), and within the driver,
> > >>> there has to be a branch whether or not to call the doorbell function. The
> > >>> code on both sides is far simpler and more readable if the doorbell
> > >>> function is exactly that - a separate function.
> > >>
> > >> I disagree. The reason is:
> > >>
> > >> We will have two classes of applications
> > >>
> > >> a) do dma copy request as and when it has data(I think, this is the
> > >> prime use case), for those,
> > >> I think, it is considerable overhead to have two function invocation
> > >> per transfer i.e
> > >> rte_dma_copy() and rte_dma_perform()
> > >>
> > >> b) do dma copy when the data is reached to a logical state,  like copy
> > >> IP frame from Ethernet packets or so,
> > >> In that case, the application will have  a LOGIC to detect when to
> > >> perform it so on the end of
> > >> that rte_dma_copy() flag can be updated to fire the doorbell.
> > >>
> > >> IMO, We are comparing against a branch(flag is already in register) vs
> > >> a set of instructions for
> > >> 1) function pointer overhead
> > >> 2) Need to use the channel context again back in another function.
> > >>
> > >> IMO, a single branch is most optimal from performance PoV.
> > >>
> > > Ok, let's try it and see how it goes.
> >
> > Test result show:
> > 1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
> > 2) For Xeon E5-2690 v2 (X86) could benefit with separate function
> > 3) Both platform could benefit with doorbell in flags if burst < 5
> >
> > There is a performance gain in small bursts (<5). Given the extensive use of bursts
>  in DPDK applications and users are accustomed to the concept, I do
> not recommend
> > using the 'doorbell' in flags.
> 
> There is NO concept change between one option vs other option. Just
> argument differnet.
> Also, _perform() scheme not used anywhere in DPDK. I
> 
> Regarding performance, I have added dummy instructions to simulate the real work
> load[1], now burst also has some gain in both x86 and arm64[3]
> 
> I have modified your application[2] to dpdk test application to use
> cpu isolation etc.
> So this is gain in flag scheme ad code is checked in to Github[2[
> 
<snip>

The benchmark numbers all seem very close between the two schemes. On my
team we pretty much have test ioat & idxd drivers ported internally to the
last dmadev draft library, and have sample apps handling traffic using
those. I'll therefore attempt to get these numbers with real traffic on
real drivers to just double check that it's the same as these
microbenchmarks.

Assuming that perf is the same, how to resolve this? Some thoughts:
* As I understand it, the main objection to the separate doorbell function
  is the use of 8-bytes in fastpath slot. Therefore I will also attempt to
  benchmark having the doorbell function not on the same cacheline and check
  perf impact, if any.
* If we don't have a impact to perf by having the doorbell function inside
  the regular "ops" rather than on fastpath cacheline, there is no reason
  we can't implement both schemes. The user can then choose themselves
  whether to doorbell using a flag on last item, or to doorbell explicitly
  using function call.

Of the two schemes, and assuming they are equal, I do have a preference for
the separate function one, primarily from a code readability point of view.
Other than that, I have no strong opinions.

/Bruce
Jerin Jacob July 11, 2021, 7:14 a.m. UTC | #27
On Fri, Jul 9, 2021 at 2:44 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Fri, Jul 09, 2021 at 12:05:40AM +0530, Jerin Jacob wrote:
> > On Thu, Jul 8, 2021 at 8:41 AM fengchengwen <fengchengwen@huawei.com> wrote:
> > >
> >
> > > >>>
> > > >>> It's just more conditionals and branches all through the code. Inside the
> > > >>> user application, the user has to check whether to set the flag or not (or
> > > >>> special-case the last transaction outside the loop), and within the driver,
> > > >>> there has to be a branch whether or not to call the doorbell function. The
> > > >>> code on both sides is far simpler and more readable if the doorbell
> > > >>> function is exactly that - a separate function.
> > > >>
> > > >> I disagree. The reason is:
> > > >>
> > > >> We will have two classes of applications
> > > >>
> > > >> a) do dma copy request as and when it has data(I think, this is the
> > > >> prime use case), for those,
> > > >> I think, it is considerable overhead to have two function invocation
> > > >> per transfer i.e
> > > >> rte_dma_copy() and rte_dma_perform()
> > > >>
> > > >> b) do dma copy when the data is reached to a logical state,  like copy
> > > >> IP frame from Ethernet packets or so,
> > > >> In that case, the application will have  a LOGIC to detect when to
> > > >> perform it so on the end of
> > > >> that rte_dma_copy() flag can be updated to fire the doorbell.
> > > >>
> > > >> IMO, We are comparing against a branch(flag is already in register) vs
> > > >> a set of instructions for
> > > >> 1) function pointer overhead
> > > >> 2) Need to use the channel context again back in another function.
> > > >>
> > > >> IMO, a single branch is most optimal from performance PoV.
> > > >>
> > > > Ok, let's try it and see how it goes.
> > >
> > > Test result show:
> > > 1) For Kunpeng platform (ARMv8) could benefit very little with doorbell in flags
> > > 2) For Xeon E5-2690 v2 (X86) could benefit with separate function
> > > 3) Both platform could benefit with doorbell in flags if burst < 5
> > >
> > > There is a performance gain in small bursts (<5). Given the extensive use of bursts
> >  in DPDK applications and users are accustomed to the concept, I do
> > not recommend
> > > using the 'doorbell' in flags.
> >
> > There is NO concept change between one option vs other option. Just
> > argument differnet.
> > Also, _perform() scheme not used anywhere in DPDK. I
> >
> > Regarding performance, I have added dummy instructions to simulate the real work
> > load[1], now burst also has some gain in both x86 and arm64[3]
> >
> > I have modified your application[2] to dpdk test application to use
> > cpu isolation etc.
> > So this is gain in flag scheme ad code is checked in to Github[2[
> >
> <snip>
>
> The benchmark numbers all seem very close between the two schemes. On my
> team we pretty much have test ioat & idxd drivers ported internally to the
> last dmadev draft library, and have sample apps handling traffic using
> those. I'll therefore attempt to get these numbers with real traffic on
> real drivers to just double check that it's the same as these
> microbenchmarks.

Thanks.

>
> Assuming that perf is the same, how to resolve this? Some thoughts:
> * As I understand it, the main objection to the separate doorbell function
>   is the use of 8-bytes in fastpath slot. Therefore I will also attempt to
>   benchmark having the doorbell function not on the same cacheline and check
>   perf impact, if any.

Probably we can remove rte_dmadev_fill_sg() variant and keep sg only for copy
to save 8B.

> * If we don't have a impact to perf by having the doorbell function inside
>   the regular "ops" rather than on fastpath cacheline, there is no reason
>   we can't implement both schemes. The user can then choose themselves
>   whether to doorbell using a flag on last item, or to doorbell explicitly
>   using function call.

Yes. I think, we can keep both.

>
> Of the two schemes, and assuming they are equal, I do have a preference for
> the separate function one, primarily from a code readability point of view.
> Other than that, I have no strong opinions.
>
> /Bruce
Morten Brørup July 12, 2021, 7:01 a.m. UTC | #28
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jerin Jacob

> Probably we can remove rte_dmadev_fill_sg() variant and keep sg only
> for copy to save 8B.

Perhaps the scatter/gather functions can be on a separate cache line, following the cache line with the simple functions?
Of course, this is only beneficial if the SG functions are not used with simple functions.
This means that we reserve space for 8 simple functions and 8 SG functions.

And if one or two functions are used with both simple and SG functions, it/they can be present in both cache lines. (This is somewhat dirty, but would be a typical implementation for a DPDK flow data structure.)

-Morten
Jerin Jacob July 12, 2021, 7:59 a.m. UTC | #29
On Mon, Jul 12, 2021 at 12:31 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jerin Jacob
>
> > Probably we can remove rte_dmadev_fill_sg() variant and keep sg only
> > for copy to save 8B.
>
> Perhaps the scatter/gather functions can be on a separate cache line, following the cache line with the simple functions?
> Of course, this is only beneficial if the SG functions are not used with simple functions.
> This means that we reserve space for 8 simple functions and 8 SG functions.

Currently, there are only two SG and normal ops functions. IMO, should
be to keep all fastpath functions in the same CL irrespective of it is
simple or not.
My suggestion was more like, if there is no HW support for
rte_dmadev_fill_sg() we can remove it from spec.

>
> And if one or two functions are used with both simple and SG functions, it/they can be present in both cache lines. (This is somewhat dirty, but would be a typical implementation for a DPDK flow data structure.)
>
> -Morten
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 4347555..2019783 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -496,6 +496,10 @@  F: drivers/raw/skeleton/
 F: app/test/test_rawdev.c
 F: doc/guides/prog_guide/rawdev.rst
 
+Dma device API
+M: Chengwen Feng <fengchengwen@huawei.com>
+F: lib/dmadev/
+
 
 Memory Pool Drivers
 -------------------
diff --git a/config/rte_config.h b/config/rte_config.h
index 590903c..331a431 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -81,6 +81,9 @@ 
 /* rawdev defines */
 #define RTE_RAWDEV_MAX_DEVS 64
 
+/* dmadev defines */
+#define RTE_DMADEV_MAX_DEVS 64
+
 /* ip_fragmentation defines */
 #define RTE_LIBRTE_IP_FRAG_MAX_FRAG 4
 #undef RTE_LIBRTE_IP_FRAG_TBL_STAT
diff --git a/lib/dmadev/meson.build b/lib/dmadev/meson.build
new file mode 100644
index 0000000..c918dae
--- /dev/null
+++ b/lib/dmadev/meson.build
@@ -0,0 +1,6 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited.
+
+sources = files('rte_dmadev.c')
+headers = files('rte_dmadev.h', 'rte_dmadev_pmd.h')
+indirect_headers += files('rte_dmadev_core.h')
diff --git a/lib/dmadev/rte_dmadev.c b/lib/dmadev/rte_dmadev.c
new file mode 100644
index 0000000..a94e839
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.c
@@ -0,0 +1,438 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_dev.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_string_fns.h>
+
+#include "rte_dmadev.h"
+#include "rte_dmadev_pmd.h"
+
+struct rte_dmadev rte_dmadevices[RTE_DMADEV_MAX_DEVS];
+
+uint16_t
+rte_dmadev_count(void)
+{
+	uint16_t count = 0;
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (rte_dmadevices[i].attached)
+			count++;
+	}
+
+	return count;
+}
+
+int
+rte_dmadev_get_dev_id(const char *name)
+{
+	uint16_t i;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++)
+		if ((strcmp(rte_dmadevices[i].name, name) == 0) &&
+		    (rte_dmadevices[i].attached == RTE_DMADEV_ATTACHED))
+			return i;
+
+	return -ENODEV;
+}
+
+int
+rte_dmadev_socket_id(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	dev = &rte_dmadevices[dev_id];
+
+	return dev->socket_id;
+}
+
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(dev_info, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
+
+	memset(dev_info, 0, sizeof(struct rte_dmadev_info));
+	diag = (*dev->dev_ops->dev_info_get)(dev, dev_info);
+	if (diag != 0)
+		return diag;
+
+	dev_info->device = dev->device;
+	dev_info->driver_name = dev->driver_name;
+	dev_info->socket_id = dev->socket_id;
+
+	return 0;
+}
+
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(dev_conf, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
+
+	if (dev->started) {
+		RTE_DMADEV_LOG(ERR,
+		   "device %u must be stopped to allow configuration", dev_id);
+		return -EBUSY;
+	}
+
+	diag = (*dev->dev_ops->dev_configure)(dev, dev_conf);
+	if (diag != 0)
+		RTE_DMADEV_LOG(ERR, "device %u dev_configure failed, ret = %d",
+			       dev_id, diag);
+	else
+		dev->attached = 1;
+
+	return diag;
+}
+
+int
+rte_dmadev_start(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+	if (dev->started != 0) {
+		RTE_DMADEV_LOG(ERR, "device %u already started", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_start == NULL)
+		goto mark_started;
+
+	diag = (*dev->dev_ops->dev_start)(dev);
+	if (diag != 0)
+		return diag;
+
+mark_started:
+	dev->started = 1;
+	return 0;
+}
+
+int
+rte_dmadev_stop(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+	int diag;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev->started == 0) {
+		RTE_DMADEV_LOG(ERR, "device %u already stopped", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_stop == NULL)
+		goto mark_stopped;
+
+	diag = (*dev->dev_ops->dev_stop)(dev);
+	if (diag != 0)
+		return diag;
+
+mark_stopped:
+	dev->started = 0;
+	return 0;
+}
+
+int
+rte_dmadev_close(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
+
+	/* Device must be stopped before it can be closed */
+	if (dev->started == 1) {
+		RTE_DMADEV_LOG(ERR, "device %u must be stopped before closing",
+			       dev_id);
+		return -EBUSY;
+	}
+
+	return (*dev->dev_ops->dev_close)(dev);
+}
+
+int
+rte_dmadev_reset(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_reset, -ENOTSUP);
+
+	/* Reset is not dependent on state of the device */
+	return (*dev->dev_ops->dev_reset)(dev);
+}
+
+int
+rte_dmadev_queue_setup(uint16_t dev_id,
+		       const struct rte_dmadev_queue_conf *conf)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(conf, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_setup, -ENOTSUP);
+
+	return (*dev->dev_ops->queue_setup)(dev, conf);
+}
+
+int
+rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_release, -ENOTSUP);
+
+	return (*dev->dev_ops->queue_release)(dev, vq_id);
+}
+
+int
+rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
+			  struct rte_dmadev_queue_info *info)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(info, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->queue_info_get, -ENOTSUP);
+
+	memset(info, 0, sizeof(struct rte_dmadev_queue_info));
+	return (*dev->dev_ops->queue_info_get)(dev, vq_id, info);
+}
+
+int
+rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
+		     struct rte_dmadev_stats *stats)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(stats, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
+
+	return (*dev->dev_ops->stats_get)(dev, vq_id, stats);
+}
+
+int
+rte_dmadev_stats_reset(uint16_t dev_id, int vq_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
+
+	return (*dev->dev_ops->stats_reset)(dev, vq_id);
+}
+
+static int
+xstats_get_count(uint16_t dev_id)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
+
+	return (*dev->dev_ops->xstats_get_names)(dev, NULL, 0);
+}
+
+int
+rte_dmadev_xstats_names_get(uint16_t dev_id,
+			    struct rte_dmadev_xstats_name *xstats_names,
+			    uint32_t size)
+{
+	struct rte_dmadev *dev;
+	int cnt_expected_entries;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	cnt_expected_entries = xstats_get_count(dev_id);
+
+	if (xstats_names == NULL || cnt_expected_entries < 0 ||
+	    (int)size < cnt_expected_entries || size == 0)
+		return cnt_expected_entries;
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get_names, -ENOTSUP);
+	return (*dev->dev_ops->xstats_get_names)(dev, xstats_names, size);
+}
+
+int
+rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
+		      uint64_t values[], uint32_t n)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(ids, -EINVAL);
+	RTE_FUNC_PTR_OR_ERR_RET(values, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_get, -ENOTSUP);
+
+	return (*dev->dev_ops->xstats_get)(dev, ids, values, n);
+}
+
+int
+rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->xstats_reset, -ENOTSUP);
+
+	return (*dev->dev_ops->xstats_reset)(dev, ids, nb_ids);
+}
+
+int
+rte_dmadev_selftest(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, -EINVAL);
+
+	dev = &rte_dmadevices[dev_id];
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_selftest, -ENOTSUP);
+
+	return (*dev->dev_ops->dev_selftest)(dev_id);
+}
+
+static inline uint16_t
+rte_dmadev_find_free_device_index(void)
+{
+	uint16_t i;
+
+	for (i = 0; i < RTE_DMADEV_MAX_DEVS; i++) {
+		if (rte_dmadevices[i].attached == RTE_DMADEV_DETACHED)
+			return i;
+	}
+
+	return RTE_DMADEV_MAX_DEVS;
+}
+
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name, size_t dev_priv_size, int socket_id)
+{
+	struct rte_dmadev *dev;
+	uint16_t dev_id;
+
+	if (rte_dmadev_get_dev_id(name) >= 0) {
+		RTE_DMADEV_LOG(ERR,
+			"device with name %s already allocated!", name);
+		return NULL;
+	}
+
+	dev_id = rte_dmadev_find_free_device_index();
+	if (dev_id == RTE_DMADEV_MAX_DEVS) {
+		RTE_DMADEV_LOG(ERR, "reached maximum number of DMA devices");
+		return NULL;
+	}
+
+	dev = &rte_dmadevices[dev_id];
+
+	if (dev_priv_size > 0) {
+		dev->dev_private = rte_zmalloc_socket("dmadev private",
+				     dev_priv_size,
+				     RTE_CACHE_LINE_SIZE,
+				     socket_id);
+		if (dev->dev_private == NULL) {
+			RTE_DMADEV_LOG(ERR,
+				"unable to allocate memory for dmadev");
+			return NULL;
+		}
+	}
+
+	dev->dev_id = dev_id;
+	dev->socket_id = socket_id;
+	dev->started = 0;
+	strlcpy(dev->name, name, RTE_DMADEV_NAME_MAX_LEN);
+
+	dev->attached = RTE_DMADEV_ATTACHED;
+
+	return dev;
+}
+
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev)
+{
+	int ret;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	ret = rte_dmadev_close(dev->dev_id);
+	if (ret != 0)
+		return ret;
+
+	if (dev->dev_private != NULL)
+		rte_free(dev->dev_private);
+
+	memset(dev, 0, sizeof(struct rte_dmadev));
+	dev->attached = RTE_DMADEV_DETACHED;
+
+	return 0;
+}
+
+RTE_LOG_REGISTER(libdmadev_logtype, lib.dmadev, INFO);
diff --git a/lib/dmadev/rte_dmadev.h b/lib/dmadev/rte_dmadev.h
new file mode 100644
index 0000000..f74fc6a
--- /dev/null
+++ b/lib/dmadev/rte_dmadev.h
@@ -0,0 +1,919 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_H_
+#define _RTE_DMADEV_H_
+
+/**
+ * @file rte_dmadev.h
+ *
+ * RTE DMA (Direct Memory Access) device APIs.
+ *
+ * The generic DMA device diagram:
+ *
+ *            ------------     ------------
+ *            | HW-queue |     | HW-queue |
+ *            ------------     ------------
+ *                   \            /
+ *                    \          /
+ *                     \        /
+ *                  ----------------
+ *                  |dma-controller|
+ *                  ----------------
+ *
+ *   The DMA could have multiple HW-queues, each HW-queue could have multiple
+ *   capabilities, e.g. whether to support fill operation, supported DMA
+ *   transfter direction and etc.
+ *
+ * The DMA framework is built on the following abstraction model:
+ *
+ *     ------------    ------------
+ *     |virt-queue|    |virt-queue|
+ *     ------------    ------------
+ *            \           /
+ *             \         /
+ *              \       /
+ *            ------------     ------------
+ *            | HW-queue |     | HW-queue |
+ *            ------------     ------------
+ *                   \            /
+ *                    \          /
+ *                     \        /
+ *                     ----------
+ *                     | dmadev |
+ *                     ----------
+ *
+ *   a) The DMA operation request must be submitted to the virt queue, virt
+ *      queues must be created based on HW queues, the DMA device could have
+ *      multiple HW queues.
+ *   b) The virt queues on the same HW-queue could represent different contexts,
+ *      e.g. user could create virt-queue-0 on HW-queue-0 for mem-to-mem
+ *      transfer scenario, and create virt-queue-1 on the same HW-queue for
+ *      mem-to-dev transfer scenario.
+ *   NOTE: user could also create multiple virt queues for mem-to-mem transfer
+ *         scenario as long as the corresponding driver supports.
+ *
+ * The control plane APIs include configure/queue_setup/queue_release/start/
+ * stop/reset/close, in order to start device work, the call sequence must be
+ * as follows:
+ *     - rte_dmadev_configure()
+ *     - rte_dmadev_queue_setup()
+ *     - rte_dmadev_start()
+ *
+ * The dataplane APIs include two parts:
+ *   a) The first part is the submission of operation requests:
+ *        - rte_dmadev_copy()
+ *        - rte_dmadev_copy_sg() - scatter-gather form of copy
+ *        - rte_dmadev_fill()
+ *        - rte_dmadev_fill_sg() - scatter-gather form of fill
+ *        - rte_dmadev_fence()   - add a fence force ordering between operations
+ *        - rte_dmadev_perform() - issue doorbell to hardware
+ *      These APIs could work with different virt queues which have different
+ *      contexts.
+ *      The first four APIs are used to submit the operation request to the virt
+ *      queue, if the submission is successful, a cookie (as type
+ *      'dma_cookie_t') is returned, otherwise a negative number is returned.
+ *   b) The second part is to obtain the result of requests:
+ *        - rte_dmadev_completed()
+ *            - return the number of operation requests completed successfully.
+ *        - rte_dmadev_completed_fails()
+ *            - return the number of operation requests failed to complete.
+ *
+ * The misc APIs include info_get/queue_info_get/stats/xstats/selftest, provide
+ * information query and self-test capabilities.
+ *
+ * About the dataplane APIs MT-safe, there are two dimensions:
+ *   a) For one virt queue, the submit/completion API could be MT-safe,
+ *      e.g. one thread do submit operation, another thread do completion
+ *      operation.
+ *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_VQ.
+ *      If driver don't support it, it's up to the application to guarantee
+ *      MT-safe.
+ *   b) For multiple virt queues on the same HW queue, e.g. one thread do
+ *      operation on virt-queue-0, another thread do operation on virt-queue-1.
+ *      If driver support it, then declare RTE_DMA_DEV_CAPA_MT_MVQ.
+ *      If driver don't support it, it's up to the application to guarantee
+ *      MT-safe.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+#include <rte_compat.h>
+
+/**
+ * dma_cookie_t - an opaque DMA cookie
+ *
+ * If dma_cookie_t is >=0 it's a DMA operation request cookie, <0 it's a error
+ * code.
+ * When using cookies, comply with the following rules:
+ * a) Cookies for each virtual queue are independent.
+ * b) For a virt queue, the cookie are monotonically incremented, when it reach
+ *    the INT_MAX, it wraps back to zero.
+ * c) The initial cookie of a virt queue is zero, after the device is stopped or
+ *    reset, the virt queue's cookie needs to be reset to zero.
+ * Example:
+ *    step-1: start one dmadev
+ *    step-2: enqueue a copy operation, the cookie return is 0
+ *    step-3: enqueue a copy operation again, the cookie return is 1
+ *    ...
+ *    step-101: stop the dmadev
+ *    step-102: start the dmadev
+ *    step-103: enqueue a copy operation, the cookie return is 0
+ *    ...
+ */
+typedef int32_t dma_cookie_t;
+
+/**
+ * dma_scatterlist - can hold scatter DMA operation request
+ */
+struct dma_scatterlist {
+	void *src;
+	void *dst;
+	uint32_t length;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the total number of DMA devices that have been successfully
+ * initialised.
+ *
+ * @return
+ *   The total number of usable DMA devices.
+ */
+__rte_experimental
+uint16_t
+rte_dmadev_count(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the device identifier for the named DMA device.
+ *
+ * @param name
+ *   DMA device name to select the DMA device identifier.
+ *
+ * @return
+ *   Returns DMA device identifier on success.
+ *   - <0: Failure to find named DMA device.
+ */
+__rte_experimental
+int
+rte_dmadev_get_dev_id(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Return the NUMA socket to which a device is connected.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   The NUMA socket id to which the device is connected or
+ *   a default of zero if the socket could not be determined.
+ *   - -EINVAL: dev_id value is out of range.
+ */
+__rte_experimental
+int
+rte_dmadev_socket_id(uint16_t dev_id);
+
+/**
+ * The capabilities of a DMA device
+ */
+#define RTE_DMA_DEV_CAPA_M2M	(1ull << 0) /**< Support mem-to-mem transfer */
+#define RTE_DMA_DEV_CAPA_M2D	(1ull << 1) /**< Support mem-to-dev transfer */
+#define RTE_DMA_DEV_CAPA_D2M	(1ull << 2) /**< Support dev-to-mem transfer */
+#define RTE_DMA_DEV_CAPA_D2D	(1ull << 3) /**< Support dev-to-dev transfer */
+#define RTE_DMA_DEV_CAPA_COPY	(1ull << 4) /**< Support copy ops */
+#define RTE_DMA_DEV_CAPA_FILL	(1ull << 5) /**< Support fill ops */
+#define RTE_DMA_DEV_CAPA_SG	(1ull << 6) /**< Support scatter-gather ops */
+#define RTE_DMA_DEV_CAPA_FENCE	(1ull << 7) /**< Support fence ops */
+#define RTE_DMA_DEV_CAPA_IOVA	(1ull << 8) /**< Support IOVA as DMA address */
+#define RTE_DMA_DEV_CAPA_VA	(1ull << 9) /**< Support VA as DMA address */
+#define RTE_DMA_DEV_CAPA_MT_VQ	(1ull << 10) /**< Support MT-safe of one virt queue */
+#define RTE_DMA_DEV_CAPA_MT_MVQ	(1ull << 11) /**< Support MT-safe of multiple virt queues */
+
+/**
+ * A structure used to retrieve the contextual information of
+ * an DMA device
+ */
+struct rte_dmadev_info {
+	/**
+	 * Fields filled by framewok
+	 */
+	struct rte_device *device; /**< Generic Device information */
+	const char *driver_name; /**< Device driver name */
+	int socket_id; /**< Socket ID where memory is allocated */
+
+	/**
+	 * Specification fields filled by driver
+	 */
+	uint64_t dev_capa; /**< Device capabilities (RTE_DMA_DEV_CAPA_) */
+	uint16_t max_hw_queues; /**< Maximum number of HW queues. */
+	uint16_t max_vqs_per_hw_queue;
+	/**< Maximum number of virt queues to allocate per HW queue */
+	uint16_t max_desc;
+	/**< Maximum allowed number of virt queue descriptors */
+	uint16_t min_desc;
+	/**< Minimum allowed number of virt queue descriptors */
+
+	/**
+	 * Status fields filled by driver
+	 */
+	uint16_t nb_hw_queues; /**< Number of HW queues configured */
+	uint16_t nb_vqs; /**< Number of virt queues configured */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve the contextual information of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @param[out] dev_info
+ *   A pointer to a structure of type *rte_dmadev_info* to be filled with the
+ *   contextual information of the device.
+ * @return
+ *   - =0: Success, driver updates the contextual information of the DMA device
+ *   - <0: Error code returned by the driver info get function.
+ *
+ */
+__rte_experimental
+int
+rte_dmadev_info_get(uint16_t dev_id, struct rte_dmadev_info *dev_info);
+
+/**
+ * dma_address_type
+ */
+enum dma_address_type {
+	DMA_ADDRESS_TYPE_IOVA, /**< Use IOVA as dma address */
+	DMA_ADDRESS_TYPE_VA, /**< Use VA as dma address */
+};
+
+/**
+ * A structure used to configure a DMA device.
+ */
+struct rte_dmadev_conf {
+	enum dma_address_type addr_type; /**< Address type to used */
+	uint16_t nb_hw_queues; /**< Number of HW-queues enable to use */
+	uint16_t max_vqs; /**< Maximum number of virt queues to use */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Configure a DMA device.
+ *
+ * This function must be invoked first before any other function in the
+ * API. This function can also be re-invoked when a device is in the
+ * stopped state.
+ *
+ * The caller may use rte_dmadev_info_get() to get the capability of each
+ * resources available for this DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device to configure.
+ * @param dev_conf
+ *   The DMA device configuration structure encapsulated into rte_dmadev_conf
+ *   object.
+ *
+ * @return
+ *   - =0: Success, device configured.
+ *   - <0: Error code returned by the driver configuration function.
+ */
+__rte_experimental
+int
+rte_dmadev_configure(uint16_t dev_id, const struct rte_dmadev_conf *dev_conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Start a DMA device.
+ *
+ * The device start step is the last one and consists of setting the DMA
+ * to start accepting jobs.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device started.
+ *   - <0: Error code returned by the driver start function.
+ */
+__rte_experimental
+int
+rte_dmadev_start(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Stop a DMA device.
+ *
+ * The device can be restarted with a call to rte_dmadev_start()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Success, device stopped.
+ *   - <0: Error code returned by the driver stop function.
+ */
+__rte_experimental
+int
+rte_dmadev_stop(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Close a DMA device.
+ *
+ * The device cannot be restarted after this call.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *  - =0: Successfully closing device
+ *  - <0: Failure to close device
+ */
+__rte_experimental
+int
+rte_dmadev_close(uint16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset a DMA device.
+ *
+ * This is different from cycle of rte_dmadev_start->rte_dmadev_stop in the
+ * sense similar to hard or soft reset.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - =0: Successful reset device.
+ *   - <0: Failure to reset device.
+ *   - (-ENOTSUP): If the device doesn't support this function.
+ */
+__rte_experimental
+int
+rte_dmadev_reset(uint16_t dev_id);
+
+/**
+ * dma_transfer_direction
+ */
+enum dma_transfer_direction {
+	DMA_MEM_TO_MEM,
+	DMA_MEM_TO_DEV,
+	DMA_DEV_TO_MEM,
+	DMA_DEV_TO_DEV,
+};
+
+/**
+ * A structure used to configure a DMA virt queue.
+ */
+struct rte_dmadev_queue_conf {
+	enum dma_transfer_direction direction;
+	/**< Associated transfer direction */
+	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
+	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
+	uint64_t dev_flags; /**< Device specific flags */
+	void *dev_ctx; /**< Device specific context */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Allocate and set up a virt queue.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param conf
+ *   The queue configuration structure encapsulated into rte_dmadev_queue_conf
+ *   object.
+ *
+ * @return
+ *   - >=0: Allocate virt queue success, it is virt queue id.
+ *   - <0: Error code returned by the driver queue setup function.
+ */
+__rte_experimental
+int
+rte_dmadev_queue_setup(uint16_t dev_id,
+		       const struct rte_dmadev_queue_conf *conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a virt queue.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue which return by queue setup.
+ *
+ * @return
+ *   - =0: Successful release the virt queue.
+ *   - <0: Error code returned by the driver queue release function.
+ */
+__rte_experimental
+int
+rte_dmadev_queue_release(uint16_t dev_id, uint16_t vq_id);
+
+/**
+ * A structure used to retrieve information of a DMA virt queue.
+ */
+struct rte_dmadev_queue_info {
+	enum dma_transfer_direction direction;
+	/**< Associated transfer direction */
+	uint16_t hw_queue_id; /**< The HW queue on which to create virt queue */
+	uint16_t nb_desc; /**< Number of descriptor for this virt queue */
+	uint64_t dev_flags; /**< Device specific flags */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve information of a DMA virt queue.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue which return by queue setup.
+ * @param[out] info
+ *   The queue info structure encapsulated into rte_dmadev_queue_info object.
+ *
+ * @return
+ *   - =0: Successful retrieve information.
+ *   - <0: Error code returned by the driver queue release function.
+ */
+__rte_experimental
+int
+rte_dmadev_queue_info_get(uint16_t dev_id, uint16_t vq_id,
+			  struct rte_dmadev_queue_info *info);
+
+#include "rte_dmadev_core.h"
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a copy operation onto the DMA virt queue.
+ *
+ * This queues up a copy operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param src
+ *   The address of the source buffer.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the data to be copied.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_copy(uint16_t dev_id, uint16_t vq_id, void *src, void *dst,
+		uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->copy)(dev, vq_id, src, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list copy operation onto the DMA virt queue.
+ *
+ * This queues up a scatter list copy operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param sg_len
+ *   The number of scatterlist elements.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_copy_sg(uint16_t dev_id, uint16_t vq_id,
+		   const struct dma_scatterlist *sg,
+		   uint32_t sg_len, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->copy_sg)(dev, vq_id, sg, sg_len, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a fill operation onto the DMA virt queue
+ *
+ * This queues up a fill operation to be performed by hardware, but does not
+ * trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the destination buffer.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_fill(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
+		void *dst, uint32_t length, uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->fill)(dev, vq_id, pattern, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter list fill operation onto the DMA virt queue
+ *
+ * This queues up a scatter list fill operation to be performed by hardware,
+ * but does not trigger hardware to begin that operation.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param sg
+ *   The pointer of scatterlist.
+ * @param sg_len
+ *   The number of scatterlist elements.
+ * @param flags
+ *   An opaque flags for this operation.
+ *
+ * @return
+ *   dma_cookie_t: please refer to the corresponding definition.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline dma_cookie_t
+rte_dmadev_fill_sg(uint16_t dev_id, uint16_t vq_id, uint64_t pattern,
+		   const struct dma_scatterlist *sg, uint32_t sg_len,
+		   uint64_t flags)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->fill_sg)(dev, vq_id, pattern, sg, sg_len, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Add a fence to force ordering between operations
+ *
+ * This adds a fence to a sequence of operations to enforce ordering, such that
+ * all operations enqueued before the fence must be completed before operations
+ * after the fence.
+ * NOTE: Since this fence may be added as a flag to the last operation enqueued,
+ * this API may not function correctly when called immediately after an
+ * "rte_dmadev_perform" call i.e. before any new operations are enqueued.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ *
+ * @return
+ *   - =0: Successful add fence.
+ *   - <0: Failure to add fence.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_fence(uint16_t dev_id, uint16_t vq_id)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->fence)(dev, vq_id);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger hardware to begin performing enqueued operations
+ *
+ * This API is used to write the "doorbell" to the hardware to trigger it
+ * to begin the operations previously enqueued by rte_dmadev_copy/fill()
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ *
+ * @return
+ *   - =0: Successful trigger hardware.
+ *   - <0: Failure to trigger hardware.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline int
+rte_dmadev_perform(uint16_t dev_id, uint16_t vq_id)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->perform)(dev, vq_id);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that have been successful completed.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param nb_cpls
+ *   The maximum number of completed operations that can be processed.
+ * @param[out] cookie
+ *   The last completed operation's cookie.
+ * @param[out] has_error
+ *   Indicates if there are transfer error.
+ *
+ * @return
+ *   The number of operations that successful completed.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed(uint16_t dev_id, uint16_t vq_id, const uint16_t nb_cpls,
+		     dma_cookie_t *cookie, bool *has_error)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	has_error = false;
+	return (*dev->completed)(dev, vq_id, nb_cpls, cookie, has_error);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Returns the number of operations that failed to complete.
+ * NOTE: This API was used when rte_dmadev_completed has_error was set.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue.
+ * @param nb_status
+ *   Indicates the size of status array.
+ * @param[out] status
+ *   The error code of operations that failed to complete.
+ * @param[out] cookie
+ *   The last failed completed operation's cookie.
+ *
+ * @return
+ *   The number of operations that failed to complete.
+ *
+ * NOTE: The caller must ensure that the input parameter is valid and the
+ *       corresponding device supports the operation.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dmadev_completed_fails(uint16_t dev_id, uint16_t vq_id,
+			   const uint16_t nb_status, uint32_t *status,
+			   dma_cookie_t *cookie)
+{
+	struct rte_dmadev *dev = &rte_dmadevices[dev_id];
+	return (*dev->completed_fails)(dev, vq_id, nb_status, status, cookie);
+}
+
+struct rte_dmadev_stats {
+	uint64_t enqueue_fail_count;
+	/**< Conut of all operations which failed enqueued */
+	uint64_t enqueued_count;
+	/**< Count of all operations which successful enqueued */
+	uint64_t completed_fail_count;
+	/**< Count of all operations which failed to complete */
+	uint64_t completed_count;
+	/**< Count of all operations which successful complete */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve basic statistics of a or all DMA virt queue(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue, -1 means all virt queues.
+ * @param[out] stats
+ *   The basic statistics structure encapsulated into rte_dmadev_stats
+ *   object.
+ *
+ * @return
+ *   - =0: Successful retrieve stats.
+ *   - <0: Failure to retrieve stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_get(uint16_t dev_id, int vq_id,
+		     struct rte_dmadev_stats *stats);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset basic statistics of a or all DMA virt queue(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vq_id
+ *   The identifier of virt queue, -1 means all virt queues.
+ *
+ * @return
+ *   - =0: Successful retrieve stats.
+ *   - <0: Failure to retrieve stats.
+ */
+__rte_experimental
+int
+rte_dmadev_stats_reset(uint16_t dev_id, int vq_id);
+
+/** Maximum name length for extended statistics counters */
+#define RTE_DMA_DEV_XSTATS_NAME_SIZE 64
+
+/**
+ * A name-key lookup element for extended statistics.
+ *
+ * This structure is used to map between names and ID numbers
+ * for extended ethdev statistics.
+ */
+struct rte_dmadev_xstats_name {
+	char name[RTE_DMA_DEV_XSTATS_NAME_SIZE];
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve names of extended statistics of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param[out] xstats_names
+ *   Block of memory to insert names into. Must be at least size in capacity.
+ *   If set to NULL, function returns required capacity.
+ * @param size
+ *   Capacity of xstats_names (number of names).
+ * @return
+ *   - positive value lower or equal to size: success. The return value
+ *     is the number of entries filled in the stats table.
+ *   - positive value higher than size: error, the given statistics table
+ *     is too small. The return value corresponds to the size that should
+ *     be given to succeed. The entries in the table are not valid and
+ *     shall not be used by the caller.
+ *   - negative value on error.
+ */
+__rte_experimental
+int
+rte_dmadev_xstats_names_get(uint16_t dev_id,
+			    struct rte_dmadev_xstats_name *xstats_names,
+			    uint32_t size);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve extended statistics of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param ids
+ *   The id numbers of the stats to get. The ids can be got from the stat
+ *   position in the stat list from rte_dmadev_get_xstats_names().
+ * @param[out] values
+ *   The values for each stats request by ID.
+ * @param n
+ *   The number of stats requested.
+ *
+ * @return
+ *   - positive value: number of stat entries filled into the values array.
+ *   - negative value on error.
+ */
+__rte_experimental
+int
+rte_dmadev_xstats_get(uint16_t dev_id, const uint32_t ids[],
+		      uint64_t values[], uint32_t n);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset the values of the xstats of the selected component in the device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param ids
+ *   Selects specific statistics to be reset. When NULL, all statistics
+ *   will be reset. If non-NULL, must point to array of at least
+ *   *nb_ids* size.
+ * @param nb_ids
+ *   The number of ids available from the *ids* array. Ignored when ids is NULL.
+ *
+ * @return
+ *   - zero: successfully reset the statistics to zero.
+ *   - negative value on error.
+ */
+__rte_experimental
+int
+rte_dmadev_xstats_reset(uint16_t dev_id, const uint32_t ids[], uint32_t nb_ids);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger the dmadev self test.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   - 0: Selftest successful.
+ *   - -ENOTSUP if the device doesn't support selftest
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+int
+rte_dmadev_selftest(uint16_t dev_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_H_ */
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
new file mode 100644
index 0000000..a3afea2
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -0,0 +1,98 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_CORE_H_
+#define _RTE_DMADEV_CORE_H_
+
+/**
+ * @file
+ *
+ * RTE DMA Device internal header.
+ *
+ * This header contains internal data types. But they are still part of the
+ * public API because they are used by inline public functions.
+ */
+
+struct rte_dmadev;
+
+typedef dma_cookie_t (*dmadev_copy_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				      void *src, void *dst,
+				      uint32_t length, uint64_t flags);
+/**< @internal Function used to enqueue a copy operation. */
+
+typedef dma_cookie_t (*dmadev_copy_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
+					 const struct dma_scatterlist *sg,
+					 uint32_t sg_len, uint64_t flags);
+/**< @internal Function used to enqueue a scatter list copy operation. */
+
+typedef dma_cookie_t (*dmadev_fill_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				      uint64_t pattern, void *dst,
+				      uint32_t length, uint64_t flags);
+/**< @internal Function used to enqueue a fill operation. */
+
+typedef dma_cookie_t (*dmadev_fill_sg_t)(struct rte_dmadev *dev, uint16_t vq_id,
+			uint64_t pattern, const struct dma_scatterlist *sg,
+			uint32_t sg_len, uint64_t flags);
+/**< @internal Function used to enqueue a scatter list fill operation. */
+
+typedef int (*dmadev_fence_t)(struct rte_dmadev *dev, uint16_t vq_id);
+/**< @internal Function used to add a fence ordering between operations. */
+
+typedef int (*dmadev_perform_t)(struct rte_dmadev *dev, uint16_t vq_id);
+/**< @internal Function used to trigger hardware to begin performing. */
+
+typedef uint16_t (*dmadev_completed_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				       const uint16_t nb_cpls,
+				       dma_cookie_t *cookie, bool *has_error);
+/**< @internal Function used to return number of successful completed operations */
+
+typedef uint16_t (*dmadev_completed_fails_t)(struct rte_dmadev *dev,
+			uint16_t vq_id, const uint16_t nb_status,
+			uint32_t *status, dma_cookie_t *cookie);
+/**< @internal Function used to return number of failed completed operations */
+
+#define RTE_DMADEV_NAME_MAX_LEN	64 /**< Max length of name of DMA PMD */
+
+struct rte_dmadev_ops;
+
+/**
+ * The data structure associated with each DMA device.
+ */
+struct rte_dmadev {
+	/**< Enqueue a copy operation onto the DMA device. */
+	dmadev_copy_t copy;
+	/**< Enqueue a scatter list copy operation onto the DMA device. */
+	dmadev_copy_sg_t copy_sg;
+	/**< Enqueue a fill operation onto the DMA device. */
+	dmadev_fill_t fill;
+	/**< Enqueue a scatter list fill operation onto the DMA device. */
+	dmadev_fill_sg_t fill_sg;
+	/**< Add a fence to force ordering between operations. */
+	dmadev_fence_t fence;
+	/**< Trigger hardware to begin performing enqueued operations. */
+	dmadev_perform_t perform;
+	/**< Returns the number of operations that successful completed. */
+	dmadev_completed_t completed;
+	/**< Returns the number of operations that failed to complete. */
+	dmadev_completed_fails_t completed_fails;
+
+	void *dev_private; /**< PMD-specific private data */
+	const struct rte_dmadev_ops *dev_ops; /**< Functions exported by PMD */
+
+	uint16_t dev_id; /**< Device ID for this instance */
+	int socket_id; /**< Socket ID where memory is allocated */
+	struct rte_device *device;
+	/**< Device info. supplied during device initialization */
+	const char *driver_name; /**< Driver info. supplied by probing */
+	char name[RTE_DMADEV_NAME_MAX_LEN]; /**< Device name */
+
+	RTE_STD_C11
+	uint8_t attached : 1; /**< Flag indicating the device is attached */
+	uint8_t started : 1; /**< Device state: STARTED(1)/STOPPED(0) */
+
+} __rte_cache_aligned;
+
+extern struct rte_dmadev rte_dmadevices[];
+
+#endif /* _RTE_DMADEV_CORE_H_ */
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
new file mode 100644
index 0000000..ef03cf7
--- /dev/null
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -0,0 +1,210 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2021 HiSilicon Limited.
+ */
+
+#ifndef _RTE_DMADEV_PMD_H_
+#define _RTE_DMADEV_PMD_H_
+
+/** @file
+ * RTE DMA PMD APIs
+ *
+ * @note
+ * Driver facing APIs for a DMA device. These are not to be called directly by
+ * any application.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include <rte_dev.h>
+#include <rte_log.h>
+#include <rte_common.h>
+
+#include "rte_dmadev.h"
+
+extern int libdmadev_logtype;
+
+#define RTE_DMADEV_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, libdmadev_logtype, "%s(): " fmt "\n", \
+		__func__, ##args)
+
+/* Macros to check for valid device */
+#define RTE_DMADEV_VALID_DEVID_OR_ERR_RET(dev_id, retval) do { \
+	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
+		return retval; \
+	} \
+} while (0)
+
+#define RTE_DMADEV_VALID_DEVID_OR_RET(dev_id) do { \
+	if (!rte_dmadev_pmd_is_valid_dev((dev_id))) { \
+		RTE_DMADEV_LOG(ERR, "Invalid dev_id=%d", dev_id); \
+		return; \
+	} \
+} while (0)
+
+#define RTE_DMADEV_DETACHED  0
+#define RTE_DMADEV_ATTACHED  1
+
+/**
+ * Validate if the DMA device index is a valid attached DMA device.
+ *
+ * @param dev_id
+ *   DMA device index.
+ *
+ * @return
+ *   - If the device index is valid (1) or not (0).
+ */
+static inline unsigned
+rte_dmadev_pmd_is_valid_dev(uint16_t dev_id)
+{
+	struct rte_dmadev *dev;
+
+	if (dev_id >= RTE_DMADEV_MAX_DEVS)
+		return 0;
+
+	dev = &rte_dmadevices[dev_id];
+	if (dev->attached != RTE_DMADEV_ATTACHED)
+		return 0;
+	else
+		return 1;
+}
+
+/**
+ * Definitions of control-plane functions exported by a driver through the
+ * generic structure of type *rte_dmadev_ops* supplied in the *rte_dmadev*
+ * structure associated with a device.
+ */
+
+typedef int (*dmadev_info_get_t)(struct rte_dmadev *dev,
+				 struct rte_dmadev_info *dev_info);
+/**< @internal Function used to get device information of a device. */
+
+typedef int (*dmadev_configure_t)(struct rte_dmadev *dev,
+				  const struct rte_dmadev_conf *dev_conf);
+/**< @internal Function used to configure a device. */
+
+typedef int (*dmadev_start_t)(struct rte_dmadev *dev);
+/**< @internal Function used to start a configured device. */
+
+typedef int (*dmadev_stop_t)(struct rte_dmadev *dev);
+/**< @internal Function used to stop a configured device. */
+
+typedef int (*dmadev_close_t)(struct rte_dmadev *dev);
+/**< @internal Function used to close a configured device. */
+
+typedef int (*dmadev_reset_t)(struct rte_dmadev *dev);
+/**< @internal Function used to reset a configured device. */
+
+typedef int (*dmadev_queue_setup_t)(struct rte_dmadev *dev,
+				    const struct rte_dmadev_queue_conf *conf);
+/**< @internal Function used to allocate and set up a virt queue. */
+
+typedef int (*dmadev_queue_release_t)(struct rte_dmadev *dev, uint16_t vq_id);
+/**< @internal Function used to release a virt queue. */
+
+typedef int (*dmadev_queue_info_t)(struct rte_dmadev *dev, uint16_t vq_id,
+				   struct rte_dmadev_queue_info *info);
+/**< @internal Function used to retrieve information of a virt queue. */
+
+typedef int (*dmadev_stats_get_t)(struct rte_dmadev *dev, int vq_id,
+				  struct rte_dmadev_stats *stats);
+/**< @internal Function used to retrieve basic statistics. */
+
+typedef int (*dmadev_stats_reset_t)(struct rte_dmadev *dev, int vq_id);
+/**< @internal Function used to reset basic statistics. */
+
+typedef int (*dmadev_xstats_get_names_t)(const struct rte_dmadev *dev,
+		struct rte_dmadev_xstats_name *xstats_names,
+		uint32_t size);
+/**< @internal Function used to get names of extended stats. */
+
+typedef int (*dmadev_xstats_get_t)(const struct rte_dmadev *dev,
+		const uint32_t ids[], uint64_t values[], uint32_t n);
+/**< @internal Function used to retrieve extended stats. */
+
+typedef int (*dmadev_xstats_reset_t)(struct rte_dmadev *dev,
+				     const uint32_t ids[], uint32_t nb_ids);
+/**< @internal Function used to reset extended stats. */
+
+typedef int (*dmadev_selftest_t)(uint16_t dev_id);
+/**< @internal Function used to start dmadev selftest. */
+
+/** DMA device operations function pointer table */
+struct rte_dmadev_ops {
+	/**< Get device info. */
+	dmadev_info_get_t dev_info_get;
+	/**< Configure device. */
+	dmadev_configure_t dev_configure;
+	/**< Start device. */
+	dmadev_start_t dev_start;
+	/**< Stop device. */
+	dmadev_stop_t dev_stop;
+	/**< Close device. */
+	dmadev_close_t dev_close;
+	/**< Reset device. */
+	dmadev_reset_t dev_reset;
+
+	/**< Allocate and set up a virt queue. */
+	dmadev_queue_setup_t queue_setup;
+	/**< Release a virt queue. */
+	dmadev_queue_release_t queue_release;
+	/**< Retrieve information of a virt queue */
+	dmadev_queue_info_t queue_info_get;
+
+	/**< Get basic statistics. */
+	dmadev_stats_get_t stats_get;
+	/**< Reset basic statistics. */
+	dmadev_stats_reset_t stats_reset;
+	/**< Get names of extended stats. */
+	dmadev_xstats_get_names_t xstats_get_names;
+	/**< Get extended statistics. */
+	dmadev_xstats_get_t xstats_get;
+	/**< Reset extended statistics values. */
+	dmadev_xstats_reset_t xstats_reset;
+
+	/**< Device selftest function */
+	dmadev_selftest_t dev_selftest;
+};
+
+/**
+ * Allocates a new dmadev slot for an DMA device and returns the pointer
+ * to that slot for the driver to use.
+ *
+ * @param name
+ *   Unique identifier name for each device
+ * @param dev_private_size
+ *   Size of private data memory allocated within rte_dmadev object.
+ *   Set to 0 to disable internal memory allocation and allow for
+ *   self-allocation.
+ * @param socket_id
+ *   Socket to allocate resources on.
+ *
+ * @return
+ *   - NULL: Failure to allocate
+ *   - Other: The rte_dmadev structure pointer for the new device
+ */
+struct rte_dmadev *
+rte_dmadev_pmd_allocate(const char *name, size_t dev_private_size,
+			int socket_id);
+
+/**
+ * Release the specified dmadev device.
+ *
+ * @param dev
+ *   The *dmadev* pointer is the address of the *rte_dmadev* structure.
+ *
+ * @return
+ *   - 0 on success, negative on error
+ */
+int
+rte_dmadev_pmd_release(struct rte_dmadev *dev);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DMADEV_PMD_H_ */
diff --git a/lib/dmadev/version.map b/lib/dmadev/version.map
new file mode 100644
index 0000000..383b3ca
--- /dev/null
+++ b/lib/dmadev/version.map
@@ -0,0 +1,32 @@ 
+EXPERIMENTAL {
+	global:
+
+	rte_dmadev_count;
+	rte_dmadev_get_dev_id;
+	rte_dmadev_socket_id;
+	rte_dmadev_info_get;
+	rte_dmadev_configure;
+	rte_dmadev_start;
+	rte_dmadev_stop;
+	rte_dmadev_close;
+	rte_dmadev_reset;
+	rte_dmadev_queue_setup;
+	rte_dmadev_queue_release;
+	rte_dmadev_queue_info_get;
+	rte_dmadev_copy;
+	rte_dmadev_copy_sg;
+	rte_dmadev_fill;
+	rte_dmadev_fill_sg;
+	rte_dmadev_fence;
+	rte_dmadev_perform;
+	rte_dmadev_completed;
+	rte_dmadev_completed_fails;
+	rte_dmadev_stats_get;
+	rte_dmadev_stats_reset;
+	rte_dmadev_xstats_names_get;
+	rte_dmadev_xstats_get;
+	rte_dmadev_xstats_reset;
+	rte_dmadev_selftest;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index 1673ca4..68d239f 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -60,6 +60,7 @@  libraries = [
         'bpf',
         'graph',
         'node',
+        'dmadev',
 ]
 
 if is_windows