diff mbox series

[2/3] app/compress-perf: add performance measurement

Message ID	1538400427-20164-3-git-send-email-tomaszx.jozwiak@intel.com (mailing list archive)
State	Superseded, archived
Delegated to:	akhil goyal
Headers	From: Tomasz Jozwiak <tomaszx.jozwiak@intel.com> To: dev@dpdk.org, fiona.trahe@intel.com, tomaszx.jozwiak@intel.com, akhil.goyal@nxp.com, pablo.de.lara.guarch@intel.com Cc: De@dpdk.org, Lara@dpdk.org, Guarch@dpdk.org Date: Mon, 1 Oct 2018 15:27:06 +0200 Message-Id: <1538400427-20164-3-git-send-email-tomaszx.jozwiak@intel.com> In-Reply-To: <1538400427-20164-1-git-send-email-tomaszx.jozwiak@intel.com> References: <1538400427-20164-1-git-send-email-tomaszx.jozwiak@intel.com> Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	app: add initial version of compress-perf \| [0/3] app: add initial version of compress-perf [1/3] app/compress-perf: add parser [2/3] app/compress-perf: add performance measurement [3/3] doc/guides/tools: add doc file

Checks

Context	Check	Description
ci/checkpatch	warning	coding style issues
ci/Intel-compilation	success	Compilation OK

Commit Message

Tomasz Jozwiak Oct. 1, 2018, 1:27 p.m. UTC

  Added performance measurement part into compression perf. test.

Signed-off-by: De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
---
 app/test-compress-perf/main.c | 844 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 844 insertions(+)

Comments

Verma, Shally Oct. 12, 2018, 10:15 a.m. UTC | #1

HI TomaszX

Sorry for delay in response. Comments inline.

>-----Original Message-----
>From: dev <dev-bounces@dpdk.org> On Behalf Of Tomasz Jozwiak
>Sent: 01 October 2018 18:57
>To: dev@dpdk.org; fiona.trahe@intel.com; tomaszx.jozwiak@intel.com; akhil.goyal@nxp.com; pablo.de.lara.guarch@intel.com
>Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Added performance measurement part into compression perf. test.
>
>Signed-off-by: De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
>Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
>---
> app/test-compress-perf/main.c | 844 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 844 insertions(+)
>
>diff --git a/app/test-compress-perf/main.c b/app/test-compress-perf/main.c
>index f52b98d..093dfaf 100644
>--- a/app/test-compress-perf/main.c
>+++ b/app/test-compress-perf/main.c
>@@ -5,13 +5,721 @@
> #include <rte_malloc.h>
> #include <rte_eal.h>
> #include <rte_log.h>
>+#include <rte_cycles.h>
> #include <rte_compressdev.h>
>
> #include "comp_perf_options.h"
>
>+#define NUM_MAX_XFORMS 16
>+#define NUM_MAX_INFLIGHT_OPS 512
>+#define EXPANSE_RATIO 1.05
>+#define MIN_ISAL_SIZE 8
>+
>+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
>+
>+static int
>+param_range_check(uint16_t size, const struct rte_param_log2_range *range)
>+{
>+       unsigned int next_size;
>+
>+       /* Check lower/upper bounds */
>+       if (size < range->min)
>+               return -1;
>+
>+       if (size > range->max)
>+               return -1;
>+
>+       /* If range is actually only one value, size is correct */
>+       if (range->increment == 0)
>+               return 0;
>+
>+       /* Check if value is one of the supported sizes */
>+       for (next_size = range->min; next_size <= range->max;
>+                       next_size += range->increment)
>+               if (size == next_size)
>+                       return 0;
>+
>+       return -1;
>+}
>+
>+static int
>+comp_perf_check_capabilities(struct comp_test_data *test_data)
>+{
>+       const struct rte_compressdev_capabilities *cap;
>+
>+       cap = rte_compressdev_capability_get(test_data->cdev_id,
>+                                            RTE_COMP_ALGO_DEFLATE);
>+
>+       if (cap == NULL) {
>+               RTE_LOG(ERR, USER1,
>+                       "Compress device does not support DEFLATE\n");
>+               return -1;
>+       }
>+
>+       uint64_t comp_flags = cap->comp_feature_flags;
>+
>+       /* Huffman enconding */
>+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
>+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
>+               RTE_LOG(ERR, USER1,
>+                       "Compress device does not supported Fixed Huffman\n");
>+               return -1;
>+       }
>+
>+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC &&
>+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0) {
>+               RTE_LOG(ERR, USER1,
>+                       "Compress device does not supported Dynamic Huffman\n");
>+               return -1;
>+       }
>+
>+       /* Window size */
>+       if (test_data->window_sz != -1) {
>+               if (param_range_check(test_data->window_sz, &cap->window_size)
What if cap->window_size is 0 i.e. implementation default?

>+                               < 0) {
>+                       RTE_LOG(ERR, USER1,
>+                               "Compress device does not support "
>+                               "this window size\n");
>+                       return -1;
>+               }
>+       } else
>+               /* Set window size to PMD maximum if none was specified */
>+               test_data->window_sz = cap->window_size.max;
>+
>+       /* Check if chained mbufs is supported */
>+       if (test_data->max_sgl_segs > 1  &&
>+                       (comp_flags & RTE_COMP_FF_OOP_SGL_IN_SGL_OUT) == 0) {
>+               RTE_LOG(INFO, USER1, "Compress device does not support "
>+                               "chained mbufs. Max SGL segments set to 1\n");
>+               test_data->max_sgl_segs = 1;
>+       }
>+
>+       /* Level 0 support */
>+       if (test_data->level.min == 0 &&
>+                       (comp_flags & RTE_COMP_FF_NONCOMPRESSED_BLOCKS) == 0) {
>+               RTE_LOG(ERR, USER1, "Compress device does not support "
>+                               "level 0 (no compression)\n");
>+               return -1;
>+       }
>+
>+       return 0;
>+}
>+
>+static int
>+comp_perf_allocate_memory(struct comp_test_data *test_data)
>+{
>+       /* Number of segments for input and output
>+        * (compression and decompression)
>+        */
>+       uint32_t total_segs = DIV_CEIL(test_data->input_data_sz,
>+                       test_data->seg_sz);
>+       test_data->comp_buf_pool = rte_pktmbuf_pool_create("comp_buf_pool",
>+                               total_segs,
>+                               0, 0, test_data->seg_sz + RTE_PKTMBUF_HEADROOM,
>+                               rte_socket_id());
>+       if (test_data->comp_buf_pool == NULL) {
>+               RTE_LOG(ERR, USER1, "Mbuf mempool could not be created\n");
>+               return -1;
>+       }
>+
>+       test_data->decomp_buf_pool = rte_pktmbuf_pool_create("decomp_buf_pool",
>+                               total_segs,
>+                               0, 0, test_data->seg_sz + RTE_PKTMBUF_HEADROOM,
>+                               rte_socket_id());
>+       if (test_data->decomp_buf_pool == NULL) {
>+               RTE_LOG(ERR, USER1, "Mbuf mempool could not be created\n");
>+               return -1;
>+       }
>+
>+       test_data->total_bufs = DIV_CEIL(total_segs, test_data->max_sgl_segs);
>+
>+       test_data->op_pool = rte_comp_op_pool_create("op_pool",
>+                                 test_data->total_bufs,
>+                                 0, 0, rte_socket_id());
>+       if (test_data->op_pool == NULL) {
>+               RTE_LOG(ERR, USER1, "Comp op mempool could not be created\n");
>+               return -1;
>+       }
>+
>+       /*
>+        * Compressed data might be a bit larger than input data,
>+        * if data cannot be compressed
Possible only if it's zlib format right? Or deflate as well?

>+        */
>+       test_data->compressed_data = rte_zmalloc_socket(NULL,
>+                               test_data->input_data_sz * EXPANSE_RATIO
>+                                                       + MIN_ISAL_SIZE, 0,
>+                               rte_socket_id());
>+       if (test_data->compressed_data == NULL) {
>+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
>+                               "file could not be allocated\n");
>+               return -1;
>+       }
>+
>+       test_data->decompressed_data = rte_zmalloc_socket(NULL,
>+                               test_data->input_data_sz, 0,
>+                               rte_socket_id());
>+       if (test_data->decompressed_data == NULL) {
>+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
>+                               "file could not be allocated\n");
>+               return -1;
>+       }
>+
>+       test_data->comp_bufs = rte_zmalloc_socket(NULL,
>+                       test_data->total_bufs * sizeof(struct rte_mbuf *),
>+                       0, rte_socket_id());
>+       if (test_data->comp_bufs == NULL) {
>+               RTE_LOG(ERR, USER1, "Memory to hold the compression mbufs"
>+                               " could not be allocated\n");
>+               return -1;
>+       }
>+
>+       test_data->decomp_bufs = rte_zmalloc_socket(NULL,
>+                       test_data->total_bufs * sizeof(struct rte_mbuf *),
>+                       0, rte_socket_id());
>+       if (test_data->decomp_bufs == NULL) {
>+               RTE_LOG(ERR, USER1, "Memory to hold the decompression mbufs"
>+                               " could not be allocated\n");
>+               return -1;
>+       }
>+       return 0;
>+}
>+
>+static int
>+comp_perf_dump_input_data(struct comp_test_data *test_data)
>+{
>+       FILE *f = fopen(test_data->input_file, "r");
>+
>+       if (f == NULL) {
>+               RTE_LOG(ERR, USER1, "Input file could not be opened\n");
>+               return -1;
>+       }
>+
>+       if (fseek(f, 0, SEEK_END) != 0) {
>+               RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
>+               goto err;
>+       }
>+       size_t actual_file_sz = ftell(f);
>+       /* If extended input data size has not been set,
>+        * input data size = file size
>+        */
>+
>+       if (test_data->input_data_sz == 0)
>+               test_data->input_data_sz = actual_file_sz;
>+
>+       if (fseek(f, 0, SEEK_SET) != 0) {
>+               RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
>+               goto err;
>+       }
>+
>+       test_data->input_data = rte_zmalloc_socket(NULL,
>+                               test_data->input_data_sz, 0, rte_socket_id());
>+
>+       if (test_data->input_data == NULL) {
>+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
>+                               "file could not be allocated\n");
>+               goto err;
>+       }
>+
>+       size_t remaining_data = test_data->input_data_sz;
>+       uint8_t *data = test_data->input_data;
>+
>+       while (remaining_data > 0) {
>+               size_t data_to_read = RTE_MIN(remaining_data, actual_file_sz);
>+
>+               if (fread(data, data_to_read, 1, f) != 1) {
>+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
>+                       goto err;
>+               }
>+               if (fseek(f, 0, SEEK_SET) != 0) {
>+                       RTE_LOG(ERR, USER1,
>+                               "Size of input could not be calculated\n");
>+                       goto err;
>+               }
>+               remaining_data -= data_to_read;
>+               data += data_to_read;
It looks like it will run 2nd time only if input file size < input data size in which case it will just keep filling input buffer with repeated data. 
Is that the intention here?

>+       }
>+
>+       if (test_data->input_data_sz > actual_file_sz)
>+               RTE_LOG(INFO, USER1,
>+                 "%zu bytes read from file %s, extending the file %.2f times\n",
>+                       test_data->input_data_sz, test_data->input_file,
>+                       (double)test_data->input_data_sz/actual_file_sz);
>+       else
>+               RTE_LOG(INFO, USER1,
>+                       "%zu bytes read from file %s\n",
>+                       test_data->input_data_sz, test_data->input_file);
>+
>+       fclose(f);
>+
>+       return 0;
>+
>+err:
>+       fclose(f);
>+       rte_free(test_data->input_data);
>+       test_data->input_data = NULL;
>+
>+       return -1;
>+}
>+
>+static int
>+comp_perf_initialize_compressdev(struct comp_test_data *test_data)
>+{
>+       uint8_t enabled_cdev_count;
>+       uint8_t enabled_cdevs[RTE_COMPRESS_MAX_DEVS];
>+
>+       enabled_cdev_count = rte_compressdev_devices_get(test_data->driver_name,
>+                       enabled_cdevs, RTE_COMPRESS_MAX_DEVS);
>+       if (enabled_cdev_count == 0) {
>+               RTE_LOG(ERR, USER1, "No compress devices type %s available\n",
>+                               test_data->driver_name);
>+               return -EINVAL;
>+       }
>+
>+       if (enabled_cdev_count > 1)
>+               RTE_LOG(INFO, USER1,
>+                       "Only the first compress device will be used\n");
>+
>+       test_data->cdev_id = enabled_cdevs[0];
>+
>+       if (comp_perf_check_capabilities(test_data) < 0)
>+               return -1;
>+
>+       /* Configure compressdev (one device, one queue pair) */
>+       struct rte_compressdev_config config = {
>+               .socket_id = rte_socket_id(),
>+               .nb_queue_pairs = 1,
>+               .max_nb_priv_xforms = NUM_MAX_XFORMS,
>+               .max_nb_streams = 0
>+       };
>+
>+       if (rte_compressdev_configure(test_data->cdev_id, &config) < 0) {
>+               RTE_LOG(ERR, USER1, "Device configuration failed\n");
>+               return -1;
>+       }
>+
>+       if (rte_compressdev_queue_pair_setup(test_data->cdev_id, 0,
>+                       NUM_MAX_INFLIGHT_OPS, rte_socket_id()) < 0) {
>+               RTE_LOG(ERR, USER1, "Queue pair setup failed\n");
>+               return -1;
>+       }
>+
>+       if (rte_compressdev_start(test_data->cdev_id) < 0) {
>+               RTE_LOG(ERR, USER1, "Device could not be started\n");
>+               return -1;
>+       }
>+
>+       return 0;
>+}
>+
>+static int
>+prepare_bufs(struct comp_test_data *test_data)
>+{
>+       uint32_t remaining_data = test_data->input_data_sz;
>+       uint8_t *input_data_ptr = test_data->input_data;
>+       size_t data_sz;
>+       uint8_t *data_addr;
>+       uint32_t i, j;
>+
>+       for (i = 0; i < test_data->total_bufs; i++) {
>+               /* Allocate data in input mbuf and copy data from input file */
>+               test_data->decomp_bufs[i] =
>+                       rte_pktmbuf_alloc(test_data->decomp_buf_pool);
>+               if (test_data->decomp_bufs[i] == NULL) {
>+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
>+                       return -1;
>+               }
>+
>+               data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
>+               data_addr = (uint8_t *) rte_pktmbuf_append(
>+                                       test_data->decomp_bufs[i], data_sz);
>+               if (data_addr == NULL) {
>+                       RTE_LOG(ERR, USER1, "Could not append data\n");
>+                       return -1;
>+               }
>+               rte_memcpy(data_addr, input_data_ptr, data_sz);
>+
>+               input_data_ptr += data_sz;
>+               remaining_data -= data_sz;
>+
>+               /* Already one segment in the mbuf */
>+               uint16_t segs_per_mbuf = 1;
>+
>+               /* Chain mbufs if needed for input mbufs */
>+               while (segs_per_mbuf < test_data->max_sgl_segs
>+                               && remaining_data > 0) {
>+                       struct rte_mbuf *next_seg =
>+                               rte_pktmbuf_alloc(test_data->decomp_buf_pool);
>+
>+                       if (next_seg == NULL) {
>+                               RTE_LOG(ERR, USER1,
>+                                       "Could not allocate mbuf\n");
>+                               return -1;
>+                       }
>+
>+                       data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
>+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
>+                               data_sz);
>+
>+                       if (data_addr == NULL) {
>+                               RTE_LOG(ERR, USER1, "Could not append data\n");
Since a new buffer per segment is allocated, so is it possible for append to fail? think, this check is redundant here.
>+                               return -1;
>+                       }
>+
>+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
>+                       input_data_ptr += data_sz;
>+                       remaining_data -= data_sz;
>+
>+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
>+                                       next_seg) < 0) {
>+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>+                               return -1;
>+                       }
>+                       segs_per_mbuf++;
>+               }
>+
>+               /* Allocate data in output mbuf */
>+               test_data->comp_bufs[i] =
>+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
>+               if (test_data->comp_bufs[i] == NULL) {
>+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
>+                       return -1;
>+               }
>+               data_addr = (uint8_t *) rte_pktmbuf_append(
>+                                       test_data->comp_bufs[i],
>+                                       test_data->seg_sz);
>+               if (data_addr == NULL) {
>+                       RTE_LOG(ERR, USER1, "Could not append data\n");
>+                       return -1;
>+               }
>+
>+               /* Chain mbufs if needed for output mbufs */
>+               for (j = 1; j < segs_per_mbuf; j++) {
>+                       struct rte_mbuf *next_seg =
>+                               rte_pktmbuf_alloc(test_data->comp_buf_pool);
>+
>+                       if (next_seg == NULL) {
>+                               RTE_LOG(ERR, USER1,
>+                                       "Could not allocate mbuf\n");
>+                               return -1;
>+                       }
>+
>+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
>+                               test_data->seg_sz);
>+
>+                       if (data_addr == NULL) {
>+                               RTE_LOG(ERR, USER1, "Could not append data\n");
>+                               return -1;
>+                       }
>+
>+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
>+                                       next_seg) < 0) {
>+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>+                               return -1;
>+                       }
>+               }
>+       }
>+
>+       return 0;
>+}
>+
>+static void
>+free_bufs(struct comp_test_data *test_data)
>+{
>+       uint32_t i;
>+
>+       for (i = 0; i < test_data->total_bufs; i++) {
>+               rte_pktmbuf_free(test_data->comp_bufs[i]);
>+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
>+       }
>+       rte_free(test_data->comp_bufs);
>+       rte_free(test_data->decomp_bufs);
>+}
>+
>+static int
>+main_loop(struct comp_test_data *test_data, uint8_t level,
>+                       enum rte_comp_xform_type type,
>+                       uint8_t *output_data_ptr,
>+                       size_t *output_data_sz,
>+                       unsigned int benchmarking)
>+{
>+       uint8_t dev_id = test_data->cdev_id;
>+       uint32_t i, iter, num_iter;
>+       struct rte_comp_op **ops, **deq_ops;
>+       void *priv_xform = NULL;
>+       struct rte_comp_xform xform;
>+       size_t output_size = 0;
>+       struct rte_mbuf **input_bufs, **output_bufs;
>+       int res = 0;
>+       int allocated = 0;
>+
>+       if (test_data == NULL || !test_data->burst_sz) {
>+               RTE_LOG(ERR, USER1,
>+                       "Unknow burst size\n");
>+               return -1;
>+       }
>+
>+       ops = rte_zmalloc_socket(NULL,
>+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
>+               0, rte_socket_id());
>+
>+       if (ops == NULL) {
>+               RTE_LOG(ERR, USER1,
>+                       "Can't allocate memory for ops strucures\n");
>+               return -1;
>+       }
>+
>+       deq_ops = &ops[test_data->total_bufs];
>+
>+       if (type == RTE_COMP_COMPRESS) {
>+               xform = (struct rte_comp_xform) {
>+                       .type = RTE_COMP_COMPRESS,
>+                       .compress = {
>+                               .algo = RTE_COMP_ALGO_DEFLATE,
>+                               .deflate.huffman = test_data->huffman_enc,
>+                               .level = level,
>+                               .window_size = test_data->window_sz,
>+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>+                       }
>+               };
>+               input_bufs = test_data->decomp_bufs;
>+               output_bufs = test_data->comp_bufs;
>+       } else {
>+               xform = (struct rte_comp_xform) {
>+                       .type = RTE_COMP_DECOMPRESS,
>+                       .decompress = {
>+                               .algo = RTE_COMP_ALGO_DEFLATE,
>+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>+                               .window_size = test_data->window_sz,
>+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>+                       }
>+               };
>+               input_bufs = test_data->comp_bufs;
>+               output_bufs = test_data->decomp_bufs;
>+       }
>+
>+       /* Create private xform */
>+       if (rte_compressdev_private_xform_create(dev_id, &xform,
>+                       &priv_xform) < 0) {
>+               RTE_LOG(ERR, USER1, "Private xform could not be created\n");
>+               res = -1;
>+               goto end;
>+       }
>+
>+       uint64_t tsc_start, tsc_end, tsc_duration;
>+
>+       tsc_start = tsc_end = tsc_duration = 0;
>+       if (benchmarking) {
>+               tsc_start = rte_rdtsc();
>+               num_iter = test_data->num_iter;
>+       } else
>+               num_iter = 1;
Looks like in same code we're doing benchmarking and functional validation. It can be reorganised to keep validation test separately like done in crypto_perf.

>+
>+       for (iter = 0; iter < num_iter; iter++) {
>+               uint32_t total_ops = test_data->total_bufs;
>+               uint32_t remaining_ops = test_data->total_bufs;
>+               uint32_t total_deq_ops = 0;
>+               uint32_t total_enq_ops = 0;
>+               uint16_t ops_unused = 0;
>+               uint16_t num_enq = 0;
>+               uint16_t num_deq = 0;
>+
>+               output_size = 0;
>+
>+               while (remaining_ops > 0) {
>+                       uint16_t num_ops = RTE_MIN(remaining_ops,
>+                                                  test_data->burst_sz);
>+                       uint16_t ops_needed = num_ops - ops_unused;
>+
>+                       /*
>+                        * Move the unused operations from the previous
>+                        * enqueue_burst call to the front, to maintain order
>+                        */
>+                       if ((ops_unused > 0) && (num_enq > 0)) {
>+                               size_t nb_b_to_mov =
>+                                     ops_unused * sizeof(struct rte_comp_op *);
>+
>+                               memmove(ops, &ops[num_enq], nb_b_to_mov);
>+                       }
>+
>+                       /* Allocate compression operations */
>+                       if (ops_needed && !rte_comp_op_bulk_alloc(
>+                                               test_data->op_pool,
>+                                               &ops[ops_unused],
>+                                               ops_needed)) {
>+                               RTE_LOG(ERR, USER1,
>+                                     "Could not allocate enough operations\n");
>+                               res = -1;
>+                               goto end;
>+                       }
>+                       allocated += ops_needed;
>+
>+                       for (i = 0; i < ops_needed; i++) {
>+                               /*
>+                                * Calculate next buffer to attach to operation
>+                                */
>+                               uint32_t buf_id = total_enq_ops + i +
>+                                               ops_unused;
>+                               uint16_t op_id = ops_unused + i;
>+                               /* Reset all data in output buffers */
>+                               struct rte_mbuf *m = output_bufs[buf_id];
>+
>+                               m->pkt_len = test_data->seg_sz * m->nb_segs;
Isn't pkt_len set already when we call rte_pktmbuf_append() and chain()?

>+                               while (m) {
>+                                       m->data_len = m->buf_len - m->data_off;
Same question, shouldn't rte_pktmbuf_append() adjust data_len as well per each mbuf?

>+                                       m = m->next;
>+                               }
>+                               ops[op_id]->m_src = input_bufs[buf_id];
>+                               ops[op_id]->m_dst = output_bufs[buf_id];
>+                               ops[op_id]->src.offset = 0;
>+                               ops[op_id]->src.length =
>+                                       rte_pktmbuf_pkt_len(input_bufs[buf_id]);
>+                               ops[op_id]->dst.offset = 0;
>+                               ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
>+                               ops[op_id]->input_chksum = buf_id;
>+                               ops[op_id]->private_xform = priv_xform;
>+                       }
>+
>+                       num_enq = rte_compressdev_enqueue_burst(dev_id, 0, ops,
>+                                                               num_ops);
>+                       ops_unused = num_ops - num_enq;
>+                       remaining_ops -= num_enq;
>+                       total_enq_ops += num_enq;
>+
>+                       num_deq = rte_compressdev_dequeue_burst(dev_id, 0,
>+                                                          deq_ops,
>+                                                          test_data->burst_sz);
>+                       total_deq_ops += num_deq;
>+                       if (benchmarking == 0) {
>+                               for (i = 0; i < num_deq; i++) {
>+                                       struct rte_comp_op *op = deq_ops[i];
>+                                       const void *read_data_addr =
>+                                               rte_pktmbuf_read(op->m_dst, 0,
>+                                               op->produced, output_data_ptr);
>+                                       if (read_data_addr == NULL) {
>+                                               RTE_LOG(ERR, USER1,
>+                                     "Could not copy buffer in destination\n");
>+                                               res = -1;
>+                                               goto end;
>+                                       }
>+
>+                                       if (read_data_addr != output_data_ptr)
>+                                               rte_memcpy(output_data_ptr,
>+                                                       rte_pktmbuf_mtod(
>+                                                         op->m_dst, uint8_t *),
>+                                                       op->produced);
>+                                       output_data_ptr += op->produced;
>+                                       output_size += op->produced;
>+
>+                               }
>+                       }
>+
>+                       if (iter == num_iter - 1) {
>+                               for (i = 0; i < num_deq; i++) {
Why is it only for last iteration, we are adjusting dst mbuf data_len.?
Shouldn't it be done for each dequeued op?
And, for benchmarking, do we even need to set data and pkt len on dst mbuf?

>+                                       struct rte_comp_op *op = deq_ops[i];
>+                                       struct rte_mbuf *m = op->m_dst;
>+
>+                                       m->pkt_len = op->produced;
>+                                       uint32_t remaining_data = op->produced;
>+                                       uint16_t data_to_append;
>+
>+                                       while (remaining_data > 0) {
>+                                               data_to_append =
>+                                                       RTE_MIN(remaining_data,
>+                                                            test_data->seg_sz);
>+                                               m->data_len = data_to_append;
>+                                               remaining_data -=
>+                                                               data_to_append;
>+                                               m = m->next;
Should break if m->next == NULL
>+                                       }
>+                               }
>+                       }
>+                       rte_mempool_put_bulk(test_data->op_pool,
>+                                            (void **)deq_ops, num_deq);
>+                       allocated -= num_deq;
>+               }
>+
>+               /* Dequeue the last operations */
>+               while (total_deq_ops < total_ops) {
>+                       num_deq = rte_compressdev_dequeue_burst(dev_id, 0,
>+                                               deq_ops, test_data->burst_sz);
>+                       total_deq_ops += num_deq;
>+                       if (benchmarking == 0) {
>+                               for (i = 0; i < num_deq; i++) {
>+                                       struct rte_comp_op *op = deq_ops[i];
>+                                       const void *read_data_addr =
>+                                               rte_pktmbuf_read(op->m_dst, 0,
>+                                               op->produced, output_data_ptr);
>+                                       if (read_data_addr == NULL) {
>+                                               RTE_LOG(ERR, USER1,
>+                                     "Could not copy buffer in destination\n");
>+                                               res = -1;
>+                                               goto end;
>+                                       }
>+
>+                                       if (read_data_addr != output_data_ptr)
>+                                               rte_memcpy(output_data_ptr,
>+                                                       rte_pktmbuf_mtod(
>+                                                       op->m_dst, uint8_t *),
>+                                                       op->produced);
>+                                       output_data_ptr += op->produced;
>+                                       output_size += op->produced;
>+
>+                               }
>+                       }
>+
>+                       if (iter == num_iter - 1) {
>+                               for (i = 0; i < num_deq; i++) {
>+                                       struct rte_comp_op *op = deq_ops[i];
>+                                       struct rte_mbuf *m = op->m_dst;
>+
>+                                       m->pkt_len = op->produced;
>+                                       uint32_t remaining_data = op->produced;
>+                                       uint16_t data_to_append;
>+
>+                                       while (remaining_data > 0) {
>+                                               data_to_append =
>+                                               RTE_MIN(remaining_data,
>+                                                       test_data->seg_sz);
>+                                               m->data_len = data_to_append;
>+                                               remaining_data -=
>+                                                               data_to_append;
>+                                               m = m->next;
>+                                       }
>+                               }
>+                       }
>+                       rte_mempool_put_bulk(test_data->op_pool,
>+                                            (void **)deq_ops, num_deq);
>+                       allocated -= num_deq;
>+               }
>+       }
>+
>+       if (benchmarking) {
>+               tsc_end = rte_rdtsc();
>+               tsc_duration = tsc_end - tsc_start;
>+
>+               if (type == RTE_COMP_COMPRESS)
test looks for stateless operations only, so can we add perf test type like: test type perf, op type:STATELESS/STATEFUL
Also, why do we need --max-num-sgl-segs as an input option from user? Shouldn't input_sz and seg_sz internally decide on num-segs?
Or is it added to serve some other different purpose?

Thanks
Shally

>+                       test_data->comp_tsc_duration[level] =
>+                                       tsc_duration / num_iter;
>+               else
>+                       test_data->decomp_tsc_duration[level] =
>+                                       tsc_duration / num_iter;
>+       }
>+
>+       if (benchmarking == 0 && output_data_sz)
>+               *output_data_sz = output_size;
>+end:
>+       rte_mempool_put_bulk(test_data->op_pool, (void **)ops, allocated);
>+       rte_compressdev_private_xform_free(dev_id, priv_xform);
>+       rte_free(ops);
>+       return res;
>+}
>+
> int
> main(int argc, char **argv)
> {
>+       uint8_t level, level_idx = 0;
>+       uint8_t i;
>        int ret;
>        struct comp_test_data *test_data;
>
>@@ -43,9 +751,145 @@ main(int argc, char **argv)
>                goto err;
>        }
>
>+       if (comp_perf_initialize_compressdev(test_data) < 0) {
>+               ret = EXIT_FAILURE;
>+               goto err;
>+       }
>+
>+       if (comp_perf_dump_input_data(test_data) < 0) {
>+               ret = EXIT_FAILURE;
>+               goto err;
>+       }
>+
>+       if (comp_perf_allocate_memory(test_data) < 0) {
>+               ret = EXIT_FAILURE;
>+               goto err;
>+       }
>+
>+       if (prepare_bufs(test_data) < 0) {
>+               ret = EXIT_FAILURE;
>+               goto err;
>+       }
>+
>+       if (test_data->level.inc != 0)
>+               level = test_data->level.min;
>+       else
>+               level = test_data->level.list[0];
>+
>+       size_t comp_data_sz;
>+       size_t decomp_data_sz;
>+
>+       printf("Burst size = %u\n", test_data->burst_sz);
>+       printf("File size = %zu\n", test_data->input_data_sz);
>+
>+       printf("%6s%12s%17s%19s%21s%15s%21s%23s%16s\n",
>+               "Level", "Comp size", "Comp ratio [%]",
>+               "Comp [Cycles/it]", "Comp [Cycles/Byte]", "Comp [Gbps]",
>+               "Decomp [Cycles/it]", "Decomp [Cycles/Byte]", "Decomp [Gbps]");
>+
>+       while (level <= test_data->level.max) {
>+               /*
>+                * Run a first iteration, to verify compression and
>+                * get the compression ratio for the level
>+                */
>+               if (main_loop(test_data, level, RTE_COMP_COMPRESS,
>+                             test_data->compressed_data,
>+                             &comp_data_sz, 0) < 0) {
>+                       ret = EXIT_FAILURE;
>+                       goto err;
>+               }
>+
>+               if (main_loop(test_data, level, RTE_COMP_DECOMPRESS,
>+                             test_data->decompressed_data,
>+                             &decomp_data_sz, 0) < 0) {
>+                       ret = EXIT_FAILURE;
>+                       goto err;
>+               }
>+
>+               if (decomp_data_sz != test_data->input_data_sz) {
>+                       RTE_LOG(ERR, USER1,
>+                  "Decompressed data length not equal to input data length\n");
>+                       RTE_LOG(ERR, USER1,
>+                               "Decompressed size = %zu, expected = %zu\n",
>+                               decomp_data_sz, test_data->input_data_sz);
>+                       ret = EXIT_FAILURE;
>+                       goto err;
>+               } else {
>+                       if (memcmp(test_data->decompressed_data,
>+                                       test_data->input_data,
>+                                       test_data->input_data_sz) != 0) {
>+                               RTE_LOG(ERR, USER1,
>+                           "Decompressed data is not the same as file data\n");
>+                               ret = EXIT_FAILURE;
>+                               goto err;
>+                       }
>+               }
>+
>+               double ratio = (double) comp_data_sz /
>+                                               test_data->input_data_sz * 100;
>+
>+               /*
>+                * Run the tests twice, discarding the first performance
>+                * results, before the cache is warmed up
>+                */
>+               for (i = 0; i < 2; i++) {
>+                       if (main_loop(test_data, level, RTE_COMP_COMPRESS,
>+                                       NULL, NULL, 1) < 0) {
>+                               ret = EXIT_FAILURE;
>+                               goto err;
>+                       }
>+               }
>+
>+               for (i = 0; i < 2; i++) {
>+                       if (main_loop(test_data, level, RTE_COMP_DECOMPRESS,
>+                                       NULL, NULL, 1) < 0) {
>+                               ret = EXIT_FAILURE;
>+                               goto err;
>+                       }
>+               }
>+
>+               uint64_t comp_tsc_duration =
>+                               test_data->comp_tsc_duration[level];
>+               double comp_tsc_byte = (double)comp_tsc_duration /
>+                                               test_data->input_data_sz;
>+               double comp_gbps = rte_get_tsc_hz() / comp_tsc_byte * 8 /
>+                               1000000000;
>+               uint64_t decomp_tsc_duration =
>+                               test_data->decomp_tsc_duration[level];
>+               double decomp_tsc_byte = (double)decomp_tsc_duration /
>+                                               test_data->input_data_sz;
>+               double decomp_gbps = rte_get_tsc_hz() / decomp_tsc_byte * 8 /
>+                               1000000000;
>+
>+               printf("%6u%12zu%17.2f%19"PRIu64"%21.2f"
>+                                       "%15.2f%21"PRIu64"%23.2f%16.2f\n",
>+                      level, comp_data_sz, ratio, comp_tsc_duration,
>+                      comp_tsc_byte, comp_gbps, decomp_tsc_duration,
>+                      decomp_tsc_byte, decomp_gbps);
>+
>+               if (test_data->level.inc != 0)
>+                       level += test_data->level.inc;
>+               else {
>+                       if (++level_idx == test_data->level.count)
>+                               break;
>+                       level = test_data->level.list[level_idx];
>+               }
>+       }
>+
>        ret = EXIT_SUCCESS;
>
> err:
>+       if (test_data->cdev_id != -1)
>+               rte_compressdev_stop(test_data->cdev_id);
>+
>+       free_bufs(test_data);
>+       rte_free(test_data->compressed_data);
>+       rte_free(test_data->decompressed_data);
>+       rte_free(test_data->input_data);
>+       rte_mempool_free(test_data->comp_buf_pool);
>+       rte_mempool_free(test_data->decomp_buf_pool);
>+       rte_mempool_free(test_data->op_pool);
>+
>        rte_free(test_data);
>
>        return ret;
>--
>2.7.4

Daly, Lee Oct. 15, 2018, 3:10 p.m. UTC | #2

Thanks for your input Shally see comments below.


I will be reviewing these changes while Tomasz is out this week.

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma, Shally
> Sent: Friday, October 12, 2018 11:16 AM
> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>
> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> HI TomaszX
> 
> Sorry for delay in response. Comments inline.
> 

<...>
> >+static int
> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
> >+       const struct rte_compressdev_capabilities *cap;
> >+
> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
> >+                                            RTE_COMP_ALGO_DEFLATE);
> >+
> >+       if (cap == NULL) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Compress device does not support DEFLATE\n");
> >+               return -1;
> >+       }
> >+
> >+       uint64_t comp_flags = cap->comp_feature_flags;
> >+
> >+       /* Huffman enconding */
> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Compress device does not supported Fixed Huffman\n");
> >+               return -1;
> >+       }
> >+
> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC &&
> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Compress device does not supported Dynamic Huffman\n");
> >+               return -1;
> >+       }
> >+
> >+       /* Window size */
> >+       if (test_data->window_sz != -1) {
> >+               if (param_range_check(test_data->window_sz,
> >+ &cap->window_size)
> What if cap->window_size is 0 i.e. implementation default?
What do you mean when you say cap->window_size = 0?
Cap->window_size is the range structure here, min, max and increment, which are filled out by the driver.
Our implementation default in the perf tool will set the window size to max the driver can support.

> 
> >+                               < 0) {
> >+                       RTE_LOG(ERR, USER1,
> >+                               "Compress device does not support "
> >+                               "this window size\n");
> >+                       return -1;
> >+               }
> >+       } else
> >+               /* Set window size to PMD maximum if none was specified */
> >+               test_data->window_sz = cap->window_size.max;
> >+

<...>
> >+
> >+static int
> >+comp_perf_dump_input_data(struct comp_test_data *test_data) {
> >+       FILE *f = fopen(test_data->input_file, "r");
> >+
> >+       if (f == NULL) {
> >+               RTE_LOG(ERR, USER1, "Input file could not be opened\n");
> >+               return -1;
> >+       }
> >+
> >+       if (fseek(f, 0, SEEK_END) != 0) {
> >+               RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
> >+               goto err;
> >+       }
> >+       size_t actual_file_sz = ftell(f);
> >+       /* If extended input data size has not been set,
> >+        * input data size = file size
> >+        */
> >+
> >+       if (test_data->input_data_sz == 0)
> >+               test_data->input_data_sz = actual_file_sz;
> >+
> >+       if (fseek(f, 0, SEEK_SET) != 0) {
> >+               RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
> >+               goto err;
> >+       }
> >+
> >+       test_data->input_data = rte_zmalloc_socket(NULL,
> >+                               test_data->input_data_sz, 0,
> >+ rte_socket_id());
> >+
> >+       if (test_data->input_data == NULL) {
> >+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
> >+                               "file could not be allocated\n");
> >+               goto err;
> >+       }
> >+
> >+       size_t remaining_data = test_data->input_data_sz;
> >+       uint8_t *data = test_data->input_data;
> >+
> >+       while (remaining_data > 0) {
> >+               size_t data_to_read = RTE_MIN(remaining_data,
> >+ actual_file_sz);
> >+
> >+               if (fread(data, data_to_read, 1, f) != 1) {
> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
> >+                       goto err;
> >+               }
> >+               if (fseek(f, 0, SEEK_SET) != 0) {
> >+                       RTE_LOG(ERR, USER1,
> >+                               "Size of input could not be calculated\n");
> >+                       goto err;
> >+               }
> >+               remaining_data -= data_to_read;
> >+               data += data_to_read;
> It looks like it will run 2nd time only if input file size < input data size in which
> case it will just keep filling input buffer with repeated data.
> Is that the intention here?
From what I can see, yes, this will only enter this while loop a second time if the file is smaller than the data_size requested.
Repeating the data from your input file as much as requested. 
If we were to pad with 0's or random data it would skew the ratio a lot.
Even though I do understand the ratio may be better here in this case as well, due to the repetition of data.

> 
> >+       }
> >+
> >+       if (test_data->input_data_sz > actual_file_sz)
> >+               RTE_LOG(INFO, USER1,
> >+                 "%zu bytes read from file %s, extending the file %.2f times\n",
> >+                       test_data->input_data_sz, test_data->input_file,
> >+                       (double)test_data->input_data_sz/actual_file_sz);
> >+       else
> >+               RTE_LOG(INFO, USER1,
> >+                       "%zu bytes read from file %s\n",
> >+                       test_data->input_data_sz,
> >+ test_data->input_file);
> >+
> >+       fclose(f);
> >+
> >+       return 0;
> >+
> >+err:
> >+       fclose(f);
> >+       rte_free(test_data->input_data);
> >+       test_data->input_data = NULL;
> >+
> >+       return -1;
> >+}
> >+
> >+static int
> >+comp_perf_initialize_compressdev(struct comp_test_data *test_data) {
> >+       uint8_t enabled_cdev_count;
> >+       uint8_t enabled_cdevs[RTE_COMPRESS_MAX_DEVS];
> >+
> >+       enabled_cdev_count = rte_compressdev_devices_get(test_data-
> >driver_name,
> >+                       enabled_cdevs, RTE_COMPRESS_MAX_DEVS);
> >+       if (enabled_cdev_count == 0) {
> >+               RTE_LOG(ERR, USER1, "No compress devices type %s available\n",
> >+                               test_data->driver_name);
> >+               return -EINVAL;
> >+       }
> >+
> >+       if (enabled_cdev_count > 1)
> >+               RTE_LOG(INFO, USER1,
> >+                       "Only the first compress device will be
> >+ used\n");
> >+
> >+       test_data->cdev_id = enabled_cdevs[0];
> >+
> >+       if (comp_perf_check_capabilities(test_data) < 0)
> >+               return -1;
> >+
> >+       /* Configure compressdev (one device, one queue pair) */
> >+       struct rte_compressdev_config config = {
> >+               .socket_id = rte_socket_id(),
> >+               .nb_queue_pairs = 1,
> >+               .max_nb_priv_xforms = NUM_MAX_XFORMS,
> >+               .max_nb_streams = 0
> >+       };
> >+
> >+       if (rte_compressdev_configure(test_data->cdev_id, &config) < 0) {
> >+               RTE_LOG(ERR, USER1, "Device configuration failed\n");
> >+               return -1;
> >+       }
> >+
> >+       if (rte_compressdev_queue_pair_setup(test_data->cdev_id, 0,
> >+                       NUM_MAX_INFLIGHT_OPS, rte_socket_id()) < 0) {
> >+               RTE_LOG(ERR, USER1, "Queue pair setup failed\n");
> >+               return -1;
> >+       }
> >+
> >+       if (rte_compressdev_start(test_data->cdev_id) < 0) {
> >+               RTE_LOG(ERR, USER1, "Device could not be started\n");
> >+               return -1;
> >+       }
> >+
> >+       return 0;
> >+}
> >+
> >+static int
> >+prepare_bufs(struct comp_test_data *test_data) {
> >+       uint32_t remaining_data = test_data->input_data_sz;
> >+       uint8_t *input_data_ptr = test_data->input_data;
> >+       size_t data_sz;
> >+       uint8_t *data_addr;
> >+       uint32_t i, j;
> >+
> >+       for (i = 0; i < test_data->total_bufs; i++) {
> >+               /* Allocate data in input mbuf and copy data from input file */
> >+               test_data->decomp_bufs[i] =
> >+                       rte_pktmbuf_alloc(test_data->decomp_buf_pool);
> >+               if (test_data->decomp_bufs[i] == NULL) {
> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
> >+                       return -1;
> >+               }
> >+
> >+               data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
> >+                                       test_data->decomp_bufs[i], data_sz);
> >+               if (data_addr == NULL) {
> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
> >+                       return -1;
> >+               }
> >+               rte_memcpy(data_addr, input_data_ptr, data_sz);
> >+
> >+               input_data_ptr += data_sz;
> >+               remaining_data -= data_sz;
> >+
> >+               /* Already one segment in the mbuf */
> >+               uint16_t segs_per_mbuf = 1;
> >+
> >+               /* Chain mbufs if needed for input mbufs */
> >+               while (segs_per_mbuf < test_data->max_sgl_segs
> >+                               && remaining_data > 0) {
> >+                       struct rte_mbuf *next_seg =
> >+
> >+ rte_pktmbuf_alloc(test_data->decomp_buf_pool);
> >+
> >+                       if (next_seg == NULL) {
> >+                               RTE_LOG(ERR, USER1,
> >+                                       "Could not allocate mbuf\n");
> >+                               return -1;
> >+                       }
> >+
> >+                       data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
> >+                               data_sz);
> >+
> >+                       if (data_addr == NULL) {
> >+                               RTE_LOG(ERR, USER1, "Could not append
> >+ data\n");
> Since a new buffer per segment is allocated, so is it possible for append to
> fail? think, this check is redundant here.

True.

> >+                               return -1;
> >+                       }
> >+
> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
> >+                       input_data_ptr += data_sz;
> >+                       remaining_data -= data_sz;
> >+
> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
> >+                                       next_seg) < 0) {
> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >+                               return -1;
> >+                       }
> >+                       segs_per_mbuf++;
> >+               }
> >+

<...>
> >+
> >+       /* Create private xform */
> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
> >+                       &priv_xform) < 0) {
> >+               RTE_LOG(ERR, USER1, "Private xform could not be created\n");
> >+               res = -1;
> >+               goto end;
> >+       }
> >+
> >+       uint64_t tsc_start, tsc_end, tsc_duration;
> >+
> >+       tsc_start = tsc_end = tsc_duration = 0;
> >+       if (benchmarking) {
> >+               tsc_start = rte_rdtsc();
> >+               num_iter = test_data->num_iter;
> >+       } else
> >+               num_iter = 1;
> Looks like in same code we're doing benchmarking and functional validation.
> It can be reorganised to keep validation test separately like done in
> crypto_perf.
Good point, will keep that in mind when doing these changes.

> 
> >+
> >+       for (iter = 0; iter < num_iter; iter++) {
> >+               uint32_t total_ops = test_data->total_bufs;
> >+               uint32_t remaining_ops = test_data->total_bufs;
> >+               uint32_t total_deq_ops = 0;
> >+               uint32_t total_enq_ops = 0;
> >+               uint16_t ops_unused = 0;
> >+               uint16_t num_enq = 0;
> >+               uint16_t num_deq = 0;
> >+
> >+               output_size = 0;
> >+
> >+               while (remaining_ops > 0) {
> >+                       uint16_t num_ops = RTE_MIN(remaining_ops,
> >+                                                  test_data->burst_sz);
> >+                       uint16_t ops_needed = num_ops - ops_unused;
> >+
> >+                       /*
> >+                        * Move the unused operations from the previous
> >+                        * enqueue_burst call to the front, to maintain order
> >+                        */
> >+                       if ((ops_unused > 0) && (num_enq > 0)) {
> >+                               size_t nb_b_to_mov =
> >+                                     ops_unused * sizeof(struct
> >+ rte_comp_op *);
> >+
> >+                               memmove(ops, &ops[num_enq], nb_b_to_mov);
> >+                       }
> >+
> >+                       /* Allocate compression operations */
> >+                       if (ops_needed && !rte_comp_op_bulk_alloc(
> >+                                               test_data->op_pool,
> >+                                               &ops[ops_unused],
> >+                                               ops_needed)) {
> >+                               RTE_LOG(ERR, USER1,
> >+                                     "Could not allocate enough operations\n");
> >+                               res = -1;
> >+                               goto end;
> >+                       }
> >+                       allocated += ops_needed;
> >+
> >+                       for (i = 0; i < ops_needed; i++) {
> >+                               /*
> >+                                * Calculate next buffer to attach to operation
> >+                                */
> >+                               uint32_t buf_id = total_enq_ops + i +
> >+                                               ops_unused;
> >+                               uint16_t op_id = ops_unused + i;
> >+                               /* Reset all data in output buffers */
> >+                               struct rte_mbuf *m =
> >+ output_bufs[buf_id];
> >+
> >+                               m->pkt_len = test_data->seg_sz *
> >+ m->nb_segs;
> Isn't pkt_len set already when we call rte_pktmbuf_append() and chain()?
> 
> >+                               while (m) {
> >+                                       m->data_len = m->buf_len -
> >+ m->data_off;
> Same question, shouldn't rte_pktmbuf_append() adjust data_len as well per
> each mbuf?
Yes you are correct,
From what I can see the *m mbuf pointer is redundant. 

> 
> >+                                       m = m->next;
> >+                               }
> >+                               ops[op_id]->m_src = input_bufs[buf_id];
> >+                               ops[op_id]->m_dst = output_bufs[buf_id];
> >+                               ops[op_id]->src.offset = 0;
> >+                               ops[op_id]->src.length =
> >+                                       rte_pktmbuf_pkt_len(input_bufs[buf_id]);
> >+                               ops[op_id]->dst.offset = 0;
> >+                               ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
> >+                               ops[op_id]->input_chksum = buf_id;
> >+                               ops[op_id]->private_xform = priv_xform;
> >+                       }
> >+
> >+                       num_enq = rte_compressdev_enqueue_burst(dev_id, 0, ops,
> >+                                                               num_ops);
> >+                       ops_unused = num_ops - num_enq;
> >+                       remaining_ops -= num_enq;
> >+                       total_enq_ops += num_enq;
> >+
> >+                       num_deq = rte_compressdev_dequeue_burst(dev_id, 0,
> >+                                                          deq_ops,
> >+                                                          test_data->burst_sz);
> >+                       total_deq_ops += num_deq;
> >+                       if (benchmarking == 0) {
> >+                               for (i = 0; i < num_deq; i++) {
> >+                                       struct rte_comp_op *op = deq_ops[i];
> >+                                       const void *read_data_addr =
> >+                                               rte_pktmbuf_read(op->m_dst, 0,
> >+                                               op->produced, output_data_ptr);
> >+                                       if (read_data_addr == NULL) {
> >+                                               RTE_LOG(ERR, USER1,
> >+                                     "Could not copy buffer in destination\n");
> >+                                               res = -1;
> >+                                               goto end;
> >+                                       }
> >+
> >+                                       if (read_data_addr != output_data_ptr)
> >+                                               rte_memcpy(output_data_ptr,
> >+                                                       rte_pktmbuf_mtod(
> >+                                                         op->m_dst, uint8_t *),
> >+                                                       op->produced);
> >+                                       output_data_ptr += op->produced;
> >+                                       output_size += op->produced;
> >+
> >+                               }
> >+                       }
> >+
> >+                       if (iter == num_iter - 1) {
> >+                               for (i = 0; i < num_deq; i++) {
> Why is it only for last iteration, we are adjusting dst mbuf data_len.?
> Shouldn't it be done for each dequeued op?
> And, for benchmarking, do we even need to set data and pkt len on dst
> mbuf?
I assume the data_len is only getting changed on the last iteration, for the reason you gave, benchmarking, to save cycles.
Does it need to be at all? Possibly not. 
> 
> >+                                       struct rte_comp_op *op = deq_ops[i];
> >+                                       struct rte_mbuf *m = op->m_dst;
> >+
> >+                                       m->pkt_len = op->produced;
> >+                                       uint32_t remaining_data = op->produced;
> >+                                       uint16_t data_to_append;
> >+
> >+                                       while (remaining_data > 0) {
> >+                                               data_to_append =
> >+                                                       RTE_MIN(remaining_data,
> >+                                                            test_data->seg_sz);
> >+                                               m->data_len = data_to_append;
> >+                                               remaining_data -=
> >+                                                               data_to_append;
> >+                                               m = m->next;
> Should break if m->next == NULL
Yup, you are correct, should be a check here.
> >+                                       }
> >+                               }
> >+                       }
> >+                       rte_mempool_put_bulk(test_data->op_pool,
> >+                                            (void **)deq_ops, num_deq);
> >+                       allocated -= num_deq;
> >+               }
> >+
> >+               /* Dequeue the last operations */
> >+               while (total_deq_ops < total_ops) {
> >+                       num_deq = rte_compressdev_dequeue_burst(dev_id, 0,
> >+                                               deq_ops, test_data->burst_sz);
> >+                       total_deq_ops += num_deq;
> >+                       if (benchmarking == 0) {
> >+                               for (i = 0; i < num_deq; i++) {
> >+                                       struct rte_comp_op *op = deq_ops[i];
> >+                                       const void *read_data_addr =
> >+                                               rte_pktmbuf_read(op->m_dst, 0,
> >+                                               op->produced, output_data_ptr);
> >+                                       if (read_data_addr == NULL) {
> >+                                               RTE_LOG(ERR, USER1,
> >+                                     "Could not copy buffer in destination\n");
> >+                                               res = -1;
> >+                                               goto end;
> >+                                       }
> >+
> >+                                       if (read_data_addr != output_data_ptr)
> >+                                               rte_memcpy(output_data_ptr,
> >+                                                       rte_pktmbuf_mtod(
> >+                                                       op->m_dst, uint8_t *),
> >+                                                       op->produced);
> >+                                       output_data_ptr += op->produced;
> >+                                       output_size += op->produced;
> >+
> >+                               }
> >+                       }
> >+
> >+                       if (iter == num_iter - 1) {
> >+                               for (i = 0; i < num_deq; i++) {
> >+                                       struct rte_comp_op *op = deq_ops[i];
> >+                                       struct rte_mbuf *m = op->m_dst;
> >+
> >+                                       m->pkt_len = op->produced;
> >+                                       uint32_t remaining_data = op->produced;
> >+                                       uint16_t data_to_append;
> >+
> >+                                       while (remaining_data > 0) {
> >+                                               data_to_append =
> >+                                               RTE_MIN(remaining_data,
> >+                                                       test_data->seg_sz);
> >+                                               m->data_len = data_to_append;
> >+                                               remaining_data -=
> >+                                                               data_to_append;
> >+                                               m = m->next;
> >+                                       }
> >+                               }
> >+                       }
> >+                       rte_mempool_put_bulk(test_data->op_pool,
> >+                                            (void **)deq_ops, num_deq);
> >+                       allocated -= num_deq;
> >+               }
> >+       }
> >+
> >+       if (benchmarking) {
> >+               tsc_end = rte_rdtsc();
> >+               tsc_duration = tsc_end - tsc_start;
> >+
> >+               if (type == RTE_COMP_COMPRESS)
> test looks for stateless operations only, so can we add perf test type like: test
> type perf, op type:STATELESS/STATEFUL 
Are you asking for the tool to support stateful ops? Since no drivers support stateful yet 
We just wanted to ensure current driver functionality was covered with this first version.

>Also, why do we need --max-num-
> sgl-segs as an input option from user? Shouldn't input_sz and seg_sz
> internally decide on num-segs?
> Or is it added to serve some other different purpose?
Will have to get back to you on this one, seems illogical to get this input from user,
But I will have to do further investigation to find if there was a different purpose. 
> 
> Thanks
> Shally
> 
Thanks for the feedback, 
We hope to get V2 sent asap.

Verma, Shally Oct. 16, 2018, 5:18 a.m. UTC | #3

>-----Original Message-----
>From: Daly, Lee <lee.daly@intel.com>
>Sent: 15 October 2018 20:40
>To: Verma, Shally <Shally.Verma@cavium.com>
>Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Thanks for your input Shally see comments below.
>
>
>I will be reviewing these changes while Tomasz is out this week.
>
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma, Shally
>> Sent: Friday, October 12, 2018 11:16 AM
>> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
>> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
>> <pablo.de.lara.guarch@intel.com>
>> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> measurement
>>
>> HI TomaszX
>>
>> Sorry for delay in response. Comments inline.
>>
>
><...>
>> >+static int
>> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
>> >+       const struct rte_compressdev_capabilities *cap;
>> >+
>> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
>> >+                                            RTE_COMP_ALGO_DEFLATE);
>> >+
>> >+       if (cap == NULL) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Compress device does not support DEFLATE\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       uint64_t comp_flags = cap->comp_feature_flags;
>> >+
>> >+       /* Huffman enconding */
>> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
>> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Compress device does not supported Fixed Huffman\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC &&
>> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Compress device does not supported Dynamic Huffman\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       /* Window size */
>> >+       if (test_data->window_sz != -1) {
>> >+               if (param_range_check(test_data->window_sz,
>> >+ &cap->window_size)
>> What if cap->window_size is 0 i.e. implementation default?
>What do you mean when you say cap->window_size = 0?
>Cap->window_size is the range structure here, min, max and increment, which are filled out by the driver.
>Our implementation default in the perf tool will set the window size to max the driver can support.
If I recall and if I am not mixing my memories, I  believe, we added a condition in lib where driver can set window sz , min = 0 or max = 0 to just mark implementation default. If that's not the case supported yet on lib, then you can ignore this comment.
>
...

>> It looks like it will run 2nd time only if input file size < input data size in which
>> case it will just keep filling input buffer with repeated data.
>> Is that the intention here?
>From what I can see, yes, this will only enter this while loop a second time if the file is smaller than the data_size requested.
>Repeating the data from your input file as much as requested.
>If we were to pad with 0's or random data it would skew the ratio a lot.
>Even though I do understand the ratio may be better here in this case as well, due to the repetition of data.
>
Yea. So I think not to influence benchmark data here. we should stick to input filesz user is giving. As performance
at a particular level will vary by content type so lets app choose and find out performance for a given content type.

>>
...

>> >+       if (benchmarking) {
>> >+               tsc_end = rte_rdtsc();
>> >+               tsc_duration = tsc_end - tsc_start;
>> >+
>> >+               if (type == RTE_COMP_COMPRESS)
>> test looks for stateless operations only, so can we add perf test type like: test
>> type perf, op type:STATELESS/STATEFUL
>Are you asking for the tool to support stateful ops? Since no drivers support stateful yet
>We just wanted to ensure current driver functionality was covered with this first version.
Since it's an app so should be generic enough to be extensible for stateful benchmarking.
So, either we name app as test_comp_benchmark_statless or we make it generic to handling both, would be my suggestion.

Thanks
Shally
>
>>Also, why do we need --max-num-
>> sgl-segs as an input option from user? Shouldn't input_sz and seg_sz
>> internally decide on num-segs?
>> Or is it added to serve some other different purpose?
>Will have to get back to you on this one, seems illogical to get this input from user,
>But I will have to do further investigation to find if there was a different purpose.
>>
>> Thanks
>> Shally
>>
>Thanks for the feedback,
>We hope to get V2 sent asap.
>

Fiona Trahe Oct. 17, 2018, 2:33 p.m. UTC | #4

Hi Shally, Lee,

> -----Original Message-----
> From: Daly, Lee
> Sent: Monday, October 15, 2018 8:10 AM
> To: Verma, Shally <Shally.Verma@cavium.com>
> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe, Fiona
> <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
> 
> Thanks for your input Shally see comments below.
> 
> 
> I will be reviewing these changes while Tomasz is out this week.
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma, Shally
> > Sent: Friday, October 12, 2018 11:16 AM
> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> > Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
> > <pablo.de.lara.guarch@intel.com>
> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> > measurement
> >
///

> >Also, why do we need --max-num-
> > sgl-segs as an input option from user? Shouldn't input_sz and seg_sz
> > internally decide on num-segs?
> > Or is it added to serve some other different purpose?
> Will have to get back to you on this one, seems illogical to get this input from user,
> But I will have to do further investigation to find if there was a different purpose.

[Fiona] Some PMDs have a limit on how many links can be in an sgl chain, e.g. in QAT case the 
PMD allocates a pool of internal structures of a suitable size during device initialisation, this is not 
a hard limit but can be configured in .config to give the user control over the memory resources allocated.
This perf-tool max-num-sgl-segs is so the user can pick a value <= whatever the PMD's max is.

Verma, Shally Oct. 17, 2018, 3:42 p.m. UTC | #5

>-----Original Message-----
>From: Trahe, Fiona <fiona.trahe@intel.com>
>Sent: 17 October 2018 20:04
>To: Daly, Lee <lee.daly@intel.com>; Verma, Shally <Shally.Verma@cavium.com>
>Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com; Trahe, Fiona <fiona.trahe@intel.com>
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Hi Shally, Lee,
>
>> -----Original Message-----
>> From: Daly, Lee
>> Sent: Monday, October 15, 2018 8:10 AM
>> To: Verma, Shally <Shally.Verma@cavium.com>
>> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe, Fiona
>> <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>>
>> Thanks for your input Shally see comments below.
>>
>>
>> I will be reviewing these changes while Tomasz is out this week.
>>
>> > -----Original Message-----
>> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma, Shally
>> > Sent: Friday, October 12, 2018 11:16 AM
>> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
>> > Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
>> > <pablo.de.lara.guarch@intel.com>
>> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> > measurement
>> >
>///
>
>> >Also, why do we need --max-num-
>> > sgl-segs as an input option from user? Shouldn't input_sz and seg_sz
>> > internally decide on num-segs?
>> > Or is it added to serve some other different purpose?
>> Will have to get back to you on this one, seems illogical to get this input from user,
>> But I will have to do further investigation to find if there was a different purpose.
>
>[Fiona] Some PMDs have a limit on how many links can be in an sgl chain, e.g. in QAT case the
>PMD allocates a pool of internal structures of a suitable size during device initialisation, this is not
>a hard limit but can be configured in .config to give the user control over the memory resources allocated.
>This perf-tool max-num-sgl-segs is so the user can pick a value <= whatever the PMD's max is.

Then also, I believe this could be taken care internally by an app.
App can choose convenient number of sgl segs as per PMD capability and input sz and chunk sz selected by user.
Just my thoughts.

Fiona Trahe Oct. 17, 2018, 4:45 p.m. UTC | #6

> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Wednesday, October 17, 2018 8:43 AM
> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
> 
> 
> 
> >-----Original Message-----
> >From: Trahe, Fiona <fiona.trahe@intel.com>
> >Sent: 17 October 2018 20:04
> >To: Daly, Lee <lee.daly@intel.com>; Verma, Shally <Shally.Verma@cavium.com>
> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com; Trahe, Fiona
> <fiona.trahe@intel.com>
> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
> >
> >External Email
> >
> >Hi Shally, Lee,
> >
> >> -----Original Message-----
> >> From: Daly, Lee
> >> Sent: Monday, October 15, 2018 8:10 AM
> >> To: Verma, Shally <Shally.Verma@cavium.com>
> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe, Fiona
> >> <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
> >>
> >> Thanks for your input Shally see comments below.
> >>
> >>
> >> I will be reviewing these changes while Tomasz is out this week.
> >>
> >> > -----Original Message-----
> >> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma, Shally
> >> > Sent: Friday, October 12, 2018 11:16 AM
> >> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> >> > Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
> >> > <pablo.de.lara.guarch@intel.com>
> >> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >> > measurement
> >> >
> >///
> >
> >> >Also, why do we need --max-num-
> >> > sgl-segs as an input option from user? Shouldn't input_sz and seg_sz
> >> > internally decide on num-segs?
> >> > Or is it added to serve some other different purpose?
> >> Will have to get back to you on this one, seems illogical to get this input from user,
> >> But I will have to do further investigation to find if there was a different purpose.
> >
> >[Fiona] Some PMDs have a limit on how many links can be in an sgl chain, e.g. in QAT case the
> >PMD allocates a pool of internal structures of a suitable size during device initialisation, this is not
> >a hard limit but can be configured in .config to give the user control over the memory resources allocated.
> >This perf-tool max-num-sgl-segs is so the user can pick a value <= whatever the PMD's max is.
> 
> Then also, I believe this could be taken care internally by an app.
> App can choose convenient number of sgl segs as per PMD capability and input sz and chunk sz selected by
> user.
> Just my thoughts.
[Fiona] Then we'd need to add this capability to the API, e.g. add uint16_t max_nb_segments_per_sgl
into the rte_compressdev_info struct.
Special case 0 means no limit.
We did consider this before, I can't remember why we didn't do it, I think it's needed.
I'll push an API patch for this in 19.02 and we can remove the --max-num-sgl-segs param from 
the performance tool and hardcode it in the tool in the meantime.
Ok?

Verma, Shally Oct. 17, 2018, 4:47 p.m. UTC | #7

>-----Original Message-----
>From: Trahe, Fiona <fiona.trahe@intel.com>
>Sent: 17 October 2018 22:15
>To: Verma, Shally <Shally.Verma@cavium.com>; Daly, Lee <lee.daly@intel.com>
>Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com; Trahe, Fiona <fiona.trahe@intel.com>
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>> -----Original Message-----
>> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> Sent: Wednesday, October 17, 2018 8:43 AM
>> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
>> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com
>> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>>
>>
>>
>> >-----Original Message-----
>> >From: Trahe, Fiona <fiona.trahe@intel.com>
>> >Sent: 17 October 2018 20:04
>> >To: Daly, Lee <lee.daly@intel.com>; Verma, Shally <Shally.Verma@cavium.com>
>> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com; Trahe, Fiona
>> <fiona.trahe@intel.com>
>> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>> >
>> >External Email
>> >
>> >Hi Shally, Lee,
>> >
>> >> -----Original Message-----
>> >> From: Daly, Lee
>> >> Sent: Monday, October 15, 2018 8:10 AM
>> >> To: Verma, Shally <Shally.Verma@cavium.com>
>> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe, Fiona
>> >> <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>> >>
>> >> Thanks for your input Shally see comments below.
>> >>
>> >>
>> >> I will be reviewing these changes while Tomasz is out this week.
>> >>
>> >> > -----Original Message-----
>> >> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma, Shally
>> >> > Sent: Friday, October 12, 2018 11:16 AM
>> >> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
>> >> > Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
>> >> > <pablo.de.lara.guarch@intel.com>
>> >> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> >> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> >> > measurement
>> >> >
>> >///
>> >
>> >> >Also, why do we need --max-num-
>> >> > sgl-segs as an input option from user? Shouldn't input_sz and seg_sz
>> >> > internally decide on num-segs?
>> >> > Or is it added to serve some other different purpose?
>> >> Will have to get back to you on this one, seems illogical to get this input from user,
>> >> But I will have to do further investigation to find if there was a different purpose.
>> >
>> >[Fiona] Some PMDs have a limit on how many links can be in an sgl chain, e.g. in QAT case the
>> >PMD allocates a pool of internal structures of a suitable size during device initialisation, this is not
>> >a hard limit but can be configured in .config to give the user control over the memory resources allocated.
>> >This perf-tool max-num-sgl-segs is so the user can pick a value <= whatever the PMD's max is.
>>
>> Then also, I believe this could be taken care internally by an app.
>> App can choose convenient number of sgl segs as per PMD capability and input sz and chunk sz selected by
>> user.
>> Just my thoughts.
>[Fiona] Then we'd need to add this capability to the API, e.g. add uint16_t max_nb_segments_per_sgl
>into the rte_compressdev_info struct.
>Special case 0 means no limit.
>We did consider this before, I can't remember why we didn't do it, I think it's needed.
>I'll push an API patch for this in 19.02 and we can remove the --max-num-sgl-segs param from
>the performance tool and hardcode it in the tool in the meantime.
>Ok?
Yea. Sounds better.

Tomasz Jozwiak Nov. 2, 2018, 9:59 a.m. UTC | #8

Hi Shally,

Sorry for delay - I was on sick leave.
We had some issues with dynamic compression test so I block this test in V2. May be there's too late to add this into this release but we've decided to send this V2 to DPDK.

My comment inline (not all have answer so far, still working on that)

> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Friday, October 12, 2018 12:16 PM
> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>
> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> HI TomaszX
> 
> Sorry for delay in response. Comments inline.
> 
> >-----Original Message-----
> >From: dev <dev-bounces@dpdk.org> On Behalf Of Tomasz Jozwiak
> >Sent: 01 October 2018 18:57
> >To: dev@dpdk.org; fiona.trahe@intel.com; tomaszx.jozwiak@intel.com;
> >akhil.goyal@nxp.com; pablo.de.lara.guarch@intel.com
> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >measurement
> >
> >External Email
> >
> >Added performance measurement part into compression perf. test.
> >
> >Signed-off-by: De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
> >---
> > app/test-compress-perf/main.c | 844
> >++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 844 insertions(+)
> >
> >diff --git a/app/test-compress-perf/main.c
> >b/app/test-compress-perf/main.c index f52b98d..093dfaf 100644
> >--- a/app/test-compress-perf/main.c
> >+++ b/app/test-compress-perf/main.c
> >@@ -5,13 +5,721 @@
> > #include <rte_malloc.h>
> > #include <rte_eal.h>
> > #include <rte_log.h>
> >+#include <rte_cycles.h>
> > #include <rte_compressdev.h>
> >
> > #include "comp_perf_options.h"
> >
> >+#define NUM_MAX_XFORMS 16
> >+#define NUM_MAX_INFLIGHT_OPS 512
> >+#define EXPANSE_RATIO 1.05
> >+#define MIN_ISAL_SIZE 8
> >+
> >+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
> >+
> >+static int
> >+param_range_check(uint16_t size, const struct rte_param_log2_range
> >+*range) {
> >+       unsigned int next_size;
> >+
> >+       /* Check lower/upper bounds */
> >+       if (size < range->min)
> >+               return -1;
> >+
> >+       if (size > range->max)
> >+               return -1;
> >+
> >+       /* If range is actually only one value, size is correct */
> >+       if (range->increment == 0)
> >+               return 0;
> >+
> >+       /* Check if value is one of the supported sizes */
> >+       for (next_size = range->min; next_size <= range->max;
> >+                       next_size += range->increment)
> >+               if (size == next_size)
> >+                       return 0;
> >+
> >+       return -1;
> >+}
> >+
> >+static int
> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
> >+       const struct rte_compressdev_capabilities *cap;
> >+
> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
> >+                                            RTE_COMP_ALGO_DEFLATE);
> >+
> >+       if (cap == NULL) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Compress device does not support DEFLATE\n");
> >+               return -1;
> >+       }
> >+
> >+       uint64_t comp_flags = cap->comp_feature_flags;
> >+
> >+       /* Huffman enconding */
> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Compress device does not supported Fixed Huffman\n");
> >+               return -1;
> >+       }
> >+
> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC &&
> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Compress device does not supported Dynamic Huffman\n");
> >+               return -1;
> >+       }
> >+
> >+       /* Window size */
> >+       if (test_data->window_sz != -1) {
> >+               if (param_range_check(test_data->window_sz,
> >+ &cap->window_size)
> What if cap->window_size is 0 i.e. implementation default?

TJ: You probably mean cap->window_size.increment = 0 (because cap->window_size is a structure). In that case we check if test_data->window_sz >=min and test_data->window_sz <= max only, because increment = 0 means (base on compression API) we have only one value of windows_size (no range is supported).



> 
> >+                               < 0) {
> >+                       RTE_LOG(ERR, USER1,
> >+                               "Compress device does not support "
> >+                               "this window size\n");
> >+                       return -1;
> >+               }
> >+       } else
> >+               /* Set window size to PMD maximum if none was specified */
> >+               test_data->window_sz = cap->window_size.max;
> >+
> >+       /* Check if chained mbufs is supported */
> >+       if (test_data->max_sgl_segs > 1  &&
> >+                       (comp_flags & RTE_COMP_FF_OOP_SGL_IN_SGL_OUT) == 0) {
> >+               RTE_LOG(INFO, USER1, "Compress device does not support "
> >+                               "chained mbufs. Max SGL segments set to 1\n");
> >+               test_data->max_sgl_segs = 1;
> >+       }
> >+
> >+       /* Level 0 support */
> >+       if (test_data->level.min == 0 &&
> >+                       (comp_flags & RTE_COMP_FF_NONCOMPRESSED_BLOCKS) ==
> 0) {
> >+               RTE_LOG(ERR, USER1, "Compress device does not support "
> >+                               "level 0 (no compression)\n");
> >+               return -1;
> >+       }
> >+
> >+       return 0;
> >+}
> >+
> >+static int
> >+comp_perf_allocate_memory(struct comp_test_data *test_data) {
> >+       /* Number of segments for input and output
> >+        * (compression and decompression)
> >+        */
> >+       uint32_t total_segs = DIV_CEIL(test_data->input_data_sz,
> >+                       test_data->seg_sz);
> >+       test_data->comp_buf_pool =
> rte_pktmbuf_pool_create("comp_buf_pool",
> >+                               total_segs,
> >+                               0, 0, test_data->seg_sz + RTE_PKTMBUF_HEADROOM,
> >+                               rte_socket_id());
> >+       if (test_data->comp_buf_pool == NULL) {
> >+               RTE_LOG(ERR, USER1, "Mbuf mempool could not be created\n");
> >+               return -1;
> >+       }
> >+
> >+       test_data->decomp_buf_pool =
> rte_pktmbuf_pool_create("decomp_buf_pool",
> >+                               total_segs,
> >+                               0, 0, test_data->seg_sz + RTE_PKTMBUF_HEADROOM,
> >+                               rte_socket_id());
> >+       if (test_data->decomp_buf_pool == NULL) {
> >+               RTE_LOG(ERR, USER1, "Mbuf mempool could not be created\n");
> >+               return -1;
> >+       }
> >+
> >+       test_data->total_bufs = DIV_CEIL(total_segs,
> >+ test_data->max_sgl_segs);
> >+
> >+       test_data->op_pool = rte_comp_op_pool_create("op_pool",
> >+                                 test_data->total_bufs,
> >+                                 0, 0, rte_socket_id());
> >+       if (test_data->op_pool == NULL) {
> >+               RTE_LOG(ERR, USER1, "Comp op mempool could not be
> created\n");
> >+               return -1;
> >+       }
> >+
> >+       /*
> >+        * Compressed data might be a bit larger than input data,
> >+        * if data cannot be compressed
> Possible only if it's zlib format right? Or deflate as well?

TJ: This due to possibility of uncompressible data. In that case the compressed data can be bigger than input, because of frame, which has to be added into data . Yes it related to zlib and deflate as well.

> 
> >+        */
> >+       test_data->compressed_data = rte_zmalloc_socket(NULL,
> >+                               test_data->input_data_sz * EXPANSE_RATIO
> >+                                                       + MIN_ISAL_SIZE, 0,
> >+                               rte_socket_id());
> >+       if (test_data->compressed_data == NULL) {
> >+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
> >+                               "file could not be allocated\n");
> >+               return -1;
> >+       }
> >+
> >+       test_data->decompressed_data = rte_zmalloc_socket(NULL,
> >+                               test_data->input_data_sz, 0,
> >+                               rte_socket_id());
> >+       if (test_data->decompressed_data == NULL) {
> >+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
> >+                               "file could not be allocated\n");
> >+               return -1;
> >+       }
> >+
> >+       test_data->comp_bufs = rte_zmalloc_socket(NULL,
> >+                       test_data->total_bufs * sizeof(struct rte_mbuf *),
> >+                       0, rte_socket_id());
> >+       if (test_data->comp_bufs == NULL) {
> >+               RTE_LOG(ERR, USER1, "Memory to hold the compression mbufs"
> >+                               " could not be allocated\n");
> >+               return -1;
> >+       }
> >+
> >+       test_data->decomp_bufs = rte_zmalloc_socket(NULL,
> >+                       test_data->total_bufs * sizeof(struct rte_mbuf *),
> >+                       0, rte_socket_id());
> >+       if (test_data->decomp_bufs == NULL) {
> >+               RTE_LOG(ERR, USER1, "Memory to hold the decompression
> mbufs"
> >+                               " could not be allocated\n");
> >+               return -1;
> >+       }
> >+       return 0;
> >+}
> >+
> >+static int
> >+comp_perf_dump_input_data(struct comp_test_data *test_data) {
> >+       FILE *f = fopen(test_data->input_file, "r");
> >+
> >+       if (f == NULL) {
> >+               RTE_LOG(ERR, USER1, "Input file could not be opened\n");
> >+               return -1;
> >+       }
> >+
> >+       if (fseek(f, 0, SEEK_END) != 0) {
> >+               RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
> >+               goto err;
> >+       }
> >+       size_t actual_file_sz = ftell(f);
> >+       /* If extended input data size has not been set,
> >+        * input data size = file size
> >+        */
> >+
> >+       if (test_data->input_data_sz == 0)
> >+               test_data->input_data_sz = actual_file_sz;
> >+
> >+       if (fseek(f, 0, SEEK_SET) != 0) {
> >+               RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
> >+               goto err;
> >+       }
> >+
> >+       test_data->input_data = rte_zmalloc_socket(NULL,
> >+                               test_data->input_data_sz, 0,
> >+ rte_socket_id());
> >+
> >+       if (test_data->input_data == NULL) {
> >+               RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
> >+                               "file could not be allocated\n");
> >+               goto err;
> >+       }
> >+
> >+       size_t remaining_data = test_data->input_data_sz;
> >+       uint8_t *data = test_data->input_data;
> >+
> >+       while (remaining_data > 0) {
> >+               size_t data_to_read = RTE_MIN(remaining_data,
> >+ actual_file_sz);
> >+
> >+               if (fread(data, data_to_read, 1, f) != 1) {
> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
> >+                       goto err;
> >+               }
> >+               if (fseek(f, 0, SEEK_SET) != 0) {
> >+                       RTE_LOG(ERR, USER1,
> >+                               "Size of input could not be calculated\n");
> >+                       goto err;
> >+               }
> >+               remaining_data -= data_to_read;
> >+               data += data_to_read;
> It looks like it will run 2nd time only if input file size < input data size in which
> case it will just keep filling input buffer with repeated data.
> Is that the intention here?

TJ: Yes exactly. If test_data->input_data_sz is bigger than  actual_file_sz then we fill the buffer with repeated data from file to fill whole buffer.

> 
> >+       }
> >+
> >+       if (test_data->input_data_sz > actual_file_sz)
> >+               RTE_LOG(INFO, USER1,
> >+                 "%zu bytes read from file %s, extending the file %.2f times\n",
> >+                       test_data->input_data_sz, test_data->input_file,
> >+                       (double)test_data->input_data_sz/actual_file_sz);
> >+       else
> >+               RTE_LOG(INFO, USER1,
> >+                       "%zu bytes read from file %s\n",
> >+                       test_data->input_data_sz,
> >+ test_data->input_file);
> >+
> >+       fclose(f);
> >+
> >+       return 0;
> >+
> >+err:
> >+       fclose(f);
> >+       rte_free(test_data->input_data);
> >+       test_data->input_data = NULL;
> >+
> >+       return -1;
> >+}
> >+
> >+static int
> >+comp_perf_initialize_compressdev(struct comp_test_data *test_data) {
> >+       uint8_t enabled_cdev_count;
> >+       uint8_t enabled_cdevs[RTE_COMPRESS_MAX_DEVS];
> >+
> >+       enabled_cdev_count = rte_compressdev_devices_get(test_data-
> >driver_name,
> >+                       enabled_cdevs, RTE_COMPRESS_MAX_DEVS);
> >+       if (enabled_cdev_count == 0) {
> >+               RTE_LOG(ERR, USER1, "No compress devices type %s available\n",
> >+                               test_data->driver_name);
> >+               return -EINVAL;
> >+       }
> >+
> >+       if (enabled_cdev_count > 1)
> >+               RTE_LOG(INFO, USER1,
> >+                       "Only the first compress device will be
> >+ used\n");
> >+
> >+       test_data->cdev_id = enabled_cdevs[0];
> >+
> >+       if (comp_perf_check_capabilities(test_data) < 0)
> >+               return -1;
> >+
> >+       /* Configure compressdev (one device, one queue pair) */
> >+       struct rte_compressdev_config config = {
> >+               .socket_id = rte_socket_id(),
> >+               .nb_queue_pairs = 1,
> >+               .max_nb_priv_xforms = NUM_MAX_XFORMS,
> >+               .max_nb_streams = 0
> >+       };
> >+
> >+       if (rte_compressdev_configure(test_data->cdev_id, &config) < 0) {
> >+               RTE_LOG(ERR, USER1, "Device configuration failed\n");
> >+               return -1;
> >+       }
> >+
> >+       if (rte_compressdev_queue_pair_setup(test_data->cdev_id, 0,
> >+                       NUM_MAX_INFLIGHT_OPS, rte_socket_id()) < 0) {
> >+               RTE_LOG(ERR, USER1, "Queue pair setup failed\n");
> >+               return -1;
> >+       }
> >+
> >+       if (rte_compressdev_start(test_data->cdev_id) < 0) {
> >+               RTE_LOG(ERR, USER1, "Device could not be started\n");
> >+               return -1;
> >+       }
> >+
> >+       return 0;
> >+}
> >+
> >+static int
> >+prepare_bufs(struct comp_test_data *test_data) {
> >+       uint32_t remaining_data = test_data->input_data_sz;
> >+       uint8_t *input_data_ptr = test_data->input_data;
> >+       size_t data_sz;
> >+       uint8_t *data_addr;
> >+       uint32_t i, j;
> >+
> >+       for (i = 0; i < test_data->total_bufs; i++) {
> >+               /* Allocate data in input mbuf and copy data from input file */
> >+               test_data->decomp_bufs[i] =
> >+                       rte_pktmbuf_alloc(test_data->decomp_buf_pool);
> >+               if (test_data->decomp_bufs[i] == NULL) {
> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
> >+                       return -1;
> >+               }
> >+
> >+               data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
> >+                                       test_data->decomp_bufs[i], data_sz);
> >+               if (data_addr == NULL) {
> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
> >+                       return -1;
> >+               }
> >+               rte_memcpy(data_addr, input_data_ptr, data_sz);
> >+
> >+               input_data_ptr += data_sz;
> >+               remaining_data -= data_sz;
> >+
> >+               /* Already one segment in the mbuf */
> >+               uint16_t segs_per_mbuf = 1;
> >+
> >+               /* Chain mbufs if needed for input mbufs */
> >+               while (segs_per_mbuf < test_data->max_sgl_segs
> >+                               && remaining_data > 0) {
> >+                       struct rte_mbuf *next_seg =
> >+
> >+ rte_pktmbuf_alloc(test_data->decomp_buf_pool);
> >+
> >+                       if (next_seg == NULL) {
> >+                               RTE_LOG(ERR, USER1,
> >+                                       "Could not allocate mbuf\n");
> >+                               return -1;
> >+                       }
> >+
> >+                       data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
> >+                               data_sz);
> >+
> >+                       if (data_addr == NULL) {
> >+                               RTE_LOG(ERR, USER1, "Could not append
> >+ data\n");
> Since a new buffer per segment is allocated, so is it possible for append to
> fail? think, this check is redundant here.

TJ: Yes, you're right, it should never fail. But I think it's good coding practice to add the check just in case.

> >+                               return -1;
> >+                       }
> >+
> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
> >+                       input_data_ptr += data_sz;
> >+                       remaining_data -= data_sz;
> >+
> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
> >+                                       next_seg) < 0) {
> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >+                               return -1;
> >+                       }
> >+                       segs_per_mbuf++;
> >+               }
> >+
> >+               /* Allocate data in output mbuf */
> >+               test_data->comp_bufs[i] =
> >+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
> >+               if (test_data->comp_bufs[i] == NULL) {
> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
> >+                       return -1;
> >+               }
> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
> >+                                       test_data->comp_bufs[i],
> >+                                       test_data->seg_sz);
> >+               if (data_addr == NULL) {
> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
> >+                       return -1;
> >+               }
> >+
> >+               /* Chain mbufs if needed for output mbufs */
> >+               for (j = 1; j < segs_per_mbuf; j++) {
> >+                       struct rte_mbuf *next_seg =
> >+
> >+ rte_pktmbuf_alloc(test_data->comp_buf_pool);
> >+
> >+                       if (next_seg == NULL) {
> >+                               RTE_LOG(ERR, USER1,
> >+                                       "Could not allocate mbuf\n");
> >+                               return -1;
> >+                       }
> >+
> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
> >+                               test_data->seg_sz);
> >+
> >+                       if (data_addr == NULL) {
> >+                               RTE_LOG(ERR, USER1, "Could not append data\n");
> >+                               return -1;
> >+                       }
> >+
> >+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
> >+                                       next_seg) < 0) {
> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >+                               return -1;
> >+                       }
> >+               }
> >+       }
> >+
> >+       return 0;
> >+}
> >+
> >+static void
> >+free_bufs(struct comp_test_data *test_data) {
> >+       uint32_t i;
> >+
> >+       for (i = 0; i < test_data->total_bufs; i++) {
> >+               rte_pktmbuf_free(test_data->comp_bufs[i]);
> >+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
> >+       }
> >+       rte_free(test_data->comp_bufs);
> >+       rte_free(test_data->decomp_bufs); }
> >+
> >+static int
> >+main_loop(struct comp_test_data *test_data, uint8_t level,
> >+                       enum rte_comp_xform_type type,
> >+                       uint8_t *output_data_ptr,
> >+                       size_t *output_data_sz,
> >+                       unsigned int benchmarking) {
> >+       uint8_t dev_id = test_data->cdev_id;
> >+       uint32_t i, iter, num_iter;
> >+       struct rte_comp_op **ops, **deq_ops;
> >+       void *priv_xform = NULL;
> >+       struct rte_comp_xform xform;
> >+       size_t output_size = 0;
> >+       struct rte_mbuf **input_bufs, **output_bufs;
> >+       int res = 0;
> >+       int allocated = 0;
> >+
> >+       if (test_data == NULL || !test_data->burst_sz) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Unknow burst size\n");
> >+               return -1;
> >+       }
> >+
> >+       ops = rte_zmalloc_socket(NULL,
> >+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
> >+               0, rte_socket_id());
> >+
> >+       if (ops == NULL) {
> >+               RTE_LOG(ERR, USER1,
> >+                       "Can't allocate memory for ops strucures\n");
> >+               return -1;
> >+       }
> >+
> >+       deq_ops = &ops[test_data->total_bufs];
> >+
> >+       if (type == RTE_COMP_COMPRESS) {
> >+               xform = (struct rte_comp_xform) {
> >+                       .type = RTE_COMP_COMPRESS,
> >+                       .compress = {
> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
> >+                               .deflate.huffman = test_data->huffman_enc,
> >+                               .level = level,
> >+                               .window_size = test_data->window_sz,
> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
> >+                       }
> >+               };
> >+               input_bufs = test_data->decomp_bufs;
> >+               output_bufs = test_data->comp_bufs;
> >+       } else {
> >+               xform = (struct rte_comp_xform) {
> >+                       .type = RTE_COMP_DECOMPRESS,
> >+                       .decompress = {
> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
> >+                               .window_size = test_data->window_sz,
> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
> >+                       }
> >+               };
> >+               input_bufs = test_data->comp_bufs;
> >+               output_bufs = test_data->decomp_bufs;
> >+       }
> >+
> >+       /* Create private xform */
> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
> >+                       &priv_xform) < 0) {
> >+               RTE_LOG(ERR, USER1, "Private xform could not be created\n");
> >+               res = -1;
> >+               goto end;
> >+       }
> >+
> >+       uint64_t tsc_start, tsc_end, tsc_duration;
> >+
> >+       tsc_start = tsc_end = tsc_duration = 0;
> >+       if (benchmarking) {
> >+               tsc_start = rte_rdtsc();
> >+               num_iter = test_data->num_iter;
> >+       } else
> >+               num_iter = 1;
> Looks like in same code we're doing benchmarking and functional validation.
> It can be reorganised to keep validation test separately like done in
> crypto_perf.

TJ: Ok, makes sense. However in the interests of getting this into the 18.11 release I'd like to
defer this refactoring and the remainder of your comments below to the next release.


Next comments - WIP


Br, Tomek

Verma, Shally Nov. 5, 2018, 8:34 a.m. UTC | #9

>-----Original Message-----
>From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
>Sent: 02 November 2018 15:29
>To: dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; Verma, Shally <Shally.Verma@cavium.com>; De Lara
>Guarch, Pablo <pablo.de.lara.guarch@intel.com>
>Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Hi Shally,
>
>Sorry for delay - I was on sick leave.
>We had some issues with dynamic compression test so I block this test in V2. May be there's too late to add this into this release but
>we've decided to send this V2 to DPDK.
>
>My comment inline (not all have answer so far, still working on that)
>
>> -----Original Message-----
>> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> Sent: Friday, October 12, 2018 12:16 PM
>> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
>> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
>> <pablo.de.lara.guarch@intel.com>
>> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> measurement
>>
>> HI TomaszX
>>
>> Sorry for delay in response. Comments inline.
>>
>> >-----Original Message-----
>> >From: dev <dev-bounces@dpdk.org> On Behalf Of Tomasz Jozwiak
>> >Sent: 01 October 2018 18:57
>> >To: dev@dpdk.org; fiona.trahe@intel.com; tomaszx.jozwiak@intel.com;
>> >akhil.goyal@nxp.com; pablo.de.lara.guarch@intel.com
>> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> >Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> >measurement
>> >
>> >External Email
>> >
>> >Added performance measurement part into compression perf. test.
>> >
>> >Signed-off-by: De Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
>> >Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
>> >---
>> > app/test-compress-perf/main.c | 844
>> >++++++++++++++++++++++++++++++++++++++++++
>> > 1 file changed, 844 insertions(+)
>> >
>> >diff --git a/app/test-compress-perf/main.c
>> >b/app/test-compress-perf/main.c index f52b98d..093dfaf 100644
>> >--- a/app/test-compress-perf/main.c
>> >+++ b/app/test-compress-perf/main.c
>> >@@ -5,13 +5,721 @@
>> > #include <rte_malloc.h>
>> > #include <rte_eal.h>
>> > #include <rte_log.h>
>> >+#include <rte_cycles.h>
>> > #include <rte_compressdev.h>
>> >
>> > #include "comp_perf_options.h"
>> >
>> >+#define NUM_MAX_XFORMS 16
>> >+#define NUM_MAX_INFLIGHT_OPS 512
>> >+#define EXPANSE_RATIO 1.05
>> >+#define MIN_ISAL_SIZE 8
>> >+
>> >+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
>> >+
>> >+static int
>> >+param_range_check(uint16_t size, const struct rte_param_log2_range
>> >+*range) {
>> >+       unsigned int next_size;
>> >+
>> >+       /* Check lower/upper bounds */
>> >+       if (size < range->min)
>> >+               return -1;
>> >+
>> >+       if (size > range->max)
>> >+               return -1;
>> >+
>> >+       /* If range is actually only one value, size is correct */
>> >+       if (range->increment == 0)
>> >+               return 0;
>> >+
>> >+       /* Check if value is one of the supported sizes */
>> >+       for (next_size = range->min; next_size <= range->max;
>> >+                       next_size += range->increment)
>> >+               if (size == next_size)
>> >+                       return 0;
>> >+
>> >+       return -1;
>> >+}
>> >+
>> >+static int
>> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
>> >+       const struct rte_compressdev_capabilities *cap;
>> >+
>> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
>> >+                                            RTE_COMP_ALGO_DEFLATE);
>> >+
>> >+       if (cap == NULL) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Compress device does not support DEFLATE\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       uint64_t comp_flags = cap->comp_feature_flags;
>> >+
>> >+       /* Huffman enconding */
>> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
>> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Compress device does not supported Fixed Huffman\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC &&
>> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Compress device does not supported Dynamic Huffman\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       /* Window size */
>> >+       if (test_data->window_sz != -1) {
>> >+               if (param_range_check(test_data->window_sz,
>> >+ &cap->window_size)
>> What if cap->window_size is 0 i.e. implementation default?
>
>TJ: You probably mean cap->window_size.increment = 0 (because cap->window_size is a structure). In that case we check if
>test_data->window_sz >=min and test_data->window_sz <= max only, because increment = 0 means (base on compression API) we
>have only one value of windows_size (no range is supported).
But PMD can set min and max too 0 for such case.

>
>
>
....

>> >+
>> >+               if (fread(data, data_to_read, 1, f) != 1) {
>> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
>> >+                       goto err;
>> >+               }
>> >+               if (fseek(f, 0, SEEK_SET) != 0) {
>> >+                       RTE_LOG(ERR, USER1,
>> >+                               "Size of input could not be calculated\n");
>> >+                       goto err;
>> >+               }
>> >+               remaining_data -= data_to_read;
>> >+               data += data_to_read;
>> It looks like it will run 2nd time only if input file size < input data size in which
>> case it will just keep filling input buffer with repeated data.
>> Is that the intention here?
>
>TJ: Yes exactly. If test_data->input_data_sz is bigger than  actual_file_sz then we fill the buffer with repeated data from file to fill
>whole buffer.
I mentioned in one of the earlier reply, wont that then influence the compression behaviour and o/p? my suggestion was to work on actual user provided input to take perf to get actual perf for given content.

>
>>
...

>> >+                       if (data_addr == NULL) {
>> >+                               RTE_LOG(ERR, USER1, "Could not append
>> >+ data\n");
>> Since a new buffer per segment is allocated, so is it possible for append to
>> fail? think, this check is redundant here.
>
>TJ: Yes, you're right, it should never fail. But I think it's good coding practice to add the check just in case.
>
Unless it is called in data path which might cost perf a bit.

Thanks
Shally

>> >+                               return -1;
>> >+                       }
>> >+
>> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
>> >+                       input_data_ptr += data_sz;
>> >+                       remaining_data -= data_sz;
>> >+
>> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
>> >+                                       next_seg) < 0) {
>> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>> >+                               return -1;
>> >+                       }
>> >+                       segs_per_mbuf++;
>> >+               }
>> >+
>> >+               /* Allocate data in output mbuf */
>> >+               test_data->comp_bufs[i] =
>> >+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
>> >+               if (test_data->comp_bufs[i] == NULL) {
>> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
>> >+                       return -1;
>> >+               }
>> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
>> >+                                       test_data->comp_bufs[i],
>> >+                                       test_data->seg_sz);
>> >+               if (data_addr == NULL) {
>> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
>> >+                       return -1;
>> >+               }
>> >+
>> >+               /* Chain mbufs if needed for output mbufs */
>> >+               for (j = 1; j < segs_per_mbuf; j++) {
>> >+                       struct rte_mbuf *next_seg =
>> >+
>> >+ rte_pktmbuf_alloc(test_data->comp_buf_pool);
>> >+
>> >+                       if (next_seg == NULL) {
>> >+                               RTE_LOG(ERR, USER1,
>> >+                                       "Could not allocate mbuf\n");
>> >+                               return -1;
>> >+                       }
>> >+
>> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
>> >+                               test_data->seg_sz);
>> >+
>> >+                       if (data_addr == NULL) {
>> >+                               RTE_LOG(ERR, USER1, "Could not append data\n");
>> >+                               return -1;
>> >+                       }
>> >+
>> >+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
>> >+                                       next_seg) < 0) {
>> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>> >+                               return -1;
>> >+                       }
>> >+               }
>> >+       }
>> >+
>> >+       return 0;
>> >+}
>> >+
>> >+static void
>> >+free_bufs(struct comp_test_data *test_data) {
>> >+       uint32_t i;
>> >+
>> >+       for (i = 0; i < test_data->total_bufs; i++) {
>> >+               rte_pktmbuf_free(test_data->comp_bufs[i]);
>> >+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
>> >+       }
>> >+       rte_free(test_data->comp_bufs);
>> >+       rte_free(test_data->decomp_bufs); }
>> >+
>> >+static int
>> >+main_loop(struct comp_test_data *test_data, uint8_t level,
>> >+                       enum rte_comp_xform_type type,
>> >+                       uint8_t *output_data_ptr,
>> >+                       size_t *output_data_sz,
>> >+                       unsigned int benchmarking) {
>> >+       uint8_t dev_id = test_data->cdev_id;
>> >+       uint32_t i, iter, num_iter;
>> >+       struct rte_comp_op **ops, **deq_ops;
>> >+       void *priv_xform = NULL;
>> >+       struct rte_comp_xform xform;
>> >+       size_t output_size = 0;
>> >+       struct rte_mbuf **input_bufs, **output_bufs;
>> >+       int res = 0;
>> >+       int allocated = 0;
>> >+
>> >+       if (test_data == NULL || !test_data->burst_sz) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Unknow burst size\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       ops = rte_zmalloc_socket(NULL,
>> >+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
>> >+               0, rte_socket_id());
>> >+
>> >+       if (ops == NULL) {
>> >+               RTE_LOG(ERR, USER1,
>> >+                       "Can't allocate memory for ops strucures\n");
>> >+               return -1;
>> >+       }
>> >+
>> >+       deq_ops = &ops[test_data->total_bufs];
>> >+
>> >+       if (type == RTE_COMP_COMPRESS) {
>> >+               xform = (struct rte_comp_xform) {
>> >+                       .type = RTE_COMP_COMPRESS,
>> >+                       .compress = {
>> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
>> >+                               .deflate.huffman = test_data->huffman_enc,
>> >+                               .level = level,
>> >+                               .window_size = test_data->window_sz,
>> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>> >+                       }
>> >+               };
>> >+               input_bufs = test_data->decomp_bufs;
>> >+               output_bufs = test_data->comp_bufs;
>> >+       } else {
>> >+               xform = (struct rte_comp_xform) {
>> >+                       .type = RTE_COMP_DECOMPRESS,
>> >+                       .decompress = {
>> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
>> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>> >+                               .window_size = test_data->window_sz,
>> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>> >+                       }
>> >+               };
>> >+               input_bufs = test_data->comp_bufs;
>> >+               output_bufs = test_data->decomp_bufs;
>> >+       }
>> >+
>> >+       /* Create private xform */
>> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
>> >+                       &priv_xform) < 0) {
>> >+               RTE_LOG(ERR, USER1, "Private xform could not be created\n");
>> >+               res = -1;
>> >+               goto end;
>> >+       }
>> >+
>> >+       uint64_t tsc_start, tsc_end, tsc_duration;
>> >+
>> >+       tsc_start = tsc_end = tsc_duration = 0;
>> >+       if (benchmarking) {
>> >+               tsc_start = rte_rdtsc();
>> >+               num_iter = test_data->num_iter;
>> >+       } else
>> >+               num_iter = 1;
>> Looks like in same code we're doing benchmarking and functional validation.
>> It can be reorganised to keep validation test separately like done in
>> crypto_perf.
>
>TJ: Ok, makes sense. However in the interests of getting this into the 18.11 release I'd like to
>defer this refactoring and the remainder of your comments below to the next release.
>
>
>Next comments - WIP
>
>
>Br, Tomek

Tomasz Jozwiak Nov. 6, 2018, 8:04 a.m. UTC | #10

Hi Shally,

Please see my comment inline.

> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Monday, November 5, 2018 9:34 AM
> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>
> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> 
> 
> >-----Original Message-----
> >From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
> >Sent: 02 November 2018 15:29
> >To: dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>;
> >akhil.goyal@nxp.com; Verma, Shally <Shally.Verma@cavium.com>; De Lara
> >Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >measurement
> >
> >External Email
> >
> >Hi Shally,
> >
> >Sorry for delay - I was on sick leave.
> >We had some issues with dynamic compression test so I block this test
> >in V2. May be there's too late to add this into this release but we've decided
> to send this V2 to DPDK.
> >
> >My comment inline (not all have answer so far, still working on that)
> >
> >> -----Original Message-----
> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> Sent: Friday, October 12, 2018 12:16 PM
> >> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara
> >> Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> performance measurement
> >>
> >> HI TomaszX
> >>
> >> Sorry for delay in response. Comments inline.
> >>
> >> >-----Original Message-----
> >> >From: dev <dev-bounces@dpdk.org> On Behalf Of Tomasz Jozwiak
> >> >Sent: 01 October 2018 18:57
> >> >To: dev@dpdk.org; fiona.trahe@intel.com; tomaszx.jozwiak@intel.com;
> >> >akhil.goyal@nxp.com; pablo.de.lara.guarch@intel.com
> >> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> >Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >> >measurement
> >> >
> >> >External Email
> >> >
> >> >Added performance measurement part into compression perf. test.
> >> >
> >> >Signed-off-by: De Lara Guarch, Pablo
> >> ><pablo.de.lara.guarch@intel.com>
> >> >Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
> >> >---
> >> > app/test-compress-perf/main.c | 844
> >> >++++++++++++++++++++++++++++++++++++++++++
> >> > 1 file changed, 844 insertions(+)
> >> >
> >> >diff --git a/app/test-compress-perf/main.c
> >> >b/app/test-compress-perf/main.c index f52b98d..093dfaf 100644
> >> >--- a/app/test-compress-perf/main.c
> >> >+++ b/app/test-compress-perf/main.c
> >> >@@ -5,13 +5,721 @@
> >> > #include <rte_malloc.h>
> >> > #include <rte_eal.h>
> >> > #include <rte_log.h>
> >> >+#include <rte_cycles.h>
> >> > #include <rte_compressdev.h>
> >> >
> >> > #include "comp_perf_options.h"
> >> >
> >> >+#define NUM_MAX_XFORMS 16
> >> >+#define NUM_MAX_INFLIGHT_OPS 512
> >> >+#define EXPANSE_RATIO 1.05
> >> >+#define MIN_ISAL_SIZE 8
> >> >+
> >> >+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
> >> >+
> >> >+static int
> >> >+param_range_check(uint16_t size, const struct rte_param_log2_range
> >> >+*range) {
> >> >+       unsigned int next_size;
> >> >+
> >> >+       /* Check lower/upper bounds */
> >> >+       if (size < range->min)
> >> >+               return -1;
> >> >+
> >> >+       if (size > range->max)
> >> >+               return -1;
> >> >+
> >> >+       /* If range is actually only one value, size is correct */
> >> >+       if (range->increment == 0)
> >> >+               return 0;
> >> >+
> >> >+       /* Check if value is one of the supported sizes */
> >> >+       for (next_size = range->min; next_size <= range->max;
> >> >+                       next_size += range->increment)
> >> >+               if (size == next_size)
> >> >+                       return 0;
> >> >+
> >> >+       return -1;
> >> >+}
> >> >+
> >> >+static int
> >> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
> >> >+       const struct rte_compressdev_capabilities *cap;
> >> >+
> >> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
> >> >+                                            RTE_COMP_ALGO_DEFLATE);
> >> >+
> >> >+       if (cap == NULL) {
> >> >+               RTE_LOG(ERR, USER1,
> >> >+                       "Compress device does not support DEFLATE\n");
> >> >+               return -1;
> >> >+       }
> >> >+
> >> >+       uint64_t comp_flags = cap->comp_feature_flags;
> >> >+
> >> >+       /* Huffman enconding */
> >> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
> >> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
> >> >+               RTE_LOG(ERR, USER1,
> >> >+                       "Compress device does not supported Fixed Huffman\n");
> >> >+               return -1;
> >> >+       }
> >> >+
> >> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC
> &&
> >> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0)
> {
> >> >+               RTE_LOG(ERR, USER1,
> >> >+                       "Compress device does not supported Dynamic
> Huffman\n");
> >> >+               return -1;
> >> >+       }
> >> >+
> >> >+       /* Window size */
> >> >+       if (test_data->window_sz != -1) {
> >> >+               if (param_range_check(test_data->window_sz,
> >> >+ &cap->window_size)
> >> What if cap->window_size is 0 i.e. implementation default?
> >
> >TJ: You probably mean cap->window_size.increment = 0 (because
> >cap->window_size is a structure). In that case we check if
> >test_data->window_sz >=min and test_data->window_sz <= max only,
> because increment = 0 means (base on compression API) we have only one
> value of windows_size (no range is supported).
> But PMD can set min and max too 0 for such case.

TJ: I can't see any issue in that case too. Maybe I don't understand what you mean but the logic is as follow:
1)  if you pass '--window-sz  ...' param. into command line your intention is to force that value of window size during test. We check is this value is allow (by param_range_check() function).
2) if you plan to use default value - just don't pass '--window-sz' param. in command line at all. In that case we get windows size from window_size.max field, so if window_size.min= window_size.max=0 
test_data->window_sz will be zero, as well. 
If you mean that behavior is not good - I will be grateful for other suggestions.

> 
> >
> >
> >
> ....
> 
> >> >+
> >> >+               if (fread(data, data_to_read, 1, f) != 1) {
> >> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
> >> >+                       goto err;
> >> >+               }
> >> >+               if (fseek(f, 0, SEEK_SET) != 0) {
> >> >+                       RTE_LOG(ERR, USER1,
> >> >+                               "Size of input could not be calculated\n");
> >> >+                       goto err;
> >> >+               }
> >> >+               remaining_data -= data_to_read;
> >> >+               data += data_to_read;
> >> It looks like it will run 2nd time only if input file size < input
> >> data size in which case it will just keep filling input buffer with repeated
> data.
> >> Is that the intention here?
> >
> >TJ: Yes exactly. If test_data->input_data_sz is bigger than
> >actual_file_sz then we fill the buffer with repeated data from file to fill
> whole buffer.
> I mentioned in one of the earlier reply, wont that then influence the
> compression behaviour and o/p? my suggestion was to work on actual user
> provided input to take perf to get actual perf for given content.

TJ: You right, but this solution is flexible. You can pass ' --extended-input-sz" or not, so you can use original input data or extend it if you want.

> 
> >
> >>
> ...
> 
> >> >+                       if (data_addr == NULL) {
> >> >+                               RTE_LOG(ERR, USER1, "Could not
> >> >+ append data\n");
> >> Since a new buffer per segment is allocated, so is it possible for
> >> append to fail? think, this check is redundant here.
> >
> >TJ: Yes, you're right, it should never fail. But I think it's good coding practice
> to add the check just in case.
> >
> Unless it is called in data path which might cost perf a bit.

TJ:  prepare_bufs() is out of perf measurement, so shouldn't impact to measurements. The performance measurement is inside main_loop() only.


Br, Tomek

> 
> Thanks
> Shally
> 
> >> >+                               return -1;
> >> >+                       }
> >> >+
> >> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
> >> >+                       input_data_ptr += data_sz;
> >> >+                       remaining_data -= data_sz;
> >> >+
> >> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
> >> >+                                       next_seg) < 0) {
> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >> >+                               return -1;
> >> >+                       }
> >> >+                       segs_per_mbuf++;
> >> >+               }
> >> >+
> >> >+               /* Allocate data in output mbuf */
> >> >+               test_data->comp_bufs[i] =
> >> >+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
> >> >+               if (test_data->comp_bufs[i] == NULL) {
> >> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
> >> >+                       return -1;
> >> >+               }
> >> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
> >> >+                                       test_data->comp_bufs[i],
> >> >+                                       test_data->seg_sz);
> >> >+               if (data_addr == NULL) {
> >> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
> >> >+                       return -1;
> >> >+               }
> >> >+
> >> >+               /* Chain mbufs if needed for output mbufs */
> >> >+               for (j = 1; j < segs_per_mbuf; j++) {
> >> >+                       struct rte_mbuf *next_seg =
> >> >+
> >> >+ rte_pktmbuf_alloc(test_data->comp_buf_pool);
> >> >+
> >> >+                       if (next_seg == NULL) {
> >> >+                               RTE_LOG(ERR, USER1,
> >> >+                                       "Could not allocate mbuf\n");
> >> >+                               return -1;
> >> >+                       }
> >> >+
> >> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
> >> >+                               test_data->seg_sz);
> >> >+
> >> >+                       if (data_addr == NULL) {
> >> >+                               RTE_LOG(ERR, USER1, "Could not append data\n");
> >> >+                               return -1;
> >> >+                       }
> >> >+
> >> >+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
> >> >+                                       next_seg) < 0) {
> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >> >+                               return -1;
> >> >+                       }
> >> >+               }
> >> >+       }
> >> >+
> >> >+       return 0;
> >> >+}
> >> >+
> >> >+static void
> >> >+free_bufs(struct comp_test_data *test_data) {
> >> >+       uint32_t i;
> >> >+
> >> >+       for (i = 0; i < test_data->total_bufs; i++) {
> >> >+               rte_pktmbuf_free(test_data->comp_bufs[i]);
> >> >+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
> >> >+       }
> >> >+       rte_free(test_data->comp_bufs);
> >> >+       rte_free(test_data->decomp_bufs); }
> >> >+
> >> >+static int
> >> >+main_loop(struct comp_test_data *test_data, uint8_t level,
> >> >+                       enum rte_comp_xform_type type,
> >> >+                       uint8_t *output_data_ptr,
> >> >+                       size_t *output_data_sz,
> >> >+                       unsigned int benchmarking) {
> >> >+       uint8_t dev_id = test_data->cdev_id;
> >> >+       uint32_t i, iter, num_iter;
> >> >+       struct rte_comp_op **ops, **deq_ops;
> >> >+       void *priv_xform = NULL;
> >> >+       struct rte_comp_xform xform;
> >> >+       size_t output_size = 0;
> >> >+       struct rte_mbuf **input_bufs, **output_bufs;
> >> >+       int res = 0;
> >> >+       int allocated = 0;
> >> >+
> >> >+       if (test_data == NULL || !test_data->burst_sz) {
> >> >+               RTE_LOG(ERR, USER1,
> >> >+                       "Unknow burst size\n");
> >> >+               return -1;
> >> >+       }
> >> >+
> >> >+       ops = rte_zmalloc_socket(NULL,
> >> >+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
> >> >+               0, rte_socket_id());
> >> >+
> >> >+       if (ops == NULL) {
> >> >+               RTE_LOG(ERR, USER1,
> >> >+                       "Can't allocate memory for ops strucures\n");
> >> >+               return -1;
> >> >+       }
> >> >+
> >> >+       deq_ops = &ops[test_data->total_bufs];
> >> >+
> >> >+       if (type == RTE_COMP_COMPRESS) {
> >> >+               xform = (struct rte_comp_xform) {
> >> >+                       .type = RTE_COMP_COMPRESS,
> >> >+                       .compress = {
> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
> >> >+                               .deflate.huffman = test_data->huffman_enc,
> >> >+                               .level = level,
> >> >+                               .window_size = test_data->window_sz,
> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
> >> >+                       }
> >> >+               };
> >> >+               input_bufs = test_data->decomp_bufs;
> >> >+               output_bufs = test_data->comp_bufs;
> >> >+       } else {
> >> >+               xform = (struct rte_comp_xform) {
> >> >+                       .type = RTE_COMP_DECOMPRESS,
> >> >+                       .decompress = {
> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
> >> >+                               .window_size = test_data->window_sz,
> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
> >> >+                       }
> >> >+               };
> >> >+               input_bufs = test_data->comp_bufs;
> >> >+               output_bufs = test_data->decomp_bufs;
> >> >+       }
> >> >+
> >> >+       /* Create private xform */
> >> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
> >> >+                       &priv_xform) < 0) {
> >> >+               RTE_LOG(ERR, USER1, "Private xform could not be created\n");
> >> >+               res = -1;
> >> >+               goto end;
> >> >+       }
> >> >+
> >> >+       uint64_t tsc_start, tsc_end, tsc_duration;
> >> >+
> >> >+       tsc_start = tsc_end = tsc_duration = 0;
> >> >+       if (benchmarking) {
> >> >+               tsc_start = rte_rdtsc();
> >> >+               num_iter = test_data->num_iter;
> >> >+       } else
> >> >+               num_iter = 1;
> >> Looks like in same code we're doing benchmarking and functional
> validation.
> >> It can be reorganised to keep validation test separately like done in
> >> crypto_perf.
> >
> >TJ: Ok, makes sense. However in the interests of getting this into the
> >18.11 release I'd like to defer this refactoring and the remainder of your
> comments below to the next release.
> >
> >
> >Next comments - WIP
> >
> >
> >Br, Tomek

Verma, Shally Nov. 6, 2018, 8:15 a.m. UTC | #11

>-----Original Message-----
>From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
>Sent: 06 November 2018 13:34
>To: Verma, Shally <Shally.Verma@cavium.com>; dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Hi Shally,
>
>Please see my comment inline.
>
>> -----Original Message-----
>> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> Sent: Monday, November 5, 2018 9:34 AM
>> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
>> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara Guarch, Pablo
>> <pablo.de.lara.guarch@intel.com>
>> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> measurement
>>
>>
>>
>> >-----Original Message-----
>> >From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
>> >Sent: 02 November 2018 15:29
>> >To: dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>;
>> >akhil.goyal@nxp.com; Verma, Shally <Shally.Verma@cavium.com>; De Lara
>> >Guarch, Pablo <pablo.de.lara.guarch@intel.com>
>> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> >measurement
>> >
>> >External Email
>> >
>> >Hi Shally,
>> >
>> >Sorry for delay - I was on sick leave.
>> >We had some issues with dynamic compression test so I block this test
>> >in V2. May be there's too late to add this into this release but we've decided
>> to send this V2 to DPDK.
>> >
>> >My comment inline (not all have answer so far, still working on that)
>> >
>> >> -----Original Message-----
>> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> >> Sent: Friday, October 12, 2018 12:16 PM
>> >> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
>> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara
>> >> Guarch, Pablo <pablo.de.lara.guarch@intel.com>
>> >> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
>> >> performance measurement
>> >>
>> >> HI TomaszX
>> >>
>> >> Sorry for delay in response. Comments inline.
>> >>
>> >> >-----Original Message-----
>> >> >From: dev <dev-bounces@dpdk.org> On Behalf Of Tomasz Jozwiak
>> >> >Sent: 01 October 2018 18:57
>> >> >To: dev@dpdk.org; fiona.trahe@intel.com; tomaszx.jozwiak@intel.com;
>> >> >akhil.goyal@nxp.com; pablo.de.lara.guarch@intel.com
>> >> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> >> >Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> >> >measurement
>> >> >
>> >> >External Email
>> >> >
>> >> >Added performance measurement part into compression perf. test.
>> >> >
>> >> >Signed-off-by: De Lara Guarch, Pablo
>> >> ><pablo.de.lara.guarch@intel.com>
>> >> >Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
>> >> >---
>> >> > app/test-compress-perf/main.c | 844
>> >> >++++++++++++++++++++++++++++++++++++++++++
>> >> > 1 file changed, 844 insertions(+)
>> >> >
>> >> >diff --git a/app/test-compress-perf/main.c
>> >> >b/app/test-compress-perf/main.c index f52b98d..093dfaf 100644
>> >> >--- a/app/test-compress-perf/main.c
>> >> >+++ b/app/test-compress-perf/main.c
>> >> >@@ -5,13 +5,721 @@
>> >> > #include <rte_malloc.h>
>> >> > #include <rte_eal.h>
>> >> > #include <rte_log.h>
>> >> >+#include <rte_cycles.h>
>> >> > #include <rte_compressdev.h>
>> >> >
>> >> > #include "comp_perf_options.h"
>> >> >
>> >> >+#define NUM_MAX_XFORMS 16
>> >> >+#define NUM_MAX_INFLIGHT_OPS 512
>> >> >+#define EXPANSE_RATIO 1.05
>> >> >+#define MIN_ISAL_SIZE 8
>> >> >+
>> >> >+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
>> >> >+
>> >> >+static int
>> >> >+param_range_check(uint16_t size, const struct rte_param_log2_range
>> >> >+*range) {
>> >> >+       unsigned int next_size;
>> >> >+
>> >> >+       /* Check lower/upper bounds */
>> >> >+       if (size < range->min)
>> >> >+               return -1;
>> >> >+
>> >> >+       if (size > range->max)
>> >> >+               return -1;
>> >> >+
>> >> >+       /* If range is actually only one value, size is correct */
>> >> >+       if (range->increment == 0)
>> >> >+               return 0;
>> >> >+
>> >> >+       /* Check if value is one of the supported sizes */
>> >> >+       for (next_size = range->min; next_size <= range->max;
>> >> >+                       next_size += range->increment)
>> >> >+               if (size == next_size)
>> >> >+                       return 0;
>> >> >+
>> >> >+       return -1;
>> >> >+}
>> >> >+
>> >> >+static int
>> >> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
>> >> >+       const struct rte_compressdev_capabilities *cap;
>> >> >+
>> >> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
>> >> >+                                            RTE_COMP_ALGO_DEFLATE);
>> >> >+
>> >> >+       if (cap == NULL) {
>> >> >+               RTE_LOG(ERR, USER1,
>> >> >+                       "Compress device does not support DEFLATE\n");
>> >> >+               return -1;
>> >> >+       }
>> >> >+
>> >> >+       uint64_t comp_flags = cap->comp_feature_flags;
>> >> >+
>> >> >+       /* Huffman enconding */
>> >> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
>> >> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
>> >> >+               RTE_LOG(ERR, USER1,
>> >> >+                       "Compress device does not supported Fixed Huffman\n");
>> >> >+               return -1;
>> >> >+       }
>> >> >+
>> >> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC
>> &&
>> >> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0)
>> {
>> >> >+               RTE_LOG(ERR, USER1,
>> >> >+                       "Compress device does not supported Dynamic
>> Huffman\n");
>> >> >+               return -1;
>> >> >+       }
>> >> >+
>> >> >+       /* Window size */
>> >> >+       if (test_data->window_sz != -1) {
>> >> >+               if (param_range_check(test_data->window_sz,
>> >> >+ &cap->window_size)
>> >> What if cap->window_size is 0 i.e. implementation default?
>> >
>> >TJ: You probably mean cap->window_size.increment = 0 (because
>> >cap->window_size is a structure). In that case we check if
>> >test_data->window_sz >=min and test_data->window_sz <= max only,
>> because increment = 0 means (base on compression API) we have only one
>> value of windows_size (no range is supported).
>> But PMD can set min and max too 0 for such case.
>
>TJ: I can't see any issue in that case too. Maybe I don't understand what you mean but the logic is as follow:
>1)  if you pass '--window-sz  ...' param. into command line your intention is to force that value of window size during test. We check is
>this value is allow (by param_range_check() function).
>2) if you plan to use default value - just don't pass '--window-sz' param. in command line at all. In that case we get windows size from
>window_size.max field, so if window_size.min= window_size.max=0
>test_data->window_sz will be zero, as well.
>If you mean that behavior is not good - I will be grateful for other suggestions.

This is fine. but I am thinking of 3rd case here:
c) user pass window sz but PMD window_sz.min = max = 0, then user requested windowsz is not applicable right?!

>
>>
>> >
>> >
>> >
>> ....
>>
>> >> >+
>> >> >+               if (fread(data, data_to_read, 1, f) != 1) {
>> >> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
>> >> >+                       goto err;
>> >> >+               }
>> >> >+               if (fseek(f, 0, SEEK_SET) != 0) {
>> >> >+                       RTE_LOG(ERR, USER1,
>> >> >+                               "Size of input could not be calculated\n");
>> >> >+                       goto err;
>> >> >+               }
>> >> >+               remaining_data -= data_to_read;
>> >> >+               data += data_to_read;
>> >> It looks like it will run 2nd time only if input file size < input
>> >> data size in which case it will just keep filling input buffer with repeated
>> data.
>> >> Is that the intention here?
>> >
>> >TJ: Yes exactly. If test_data->input_data_sz is bigger than
>> >actual_file_sz then we fill the buffer with repeated data from file to fill
>> whole buffer.
>> I mentioned in one of the earlier reply, wont that then influence the
>> compression behaviour and o/p? my suggestion was to work on actual user
>> provided input to take perf to get actual perf for given content.
>
>TJ: You right, but this solution is flexible. You can pass ' --extended-input-sz" or not, so you can use original input data or extend it if
>you want.
Ok. but still not sure if it's really needed. Might be practically most of the time it wont be exercised. No hard opinion on this though.

Thanks
Shally
>
>>
>> >
>> >>
>> ...
>>
>> >> >+                       if (data_addr == NULL) {
>> >> >+                               RTE_LOG(ERR, USER1, "Could not
>> >> >+ append data\n");
>> >> Since a new buffer per segment is allocated, so is it possible for
>> >> append to fail? think, this check is redundant here.
>> >
>> >TJ: Yes, you're right, it should never fail. But I think it's good coding practice
>> to add the check just in case.
>> >
>> Unless it is called in data path which might cost perf a bit.
>
>TJ:  prepare_bufs() is out of perf measurement, so shouldn't impact to measurements. The performance measurement is inside
>main_loop() only.
>
>
>Br, Tomek
>
>>
>> Thanks
>> Shally
>>
>> >> >+                               return -1;
>> >> >+                       }
>> >> >+
>> >> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
>> >> >+                       input_data_ptr += data_sz;
>> >> >+                       remaining_data -= data_sz;
>> >> >+
>> >> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
>> >> >+                                       next_seg) < 0) {
>> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>> >> >+                               return -1;
>> >> >+                       }
>> >> >+                       segs_per_mbuf++;
>> >> >+               }
>> >> >+
>> >> >+               /* Allocate data in output mbuf */
>> >> >+               test_data->comp_bufs[i] =
>> >> >+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
>> >> >+               if (test_data->comp_bufs[i] == NULL) {
>> >> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
>> >> >+                       return -1;
>> >> >+               }
>> >> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
>> >> >+                                       test_data->comp_bufs[i],
>> >> >+                                       test_data->seg_sz);
>> >> >+               if (data_addr == NULL) {
>> >> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
>> >> >+                       return -1;
>> >> >+               }
>> >> >+
>> >> >+               /* Chain mbufs if needed for output mbufs */
>> >> >+               for (j = 1; j < segs_per_mbuf; j++) {
>> >> >+                       struct rte_mbuf *next_seg =
>> >> >+
>> >> >+ rte_pktmbuf_alloc(test_data->comp_buf_pool);
>> >> >+
>> >> >+                       if (next_seg == NULL) {
>> >> >+                               RTE_LOG(ERR, USER1,
>> >> >+                                       "Could not allocate mbuf\n");
>> >> >+                               return -1;
>> >> >+                       }
>> >> >+
>> >> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
>> >> >+                               test_data->seg_sz);
>> >> >+
>> >> >+                       if (data_addr == NULL) {
>> >> >+                               RTE_LOG(ERR, USER1, "Could not append data\n");
>> >> >+                               return -1;
>> >> >+                       }
>> >> >+
>> >> >+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
>> >> >+                                       next_seg) < 0) {
>> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>> >> >+                               return -1;
>> >> >+                       }
>> >> >+               }
>> >> >+       }
>> >> >+
>> >> >+       return 0;
>> >> >+}
>> >> >+
>> >> >+static void
>> >> >+free_bufs(struct comp_test_data *test_data) {
>> >> >+       uint32_t i;
>> >> >+
>> >> >+       for (i = 0; i < test_data->total_bufs; i++) {
>> >> >+               rte_pktmbuf_free(test_data->comp_bufs[i]);
>> >> >+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
>> >> >+       }
>> >> >+       rte_free(test_data->comp_bufs);
>> >> >+       rte_free(test_data->decomp_bufs); }
>> >> >+
>> >> >+static int
>> >> >+main_loop(struct comp_test_data *test_data, uint8_t level,
>> >> >+                       enum rte_comp_xform_type type,
>> >> >+                       uint8_t *output_data_ptr,
>> >> >+                       size_t *output_data_sz,
>> >> >+                       unsigned int benchmarking) {
>> >> >+       uint8_t dev_id = test_data->cdev_id;
>> >> >+       uint32_t i, iter, num_iter;
>> >> >+       struct rte_comp_op **ops, **deq_ops;
>> >> >+       void *priv_xform = NULL;
>> >> >+       struct rte_comp_xform xform;
>> >> >+       size_t output_size = 0;
>> >> >+       struct rte_mbuf **input_bufs, **output_bufs;
>> >> >+       int res = 0;
>> >> >+       int allocated = 0;
>> >> >+
>> >> >+       if (test_data == NULL || !test_data->burst_sz) {
>> >> >+               RTE_LOG(ERR, USER1,
>> >> >+                       "Unknow burst size\n");
>> >> >+               return -1;
>> >> >+       }
>> >> >+
>> >> >+       ops = rte_zmalloc_socket(NULL,
>> >> >+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
>> >> >+               0, rte_socket_id());
>> >> >+
>> >> >+       if (ops == NULL) {
>> >> >+               RTE_LOG(ERR, USER1,
>> >> >+                       "Can't allocate memory for ops strucures\n");
>> >> >+               return -1;
>> >> >+       }
>> >> >+
>> >> >+       deq_ops = &ops[test_data->total_bufs];
>> >> >+
>> >> >+       if (type == RTE_COMP_COMPRESS) {
>> >> >+               xform = (struct rte_comp_xform) {
>> >> >+                       .type = RTE_COMP_COMPRESS,
>> >> >+                       .compress = {
>> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
>> >> >+                               .deflate.huffman = test_data->huffman_enc,
>> >> >+                               .level = level,
>> >> >+                               .window_size = test_data->window_sz,
>> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>> >> >+                       }
>> >> >+               };
>> >> >+               input_bufs = test_data->decomp_bufs;
>> >> >+               output_bufs = test_data->comp_bufs;
>> >> >+       } else {
>> >> >+               xform = (struct rte_comp_xform) {
>> >> >+                       .type = RTE_COMP_DECOMPRESS,
>> >> >+                       .decompress = {
>> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
>> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>> >> >+                               .window_size = test_data->window_sz,
>> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>> >> >+                       }
>> >> >+               };
>> >> >+               input_bufs = test_data->comp_bufs;
>> >> >+               output_bufs = test_data->decomp_bufs;
>> >> >+       }
>> >> >+
>> >> >+       /* Create private xform */
>> >> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
>> >> >+                       &priv_xform) < 0) {
>> >> >+               RTE_LOG(ERR, USER1, "Private xform could not be created\n");
>> >> >+               res = -1;
>> >> >+               goto end;
>> >> >+       }
>> >> >+
>> >> >+       uint64_t tsc_start, tsc_end, tsc_duration;
>> >> >+
>> >> >+       tsc_start = tsc_end = tsc_duration = 0;
>> >> >+       if (benchmarking) {
>> >> >+               tsc_start = rte_rdtsc();
>> >> >+               num_iter = test_data->num_iter;
>> >> >+       } else
>> >> >+               num_iter = 1;
>> >> Looks like in same code we're doing benchmarking and functional
>> validation.
>> >> It can be reorganised to keep validation test separately like done in
>> >> crypto_perf.
>> >
>> >TJ: Ok, makes sense. However in the interests of getting this into the
>> >18.11 release I'd like to defer this refactoring and the remainder of your
>> comments below to the next release.
>> >
>> >
>> >Next comments - WIP
>> >
>> >
>> >Br, Tomek

Tomasz Jozwiak Nov. 6, 2018, 9:05 a.m. UTC | #12

> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Tuesday, November 6, 2018 9:16 AM
> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> 
> 
> >-----Original Message-----
> >From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
> >Sent: 06 November 2018 13:34
> >To: Verma, Shally <Shally.Verma@cavium.com>; dev@dpdk.org; Trahe,
> Fiona
> ><fiona.trahe@intel.com>; akhil.goyal@nxp.com
> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >measurement
> >
> >External Email
> >
> >Hi Shally,
> >
> >Please see my comment inline.
> >
> >> -----Original Message-----
> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> Sent: Monday, November 5, 2018 9:34 AM
> >> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara
> >> Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> performance measurement
> >>
> >>
> >>
> >> >-----Original Message-----
> >> >From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
> >> >Sent: 02 November 2018 15:29
> >> >To: dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>;
> >> >akhil.goyal@nxp.com; Verma, Shally <Shally.Verma@cavium.com>; De
> >> >Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >performance measurement
> >> >
> >> >External Email
> >> >
> >> >Hi Shally,
> >> >
> >> >Sorry for delay - I was on sick leave.
> >> >We had some issues with dynamic compression test so I block this
> >> >test in V2. May be there's too late to add this into this release
> >> >but we've decided
> >> to send this V2 to DPDK.
> >> >
> >> >My comment inline (not all have answer so far, still working on
> >> >that)
> >> >
> >> >> -----Original Message-----
> >> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> >> Sent: Friday, October 12, 2018 12:16 PM
> >> >> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De Lara
> >> >> Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >> >> Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> performance measurement
> >> >>
> >> >> HI TomaszX
> >> >>
> >> >> Sorry for delay in response. Comments inline.
> >> >>
> >> >> >-----Original Message-----
> >> >> >From: dev <dev-bounces@dpdk.org> On Behalf Of Tomasz Jozwiak
> >> >> >Sent: 01 October 2018 18:57
> >> >> >To: dev@dpdk.org; fiona.trahe@intel.com;
> >> >> >tomaszx.jozwiak@intel.com; akhil.goyal@nxp.com;
> >> >> >pablo.de.lara.guarch@intel.com
> >> >> >Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> >> >Subject: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> >performance measurement
> >> >> >
> >> >> >External Email
> >> >> >
> >> >> >Added performance measurement part into compression perf. test.
> >> >> >
> >> >> >Signed-off-by: De Lara Guarch, Pablo
> >> >> ><pablo.de.lara.guarch@intel.com>
> >> >> >Signed-off-by: Tomasz Jozwiak <tomaszx.jozwiak@intel.com>
> >> >> >---
> >> >> > app/test-compress-perf/main.c | 844
> >> >> >++++++++++++++++++++++++++++++++++++++++++
> >> >> > 1 file changed, 844 insertions(+)
> >> >> >
> >> >> >diff --git a/app/test-compress-perf/main.c
> >> >> >b/app/test-compress-perf/main.c index f52b98d..093dfaf 100644
> >> >> >--- a/app/test-compress-perf/main.c
> >> >> >+++ b/app/test-compress-perf/main.c
> >> >> >@@ -5,13 +5,721 @@
> >> >> > #include <rte_malloc.h>
> >> >> > #include <rte_eal.h>
> >> >> > #include <rte_log.h>
> >> >> >+#include <rte_cycles.h>
> >> >> > #include <rte_compressdev.h>
> >> >> >
> >> >> > #include "comp_perf_options.h"
> >> >> >
> >> >> >+#define NUM_MAX_XFORMS 16
> >> >> >+#define NUM_MAX_INFLIGHT_OPS 512 #define EXPANSE_RATIO
> 1.05
> >> >> >+#define MIN_ISAL_SIZE 8
> >> >> >+
> >> >> >+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
> >> >> >+
> >> >> >+static int
> >> >> >+param_range_check(uint16_t size, const struct
> >> >> >+rte_param_log2_range
> >> >> >+*range) {
> >> >> >+       unsigned int next_size;
> >> >> >+
> >> >> >+       /* Check lower/upper bounds */
> >> >> >+       if (size < range->min)
> >> >> >+               return -1;
> >> >> >+
> >> >> >+       if (size > range->max)
> >> >> >+               return -1;
> >> >> >+
> >> >> >+       /* If range is actually only one value, size is correct */
> >> >> >+       if (range->increment == 0)
> >> >> >+               return 0;
> >> >> >+
> >> >> >+       /* Check if value is one of the supported sizes */
> >> >> >+       for (next_size = range->min; next_size <= range->max;
> >> >> >+                       next_size += range->increment)
> >> >> >+               if (size == next_size)
> >> >> >+                       return 0;
> >> >> >+
> >> >> >+       return -1;
> >> >> >+}
> >> >> >+
> >> >> >+static int
> >> >> >+comp_perf_check_capabilities(struct comp_test_data *test_data) {
> >> >> >+       const struct rte_compressdev_capabilities *cap;
> >> >> >+
> >> >> >+       cap = rte_compressdev_capability_get(test_data->cdev_id,
> >> >> >+
> >> >> >+ RTE_COMP_ALGO_DEFLATE);
> >> >> >+
> >> >> >+       if (cap == NULL) {
> >> >> >+               RTE_LOG(ERR, USER1,
> >> >> >+                       "Compress device does not support DEFLATE\n");
> >> >> >+               return -1;
> >> >> >+       }
> >> >> >+
> >> >> >+       uint64_t comp_flags = cap->comp_feature_flags;
> >> >> >+
> >> >> >+       /* Huffman enconding */
> >> >> >+       if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED
> &&
> >> >> >+                       (comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
> >> >> >+               RTE_LOG(ERR, USER1,
> >> >> >+                       "Compress device does not supported Fixed
> Huffman\n");
> >> >> >+               return -1;
> >> >> >+       }
> >> >> >+
> >> >> >+       if (test_data->huffman_enc ==
> RTE_COMP_HUFFMAN_DYNAMIC
> >> &&
> >> >> >+                       (comp_flags &
> >> >> >+ RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0)
> >> {
> >> >> >+               RTE_LOG(ERR, USER1,
> >> >> >+                       "Compress device does not supported
> >> >> >+ Dynamic
> >> Huffman\n");
> >> >> >+               return -1;
> >> >> >+       }
> >> >> >+
> >> >> >+       /* Window size */
> >> >> >+       if (test_data->window_sz != -1) {
> >> >> >+               if (param_range_check(test_data->window_sz,
> >> >> >+ &cap->window_size)
> >> >> What if cap->window_size is 0 i.e. implementation default?
> >> >
> >> >TJ: You probably mean cap->window_size.increment = 0 (because
> >> >cap->window_size is a structure). In that case we check if
> >> >test_data->window_sz >=min and test_data->window_sz <= max only,
> >> because increment = 0 means (base on compression API) we have only
> >> one value of windows_size (no range is supported).
> >> But PMD can set min and max too 0 for such case.
> >
> >TJ: I can't see any issue in that case too. Maybe I don't understand what you
> mean but the logic is as follow:
> >1)  if you pass '--window-sz  ...' param. into command line your
> >intention is to force that value of window size during test. We check is this
> value is allow (by param_range_check() function).
> >2) if you plan to use default value - just don't pass '--window-sz'
> >param. in command line at all. In that case we get windows size from
> >window_size.max field, so if window_size.min= window_size.max=0
> test_data->window_sz will be zero, as well.
> >If you mean that behavior is not good - I will be grateful for other
> suggestions.
> 
> This is fine. but I am thinking of 3rd case here:
> c) user pass window sz but PMD window_sz.min = max = 0, then user
> requested windowsz is not applicable right?!

In that case - true. There'll be fail :
"Compress device does not support this window size\n");
So what is your proposal for  that case?





> 
> >
> >>
> >> >
> >> >
> >> >
> >> ....
> >>
> >> >> >+
> >> >> >+               if (fread(data, data_to_read, 1, f) != 1) {
> >> >> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
> >> >> >+                       goto err;
> >> >> >+               }
> >> >> >+               if (fseek(f, 0, SEEK_SET) != 0) {
> >> >> >+                       RTE_LOG(ERR, USER1,
> >> >> >+                               "Size of input could not be calculated\n");
> >> >> >+                       goto err;
> >> >> >+               }
> >> >> >+               remaining_data -= data_to_read;
> >> >> >+               data += data_to_read;
> >> >> It looks like it will run 2nd time only if input file size < input
> >> >> data size in which case it will just keep filling input buffer
> >> >> with repeated
> >> data.
> >> >> Is that the intention here?
> >> >
> >> >TJ: Yes exactly. If test_data->input_data_sz is bigger than
> >> >actual_file_sz then we fill the buffer with repeated data from file
> >> >to fill
> >> whole buffer.
> >> I mentioned in one of the earlier reply, wont that then influence the
> >> compression behaviour and o/p? my suggestion was to work on actual
> >> user provided input to take perf to get actual perf for given content.
> >
> >TJ: You right, but this solution is flexible. You can pass '
> >--extended-input-sz" or not, so you can use original input data or extend it
> if you want.
> Ok. but still not sure if it's really needed. Might be practically most of the time
> it wont be exercised. No hard opinion on this though.
> 
> Thanks
> Shally
> >
> >>
> >> >
> >> >>
> >> ...
> >>
> >> >> >+                       if (data_addr == NULL) {
> >> >> >+                               RTE_LOG(ERR, USER1, "Could not
> >> >> >+ append data\n");
> >> >> Since a new buffer per segment is allocated, so is it possible for
> >> >> append to fail? think, this check is redundant here.
> >> >
> >> >TJ: Yes, you're right, it should never fail. But I think it's good
> >> >coding practice
> >> to add the check just in case.
> >> >
> >> Unless it is called in data path which might cost perf a bit.
> >
> >TJ:  prepare_bufs() is out of perf measurement, so shouldn't impact to
> >measurements. The performance measurement is inside
> >main_loop() only.
> >
> >
> >Br, Tomek
> >
> >>
> >> Thanks
> >> Shally
> >>
> >> >> >+                               return -1;
> >> >> >+                       }
> >> >> >+
> >> >> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
> >> >> >+                       input_data_ptr += data_sz;
> >> >> >+                       remaining_data -= data_sz;
> >> >> >+
> >> >> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
> >> >> >+                                       next_seg) < 0) {
> >> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >> >> >+                               return -1;
> >> >> >+                       }
> >> >> >+                       segs_per_mbuf++;
> >> >> >+               }
> >> >> >+
> >> >> >+               /* Allocate data in output mbuf */
> >> >> >+               test_data->comp_bufs[i] =
> >> >> >+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
> >> >> >+               if (test_data->comp_bufs[i] == NULL) {
> >> >> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
> >> >> >+                       return -1;
> >> >> >+               }
> >> >> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
> >> >> >+                                       test_data->comp_bufs[i],
> >> >> >+                                       test_data->seg_sz);
> >> >> >+               if (data_addr == NULL) {
> >> >> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
> >> >> >+                       return -1;
> >> >> >+               }
> >> >> >+
> >> >> >+               /* Chain mbufs if needed for output mbufs */
> >> >> >+               for (j = 1; j < segs_per_mbuf; j++) {
> >> >> >+                       struct rte_mbuf *next_seg =
> >> >> >+
> >> >> >+ rte_pktmbuf_alloc(test_data->comp_buf_pool);
> >> >> >+
> >> >> >+                       if (next_seg == NULL) {
> >> >> >+                               RTE_LOG(ERR, USER1,
> >> >> >+                                       "Could not allocate mbuf\n");
> >> >> >+                               return -1;
> >> >> >+                       }
> >> >> >+
> >> >> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
> >> >> >+                               test_data->seg_sz);
> >> >> >+
> >> >> >+                       if (data_addr == NULL) {
> >> >> >+                               RTE_LOG(ERR, USER1, "Could not append data\n");
> >> >> >+                               return -1;
> >> >> >+                       }
> >> >> >+
> >> >> >+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
> >> >> >+                                       next_seg) < 0) {
> >> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
> >> >> >+                               return -1;
> >> >> >+                       }
> >> >> >+               }
> >> >> >+       }
> >> >> >+
> >> >> >+       return 0;
> >> >> >+}
> >> >> >+
> >> >> >+static void
> >> >> >+free_bufs(struct comp_test_data *test_data) {
> >> >> >+       uint32_t i;
> >> >> >+
> >> >> >+       for (i = 0; i < test_data->total_bufs; i++) {
> >> >> >+               rte_pktmbuf_free(test_data->comp_bufs[i]);
> >> >> >+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
> >> >> >+       }
> >> >> >+       rte_free(test_data->comp_bufs);
> >> >> >+       rte_free(test_data->decomp_bufs); }
> >> >> >+
> >> >> >+static int
> >> >> >+main_loop(struct comp_test_data *test_data, uint8_t level,
> >> >> >+                       enum rte_comp_xform_type type,
> >> >> >+                       uint8_t *output_data_ptr,
> >> >> >+                       size_t *output_data_sz,
> >> >> >+                       unsigned int benchmarking) {
> >> >> >+       uint8_t dev_id = test_data->cdev_id;
> >> >> >+       uint32_t i, iter, num_iter;
> >> >> >+       struct rte_comp_op **ops, **deq_ops;
> >> >> >+       void *priv_xform = NULL;
> >> >> >+       struct rte_comp_xform xform;
> >> >> >+       size_t output_size = 0;
> >> >> >+       struct rte_mbuf **input_bufs, **output_bufs;
> >> >> >+       int res = 0;
> >> >> >+       int allocated = 0;
> >> >> >+
> >> >> >+       if (test_data == NULL || !test_data->burst_sz) {
> >> >> >+               RTE_LOG(ERR, USER1,
> >> >> >+                       "Unknow burst size\n");
> >> >> >+               return -1;
> >> >> >+       }
> >> >> >+
> >> >> >+       ops = rte_zmalloc_socket(NULL,
> >> >> >+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
> >> >> >+               0, rte_socket_id());
> >> >> >+
> >> >> >+       if (ops == NULL) {
> >> >> >+               RTE_LOG(ERR, USER1,
> >> >> >+                       "Can't allocate memory for ops strucures\n");
> >> >> >+               return -1;
> >> >> >+       }
> >> >> >+
> >> >> >+       deq_ops = &ops[test_data->total_bufs];
> >> >> >+
> >> >> >+       if (type == RTE_COMP_COMPRESS) {
> >> >> >+               xform = (struct rte_comp_xform) {
> >> >> >+                       .type = RTE_COMP_COMPRESS,
> >> >> >+                       .compress = {
> >> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
> >> >> >+                               .deflate.huffman = test_data->huffman_enc,
> >> >> >+                               .level = level,
> >> >> >+                               .window_size = test_data->window_sz,
> >> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
> >> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
> >> >> >+                       }
> >> >> >+               };
> >> >> >+               input_bufs = test_data->decomp_bufs;
> >> >> >+               output_bufs = test_data->comp_bufs;
> >> >> >+       } else {
> >> >> >+               xform = (struct rte_comp_xform) {
> >> >> >+                       .type = RTE_COMP_DECOMPRESS,
> >> >> >+                       .decompress = {
> >> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
> >> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
> >> >> >+                               .window_size = test_data->window_sz,
> >> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
> >> >> >+                       }
> >> >> >+               };
> >> >> >+               input_bufs = test_data->comp_bufs;
> >> >> >+               output_bufs = test_data->decomp_bufs;
> >> >> >+       }
> >> >> >+
> >> >> >+       /* Create private xform */
> >> >> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
> >> >> >+                       &priv_xform) < 0) {
> >> >> >+               RTE_LOG(ERR, USER1, "Private xform could not be
> created\n");
> >> >> >+               res = -1;
> >> >> >+               goto end;
> >> >> >+       }
> >> >> >+
> >> >> >+       uint64_t tsc_start, tsc_end, tsc_duration;
> >> >> >+
> >> >> >+       tsc_start = tsc_end = tsc_duration = 0;
> >> >> >+       if (benchmarking) {
> >> >> >+               tsc_start = rte_rdtsc();
> >> >> >+               num_iter = test_data->num_iter;
> >> >> >+       } else
> >> >> >+               num_iter = 1;
> >> >> Looks like in same code we're doing benchmarking and functional
> >> validation.
> >> >> It can be reorganised to keep validation test separately like done
> >> >> in crypto_perf.
> >> >
> >> >TJ: Ok, makes sense. However in the interests of getting this into
> >> >the
> >> >18.11 release I'd like to defer this refactoring and the remainder
> >> >of your
> >> comments below to the next release.
> >> >
> >> >
> >> >Next comments - WIP
> >> >
> >> >
> >> >Br, Tomek

Verma, Shally Nov. 6, 2018, 3:39 p.m. UTC | #13

>-----Original Message-----
>From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
>Sent: 06 November 2018 14:36
>To: Verma, Shally <Shally.Verma@cavium.com>; dev@dpdk.org; Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>> -----Original Message-----
>> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> Sent: Tuesday, November 6, 2018 9:16 AM
>> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
>> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> measurement
...

>> >> >> >+
>> >> >> >+       /* Window size */
>> >> >> >+       if (test_data->window_sz != -1) {
>> >> >> >+               if (param_range_check(test_data->window_sz,
>> >> >> >+ &cap->window_size)
>> >> >> What if cap->window_size is 0 i.e. implementation default?
>> >> >
>> >> >TJ: You probably mean cap->window_size.increment = 0 (because
>> >> >cap->window_size is a structure). In that case we check if
>> >> >test_data->window_sz >=min and test_data->window_sz <= max only,
>> >> because increment = 0 means (base on compression API) we have only
>> >> one value of windows_size (no range is supported).
>> >> But PMD can set min and max too 0 for such case.
>> >
>> >TJ: I can't see any issue in that case too. Maybe I don't understand what you
>> mean but the logic is as follow:
>> >1)  if you pass '--window-sz  ...' param. into command line your
>> >intention is to force that value of window size during test. We check is this
>> value is allow (by param_range_check() function).
>> >2) if you plan to use default value - just don't pass '--window-sz'
>> >param. in command line at all. In that case we get windows size from
>> >window_size.max field, so if window_size.min= window_size.max=0
>> test_data->window_sz will be zero, as well.
>> >If you mean that behavior is not good - I will be grateful for other
>> suggestions.
>>
>> This is fine. but I am thinking of 3rd case here:
>> c) user pass window sz but PMD window_sz.min = max = 0, then user
>> requested windowsz is not applicable right?!
>
>In that case - true. There'll be fail :
>"Compress device does not support this window size\n");
>So what is your proposal for  that case?
>
We can set to window size to implementation default and add in diagnostic of used window sz for test run.
No need to fail here I believe.

Thanks
Shally

>
>
>
>
>>
>> >
>> >>
>> >> >
>> >> >
>> >> >
>> >> ....
>> >>
>> >> >> >+
>> >> >> >+               if (fread(data, data_to_read, 1, f) != 1) {
>> >> >> >+                       RTE_LOG(ERR, USER1, "Input file could not be read\n");
>> >> >> >+                       goto err;
>> >> >> >+               }
>> >> >> >+               if (fseek(f, 0, SEEK_SET) != 0) {
>> >> >> >+                       RTE_LOG(ERR, USER1,
>> >> >> >+                               "Size of input could not be calculated\n");
>> >> >> >+                       goto err;
>> >> >> >+               }
>> >> >> >+               remaining_data -= data_to_read;
>> >> >> >+               data += data_to_read;
>> >> >> It looks like it will run 2nd time only if input file size < input
>> >> >> data size in which case it will just keep filling input buffer
>> >> >> with repeated
>> >> data.
>> >> >> Is that the intention here?
>> >> >
>> >> >TJ: Yes exactly. If test_data->input_data_sz is bigger than
>> >> >actual_file_sz then we fill the buffer with repeated data from file
>> >> >to fill
>> >> whole buffer.
>> >> I mentioned in one of the earlier reply, wont that then influence the
>> >> compression behaviour and o/p? my suggestion was to work on actual
>> >> user provided input to take perf to get actual perf for given content.
>> >
>> >TJ: You right, but this solution is flexible. You can pass '
>> >--extended-input-sz" or not, so you can use original input data or extend it
>> if you want.
>> Ok. but still not sure if it's really needed. Might be practically most of the time
>> it wont be exercised. No hard opinion on this though.
>>
>> Thanks
>> Shally
>> >
>> >>
>> >> >
>> >> >>
>> >> ...
>> >>
>> >> >> >+                       if (data_addr == NULL) {
>> >> >> >+                               RTE_LOG(ERR, USER1, "Could not
>> >> >> >+ append data\n");
>> >> >> Since a new buffer per segment is allocated, so is it possible for
>> >> >> append to fail? think, this check is redundant here.
>> >> >
>> >> >TJ: Yes, you're right, it should never fail. But I think it's good
>> >> >coding practice
>> >> to add the check just in case.
>> >> >
>> >> Unless it is called in data path which might cost perf a bit.
>> >
>> >TJ:  prepare_bufs() is out of perf measurement, so shouldn't impact to
>> >measurements. The performance measurement is inside
>> >main_loop() only.
>> >
>> >
>> >Br, Tomek
>> >
>> >>
>> >> Thanks
>> >> Shally
>> >>
>> >> >> >+                               return -1;
>> >> >> >+                       }
>> >> >> >+
>> >> >> >+                       rte_memcpy(data_addr, input_data_ptr, data_sz);
>> >> >> >+                       input_data_ptr += data_sz;
>> >> >> >+                       remaining_data -= data_sz;
>> >> >> >+
>> >> >> >+                       if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
>> >> >> >+                                       next_seg) < 0) {
>> >> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>> >> >> >+                               return -1;
>> >> >> >+                       }
>> >> >> >+                       segs_per_mbuf++;
>> >> >> >+               }
>> >> >> >+
>> >> >> >+               /* Allocate data in output mbuf */
>> >> >> >+               test_data->comp_bufs[i] =
>> >> >> >+                       rte_pktmbuf_alloc(test_data->comp_buf_pool);
>> >> >> >+               if (test_data->comp_bufs[i] == NULL) {
>> >> >> >+                       RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
>> >> >> >+                       return -1;
>> >> >> >+               }
>> >> >> >+               data_addr = (uint8_t *) rte_pktmbuf_append(
>> >> >> >+                                       test_data->comp_bufs[i],
>> >> >> >+                                       test_data->seg_sz);
>> >> >> >+               if (data_addr == NULL) {
>> >> >> >+                       RTE_LOG(ERR, USER1, "Could not append data\n");
>> >> >> >+                       return -1;
>> >> >> >+               }
>> >> >> >+
>> >> >> >+               /* Chain mbufs if needed for output mbufs */
>> >> >> >+               for (j = 1; j < segs_per_mbuf; j++) {
>> >> >> >+                       struct rte_mbuf *next_seg =
>> >> >> >+
>> >> >> >+ rte_pktmbuf_alloc(test_data->comp_buf_pool);
>> >> >> >+
>> >> >> >+                       if (next_seg == NULL) {
>> >> >> >+                               RTE_LOG(ERR, USER1,
>> >> >> >+                                       "Could not allocate mbuf\n");
>> >> >> >+                               return -1;
>> >> >> >+                       }
>> >> >> >+
>> >> >> >+                       data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
>> >> >> >+                               test_data->seg_sz);
>> >> >> >+
>> >> >> >+                       if (data_addr == NULL) {
>> >> >> >+                               RTE_LOG(ERR, USER1, "Could not append data\n");
>> >> >> >+                               return -1;
>> >> >> >+                       }
>> >> >> >+
>> >> >> >+                       if (rte_pktmbuf_chain(test_data->comp_bufs[i],
>> >> >> >+                                       next_seg) < 0) {
>> >> >> >+                               RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
>> >> >> >+                               return -1;
>> >> >> >+                       }
>> >> >> >+               }
>> >> >> >+       }
>> >> >> >+
>> >> >> >+       return 0;
>> >> >> >+}
>> >> >> >+
>> >> >> >+static void
>> >> >> >+free_bufs(struct comp_test_data *test_data) {
>> >> >> >+       uint32_t i;
>> >> >> >+
>> >> >> >+       for (i = 0; i < test_data->total_bufs; i++) {
>> >> >> >+               rte_pktmbuf_free(test_data->comp_bufs[i]);
>> >> >> >+               rte_pktmbuf_free(test_data->decomp_bufs[i]);
>> >> >> >+       }
>> >> >> >+       rte_free(test_data->comp_bufs);
>> >> >> >+       rte_free(test_data->decomp_bufs); }
>> >> >> >+
>> >> >> >+static int
>> >> >> >+main_loop(struct comp_test_data *test_data, uint8_t level,
>> >> >> >+                       enum rte_comp_xform_type type,
>> >> >> >+                       uint8_t *output_data_ptr,
>> >> >> >+                       size_t *output_data_sz,
>> >> >> >+                       unsigned int benchmarking) {
>> >> >> >+       uint8_t dev_id = test_data->cdev_id;
>> >> >> >+       uint32_t i, iter, num_iter;
>> >> >> >+       struct rte_comp_op **ops, **deq_ops;
>> >> >> >+       void *priv_xform = NULL;
>> >> >> >+       struct rte_comp_xform xform;
>> >> >> >+       size_t output_size = 0;
>> >> >> >+       struct rte_mbuf **input_bufs, **output_bufs;
>> >> >> >+       int res = 0;
>> >> >> >+       int allocated = 0;
>> >> >> >+
>> >> >> >+       if (test_data == NULL || !test_data->burst_sz) {
>> >> >> >+               RTE_LOG(ERR, USER1,
>> >> >> >+                       "Unknow burst size\n");
>> >> >> >+               return -1;
>> >> >> >+       }
>> >> >> >+
>> >> >> >+       ops = rte_zmalloc_socket(NULL,
>> >> >> >+               2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
>> >> >> >+               0, rte_socket_id());
>> >> >> >+
>> >> >> >+       if (ops == NULL) {
>> >> >> >+               RTE_LOG(ERR, USER1,
>> >> >> >+                       "Can't allocate memory for ops strucures\n");
>> >> >> >+               return -1;
>> >> >> >+       }
>> >> >> >+
>> >> >> >+       deq_ops = &ops[test_data->total_bufs];
>> >> >> >+
>> >> >> >+       if (type == RTE_COMP_COMPRESS) {
>> >> >> >+               xform = (struct rte_comp_xform) {
>> >> >> >+                       .type = RTE_COMP_COMPRESS,
>> >> >> >+                       .compress = {
>> >> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
>> >> >> >+                               .deflate.huffman = test_data->huffman_enc,
>> >> >> >+                               .level = level,
>> >> >> >+                               .window_size = test_data->window_sz,
>> >> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>> >> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>> >> >> >+                       }
>> >> >> >+               };
>> >> >> >+               input_bufs = test_data->decomp_bufs;
>> >> >> >+               output_bufs = test_data->comp_bufs;
>> >> >> >+       } else {
>> >> >> >+               xform = (struct rte_comp_xform) {
>> >> >> >+                       .type = RTE_COMP_DECOMPRESS,
>> >> >> >+                       .decompress = {
>> >> >> >+                               .algo = RTE_COMP_ALGO_DEFLATE,
>> >> >> >+                               .chksum = RTE_COMP_CHECKSUM_NONE,
>> >> >> >+                               .window_size = test_data->window_sz,
>> >> >> >+                               .hash_algo = RTE_COMP_HASH_ALGO_NONE
>> >> >> >+                       }
>> >> >> >+               };
>> >> >> >+               input_bufs = test_data->comp_bufs;
>> >> >> >+               output_bufs = test_data->decomp_bufs;
>> >> >> >+       }
>> >> >> >+
>> >> >> >+       /* Create private xform */
>> >> >> >+       if (rte_compressdev_private_xform_create(dev_id, &xform,
>> >> >> >+                       &priv_xform) < 0) {
>> >> >> >+               RTE_LOG(ERR, USER1, "Private xform could not be
>> created\n");
>> >> >> >+               res = -1;
>> >> >> >+               goto end;
>> >> >> >+       }
>> >> >> >+
>> >> >> >+       uint64_t tsc_start, tsc_end, tsc_duration;
>> >> >> >+
>> >> >> >+       tsc_start = tsc_end = tsc_duration = 0;
>> >> >> >+       if (benchmarking) {
>> >> >> >+               tsc_start = rte_rdtsc();
>> >> >> >+               num_iter = test_data->num_iter;
>> >> >> >+       } else
>> >> >> >+               num_iter = 1;
>> >> >> Looks like in same code we're doing benchmarking and functional
>> >> validation.
>> >> >> It can be reorganised to keep validation test separately like done
>> >> >> in crypto_perf.
>> >> >
>> >> >TJ: Ok, makes sense. However in the interests of getting this into
>> >> >the
>> >> >18.11 release I'd like to defer this refactoring and the remainder
>> >> >of your
>> >> comments below to the next release.
>> >> >
>> >> >
>> >> >Next comments - WIP
>> >> >
>> >> >
>> >> >Br, Tomek

Tomasz Jozwiak Nov. 7, 2018, 10:18 a.m. UTC | #14

> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Tuesday, November 6, 2018 4:40 PM
> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org; Trahe,
> Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> 
> 
> >-----Original Message-----
> >From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
> >Sent: 06 November 2018 14:36
> >To: Verma, Shally <Shally.Verma@cavium.com>; dev@dpdk.org; Trahe,
> Fiona
> ><fiona.trahe@intel.com>; akhil.goyal@nxp.com
> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >measurement
> >
> >External Email
> >
> >> -----Original Message-----
> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> Sent: Tuesday, November 6, 2018 9:16 AM
> >> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> performance measurement
> ...
> 
> >> >> >> >+
> >> >> >> >+       /* Window size */
> >> >> >> >+       if (test_data->window_sz != -1) {
> >> >> >> >+               if (param_range_check(test_data->window_sz,
> >> >> >> >+ &cap->window_size)
> >> >> >> What if cap->window_size is 0 i.e. implementation default?
> >> >> >
> >> >> >TJ: You probably mean cap->window_size.increment = 0 (because
> >> >> >cap->window_size is a structure). In that case we check if
> >> >> >test_data->window_sz >=min and test_data->window_sz <= max
> only,
> >> >> because increment = 0 means (base on compression API) we have only
> >> >> one value of windows_size (no range is supported).
> >> >> But PMD can set min and max too 0 for such case.
> >> >
> >> >TJ: I can't see any issue in that case too. Maybe I don't understand
> >> >what you
> >> mean but the logic is as follow:
> >> >1)  if you pass '--window-sz  ...' param. into command line your
> >> >intention is to force that value of window size during test. We
> >> >check is this
> >> value is allow (by param_range_check() function).
> >> >2) if you plan to use default value - just don't pass '--window-sz'
> >> >param. in command line at all. In that case we get windows size from
> >> >window_size.max field, so if window_size.min= window_size.max=0
> >> test_data->window_sz will be zero, as well.
> >> >If you mean that behavior is not good - I will be grateful for other
> >> suggestions.
> >>
> >> This is fine. but I am thinking of 3rd case here:
> >> c) user pass window sz but PMD window_sz.min = max = 0, then user
> >> requested windowsz is not applicable right?!
> >
> >In that case - true. There'll be fail :
> >"Compress device does not support this window size\n"); So what is your
> >proposal for  that case?
> >
> We can set to window size to implementation default and add in diagnostic
> of used window sz for test run.
> No need to fail here I believe.
> 
> Thanks
> Shally

Ok, I'll try to implement that feature in V3


Br, Tomek

Fiona Trahe Nov. 10, 2018, 12:54 a.m. UTC | #15

Hi Shally, Tomasz,

> > >> >> >> >+       /* Window size */
> > >> >> >> >+       if (test_data->window_sz != -1) {
> > >> >> >> >+               if (param_range_check(test_data->window_sz,
> > >> >> >> >+ &cap->window_size)
> > >> >> >> What if cap->window_size is 0 i.e. implementation default?
> > >> >> >
> > >> >> >TJ: You probably mean cap->window_size.increment = 0 (because
> > >> >> >cap->window_size is a structure). In that case we check if
> > >> >> >test_data->window_sz >=min and test_data->window_sz <= max
> > only,
> > >> >> because increment = 0 means (base on compression API) we have only
> > >> >> one value of windows_size (no range is supported).
> > >> >> But PMD can set min and max too 0 for such case.
> > >> >
> > >> >TJ: I can't see any issue in that case too. Maybe I don't understand
> > >> >what you
> > >> mean but the logic is as follow:
> > >> >1)  if you pass '--window-sz  ...' param. into command line your
> > >> >intention is to force that value of window size during test. We
> > >> >check is this
> > >> value is allow (by param_range_check() function).
> > >> >2) if you plan to use default value - just don't pass '--window-sz'
> > >> >param. in command line at all. In that case we get windows size from
> > >> >window_size.max field, so if window_size.min= window_size.max=0
> > >> test_data->window_sz will be zero, as well.
> > >> >If you mean that behavior is not good - I will be grateful for other
> > >> suggestions.
> > >>
> > >> This is fine. but I am thinking of 3rd case here:
> > >> c) user pass window sz but PMD window_sz.min = max = 0, then user
> > >> requested windowsz is not applicable right?!
> > >
> > >In that case - true. There'll be fail :
> > >"Compress device does not support this window size\n"); So what is your
> > >proposal for  that case?
> > >
> > We can set to window size to implementation default and add in diagnostic
> > of used window sz for test run.
> > No need to fail here I believe.

[Fiona] For Window size capability reported by the PMD in the info struct 
it is not valid to report min=0, max=0. The PMD must report the range it can
handle - the API doesn't suggest otherwise. 
On the xform a specific window size is requested of the PMD, if it doesn't support
this it's allowed to fall back to a lower size according to the API.
However that doesn't mean the PMD can pick any size if it doesn't support the
requested size, i.e. it can't pick a bigger size, just a smaller one. 
If an application requests a smaller window size
than a PMD supports, it can be that the decompression engine
will be unable to decompress if a larger window is used, so the PMD 
should only fall back to a smaller size.
Based on above, I think the perf tool behaviour is ok. 
It should pass the user requested value to the PMD if the PMD capabilities support it.
If not it should fail. If the user wants to measure with a different window size they can
pass in that parameter.
The functional test suite can be used to validate the case where the PMD
falls back - this is not what the perf tool is for.
Does this make sense?

@Shally, do you think we need an API change to support an unlimited set of window sizes?
If so can you explain why?

Verma, Shally Nov. 12, 2018, 4:45 a.m. UTC | #16

>-----Original Message-----
>From: Trahe, Fiona <fiona.trahe@intel.com>
>Sent: 10 November 2018 06:24
>To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; Verma, Shally <Shally.Verma@cavium.com>; dev@dpdk.org;
>akhil.goyal@nxp.com
>Cc: Trahe, Fiona <fiona.trahe@intel.com>
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Hi Shally, Tomasz,
>
>> > >> >> >> >+       /* Window size */
>> > >> >> >> >+       if (test_data->window_sz != -1) {
>> > >> >> >> >+               if (param_range_check(test_data->window_sz,
>> > >> >> >> >+ &cap->window_size)
>> > >> >> >> What if cap->window_size is 0 i.e. implementation default?
>> > >> >> >
>> > >> >> >TJ: You probably mean cap->window_size.increment = 0 (because
>> > >> >> >cap->window_size is a structure). In that case we check if
>> > >> >> >test_data->window_sz >=min and test_data->window_sz <= max
>> > only,
>> > >> >> because increment = 0 means (base on compression API) we have only
>> > >> >> one value of windows_size (no range is supported).
>> > >> >> But PMD can set min and max too 0 for such case.
>> > >> >
>> > >> >TJ: I can't see any issue in that case too. Maybe I don't understand
>> > >> >what you
>> > >> mean but the logic is as follow:
>> > >> >1)  if you pass '--window-sz  ...' param. into command line your
>> > >> >intention is to force that value of window size during test. We
>> > >> >check is this
>> > >> value is allow (by param_range_check() function).
>> > >> >2) if you plan to use default value - just don't pass '--window-sz'
>> > >> >param. in command line at all. In that case we get windows size from
>> > >> >window_size.max field, so if window_size.min= window_size.max=0
>> > >> test_data->window_sz will be zero, as well.
>> > >> >If you mean that behavior is not good - I will be grateful for other
>> > >> suggestions.
>> > >>
>> > >> This is fine. but I am thinking of 3rd case here:
>> > >> c) user pass window sz but PMD window_sz.min = max = 0, then user
>> > >> requested windowsz is not applicable right?!
>> > >
>> > >In that case - true. There'll be fail :
>> > >"Compress device does not support this window size\n"); So what is your
>> > >proposal for  that case?
>> > >
>> > We can set to window size to implementation default and add in diagnostic
>> > of used window sz for test run.
>> > No need to fail here I believe.
>
>[Fiona] For Window size capability reported by the PMD in the info struct
>it is not valid to report min=0, max=0. The PMD must report the range it can
>handle - the API doesn't suggest otherwise.
>On the xform a specific window size is requested of the PMD, if it doesn't support
>this it's allowed to fall back to a lower size according to the API.
>However that doesn't mean the PMD can pick any size if it doesn't support the
>requested size, i.e. it can't pick a bigger size, just a smaller one.
>If an application requests a smaller window size
>than a PMD supports, it can be that the decompression engine
>will be unable to decompress if a larger window is used, so the PMD
>should only fall back to a smaller size.
>Based on above, I think the perf tool behaviour is ok.
>It should pass the user requested value to the PMD if the PMD capabilities support it.
Agree to this. However my point is what if PMD just leave these window sz values as 0, meaning implementation default i.e.
internally used fixed value used by PMD to lookup for both compression/decompression. But if we are not supporting  window sz = 0 on an API then its fine , no need to handle this special case. However given that, we need to add comment in capability field, PMD must set it to some non-zero value and 0 is not valid case to handle.

>If not it should fail. If the user wants to measure with a different window size they can
>pass in that parameter.
>The functional test suite can be used to validate the case where the PMD
>falls back - this is not what the perf tool is for.
>Does this make sense?
>
>@Shally, do you think we need an API change to support an unlimited set of window sizes?
>If so can you explain why?
No.I don't intend to add support something like unlimited window sz as that isn't a known use-case. Also, I didn't mean window sz = 0  to be interpreted as unlimited window sz. I just meant 0 = implementation default window sz , if that's supported on compression spec. 

Thanks
Shally
>

Tomasz Jozwiak Nov. 30, 2018, 2:43 p.m. UTC | #17

Hi Shally,

I'm about of sending V5 of compression-perf tool.

Our performance testing shows that the number of sgls in a chain can be a factor in the performance.
So we want to keep this on the cmd line for the performance tool.
There are alternatives, like setting the input size and segment size to get the num segments desired, but I prefer
to have the option to specify the num segments explicitly.
We'll document that if the max-num-sgl-segs x seg_sz > input size then segments number in the chain will be lower ( to store all the data)
As regards adding the max_nb_segments_per_sgl into the rte_compressdev_info struct we're investigating
another workaround to this limitation in QAT, so will leave this off the API unless some other PMD needs it.
In the meantime we'll document the limitation in QAT.

Please let me know your thoughts.

--
Tomek

> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Wednesday, October 17, 2018 6:48 PM
> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> akhil.goyal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> 
> 
> >-----Original Message-----
> >From: Trahe, Fiona <fiona.trahe@intel.com>
> >Sent: 17 October 2018 22:15
> >To: Verma, Shally <Shally.Verma@cavium.com>; Daly, Lee
> ><lee.daly@intel.com>
> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >akhil.goyal@nxp.com; Trahe, Fiona <fiona.trahe@intel.com>
> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >measurement
> >
> >External Email
> >
> >> -----Original Message-----
> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> Sent: Wednesday, October 17, 2018 8:43 AM
> >> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee
> >> <lee.daly@intel.com>
> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> akhil.goyal@nxp.com
> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> performance measurement
> >>
> >>
> >>
> >> >-----Original Message-----
> >> >From: Trahe, Fiona <fiona.trahe@intel.com>
> >> >Sent: 17 October 2018 20:04
> >> >To: Daly, Lee <lee.daly@intel.com>; Verma, Shally
> >> ><Shally.Verma@cavium.com>
> >> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> >akhil.goyal@nxp.com; Trahe, Fiona
> >> <fiona.trahe@intel.com>
> >> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >performance measurement
> >> >
> >> >External Email
> >> >
> >> >Hi Shally, Lee,
> >> >
> >> >> -----Original Message-----
> >> >> From: Daly, Lee
> >> >> Sent: Monday, October 15, 2018 8:10 AM
> >> >> To: Verma, Shally <Shally.Verma@cavium.com>
> >> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> >> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> performance measurement
> >> >>
> >> >> Thanks for your input Shally see comments below.
> >> >>
> >> >>
> >> >> I will be reviewing these changes while Tomasz is out this week.
> >> >>
> >> >> > -----Original Message-----
> >> >> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma,
> >> >> > Shally
> >> >> > Sent: Friday, October 12, 2018 11:16 AM
> >> >> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>;
> dev@dpdk.org;
> >> >> > Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De
> >> >> > Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >> >> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> >> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> > performance measurement
> >> >> >
> >> >///
> >> >
> >> >> >Also, why do we need --max-num-
> >> >> > sgl-segs as an input option from user? Shouldn't input_sz and
> >> >> >seg_sz  internally decide on num-segs?
> >> >> > Or is it added to serve some other different purpose?
> >> >> Will have to get back to you on this one, seems illogical to get
> >> >> this input from user, But I will have to do further investigation to find if
> there was a different purpose.
> >> >
> >> >[Fiona] Some PMDs have a limit on how many links can be in an sgl
> >> >chain, e.g. in QAT case the PMD allocates a pool of internal
> >> >structures of a suitable size during device initialisation, this is not a hard
> limit but can be configured in .config to give the user control over the
> memory resources allocated.
> >> >This perf-tool max-num-sgl-segs is so the user can pick a value <=
> whatever the PMD's max is.
> >>
> >> Then also, I believe this could be taken care internally by an app.
> >> App can choose convenient number of sgl segs as per PMD capability
> >> and input sz and chunk sz selected by user.
> >> Just my thoughts.
> >[Fiona] Then we'd need to add this capability to the API, e.g. add
> >uint16_t max_nb_segments_per_sgl into the rte_compressdev_info struct.
> >Special case 0 means no limit.
> >We did consider this before, I can't remember why we didn't do it, I think
> it's needed.
> >I'll push an API patch for this in 19.02 and we can remove the
> >--max-num-sgl-segs param from the performance tool and hardcode it in
> the tool in the meantime.
> >Ok?
> Yea. Sounds better.

Verma, Shally Dec. 2, 2018, 6:39 a.m. UTC | #18

Ok. Then to keep it simple can we keep input sz and max-num-segs-sgl at cmd line input. I don't think segsz is required to input then?

Thanks
Shally 

>-----Original Message-----
>From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
>Sent: 30 November 2018 20:13
>To: Verma, Shally <Shally.Verma@cavium.com>; Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
>Cc: dev@dpdk.org; akhil.goyal@nxp.com
>Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance measurement
>
>External Email
>
>Hi Shally,
>
>I'm about of sending V5 of compression-perf tool.
>
>Our performance testing shows that the number of sgls in a chain can be a factor in the performance.
>So we want to keep this on the cmd line for the performance tool.
>There are alternatives, like setting the input size and segment size to get the num segments desired, but I prefer
>to have the option to specify the num segments explicitly.
>We'll document that if the max-num-sgl-segs x seg_sz > input size then segments number in the chain will be lower ( to store all the
>data)
>As regards adding the max_nb_segments_per_sgl into the rte_compressdev_info struct we're investigating
>another workaround to this limitation in QAT, so will leave this off the API unless some other PMD needs it.
>In the meantime we'll document the limitation in QAT.
>
>Please let me know your thoughts.
>
>--
>Tomek
>
>> -----Original Message-----
>> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> Sent: Wednesday, October 17, 2018 6:48 PM
>> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
>> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
>> akhil.goyal@nxp.com
>> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> measurement
>>
>>
>>
>> >-----Original Message-----
>> >From: Trahe, Fiona <fiona.trahe@intel.com>
>> >Sent: 17 October 2018 22:15
>> >To: Verma, Shally <Shally.Verma@cavium.com>; Daly, Lee
>> ><lee.daly@intel.com>
>> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
>> >akhil.goyal@nxp.com; Trahe, Fiona <fiona.trahe@intel.com>
>> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
>> >measurement
>> >
>> >External Email
>> >
>> >> -----Original Message-----
>> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
>> >> Sent: Wednesday, October 17, 2018 8:43 AM
>> >> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee
>> >> <lee.daly@intel.com>
>> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
>> >> akhil.goyal@nxp.com
>> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
>> >> performance measurement
>> >>
>> >>
>> >>
>> >> >-----Original Message-----
>> >> >From: Trahe, Fiona <fiona.trahe@intel.com>
>> >> >Sent: 17 October 2018 20:04
>> >> >To: Daly, Lee <lee.daly@intel.com>; Verma, Shally
>> >> ><Shally.Verma@cavium.com>
>> >> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
>> >> >akhil.goyal@nxp.com; Trahe, Fiona
>> >> <fiona.trahe@intel.com>
>> >> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
>> >> >performance measurement
>> >> >
>> >> >External Email
>> >> >
>> >> >Hi Shally, Lee,
>> >> >
>> >> >> -----Original Message-----
>> >> >> From: Daly, Lee
>> >> >> Sent: Monday, October 15, 2018 8:10 AM
>> >> >> To: Verma, Shally <Shally.Verma@cavium.com>
>> >> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
>> >> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
>> >> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
>> >> >> performance measurement
>> >> >>
>> >> >> Thanks for your input Shally see comments below.
>> >> >>
>> >> >>
>> >> >> I will be reviewing these changes while Tomasz is out this week.
>> >> >>
>> >> >> > -----Original Message-----
>> >> >> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma,
>> >> >> > Shally
>> >> >> > Sent: Friday, October 12, 2018 11:16 AM
>> >> >> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>;
>> dev@dpdk.org;
>> >> >> > Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De
>> >> >> > Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
>> >> >> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
>> >> >> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
>> >> >> > performance measurement
>> >> >> >
>> >> >///
>> >> >
>> >> >> >Also, why do we need --max-num-
>> >> >> > sgl-segs as an input option from user? Shouldn't input_sz and
>> >> >> >seg_sz  internally decide on num-segs?
>> >> >> > Or is it added to serve some other different purpose?
>> >> >> Will have to get back to you on this one, seems illogical to get
>> >> >> this input from user, But I will have to do further investigation to find if
>> there was a different purpose.
>> >> >
>> >> >[Fiona] Some PMDs have a limit on how many links can be in an sgl
>> >> >chain, e.g. in QAT case the PMD allocates a pool of internal
>> >> >structures of a suitable size during device initialisation, this is not a hard
>> limit but can be configured in .config to give the user control over the
>> memory resources allocated.
>> >> >This perf-tool max-num-sgl-segs is so the user can pick a value <=
>> whatever the PMD's max is.
>> >>
>> >> Then also, I believe this could be taken care internally by an app.
>> >> App can choose convenient number of sgl segs as per PMD capability
>> >> and input sz and chunk sz selected by user.
>> >> Just my thoughts.
>> >[Fiona] Then we'd need to add this capability to the API, e.g. add
>> >uint16_t max_nb_segments_per_sgl into the rte_compressdev_info struct.
>> >Special case 0 means no limit.
>> >We did consider this before, I can't remember why we didn't do it, I think
>> it's needed.
>> >I'll push an API patch for this in 19.02 and we can remove the
>> >--max-num-sgl-segs param from the performance tool and hardcode it in
>> the tool in the meantime.
>> >Ok?
>> Yea. Sounds better.

Tomasz Jozwiak Dec. 5, 2018, 8:51 a.m. UTC | #19

Hi Shally,

> I don't think segsz is required to input then?

Yes, this param. together with others like: input sz and max-num-segs-sgl
gives possibility to find the best performance values set for each PMD, so let's keep it.


Br, Tomek




> -----Original Message-----
> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> Sent: Sunday, December 2, 2018 7:40 AM
> To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; Trahe, Fiona
> <fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
> Cc: dev@dpdk.org; akhil.goyal@nxp.com
> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> measurement
> 
> Ok. Then to keep it simple can we keep input sz and max-num-segs-sgl at
> cmd line input. I don't think segsz is required to input then?
> 
> Thanks
> Shally
> 
> >-----Original Message-----
> >From: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>
> >Sent: 30 November 2018 20:13
> >To: Verma, Shally <Shally.Verma@cavium.com>; Trahe, Fiona
> ><fiona.trahe@intel.com>; Daly, Lee <lee.daly@intel.com>
> >Cc: dev@dpdk.org; akhil.goyal@nxp.com
> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add performance
> >measurement
> >
> >External Email
> >
> >Hi Shally,
> >
> >I'm about of sending V5 of compression-perf tool.
> >
> >Our performance testing shows that the number of sgls in a chain can be a
> factor in the performance.
> >So we want to keep this on the cmd line for the performance tool.
> >There are alternatives, like setting the input size and segment size to
> >get the num segments desired, but I prefer to have the option to specify
> the num segments explicitly.
> >We'll document that if the max-num-sgl-segs x seg_sz > input size then
> >segments number in the chain will be lower ( to store all the
> >data)
> >As regards adding the max_nb_segments_per_sgl into the
> >rte_compressdev_info struct we're investigating another workaround to
> this limitation in QAT, so will leave this off the API unless some other PMD
> needs it.
> >In the meantime we'll document the limitation in QAT.
> >
> >Please let me know your thoughts.
> >
> >--
> >Tomek
> >
> >> -----Original Message-----
> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> Sent: Wednesday, October 17, 2018 6:48 PM
> >> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee
> >> <lee.daly@intel.com>
> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> akhil.goyal@nxp.com
> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> performance measurement
> >>
> >>
> >>
> >> >-----Original Message-----
> >> >From: Trahe, Fiona <fiona.trahe@intel.com>
> >> >Sent: 17 October 2018 22:15
> >> >To: Verma, Shally <Shally.Verma@cavium.com>; Daly, Lee
> >> ><lee.daly@intel.com>
> >> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> >akhil.goyal@nxp.com; Trahe, Fiona <fiona.trahe@intel.com>
> >> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >performance measurement
> >> >
> >> >External Email
> >> >
> >> >> -----Original Message-----
> >> >> From: Verma, Shally [mailto:Shally.Verma@cavium.com]
> >> >> Sent: Wednesday, October 17, 2018 8:43 AM
> >> >> To: Trahe, Fiona <fiona.trahe@intel.com>; Daly, Lee
> >> >> <lee.daly@intel.com>
> >> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> >> akhil.goyal@nxp.com
> >> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> performance measurement
> >> >>
> >> >>
> >> >>
> >> >> >-----Original Message-----
> >> >> >From: Trahe, Fiona <fiona.trahe@intel.com>
> >> >> >Sent: 17 October 2018 20:04
> >> >> >To: Daly, Lee <lee.daly@intel.com>; Verma, Shally
> >> >> ><Shally.Verma@cavium.com>
> >> >> >Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>; dev@dpdk.org;
> >> >> >akhil.goyal@nxp.com; Trahe, Fiona
> >> >> <fiona.trahe@intel.com>
> >> >> >Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> >performance measurement
> >> >> >
> >> >> >External Email
> >> >> >
> >> >> >Hi Shally, Lee,
> >> >> >
> >> >> >> -----Original Message-----
> >> >> >> From: Daly, Lee
> >> >> >> Sent: Monday, October 15, 2018 8:10 AM
> >> >> >> To: Verma, Shally <Shally.Verma@cavium.com>
> >> >> >> Cc: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>;
> dev@dpdk.org;
> >> >> >> Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com
> >> >> >> Subject: RE: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> >> performance measurement
> >> >> >>
> >> >> >> Thanks for your input Shally see comments below.
> >> >> >>
> >> >> >>
> >> >> >> I will be reviewing these changes while Tomasz is out this week.
> >> >> >>
> >> >> >> > -----Original Message-----
> >> >> >> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Verma,
> >> >> >> > Shally
> >> >> >> > Sent: Friday, October 12, 2018 11:16 AM
> >> >> >> > To: Jozwiak, TomaszX <tomaszx.jozwiak@intel.com>;
> >> dev@dpdk.org;
> >> >> >> > Trahe, Fiona <fiona.trahe@intel.com>; akhil.goyal@nxp.com; De
> >> >> >> > Lara Guarch, Pablo <pablo.de.lara.guarch@intel.com>
> >> >> >> > Cc: De@dpdk.org; Lara@dpdk.org; Guarch@dpdk.org
> >> >> >> > Subject: Re: [dpdk-dev] [PATCH 2/3] app/compress-perf: add
> >> >> >> > performance measurement
> >> >> >> >
> >> >> >///
> >> >> >
> >> >> >> >Also, why do we need --max-num-  sgl-segs as an input option
> >> >> >> >from user? Shouldn't input_sz and seg_sz  internally decide on
> >> >> >> >num-segs?
> >> >> >> > Or is it added to serve some other different purpose?
> >> >> >> Will have to get back to you on this one, seems illogical to
> >> >> >> get this input from user, But I will have to do further
> >> >> >> investigation to find if
> >> there was a different purpose.
> >> >> >
> >> >> >[Fiona] Some PMDs have a limit on how many links can be in an sgl
> >> >> >chain, e.g. in QAT case the PMD allocates a pool of internal
> >> >> >structures of a suitable size during device initialisation, this
> >> >> >is not a hard
> >> limit but can be configured in .config to give the user control over
> >> the memory resources allocated.
> >> >> >This perf-tool max-num-sgl-segs is so the user can pick a value
> >> >> ><=
> >> whatever the PMD's max is.
> >> >>
> >> >> Then also, I believe this could be taken care internally by an app.
> >> >> App can choose convenient number of sgl segs as per PMD capability
> >> >> and input sz and chunk sz selected by user.
> >> >> Just my thoughts.
> >> >[Fiona] Then we'd need to add this capability to the API, e.g. add
> >> >uint16_t max_nb_segments_per_sgl into the rte_compressdev_info
> struct.
> >> >Special case 0 means no limit.
> >> >We did consider this before, I can't remember why we didn't do it, I
> >> >think
> >> it's needed.
> >> >I'll push an API patch for this in 19.02 and we can remove the
> >> >--max-num-sgl-segs param from the performance tool and hardcode it
> >> >in
> >> the tool in the meantime.
> >> >Ok?
> >> Yea. Sounds better.

diff mbox series

Patch

diff --git a/app/test-compress-perf/main.c b/app/test-compress-perf/main.c
index f52b98d..093dfaf 100644
--- a/app/test-compress-perf/main.c
+++ b/app/test-compress-perf/main.c
@@ -5,13 +5,721 @@ 
 #include <rte_malloc.h>
 #include <rte_eal.h>
 #include <rte_log.h>
+#include <rte_cycles.h>
 #include <rte_compressdev.h>
 
 #include "comp_perf_options.h"
 
+#define NUM_MAX_XFORMS 16
+#define NUM_MAX_INFLIGHT_OPS 512
+#define EXPANSE_RATIO 1.05
+#define MIN_ISAL_SIZE 8
+
+#define DIV_CEIL(a, b)  ((a) / (b) + ((a) % (b) != 0))
+
+static int
+param_range_check(uint16_t size, const struct rte_param_log2_range *range)
+{
+	unsigned int next_size;
+
+	/* Check lower/upper bounds */
+	if (size < range->min)
+		return -1;
+
+	if (size > range->max)
+		return -1;
+
+	/* If range is actually only one value, size is correct */
+	if (range->increment == 0)
+		return 0;
+
+	/* Check if value is one of the supported sizes */
+	for (next_size = range->min; next_size <= range->max;
+			next_size += range->increment)
+		if (size == next_size)
+			return 0;
+
+	return -1;
+}
+
+static int
+comp_perf_check_capabilities(struct comp_test_data *test_data)
+{
+	const struct rte_compressdev_capabilities *cap;
+
+	cap = rte_compressdev_capability_get(test_data->cdev_id,
+					     RTE_COMP_ALGO_DEFLATE);
+
+	if (cap == NULL) {
+		RTE_LOG(ERR, USER1,
+			"Compress device does not support DEFLATE\n");
+		return -1;
+	}
+
+	uint64_t comp_flags = cap->comp_feature_flags;
+
+	/* Huffman enconding */
+	if (test_data->huffman_enc == RTE_COMP_HUFFMAN_FIXED &&
+			(comp_flags & RTE_COMP_FF_HUFFMAN_FIXED) == 0) {
+		RTE_LOG(ERR, USER1,
+			"Compress device does not supported Fixed Huffman\n");
+		return -1;
+	}
+
+	if (test_data->huffman_enc == RTE_COMP_HUFFMAN_DYNAMIC &&
+			(comp_flags & RTE_COMP_FF_HUFFMAN_DYNAMIC) == 0) {
+		RTE_LOG(ERR, USER1,
+			"Compress device does not supported Dynamic Huffman\n");
+		return -1;
+	}
+
+	/* Window size */
+	if (test_data->window_sz != -1) {
+		if (param_range_check(test_data->window_sz, &cap->window_size)
+				< 0) {
+			RTE_LOG(ERR, USER1,
+				"Compress device does not support "
+				"this window size\n");
+			return -1;
+		}
+	} else
+		/* Set window size to PMD maximum if none was specified */
+		test_data->window_sz = cap->window_size.max;
+
+	/* Check if chained mbufs is supported */
+	if (test_data->max_sgl_segs > 1  &&
+			(comp_flags & RTE_COMP_FF_OOP_SGL_IN_SGL_OUT) == 0) {
+		RTE_LOG(INFO, USER1, "Compress device does not support "
+				"chained mbufs. Max SGL segments set to 1\n");
+		test_data->max_sgl_segs = 1;
+	}
+
+	/* Level 0 support */
+	if (test_data->level.min == 0 &&
+			(comp_flags & RTE_COMP_FF_NONCOMPRESSED_BLOCKS) == 0) {
+		RTE_LOG(ERR, USER1, "Compress device does not support "
+				"level 0 (no compression)\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+comp_perf_allocate_memory(struct comp_test_data *test_data)
+{
+	/* Number of segments for input and output
+	 * (compression and decompression)
+	 */
+	uint32_t total_segs = DIV_CEIL(test_data->input_data_sz,
+			test_data->seg_sz);
+	test_data->comp_buf_pool = rte_pktmbuf_pool_create("comp_buf_pool",
+				total_segs,
+				0, 0, test_data->seg_sz + RTE_PKTMBUF_HEADROOM,
+				rte_socket_id());
+	if (test_data->comp_buf_pool == NULL) {
+		RTE_LOG(ERR, USER1, "Mbuf mempool could not be created\n");
+		return -1;
+	}
+
+	test_data->decomp_buf_pool = rte_pktmbuf_pool_create("decomp_buf_pool",
+				total_segs,
+				0, 0, test_data->seg_sz + RTE_PKTMBUF_HEADROOM,
+				rte_socket_id());
+	if (test_data->decomp_buf_pool == NULL) {
+		RTE_LOG(ERR, USER1, "Mbuf mempool could not be created\n");
+		return -1;
+	}
+
+	test_data->total_bufs = DIV_CEIL(total_segs, test_data->max_sgl_segs);
+
+	test_data->op_pool = rte_comp_op_pool_create("op_pool",
+				  test_data->total_bufs,
+				  0, 0, rte_socket_id());
+	if (test_data->op_pool == NULL) {
+		RTE_LOG(ERR, USER1, "Comp op mempool could not be created\n");
+		return -1;
+	}
+
+	/*
+	 * Compressed data might be a bit larger than input data,
+	 * if data cannot be compressed
+	 */
+	test_data->compressed_data = rte_zmalloc_socket(NULL,
+				test_data->input_data_sz * EXPANSE_RATIO
+							+ MIN_ISAL_SIZE, 0,
+				rte_socket_id());
+	if (test_data->compressed_data == NULL) {
+		RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
+				"file could not be allocated\n");
+		return -1;
+	}
+
+	test_data->decompressed_data = rte_zmalloc_socket(NULL,
+				test_data->input_data_sz, 0,
+				rte_socket_id());
+	if (test_data->decompressed_data == NULL) {
+		RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
+				"file could not be allocated\n");
+		return -1;
+	}
+
+	test_data->comp_bufs = rte_zmalloc_socket(NULL,
+			test_data->total_bufs * sizeof(struct rte_mbuf *),
+			0, rte_socket_id());
+	if (test_data->comp_bufs == NULL) {
+		RTE_LOG(ERR, USER1, "Memory to hold the compression mbufs"
+				" could not be allocated\n");
+		return -1;
+	}
+
+	test_data->decomp_bufs = rte_zmalloc_socket(NULL,
+			test_data->total_bufs * sizeof(struct rte_mbuf *),
+			0, rte_socket_id());
+	if (test_data->decomp_bufs == NULL) {
+		RTE_LOG(ERR, USER1, "Memory to hold the decompression mbufs"
+				" could not be allocated\n");
+		return -1;
+	}
+	return 0;
+}
+
+static int
+comp_perf_dump_input_data(struct comp_test_data *test_data)
+{
+	FILE *f = fopen(test_data->input_file, "r");
+
+	if (f == NULL) {
+		RTE_LOG(ERR, USER1, "Input file could not be opened\n");
+		return -1;
+	}
+
+	if (fseek(f, 0, SEEK_END) != 0) {
+		RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
+		goto err;
+	}
+	size_t actual_file_sz = ftell(f);
+	/* If extended input data size has not been set,
+	 * input data size = file size
+	 */
+
+	if (test_data->input_data_sz == 0)
+		test_data->input_data_sz = actual_file_sz;
+
+	if (fseek(f, 0, SEEK_SET) != 0) {
+		RTE_LOG(ERR, USER1, "Size of input could not be calculated\n");
+		goto err;
+	}
+
+	test_data->input_data = rte_zmalloc_socket(NULL,
+				test_data->input_data_sz, 0, rte_socket_id());
+
+	if (test_data->input_data == NULL) {
+		RTE_LOG(ERR, USER1, "Memory to hold the data from the input "
+				"file could not be allocated\n");
+		goto err;
+	}
+
+	size_t remaining_data = test_data->input_data_sz;
+	uint8_t *data = test_data->input_data;
+
+	while (remaining_data > 0) {
+		size_t data_to_read = RTE_MIN(remaining_data, actual_file_sz);
+
+		if (fread(data, data_to_read, 1, f) != 1) {
+			RTE_LOG(ERR, USER1, "Input file could not be read\n");
+			goto err;
+		}
+		if (fseek(f, 0, SEEK_SET) != 0) {
+			RTE_LOG(ERR, USER1,
+				"Size of input could not be calculated\n");
+			goto err;
+		}
+		remaining_data -= data_to_read;
+		data += data_to_read;
+	}
+
+	if (test_data->input_data_sz > actual_file_sz)
+		RTE_LOG(INFO, USER1,
+		  "%zu bytes read from file %s, extending the file %.2f times\n",
+			test_data->input_data_sz, test_data->input_file,
+			(double)test_data->input_data_sz/actual_file_sz);
+	else
+		RTE_LOG(INFO, USER1,
+			"%zu bytes read from file %s\n",
+			test_data->input_data_sz, test_data->input_file);
+
+	fclose(f);
+
+	return 0;
+
+err:
+	fclose(f);
+	rte_free(test_data->input_data);
+	test_data->input_data = NULL;
+
+	return -1;
+}
+
+static int
+comp_perf_initialize_compressdev(struct comp_test_data *test_data)
+{
+	uint8_t enabled_cdev_count;
+	uint8_t enabled_cdevs[RTE_COMPRESS_MAX_DEVS];
+
+	enabled_cdev_count = rte_compressdev_devices_get(test_data->driver_name,
+			enabled_cdevs, RTE_COMPRESS_MAX_DEVS);
+	if (enabled_cdev_count == 0) {
+		RTE_LOG(ERR, USER1, "No compress devices type %s available\n",
+				test_data->driver_name);
+		return -EINVAL;
+	}
+
+	if (enabled_cdev_count > 1)
+		RTE_LOG(INFO, USER1,
+			"Only the first compress device will be used\n");
+
+	test_data->cdev_id = enabled_cdevs[0];
+
+	if (comp_perf_check_capabilities(test_data) < 0)
+		return -1;
+
+	/* Configure compressdev (one device, one queue pair) */
+	struct rte_compressdev_config config = {
+		.socket_id = rte_socket_id(),
+		.nb_queue_pairs = 1,
+		.max_nb_priv_xforms = NUM_MAX_XFORMS,
+		.max_nb_streams = 0
+	};
+
+	if (rte_compressdev_configure(test_data->cdev_id, &config) < 0) {
+		RTE_LOG(ERR, USER1, "Device configuration failed\n");
+		return -1;
+	}
+
+	if (rte_compressdev_queue_pair_setup(test_data->cdev_id, 0,
+			NUM_MAX_INFLIGHT_OPS, rte_socket_id()) < 0) {
+		RTE_LOG(ERR, USER1, "Queue pair setup failed\n");
+		return -1;
+	}
+
+	if (rte_compressdev_start(test_data->cdev_id) < 0) {
+		RTE_LOG(ERR, USER1, "Device could not be started\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+prepare_bufs(struct comp_test_data *test_data)
+{
+	uint32_t remaining_data = test_data->input_data_sz;
+	uint8_t *input_data_ptr = test_data->input_data;
+	size_t data_sz;
+	uint8_t *data_addr;
+	uint32_t i, j;
+
+	for (i = 0; i < test_data->total_bufs; i++) {
+		/* Allocate data in input mbuf and copy data from input file */
+		test_data->decomp_bufs[i] =
+			rte_pktmbuf_alloc(test_data->decomp_buf_pool);
+		if (test_data->decomp_bufs[i] == NULL) {
+			RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
+			return -1;
+		}
+
+		data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
+		data_addr = (uint8_t *) rte_pktmbuf_append(
+					test_data->decomp_bufs[i], data_sz);
+		if (data_addr == NULL) {
+			RTE_LOG(ERR, USER1, "Could not append data\n");
+			return -1;
+		}
+		rte_memcpy(data_addr, input_data_ptr, data_sz);
+
+		input_data_ptr += data_sz;
+		remaining_data -= data_sz;
+
+		/* Already one segment in the mbuf */
+		uint16_t segs_per_mbuf = 1;
+
+		/* Chain mbufs if needed for input mbufs */
+		while (segs_per_mbuf < test_data->max_sgl_segs
+				&& remaining_data > 0) {
+			struct rte_mbuf *next_seg =
+				rte_pktmbuf_alloc(test_data->decomp_buf_pool);
+
+			if (next_seg == NULL) {
+				RTE_LOG(ERR, USER1,
+					"Could not allocate mbuf\n");
+				return -1;
+			}
+
+			data_sz = RTE_MIN(remaining_data, test_data->seg_sz);
+			data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
+				data_sz);
+
+			if (data_addr == NULL) {
+				RTE_LOG(ERR, USER1, "Could not append data\n");
+				return -1;
+			}
+
+			rte_memcpy(data_addr, input_data_ptr, data_sz);
+			input_data_ptr += data_sz;
+			remaining_data -= data_sz;
+
+			if (rte_pktmbuf_chain(test_data->decomp_bufs[i],
+					next_seg) < 0) {
+				RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
+				return -1;
+			}
+			segs_per_mbuf++;
+		}
+
+		/* Allocate data in output mbuf */
+		test_data->comp_bufs[i] =
+			rte_pktmbuf_alloc(test_data->comp_buf_pool);
+		if (test_data->comp_bufs[i] == NULL) {
+			RTE_LOG(ERR, USER1, "Could not allocate mbuf\n");
+			return -1;
+		}
+		data_addr = (uint8_t *) rte_pktmbuf_append(
+					test_data->comp_bufs[i],
+					test_data->seg_sz);
+		if (data_addr == NULL) {
+			RTE_LOG(ERR, USER1, "Could not append data\n");
+			return -1;
+		}
+
+		/* Chain mbufs if needed for output mbufs */
+		for (j = 1; j < segs_per_mbuf; j++) {
+			struct rte_mbuf *next_seg =
+				rte_pktmbuf_alloc(test_data->comp_buf_pool);
+
+			if (next_seg == NULL) {
+				RTE_LOG(ERR, USER1,
+					"Could not allocate mbuf\n");
+				return -1;
+			}
+
+			data_addr = (uint8_t *)rte_pktmbuf_append(next_seg,
+				test_data->seg_sz);
+
+			if (data_addr == NULL) {
+				RTE_LOG(ERR, USER1, "Could not append data\n");
+				return -1;
+			}
+
+			if (rte_pktmbuf_chain(test_data->comp_bufs[i],
+					next_seg) < 0) {
+				RTE_LOG(ERR, USER1, "Could not chain mbufs\n");
+				return -1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void
+free_bufs(struct comp_test_data *test_data)
+{
+	uint32_t i;
+
+	for (i = 0; i < test_data->total_bufs; i++) {
+		rte_pktmbuf_free(test_data->comp_bufs[i]);
+		rte_pktmbuf_free(test_data->decomp_bufs[i]);
+	}
+	rte_free(test_data->comp_bufs);
+	rte_free(test_data->decomp_bufs);
+}
+
+static int
+main_loop(struct comp_test_data *test_data, uint8_t level,
+			enum rte_comp_xform_type type,
+			uint8_t *output_data_ptr,
+			size_t *output_data_sz,
+			unsigned int benchmarking)
+{
+	uint8_t dev_id = test_data->cdev_id;
+	uint32_t i, iter, num_iter;
+	struct rte_comp_op **ops, **deq_ops;
+	void *priv_xform = NULL;
+	struct rte_comp_xform xform;
+	size_t output_size = 0;
+	struct rte_mbuf **input_bufs, **output_bufs;
+	int res = 0;
+	int allocated = 0;
+
+	if (test_data == NULL || !test_data->burst_sz) {
+		RTE_LOG(ERR, USER1,
+			"Unknow burst size\n");
+		return -1;
+	}
+
+	ops = rte_zmalloc_socket(NULL,
+		2 * test_data->total_bufs * sizeof(struct rte_comp_op *),
+		0, rte_socket_id());
+
+	if (ops == NULL) {
+		RTE_LOG(ERR, USER1,
+			"Can't allocate memory for ops strucures\n");
+		return -1;
+	}
+
+	deq_ops = &ops[test_data->total_bufs];
+
+	if (type == RTE_COMP_COMPRESS) {
+		xform = (struct rte_comp_xform) {
+			.type = RTE_COMP_COMPRESS,
+			.compress = {
+				.algo = RTE_COMP_ALGO_DEFLATE,
+				.deflate.huffman = test_data->huffman_enc,
+				.level = level,
+				.window_size = test_data->window_sz,
+				.chksum = RTE_COMP_CHECKSUM_NONE,
+				.hash_algo = RTE_COMP_HASH_ALGO_NONE
+			}
+		};
+		input_bufs = test_data->decomp_bufs;
+		output_bufs = test_data->comp_bufs;
+	} else {
+		xform = (struct rte_comp_xform) {
+			.type = RTE_COMP_DECOMPRESS,
+			.decompress = {
+				.algo = RTE_COMP_ALGO_DEFLATE,
+				.chksum = RTE_COMP_CHECKSUM_NONE,
+				.window_size = test_data->window_sz,
+				.hash_algo = RTE_COMP_HASH_ALGO_NONE
+			}
+		};
+		input_bufs = test_data->comp_bufs;
+		output_bufs = test_data->decomp_bufs;
+	}
+
+	/* Create private xform */
+	if (rte_compressdev_private_xform_create(dev_id, &xform,
+			&priv_xform) < 0) {
+		RTE_LOG(ERR, USER1, "Private xform could not be created\n");
+		res = -1;
+		goto end;
+	}
+
+	uint64_t tsc_start, tsc_end, tsc_duration;
+
+	tsc_start = tsc_end = tsc_duration = 0;
+	if (benchmarking) {
+		tsc_start = rte_rdtsc();
+		num_iter = test_data->num_iter;
+	} else
+		num_iter = 1;
+
+	for (iter = 0; iter < num_iter; iter++) {
+		uint32_t total_ops = test_data->total_bufs;
+		uint32_t remaining_ops = test_data->total_bufs;
+		uint32_t total_deq_ops = 0;
+		uint32_t total_enq_ops = 0;
+		uint16_t ops_unused = 0;
+		uint16_t num_enq = 0;
+		uint16_t num_deq = 0;
+
+		output_size = 0;
+
+		while (remaining_ops > 0) {
+			uint16_t num_ops = RTE_MIN(remaining_ops,
+						   test_data->burst_sz);
+			uint16_t ops_needed = num_ops - ops_unused;
+
+			/*
+			 * Move the unused operations from the previous
+			 * enqueue_burst call to the front, to maintain order
+			 */
+			if ((ops_unused > 0) && (num_enq > 0)) {
+				size_t nb_b_to_mov =
+				      ops_unused * sizeof(struct rte_comp_op *);
+
+				memmove(ops, &ops[num_enq], nb_b_to_mov);
+			}
+
+			/* Allocate compression operations */
+			if (ops_needed && !rte_comp_op_bulk_alloc(
+						test_data->op_pool,
+						&ops[ops_unused],
+						ops_needed)) {
+				RTE_LOG(ERR, USER1,
+				      "Could not allocate enough operations\n");
+				res = -1;
+				goto end;
+			}
+			allocated += ops_needed;
+
+			for (i = 0; i < ops_needed; i++) {
+				/*
+				 * Calculate next buffer to attach to operation
+				 */
+				uint32_t buf_id = total_enq_ops + i +
+						ops_unused;
+				uint16_t op_id = ops_unused + i;
+				/* Reset all data in output buffers */
+				struct rte_mbuf *m = output_bufs[buf_id];
+
+				m->pkt_len = test_data->seg_sz * m->nb_segs;
+				while (m) {
+					m->data_len = m->buf_len - m->data_off;
+					m = m->next;
+				}
+				ops[op_id]->m_src = input_bufs[buf_id];
+				ops[op_id]->m_dst = output_bufs[buf_id];
+				ops[op_id]->src.offset = 0;
+				ops[op_id]->src.length =
+					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
+				ops[op_id]->dst.offset = 0;
+				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
+				ops[op_id]->input_chksum = buf_id;
+				ops[op_id]->private_xform = priv_xform;
+			}
+
+			num_enq = rte_compressdev_enqueue_burst(dev_id, 0, ops,
+								num_ops);
+			ops_unused = num_ops - num_enq;
+			remaining_ops -= num_enq;
+			total_enq_ops += num_enq;
+
+			num_deq = rte_compressdev_dequeue_burst(dev_id, 0,
+							   deq_ops,
+							   test_data->burst_sz);
+			total_deq_ops += num_deq;
+			if (benchmarking == 0) {
+				for (i = 0; i < num_deq; i++) {
+					struct rte_comp_op *op = deq_ops[i];
+					const void *read_data_addr =
+						rte_pktmbuf_read(op->m_dst, 0,
+						op->produced, output_data_ptr);
+					if (read_data_addr == NULL) {
+						RTE_LOG(ERR, USER1,
+				      "Could not copy buffer in destination\n");
+						res = -1;
+						goto end;
+					}
+
+					if (read_data_addr != output_data_ptr)
+						rte_memcpy(output_data_ptr,
+							rte_pktmbuf_mtod(
+							  op->m_dst, uint8_t *),
+							op->produced);
+					output_data_ptr += op->produced;
+					output_size += op->produced;
+
+				}
+			}
+
+			if (iter == num_iter - 1) {
+				for (i = 0; i < num_deq; i++) {
+					struct rte_comp_op *op = deq_ops[i];
+					struct rte_mbuf *m = op->m_dst;
+
+					m->pkt_len = op->produced;
+					uint32_t remaining_data = op->produced;
+					uint16_t data_to_append;
+
+					while (remaining_data > 0) {
+						data_to_append =
+							RTE_MIN(remaining_data,
+							     test_data->seg_sz);
+						m->data_len = data_to_append;
+						remaining_data -=
+								data_to_append;
+						m = m->next;
+					}
+				}
+			}
+			rte_mempool_put_bulk(test_data->op_pool,
+					     (void **)deq_ops, num_deq);
+			allocated -= num_deq;
+		}
+
+		/* Dequeue the last operations */
+		while (total_deq_ops < total_ops) {
+			num_deq = rte_compressdev_dequeue_burst(dev_id, 0,
+						deq_ops, test_data->burst_sz);
+			total_deq_ops += num_deq;
+			if (benchmarking == 0) {
+				for (i = 0; i < num_deq; i++) {
+					struct rte_comp_op *op = deq_ops[i];
+					const void *read_data_addr =
+						rte_pktmbuf_read(op->m_dst, 0,
+						op->produced, output_data_ptr);
+					if (read_data_addr == NULL) {
+						RTE_LOG(ERR, USER1,
+				      "Could not copy buffer in destination\n");
+						res = -1;
+						goto end;
+					}
+
+					if (read_data_addr != output_data_ptr)
+						rte_memcpy(output_data_ptr,
+							rte_pktmbuf_mtod(
+							op->m_dst, uint8_t *),
+							op->produced);
+					output_data_ptr += op->produced;
+					output_size += op->produced;
+
+				}
+			}
+
+			if (iter == num_iter - 1) {
+				for (i = 0; i < num_deq; i++) {
+					struct rte_comp_op *op = deq_ops[i];
+					struct rte_mbuf *m = op->m_dst;
+
+					m->pkt_len = op->produced;
+					uint32_t remaining_data = op->produced;
+					uint16_t data_to_append;
+
+					while (remaining_data > 0) {
+						data_to_append =
+						RTE_MIN(remaining_data,
+							test_data->seg_sz);
+						m->data_len = data_to_append;
+						remaining_data -=
+								data_to_append;
+						m = m->next;
+					}
+				}
+			}
+			rte_mempool_put_bulk(test_data->op_pool,
+					     (void **)deq_ops, num_deq);
+			allocated -= num_deq;
+		}
+	}
+
+	if (benchmarking) {
+		tsc_end = rte_rdtsc();
+		tsc_duration = tsc_end - tsc_start;
+
+		if (type == RTE_COMP_COMPRESS)
+			test_data->comp_tsc_duration[level] =
+					tsc_duration / num_iter;
+		else
+			test_data->decomp_tsc_duration[level] =
+					tsc_duration / num_iter;
+	}
+
+	if (benchmarking == 0 && output_data_sz)
+		*output_data_sz = output_size;
+end:
+	rte_mempool_put_bulk(test_data->op_pool, (void **)ops, allocated);
+	rte_compressdev_private_xform_free(dev_id, priv_xform);
+	rte_free(ops);
+	return res;
+}
+
 int
 main(int argc, char **argv)
 {
+	uint8_t level, level_idx = 0;
+	uint8_t i;
 	int ret;
 	struct comp_test_data *test_data;
 
@@ -43,9 +751,145 @@  main(int argc, char **argv)
 		goto err;
 	}
 
+	if (comp_perf_initialize_compressdev(test_data) < 0) {
+		ret = EXIT_FAILURE;
+		goto err;
+	}
+
+	if (comp_perf_dump_input_data(test_data) < 0) {
+		ret = EXIT_FAILURE;
+		goto err;
+	}
+
+	if (comp_perf_allocate_memory(test_data) < 0) {
+		ret = EXIT_FAILURE;
+		goto err;
+	}
+
+	if (prepare_bufs(test_data) < 0) {
+		ret = EXIT_FAILURE;
+		goto err;
+	}
+
+	if (test_data->level.inc != 0)
+		level = test_data->level.min;
+	else
+		level = test_data->level.list[0];
+
+	size_t comp_data_sz;
+	size_t decomp_data_sz;
+
+	printf("Burst size = %u\n", test_data->burst_sz);
+	printf("File size = %zu\n", test_data->input_data_sz);
+
+	printf("%6s%12s%17s%19s%21s%15s%21s%23s%16s\n",
+		"Level", "Comp size", "Comp ratio [%]",
+		"Comp [Cycles/it]", "Comp [Cycles/Byte]", "Comp [Gbps]",
+		"Decomp [Cycles/it]", "Decomp [Cycles/Byte]", "Decomp [Gbps]");
+
+	while (level <= test_data->level.max) {
+		/*
+		 * Run a first iteration, to verify compression and
+		 * get the compression ratio for the level
+		 */
+		if (main_loop(test_data, level, RTE_COMP_COMPRESS,
+			      test_data->compressed_data,
+			      &comp_data_sz, 0) < 0) {
+			ret = EXIT_FAILURE;
+			goto err;
+		}
+
+		if (main_loop(test_data, level, RTE_COMP_DECOMPRESS,
+			      test_data->decompressed_data,
+			      &decomp_data_sz, 0) < 0) {
+			ret = EXIT_FAILURE;
+			goto err;
+		}
+
+		if (decomp_data_sz != test_data->input_data_sz) {
+			RTE_LOG(ERR, USER1,
+		   "Decompressed data length not equal to input data length\n");
+			RTE_LOG(ERR, USER1,
+				"Decompressed size = %zu, expected = %zu\n",
+				decomp_data_sz, test_data->input_data_sz);
+			ret = EXIT_FAILURE;
+			goto err;
+		} else {
+			if (memcmp(test_data->decompressed_data,
+					test_data->input_data,
+					test_data->input_data_sz) != 0) {
+				RTE_LOG(ERR, USER1,
+			    "Decompressed data is not the same as file data\n");
+				ret = EXIT_FAILURE;
+				goto err;
+			}
+		}
+
+		double ratio = (double) comp_data_sz /
+						test_data->input_data_sz * 100;
+
+		/*
+		 * Run the tests twice, discarding the first performance
+		 * results, before the cache is warmed up
+		 */
+		for (i = 0; i < 2; i++) {
+			if (main_loop(test_data, level, RTE_COMP_COMPRESS,
+					NULL, NULL, 1) < 0) {
+				ret = EXIT_FAILURE;
+				goto err;
+			}
+		}
+
+		for (i = 0; i < 2; i++) {
+			if (main_loop(test_data, level, RTE_COMP_DECOMPRESS,
+					NULL, NULL, 1) < 0) {
+				ret = EXIT_FAILURE;
+				goto err;
+			}
+		}
+
+		uint64_t comp_tsc_duration =
+				test_data->comp_tsc_duration[level];
+		double comp_tsc_byte = (double)comp_tsc_duration /
+						test_data->input_data_sz;
+		double comp_gbps = rte_get_tsc_hz() / comp_tsc_byte * 8 /
+				1000000000;
+		uint64_t decomp_tsc_duration =
+				test_data->decomp_tsc_duration[level];
+		double decomp_tsc_byte = (double)decomp_tsc_duration /
+						test_data->input_data_sz;
+		double decomp_gbps = rte_get_tsc_hz() / decomp_tsc_byte * 8 /
+				1000000000;
+
+		printf("%6u%12zu%17.2f%19"PRIu64"%21.2f"
+					"%15.2f%21"PRIu64"%23.2f%16.2f\n",
+		       level, comp_data_sz, ratio, comp_tsc_duration,
+		       comp_tsc_byte, comp_gbps, decomp_tsc_duration,
+		       decomp_tsc_byte, decomp_gbps);
+
+		if (test_data->level.inc != 0)
+			level += test_data->level.inc;
+		else {
+			if (++level_idx == test_data->level.count)
+				break;
+			level = test_data->level.list[level_idx];
+		}
+	}
+
 	ret = EXIT_SUCCESS;
 
 err:
+	if (test_data->cdev_id != -1)
+		rte_compressdev_stop(test_data->cdev_id);
+
+	free_bufs(test_data);
+	rte_free(test_data->compressed_data);
+	rte_free(test_data->decompressed_data);
+	rte_free(test_data->input_data);
+	rte_mempool_free(test_data->comp_buf_pool);
+	rte_mempool_free(test_data->decomp_buf_pool);
+	rte_mempool_free(test_data->op_pool);
+
 	rte_free(test_data);
 
 	return ret;