From patchwork Thu Dec 6 12:03:17 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kamil Chalupnik X-Patchwork-Id: 48561 Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id BB17B5F2D; Thu, 6 Dec 2018 13:03:47 +0100 (CET) Received: from mga17.intel.com (mga17.intel.com [192.55.52.151]) by dpdk.org (Postfix) with ESMTP id 352C15F13 for ; Thu, 6 Dec 2018 13:03:45 +0100 (CET) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by fmsmga107.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 06 Dec 2018 04:03:44 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.56,322,1539673200"; d="scan'208";a="127637443" Received: from kchalupx-mobl.ger.corp.intel.com ([10.103.104.172]) by fmsmga001.fm.intel.com with ESMTP; 06 Dec 2018 04:03:42 -0800 From: Kamil Chalupnik To: dev@dpdk.org Cc: amr.mokhtar@intel.com, akhil.goyal@nxp.com, Kamil Chalupnik Date: Thu, 6 Dec 2018 13:03:17 +0100 Message-Id: <20181206120317.16156-1-kamilx.chalupnik@intel.com> X-Mailer: git-send-email 2.9.0.windows.1 Subject: [dpdk-dev] [PATCH] baseband: enhancement of offload cost test X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Offload cost test was improved in order to collect more accurate results. Signed-off-by: Kamil Chalupnik Acked-by: Amr Mokhtar --- app/test-bbdev/test_bbdev_perf.c | 154 +++++++++++------------ config/common_base | 2 +- drivers/baseband/turbo_sw/bbdev_turbo_software.c | 70 ++++++++--- lib/librte_bbdev/rte_bbdev.h | 9 +- 4 files changed, 136 insertions(+), 99 deletions(-) diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c index fbe6cc9..21be574 100644 --- a/app/test-bbdev/test_bbdev_perf.c +++ b/app/test-bbdev/test_bbdev_perf.c @@ -88,19 +88,19 @@ struct thread_params { /* Stores time statistics */ struct test_time_stats { /* Stores software enqueue total working time */ - uint64_t enq_sw_tot_time; + uint64_t enq_sw_total_time; /* Stores minimum value of software enqueue working time */ uint64_t enq_sw_min_time; /* Stores maximum value of software enqueue working time */ uint64_t enq_sw_max_time; /* Stores turbo enqueue total working time */ - uint64_t enq_tur_tot_time; - /* Stores minimum value of turbo enqueue working time */ - uint64_t enq_tur_min_time; - /* Stores maximum value of turbo enqueue working time */ - uint64_t enq_tur_max_time; + uint64_t enq_acc_total_time; + /* Stores minimum value of accelerator enqueue working time */ + uint64_t enq_acc_min_time; + /* Stores maximum value of accelerator enqueue working time */ + uint64_t enq_acc_max_time; /* Stores dequeue total working time */ - uint64_t deq_tot_time; + uint64_t deq_total_time; /* Stores minimum value of dequeue working time */ uint64_t deq_min_time; /* Stores maximum value of dequeue working time */ @@ -1200,12 +1200,15 @@ typedef int (test_case_function)(struct active_device *ad, burst_sz = tp->op_params->burst_sz; num_to_process = tp->op_params->num_to_process; - if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) + if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC) { deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id, dec_ops, burst_sz); - else + rte_bbdev_dec_op_free_bulk(dec_ops, deq); + } else { deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id, enc_ops, burst_sz); + rte_bbdev_enc_op_free_bulk(enc_ops, deq); + } if (deq < burst_sz) { printf( @@ -1316,8 +1319,6 @@ typedef int (test_case_function)(struct active_device *ad, enqueued += rte_bbdev_enqueue_dec_ops(tp->dev_id, queue_id, ops, num_to_enq); - - rte_bbdev_dec_op_free_bulk(ops, num_to_enq); } if (allocs_failed > 0) @@ -1380,8 +1381,6 @@ typedef int (test_case_function)(struct active_device *ad, enqueued += rte_bbdev_enqueue_enc_ops(tp->dev_id, queue_id, ops, num_to_enq); - - rte_bbdev_enc_op_free_bulk(ops, num_to_enq); } if (allocs_failed > 0) @@ -1575,13 +1574,14 @@ typedef int (test_case_function)(struct active_device *ad, RTE_LCORE_FOREACH(lcore_id) { if (iter++ >= used_cores) break; - printf("\tlcore_id: %u, throughput: %.8lg MOPS, %.8lg Mbps\n", - lcore_id, t_params[lcore_id].mops, t_params[lcore_id].mbps); + printf("Throughput for core (%u): %.8lg MOPS, %.8lg Mbps\n", + lcore_id, t_params[lcore_id].mops, + t_params[lcore_id].mbps); total_mops += t_params[lcore_id].mops; total_mbps += t_params[lcore_id].mbps; } printf( - "\n\tTotal stats for %u cores: throughput: %.8lg MOPS, %.8lg Mbps\n", + "\nTotal throughput for %u cores: %.8lg MOPS, %.8lg Mbps\n", used_cores, total_mops, total_mbps); } @@ -1609,7 +1609,7 @@ typedef int (test_case_function)(struct active_device *ad, test_vector.op_type); printf( - "Throughput test: dev: %s, nb_queues: %u, burst size: %u, num ops: %u, num_lcores: %u, op type: %s, int mode: %s, GHz: %lg\n", + "\nThroughput test: dev: %s, nb_queues: %u, burst size: %u, num ops: %u, num_lcores: %u, op type: %s, int mode: %s, GHz: %lg\n", info.dev_name, ad->nb_queues, op_params->burst_sz, op_params->num_to_process, op_params->num_lcores, op_type_str, @@ -1882,7 +1882,7 @@ typedef int (test_case_function)(struct active_device *ad, TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type); printf( - "Validation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n", + "\nValidation/Latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n", info.dev_name, burst_sz, num_to_process, op_type_str); if (op_type == RTE_BBDEV_OP_TURBO_DEC) @@ -1899,10 +1899,10 @@ typedef int (test_case_function)(struct active_device *ad, if (iter <= 0) return TEST_FAILED; - printf("\toperation latency:\n" - "\t\tavg latency: %lg cycles, %lg us\n" - "\t\tmin latency: %lg cycles, %lg us\n" - "\t\tmax latency: %lg cycles, %lg us\n", + printf("Operation latency:\n" + "\tavg latency: %lg cycles, %lg us\n" + "\tmin latency: %lg cycles, %lg us\n" + "\tmax latency: %lg cycles, %lg us\n", (double)total_time / (double)iter, (double)(total_time * 1000000) / (double)iter / (double)rte_get_tsc_hz(), (double)min_time, @@ -1930,7 +1930,7 @@ typedef int (test_case_function)(struct active_device *ad, stats->dequeued_count = q_stats->dequeued_count; stats->enqueue_err_count = q_stats->enqueue_err_count; stats->dequeue_err_count = q_stats->dequeue_err_count; - stats->offload_time = q_stats->offload_time; + stats->acc_offload_cycles = q_stats->acc_offload_cycles; return 0; } @@ -1974,18 +1974,18 @@ typedef int (test_case_function)(struct active_device *ad, queue_id, dev_id); enq_sw_last_time = rte_rdtsc_precise() - enq_start_time - - stats.offload_time; + stats.acc_offload_cycles; time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time, enq_sw_last_time); time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time, enq_sw_last_time); - time_st->enq_sw_tot_time += enq_sw_last_time; + time_st->enq_sw_total_time += enq_sw_last_time; - time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time, - stats.offload_time); - time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time, - stats.offload_time); - time_st->enq_tur_tot_time += stats.offload_time; + time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time, + stats.acc_offload_cycles); + time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time, + stats.acc_offload_cycles); + time_st->enq_acc_total_time += stats.acc_offload_cycles; /* ensure enqueue has been completed */ rte_delay_ms(10); @@ -2003,7 +2003,7 @@ typedef int (test_case_function)(struct active_device *ad, deq_last_time); time_st->deq_min_time = RTE_MIN(time_st->deq_min_time, deq_last_time); - time_st->deq_tot_time += deq_last_time; + time_st->deq_total_time += deq_last_time; /* Dequeue remaining operations if needed*/ while (burst_sz != deq) @@ -2055,18 +2055,18 @@ typedef int (test_case_function)(struct active_device *ad, queue_id, dev_id); enq_sw_last_time = rte_rdtsc_precise() - enq_start_time - - stats.offload_time; + stats.acc_offload_cycles; time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time, enq_sw_last_time); time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time, enq_sw_last_time); - time_st->enq_sw_tot_time += enq_sw_last_time; + time_st->enq_sw_total_time += enq_sw_last_time; - time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time, - stats.offload_time); - time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time, - stats.offload_time); - time_st->enq_tur_tot_time += stats.offload_time; + time_st->enq_acc_max_time = RTE_MAX(time_st->enq_acc_max_time, + stats.acc_offload_cycles); + time_st->enq_acc_min_time = RTE_MIN(time_st->enq_acc_min_time, + stats.acc_offload_cycles); + time_st->enq_acc_total_time += stats.acc_offload_cycles; /* ensure enqueue has been completed */ rte_delay_ms(10); @@ -2084,7 +2084,7 @@ typedef int (test_case_function)(struct active_device *ad, deq_last_time); time_st->deq_min_time = RTE_MIN(time_st->deq_min_time, deq_last_time); - time_st->deq_tot_time += deq_last_time; + time_st->deq_total_time += deq_last_time; while (burst_sz != deq) deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id, @@ -2121,7 +2121,7 @@ typedef int (test_case_function)(struct active_device *ad, memset(&time_st, 0, sizeof(struct test_time_stats)); time_st.enq_sw_min_time = UINT64_MAX; - time_st.enq_tur_min_time = UINT64_MAX; + time_st.enq_acc_min_time = UINT64_MAX; time_st.deq_min_time = UINT64_MAX; TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST), @@ -2134,7 +2134,7 @@ typedef int (test_case_function)(struct active_device *ad, TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type); printf( - "Offload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n", + "\nOffload latency test: dev: %s, burst size: %u, num ops: %u, op type: %s\n", info.dev_name, burst_sz, num_to_process, op_type_str); if (op_type == RTE_BBDEV_OP_TURBO_DEC) @@ -2149,36 +2149,36 @@ typedef int (test_case_function)(struct active_device *ad, if (iter <= 0) return TEST_FAILED; - printf("\tenq offload cost latency:\n" - "\t\tsoftware avg %lg cycles, %lg us\n" - "\t\tsoftware min %lg cycles, %lg us\n" - "\t\tsoftware max %lg cycles, %lg us\n" - "\t\tturbo avg %lg cycles, %lg us\n" - "\t\tturbo min %lg cycles, %lg us\n" - "\t\tturbo max %lg cycles, %lg us\n", - (double)time_st.enq_sw_tot_time / (double)iter, - (double)(time_st.enq_sw_tot_time * 1000000) / + printf("Enqueue offload cost latency:\n" + "\tDriver offload avg %lg cycles, %lg us\n" + "\tDriver offload min %lg cycles, %lg us\n" + "\tDriver offload max %lg cycles, %lg us\n" + "\tAccelerator offload avg %lg cycles, %lg us\n" + "\tAccelerator offload min %lg cycles, %lg us\n" + "\tAccelerator offload max %lg cycles, %lg us\n", + (double)time_st.enq_sw_total_time / (double)iter, + (double)(time_st.enq_sw_total_time * 1000000) / (double)iter / (double)rte_get_tsc_hz(), (double)time_st.enq_sw_min_time, (double)(time_st.enq_sw_min_time * 1000000) / rte_get_tsc_hz(), (double)time_st.enq_sw_max_time, (double)(time_st.enq_sw_max_time * 1000000) / - rte_get_tsc_hz(), (double)time_st.enq_tur_tot_time / + rte_get_tsc_hz(), (double)time_st.enq_acc_total_time / (double)iter, - (double)(time_st.enq_tur_tot_time * 1000000) / + (double)(time_st.enq_acc_total_time * 1000000) / (double)iter / (double)rte_get_tsc_hz(), - (double)time_st.enq_tur_min_time, - (double)(time_st.enq_tur_min_time * 1000000) / - rte_get_tsc_hz(), (double)time_st.enq_tur_max_time, - (double)(time_st.enq_tur_max_time * 1000000) / + (double)time_st.enq_acc_min_time, + (double)(time_st.enq_acc_min_time * 1000000) / + rte_get_tsc_hz(), (double)time_st.enq_acc_max_time, + (double)(time_st.enq_acc_max_time * 1000000) / rte_get_tsc_hz()); - printf("\tdeq offload cost latency - one op:\n" - "\t\tavg %lg cycles, %lg us\n" - "\t\tmin %lg cycles, %lg us\n" - "\t\tmax %lg cycles, %lg us\n", - (double)time_st.deq_tot_time / (double)iter, - (double)(time_st.deq_tot_time * 1000000) / + printf("Dequeue offload cost latency - one op:\n" + "\tavg %lg cycles, %lg us\n" + "\tmin %lg cycles, %lg us\n" + "\tmax %lg cycles, %lg us\n", + (double)time_st.deq_total_time / (double)iter, + (double)(time_st.deq_total_time * 1000000) / (double)iter / (double)rte_get_tsc_hz(), (double)time_st.deq_min_time, (double)(time_st.deq_min_time * 1000000) / @@ -2194,7 +2194,7 @@ typedef int (test_case_function)(struct active_device *ad, static int offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id, const uint16_t num_to_process, uint16_t burst_sz, - uint64_t *deq_tot_time, uint64_t *deq_min_time, + uint64_t *deq_total_time, uint64_t *deq_min_time, uint64_t *deq_max_time) { int i, deq_total; @@ -2214,7 +2214,7 @@ typedef int (test_case_function)(struct active_device *ad, deq_last_time = rte_rdtsc_precise() - deq_start_time; *deq_max_time = RTE_MAX(*deq_max_time, deq_last_time); *deq_min_time = RTE_MIN(*deq_min_time, deq_last_time); - *deq_tot_time += deq_last_time; + *deq_total_time += deq_last_time; } return i; @@ -2223,7 +2223,7 @@ typedef int (test_case_function)(struct active_device *ad, static int offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id, const uint16_t num_to_process, uint16_t burst_sz, - uint64_t *deq_tot_time, uint64_t *deq_min_time, + uint64_t *deq_total_time, uint64_t *deq_min_time, uint64_t *deq_max_time) { int i, deq_total; @@ -2242,7 +2242,7 @@ typedef int (test_case_function)(struct active_device *ad, deq_last_time = rte_rdtsc_precise() - deq_start_time; *deq_max_time = RTE_MAX(*deq_max_time, deq_last_time); *deq_min_time = RTE_MIN(*deq_min_time, deq_last_time); - *deq_tot_time += deq_last_time; + *deq_total_time += deq_last_time; } return i; @@ -2261,7 +2261,7 @@ typedef int (test_case_function)(struct active_device *ad, return TEST_SKIPPED; #else int iter; - uint64_t deq_tot_time, deq_min_time, deq_max_time; + uint64_t deq_total_time, deq_min_time, deq_max_time; uint16_t burst_sz = op_params->burst_sz; const uint16_t num_to_process = op_params->num_to_process; const enum rte_bbdev_op_type op_type = test_vector.op_type; @@ -2269,7 +2269,7 @@ typedef int (test_case_function)(struct active_device *ad, struct rte_bbdev_info info; const char *op_type_str; - deq_tot_time = deq_max_time = 0; + deq_total_time = deq_max_time = 0; deq_min_time = UINT64_MAX; TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST), @@ -2281,27 +2281,27 @@ typedef int (test_case_function)(struct active_device *ad, TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type); printf( - "Offload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n", + "\nOffload latency empty dequeue test: dev: %s, burst size: %u, num ops: %u, op type: %s\n", info.dev_name, burst_sz, num_to_process, op_type_str); if (op_type == RTE_BBDEV_OP_TURBO_DEC) iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id, - num_to_process, burst_sz, &deq_tot_time, + num_to_process, burst_sz, &deq_total_time, &deq_min_time, &deq_max_time); else iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id, - num_to_process, burst_sz, &deq_tot_time, + num_to_process, burst_sz, &deq_total_time, &deq_min_time, &deq_max_time); if (iter <= 0) return TEST_FAILED; - printf("\tempty deq offload\n" - "\t\tavg. latency: %lg cycles, %lg us\n" - "\t\tmin. latency: %lg cycles, %lg us\n" - "\t\tmax. latency: %lg cycles, %lg us\n", - (double)deq_tot_time / (double)iter, - (double)(deq_tot_time * 1000000) / (double)iter / + printf("Empty dequeue offload\n" + "\tavg. latency: %lg cycles, %lg us\n" + "\tmin. latency: %lg cycles, %lg us\n" + "\tmax. latency: %lg cycles, %lg us\n", + (double)deq_total_time / (double)iter, + (double)(deq_total_time * 1000000) / (double)iter / (double)rte_get_tsc_hz(), (double)deq_min_time, (double)(deq_min_time * 1000000) / rte_get_tsc_hz(), (double)deq_max_time, (double)(deq_max_time * 1000000) / diff --git a/config/common_base b/config/common_base index d12ae98..3ff98bb 100644 --- a/config/common_base +++ b/config/common_base @@ -481,7 +481,7 @@ CONFIG_RTE_PMD_PACKET_PREFETCH=y # CONFIG_RTE_LIBRTE_BBDEV=y CONFIG_RTE_BBDEV_MAX_DEVS=128 -CONFIG_RTE_BBDEV_OFFLOAD_COST=n +CONFIG_RTE_BBDEV_OFFLOAD_COST=y # # Compile PMD for NULL bbdev device diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c index 8ceb276..57f6ba1 100644 --- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c +++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c @@ -510,9 +510,10 @@ struct turbo_sw_queue { #ifdef RTE_BBDEV_OFFLOAD_COST start_time = rte_rdtsc_precise(); #endif + /* CRC24A generation */ bblib_lte_crc24a_gen(&crc_req, &crc_resp); #ifdef RTE_BBDEV_OFFLOAD_COST - q_stats->offload_time += rte_rdtsc_precise() - start_time; + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; #endif } else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) { /* CRC24B */ @@ -542,9 +543,10 @@ struct turbo_sw_queue { #ifdef RTE_BBDEV_OFFLOAD_COST start_time = rte_rdtsc_precise(); #endif + /* CRC24B generation */ bblib_lte_crc24b_gen(&crc_req, &crc_resp); #ifdef RTE_BBDEV_OFFLOAD_COST - q_stats->offload_time += rte_rdtsc_precise() - start_time; + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; #endif } else { ret = is_enc_input_valid(k, k_idx, total_left); @@ -596,15 +598,14 @@ struct turbo_sw_queue { #ifdef RTE_BBDEV_OFFLOAD_COST start_time = rte_rdtsc_precise(); #endif - + /* Turbo encoding */ if (bblib_turbo_encoder(&turbo_req, &turbo_resp) != 0) { op->status |= 1 << RTE_BBDEV_DRV_ERROR; rte_bbdev_log(ERR, "Turbo Encoder failed"); return; } - #ifdef RTE_BBDEV_OFFLOAD_COST - q_stats->offload_time += rte_rdtsc_precise() - start_time; + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; #endif /* Restore 3 first bytes of next CB if they were overwritten by CRC*/ @@ -671,23 +672,21 @@ struct turbo_sw_queue { #ifdef RTE_BBDEV_OFFLOAD_COST start_time = rte_rdtsc_precise(); #endif - + /* Rate-Matching */ if (bblib_rate_match_dl(&rm_req, &rm_resp) != 0) { op->status |= 1 << RTE_BBDEV_DRV_ERROR; rte_bbdev_log(ERR, "Rate matching failed"); return; } +#ifdef RTE_BBDEV_OFFLOAD_COST + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; +#endif /* SW fills an entire last byte even if E%8 != 0. Clear the * superfluous data bits for consistency with HW device. */ mask_id = (e & 7) >> 1; rm_out[out_len - 1] &= mask_out[mask_id]; - -#ifdef RTE_BBDEV_OFFLOAD_COST - q_stats->offload_time += rte_rdtsc_precise() - start_time; -#endif - enc->output.length += rm_resp.OutputLen; } else { /* Rate matching is bypassed */ @@ -798,7 +797,7 @@ struct turbo_sw_queue { { uint16_t i; #ifdef RTE_BBDEV_OFFLOAD_COST - queue_stats->offload_time = 0; + queue_stats->acc_offload_cycles = 0; #endif for (i = 0; i < nb_ops; ++i) @@ -905,7 +904,8 @@ struct turbo_sw_queue { process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op, uint8_t c, uint16_t k, uint16_t kw, struct rte_mbuf *m_in, struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset, - bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left) + bool check_crc_24b, uint16_t crc24_overlap, uint16_t total_left, + struct rte_bbdev_stats *q_stats) { int ret; int32_t k_idx; @@ -917,6 +917,11 @@ struct turbo_sw_queue { struct bblib_turbo_decoder_request turbo_req; struct bblib_turbo_decoder_response turbo_resp; struct rte_bbdev_op_turbo_dec *dec = &op->turbo_dec; +#ifdef RTE_BBDEV_OFFLOAD_COST + uint64_t start_time; +#else + RTE_SET_USED(q_stats); +#endif k_idx = compute_idx(k); @@ -942,7 +947,14 @@ struct turbo_sw_queue { deint_req.pharqbuffer = q->deint_input; deint_req.ncb = ncb_without_null; deint_resp.pinteleavebuffer = q->deint_output; + +#ifdef RTE_BBDEV_OFFLOAD_COST + start_time = rte_rdtsc_precise(); +#endif bblib_deinterleave_ul(&deint_req, &deint_resp); +#ifdef RTE_BBDEV_OFFLOAD_COST + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; +#endif } else move_padding_bytes(in, q->deint_output, k, ncb); @@ -961,7 +973,15 @@ struct turbo_sw_queue { adapter_req.ncb = ncb_without_null; adapter_req.pinteleavebuffer = adapter_input; adapter_resp.pharqout = q->adapter_output; + +#ifdef RTE_BBDEV_OFFLOAD_COST + start_time = rte_rdtsc_precise(); +#endif + /* Turbo decode adaptation */ bblib_turbo_adapter_ul(&adapter_req, &adapter_resp); +#ifdef RTE_BBDEV_OFFLOAD_COST + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; +#endif out = (uint8_t *)rte_pktmbuf_append(m_out, ((k - crc24_overlap) >> 3)); if (out == NULL) { @@ -986,12 +1006,20 @@ struct turbo_sw_queue { turbo_resp.ag_buf = q->ag; turbo_resp.cb_buf = q->code_block; turbo_resp.output = out; + +#ifdef RTE_BBDEV_OFFLOAD_COST + start_time = rte_rdtsc_precise(); +#endif + /* Turbo decode */ iter_cnt = bblib_turbo_decoder(&turbo_req, &turbo_resp); +#ifdef RTE_BBDEV_OFFLOAD_COST + q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time; +#endif dec->hard_output.length += (k >> 3); if (iter_cnt > 0) { /* Temporary solution for returned iter_count from SDK */ - iter_cnt = (iter_cnt - 1) / 2; + iter_cnt = (iter_cnt - 1) >> 1; dec->iter_count = RTE_MAX(iter_cnt, dec->iter_count); } else { op->status |= 1 << RTE_BBDEV_DATA_ERROR; @@ -1001,7 +1029,8 @@ struct turbo_sw_queue { } static inline void -enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op) +enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op, + struct rte_bbdev_stats *queue_stats) { uint8_t c, r = 0; uint16_t kw, k = 0; @@ -1053,7 +1082,7 @@ struct turbo_sw_queue { process_dec_cb(q, op, c, k, kw, m_in, m_out, in_offset, out_offset, check_bit(dec->op_flags, RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap, - total_left); + total_left, queue_stats); /* To keep CRC24 attached to end of Code block, use * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it * removed by default once verified. @@ -1075,12 +1104,15 @@ struct turbo_sw_queue { static inline uint16_t enqueue_dec_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_dec_op **ops, - uint16_t nb_ops) + uint16_t nb_ops, struct rte_bbdev_stats *queue_stats) { uint16_t i; +#ifdef RTE_BBDEV_OFFLOAD_COST + queue_stats->acc_offload_cycles = 0; +#endif for (i = 0; i < nb_ops; ++i) - enqueue_dec_one_op(q, ops[i]); + enqueue_dec_one_op(q, ops[i], queue_stats); return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops, NULL); @@ -1112,7 +1144,7 @@ struct turbo_sw_queue { struct turbo_sw_queue *q = queue; uint16_t nb_enqueued = 0; - nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops); + nb_enqueued = enqueue_dec_all_ops(q, ops, nb_ops, &q_data->queue_stats); q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued; q_data->queue_stats.enqueued_count += nb_enqueued; diff --git a/lib/librte_bbdev/rte_bbdev.h b/lib/librte_bbdev/rte_bbdev.h index 25ef409..da8cf07 100644 --- a/lib/librte_bbdev/rte_bbdev.h +++ b/lib/librte_bbdev/rte_bbdev.h @@ -239,8 +239,13 @@ struct rte_bbdev_stats { uint64_t enqueue_err_count; /** Total error count on operations dequeued */ uint64_t dequeue_err_count; - /** Offload time */ - uint64_t offload_time; + /** CPU cycles consumed by the (HW/SW) accelerator device to offload + * the enqueue request to its internal queues. + * - For a HW device this is the cycles consumed in MMIO write + * - For a SW (vdev) device, this is the processing time of the + * bbdev operation + */ + uint64_t acc_offload_cycles; }; /**