get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/120599/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 120599,
    "url": "https://patches.dpdk.org/api/patches/120599/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/20221208193532.16718-5-syalavarthi@marvell.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20221208193532.16718-5-syalavarthi@marvell.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20221208193532.16718-5-syalavarthi@marvell.com",
    "date": "2022-12-08T19:35:32",
    "name": "[v1,4/4] common/ml: add Arm NEON type conversion routines",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "10644d478d7531b205529993662f36b7b222aca6",
    "submitter": {
        "id": 2480,
        "url": "https://patches.dpdk.org/api/people/2480/?format=api",
        "name": "Srikanth Yalavarthi",
        "email": "syalavarthi@marvell.com"
    },
    "delegate": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/users/1/?format=api",
        "username": "tmonjalo",
        "first_name": "Thomas",
        "last_name": "Monjalon",
        "email": "thomas@monjalon.net"
    },
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/20221208193532.16718-5-syalavarthi@marvell.com/mbox/",
    "series": [
        {
            "id": 26047,
            "url": "https://patches.dpdk.org/api/series/26047/?format=api",
            "web_url": "https://patches.dpdk.org/project/dpdk/list/?series=26047",
            "date": "2022-12-08T19:35:28",
            "name": "implementation of ML common code",
            "version": 1,
            "mbox": "https://patches.dpdk.org/series/26047/mbox/"
        }
    ],
    "comments": "https://patches.dpdk.org/api/patches/120599/comments/",
    "check": "success",
    "checks": "https://patches.dpdk.org/api/patches/120599/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 6D5EDA0032;\n\tThu,  8 Dec 2022 20:36:01 +0100 (CET)",
            "from mails.dpdk.org (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id 0B1FF42D18;\n\tThu,  8 Dec 2022 20:35:44 +0100 (CET)",
            "from mx0b-0016f401.pphosted.com (mx0a-0016f401.pphosted.com\n [67.231.148.174])\n by mails.dpdk.org (Postfix) with ESMTP id 0394A42D30\n for <dev@dpdk.org>; Thu,  8 Dec 2022 20:35:41 +0100 (CET)",
            "from pps.filterd (m0045849.ppops.net [127.0.0.1])\n by mx0a-0016f401.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id\n 2B8J8KCP001352; Thu, 8 Dec 2022 11:35:38 -0800",
            "from dc5-exch02.marvell.com ([199.233.59.182])\n by mx0a-0016f401.pphosted.com (PPS) with ESMTPS id 3mb22svkjm-1\n (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384 bits=256 verify=NOT);\n Thu, 08 Dec 2022 11:35:38 -0800",
            "from DC5-EXCH02.marvell.com (10.69.176.39) by DC5-EXCH02.marvell.com\n (10.69.176.39) with Microsoft SMTP Server (TLS) id 15.0.1497.18;\n Thu, 8 Dec 2022 11:35:36 -0800",
            "from maili.marvell.com (10.69.176.80) by DC5-EXCH02.marvell.com\n (10.69.176.39) with Microsoft SMTP Server id 15.0.1497.18 via Frontend\n Transport; Thu, 8 Dec 2022 11:35:36 -0800",
            "from ml-host-33.caveonetworks.com (unknown [10.110.143.233])\n by maili.marvell.com (Postfix) with ESMTP id 2AE9B3F706F;\n Thu,  8 Dec 2022 11:35:36 -0800 (PST)"
        ],
        "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=marvell.com;\n h=from : to : cc :\n subject : date : message-id : in-reply-to : references : mime-version :\n content-type; s=pfpt0220; bh=KH6ecjgraHiq5NtgnhM+/G6TWws3ixdRMI0yw8LXPfM=;\n b=TDNlZ5zEhbozH84Z2vyzXw59uRcNaKoUq1fXZvd0evu7Dcpjgn3bx9QCYoe6JH5ltjDK\n ve+hmdUhEA7A2LSFazeduLsNIgNSC4AggSOt+UG3f8MFoSoYemEJFJ43am+ROvIBiHd7\n 3sdVL4reC92e1wmb31mBwTpGnx++Uf1UEvoemuQnNrra839K+z1HL0XR28ZWL4scVV4q\n Y/1YbnK7Qpe5sZ+/0blxaia+hiEBtzcGFFi6xczEpN1v6UqJNW6TUl08o+sVomJFS3MW\n vXT8qSZA8EpYhoTmzNDpfPtzmMaMZHxq9Bn2Wsv8MzGiXgqCiDuxB3aHNOueAOi8h/O/ /Q==",
        "From": "Srikanth Yalavarthi <syalavarthi@marvell.com>",
        "To": "Srikanth Yalavarthi <syalavarthi@marvell.com>, Ruifeng Wang\n <ruifeng.wang@arm.com>",
        "CC": "<dev@dpdk.org>, <sshankarnara@marvell.com>, <jerinj@marvell.com>,\n <aprabhu@marvell.com>",
        "Subject": "[PATCH v1 4/4] common/ml: add Arm NEON type conversion routines",
        "Date": "Thu, 8 Dec 2022 11:35:32 -0800",
        "Message-ID": "<20221208193532.16718-5-syalavarthi@marvell.com>",
        "X-Mailer": "git-send-email 2.17.1",
        "In-Reply-To": "<20221208193532.16718-1-syalavarthi@marvell.com>",
        "References": "<20221208193532.16718-1-syalavarthi@marvell.com>",
        "MIME-Version": "1.0",
        "Content-Type": "text/plain",
        "X-Proofpoint-ORIG-GUID": "3iE-h1Ss98HQxxLtoQdhZ4kjJawZjPii",
        "X-Proofpoint-GUID": "3iE-h1Ss98HQxxLtoQdhZ4kjJawZjPii",
        "X-Proofpoint-Virus-Version": "vendor=baseguard\n engine=ICAP:2.0.205,Aquarius:18.0.923,Hydra:6.0.545,FMLib:17.11.122.1\n definitions=2022-12-08_11,2022-12-08_01,2022-06-22_01",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.29",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org"
    },
    "content": "Added ARM NEON intrinsic based implementations to support conversion\nof data types. Support is enabled to handle int8, uint8, int16, uint16,\nfloat16, float32 and bfloat16 types.\n\nSigned-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>\n---\n drivers/common/ml/meson.build     |   5 +\n drivers/common/ml/ml_utils.c      |  48 ++\n drivers/common/ml/ml_utils_neon.c | 950 ++++++++++++++++++++++++++++++\n drivers/common/ml/ml_utils_neon.h |  23 +\n 4 files changed, 1026 insertions(+)\n create mode 100644 drivers/common/ml/ml_utils_neon.c\n create mode 100644 drivers/common/ml/ml_utils_neon.h",
    "diff": "diff --git a/drivers/common/ml/meson.build b/drivers/common/ml/meson.build\nindex 84ae84ee4e..f7ce19b4b4 100644\n--- a/drivers/common/ml/meson.build\n+++ b/drivers/common/ml/meson.build\n@@ -17,6 +17,11 @@ sources = files(\n         'ml_utils_generic.c',\n )\n \n+if arch_subdir == 'arm'\n+    headers += files('ml_utils_neon.h')\n+    sources += files('ml_utils_neon.c')\n+endif\n+\n deps += ['mldev']\n \n pmd_supports_disable_iova_as_pa = true\ndiff --git a/drivers/common/ml/ml_utils.c b/drivers/common/ml/ml_utils.c\nindex e2edef0904..3edcf09fde 100644\n--- a/drivers/common/ml/ml_utils.c\n+++ b/drivers/common/ml/ml_utils.c\n@@ -120,71 +120,119 @@ ml_io_format_to_str(enum rte_ml_io_format format, char *str, int len)\n int\n ml_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_float32_to_int8_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float32_to_int8_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_int8_to_float32_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_int8_to_float32_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_float32_to_uint8_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float32_to_uint8_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_uint8_to_float32_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_uint8_to_float32_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_float32_to_int16_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float32_to_int16_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_int16_to_float32_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_int16_to_float32_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_float32_to_uint16_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float32_to_uint16_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_uint16_to_float32_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_uint16_to_float32_generic(scale, nb_elements, input, output);\n+#endif\n }\n \n int\n ml_float32_to_float16(uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_float32_to_float16_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float32_to_float16_generic(nb_elements, input, output);\n+#endif\n }\n \n int\n ml_float16_to_float32(uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_NEON__)\n+\treturn ml_float16_to_float32_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float16_to_float32_generic(nb_elements, input, output);\n+#endif\n }\n \n int\n ml_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_FEATURE_BF16)\n+\treturn ml_float32_to_bfloat16_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_float32_to_bfloat16_generic(nb_elements, input, output);\n+#endif\n }\n \n int\n ml_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)\n {\n+#if defined(__ARM_FEATURE_BF16)\n+\treturn ml_bfloat16_to_float32_neon(scale, nb_elements, input, output);\n+#else\n \treturn ml_bfloat16_to_float32_generic(nb_elements, input, output);\n+#endif\n }\ndiff --git a/drivers/common/ml/ml_utils_neon.c b/drivers/common/ml/ml_utils_neon.c\nnew file mode 100644\nindex 0000000000..b660de07ec\n--- /dev/null\n+++ b/drivers/common/ml/ml_utils_neon.c\n@@ -0,0 +1,950 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright (c) 2022 Marvell.\n+ */\n+\n+#include <errno.h>\n+#include <math.h>\n+#include <stdint.h>\n+\n+#include <rte_common.h>\n+#include <rte_vect.h>\n+\n+#include \"ml_utils.h\"\n+#include \"ml_utils_neon.h\"\n+\n+#include <arm_neon.h>\n+\n+static void\n+__float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)\n+{\n+\tint16x4_t s16x4_l;\n+\tint16x4_t s16x4_h;\n+\tfloat32x4_t f32x4;\n+\tint16x8_t s16x8;\n+\tint32x4_t s32x4;\n+\tint32x4_t vmin;\n+\tint32x4_t vmax;\n+\tint8x8_t s8x8;\n+\n+\t/* set constants */\n+\tvmin = vdupq_n_s32(INT8_MIN);\n+\tvmax = vdupq_n_s32(INT8_MAX);\n+\n+\t/* load 4 float32 elements, scale, convert, update ranges and narrow to int16.\n+\t * Use round to nearest with ties away rounding mode.\n+\t */\n+\tf32x4 = vld1q_f32(input);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\ts32x4 = vcvtaq_s32_f32(f32x4);\n+\ts32x4 = vminq_s32(s32x4, vmax);\n+\ts32x4 = vmaxq_s32(s32x4, vmin);\n+\ts16x4_l = vmovn_s32(s32x4);\n+\n+\t/* load next 4 float32 elements, scale, convert, update ranges and narrow to int16.\n+\t * Use round to nearest with ties away rounding mode.\n+\t */\n+\tf32x4 = vld1q_f32(input + 4);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\ts32x4 = vcvtaq_s32_f32(f32x4);\n+\ts32x4 = vminq_s32(s32x4, vmax);\n+\ts32x4 = vmaxq_s32(s32x4, vmin);\n+\ts16x4_h = vmovn_s32(s32x4);\n+\n+\t/* combine lower and higher int16x4_t to int16x8_t */\n+\ts16x8 = vcombine_s16(s16x4_l, s16x4_h);\n+\n+\t/* narrow to int8_t */\n+\ts8x8 = vmovn_s16(s16x8);\n+\n+\t/* store 8 elements */\n+\tvst1_s8(output, s8x8);\n+}\n+\n+static void\n+__float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)\n+{\n+\tfloat32x2_t f32x2;\n+\tint32x2_t s32x2;\n+\tint32x2_t vmin;\n+\tint32x2_t vmax;\n+\tint8x8_t s8x8;\n+\n+\t/* set constants */\n+\tvmin = vdup_n_s32(INT8_MIN);\n+\tvmax = vdup_n_s32(INT8_MAX);\n+\n+\t/* load element to 2 lanes */\n+\tf32x2 = vld1_dup_f32(input);\n+\n+\t/* scale */\n+\tf32x2 = vmul_n_f32(f32x2, scale);\n+\n+\t/* convert with use round to nearest with ties away rounding mode */\n+\ts32x2 = vcvta_s32_f32(f32x2);\n+\n+\t/* update range [INT8_MIN:INT8_MAX] */\n+\ts32x2 = vmin_s32(s32x2, vmax);\n+\ts32x2 = vmax_s32(s32x2, vmin);\n+\n+\t/* convert to int8_t */\n+\ts8x8 = vreinterpret_s8_s32(s32x2);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_s8(output, s8x8, 0);\n+}\n+\n+int\n+ml_float32_to_int8_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat *input_buffer;\n+\tint8_t *output_buffer;\n+\tuint32_t batch_size;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float *)input;\n+\toutput_buffer = (int8_t *)output;\n+\tbatch_size = 2 * sizeof(float) / sizeof(int8_t);\n+\n+\t/* convert batch_size elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / batch_size); i++) {\n+\t\t__float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += batch_size;\n+\t\toutput_buffer += batch_size;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * batch_size;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tint16x8_t s16x8;\n+\tint16x4_t s16x4;\n+\tint32x4_t s32x4;\n+\tint8x8_t s8x8;\n+\n+\t/* load 8 x int8_t elements */\n+\ts8x8 = vld1_s8(input);\n+\n+\t/* widen int8_t to int16_t */\n+\ts16x8 = vmovl_s8(s8x8);\n+\n+\t/* convert lower 4 elements: widen to int32_t, convert to float, scale and store */\n+\ts16x4 = vget_low_s16(s16x8);\n+\ts32x4 = vmovl_s16(s16x4);\n+\tf32x4 = vcvtq_f32_s32(s32x4);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\tvst1q_f32(output, f32x4);\n+\n+\t/* convert higher 4 elements: widen to int32_t, convert to float, scale and store */\n+\ts16x4 = vget_high_s16(s16x8);\n+\ts32x4 = vmovl_s16(s16x4);\n+\tf32x4 = vcvtq_f32_s32(s32x4);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\tvst1q_f32(output + 4, f32x4);\n+}\n+\n+static void\n+__int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)\n+{\n+\t*output = scale * vcvts_f32_s32((int32_t)*input);\n+}\n+\n+int\n+ml_int8_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tint8_t *input_buffer;\n+\tfloat *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (int8_t *)input;\n+\toutput_buffer = (float *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(int8_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)\n+{\n+\tuint16x4_t u16x4_l;\n+\tuint16x4_t u16x4_h;\n+\tfloat32x4_t f32x4;\n+\tuint32x4_t u32x4;\n+\tuint16x8_t u16x8;\n+\tuint32x4_t vmax;\n+\tuint8x8_t u8x8;\n+\n+\t/* set constants */\n+\tvmax = vdupq_n_u32(UINT8_MAX);\n+\n+\t/* load 4 float elements, scale, convert, update range and narrow to uint16_t.\n+\t * use round to nearest with ties away rounding mode.\n+\t */\n+\tf32x4 = vld1q_f32(input);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\tu32x4 = vcvtaq_u32_f32(f32x4);\n+\tu32x4 = vminq_u32(u32x4, vmax);\n+\tu16x4_l = vmovn_u32(u32x4);\n+\n+\t/* load next 4 float elements, scale, convert, update range and narrow to uint16_t\n+\t * use round to nearest with ties away rounding mode.\n+\t */\n+\tf32x4 = vld1q_f32(input + 4);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\tu32x4 = vcvtaq_u32_f32(f32x4);\n+\tu32x4 = vminq_u32(u32x4, vmax);\n+\tu16x4_h = vmovn_u32(u32x4);\n+\n+\t/* combine lower and higher uint16x4_t */\n+\tu16x8 = vcombine_u16(u16x4_l, u16x4_h);\n+\n+\t/* narrow to uint8x8_t */\n+\tu8x8 = vmovn_u16(u16x8);\n+\n+\t/* store 8 elements */\n+\tvst1_u8(output, u8x8);\n+}\n+\n+static void\n+__float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)\n+{\n+\tfloat32x2_t f32x2;\n+\tuint32x2_t u32x2;\n+\tuint32x2_t vmax;\n+\tuint8x8_t u8x8;\n+\n+\t/* set constants */\n+\tvmax = vdup_n_u32(UINT8_MAX);\n+\n+\t/* load element to 2 lanes */\n+\tf32x2 = vld1_dup_f32(input);\n+\n+\t/* scale */\n+\tf32x2 = vmul_n_f32(f32x2, scale);\n+\n+\t/* convert to uin32_t using round to nearest with ties away rounding mode */\n+\tu32x2 = vcvta_u32_f32(f32x2);\n+\n+\t/* update range [0:UINT8_MAX] */\n+\tu32x2 = vmin_u32(u32x2, vmax);\n+\n+\t/* convert to uint8x8_t */\n+\tu8x8 = vreinterpret_u8_u32(u32x2);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_u8(output, u8x8, 0);\n+}\n+\n+int\n+ml_float32_to_uint8_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat *input_buffer;\n+\tuint8_t *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float *)input;\n+\toutput_buffer = (uint8_t *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(uint8_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tuint16x8_t u16x8;\n+\tuint16x4_t u16x4;\n+\tuint32x4_t u32x4;\n+\tuint8x8_t u8x8;\n+\n+\t/* load 8 x uint8_t elements */\n+\tu8x8 = vld1_u8(input);\n+\n+\t/* widen uint8_t to uint16_t */\n+\tu16x8 = vmovl_u8(u8x8);\n+\n+\t/* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */\n+\tu16x4 = vget_low_u16(u16x8);\n+\tu32x4 = vmovl_u16(u16x4);\n+\tf32x4 = vcvtq_f32_u32(u32x4);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\tvst1q_f32(output, f32x4);\n+\n+\t/* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */\n+\tu16x4 = vget_high_u16(u16x8);\n+\tu32x4 = vmovl_u16(u16x4);\n+\tf32x4 = vcvtq_f32_u32(u32x4);\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\tvst1q_f32(output + 4, f32x4);\n+}\n+\n+static void\n+__uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)\n+{\n+\t*output = scale * vcvts_f32_u32((uint32_t)*input);\n+}\n+\n+int\n+ml_uint8_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tuint8_t *input_buffer;\n+\tfloat *output_buffer;\n+\tuint64_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (uint8_t *)input;\n+\toutput_buffer = (float *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(uint8_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tint16x4_t s16x4;\n+\tint32x4_t s32x4;\n+\tint32x4_t vmin;\n+\tint32x4_t vmax;\n+\n+\t/* set constants */\n+\tvmin = vdupq_n_s32(INT16_MIN);\n+\tvmax = vdupq_n_s32(INT16_MAX);\n+\n+\t/* load 4 x float elements */\n+\tf32x4 = vld1q_f32(input);\n+\n+\t/* scale */\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\n+\t/* convert to int32x4_t using round to nearest with ties away rounding mode */\n+\ts32x4 = vcvtaq_s32_f32(f32x4);\n+\n+\t/* update range [INT16_MIN:INT16_MAX] */\n+\ts32x4 = vminq_s32(s32x4, vmax);\n+\ts32x4 = vmaxq_s32(s32x4, vmin);\n+\n+\t/* narrow to int16x4_t */\n+\ts16x4 = vmovn_s32(s32x4);\n+\n+\t/* store 4 elements */\n+\tvst1_s16(output, s16x4);\n+}\n+\n+static void\n+__float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)\n+{\n+\tfloat32x2_t f32x2;\n+\tint32x2_t s32x2;\n+\tint16x4_t s16x4;\n+\tint32x2_t vmin;\n+\tint32x2_t vmax;\n+\n+\t/* set constants */\n+\tvmin = vdup_n_s32(INT16_MIN);\n+\tvmax = vdup_n_s32(INT16_MAX);\n+\n+\t/* load element to 2 lanes */\n+\tf32x2 = vld1_dup_f32(input);\n+\n+\t/* scale */\n+\tf32x2 = vmul_n_f32(f32x2, scale);\n+\n+\t/* convert using round to nearest with ties to away rounding mode */\n+\ts32x2 = vcvta_s32_f32(f32x2);\n+\n+\t/* update range [INT16_MIN:INT16_MAX] */\n+\ts32x2 = vmin_s32(s32x2, vmax);\n+\ts32x2 = vmax_s32(s32x2, vmin);\n+\n+\t/* convert to int16x4_t */\n+\ts16x4 = vreinterpret_s16_s32(s32x2);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_s16(output, s16x4, 0);\n+}\n+\n+int\n+ml_float32_to_int16_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat *input_buffer;\n+\tint16_t *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float *)input;\n+\toutput_buffer = (int16_t *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(int16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tint16x4_t s16x4;\n+\tint32x4_t s32x4;\n+\n+\t/* load 4 x int16_t elements */\n+\ts16x4 = vld1_s16(input);\n+\n+\t/* widen int16_t to int32_t */\n+\ts32x4 = vmovl_s16(s16x4);\n+\n+\t/* convert uint32_t to float */\n+\tf32x4 = vcvtq_f32_s32(s32x4);\n+\n+\t/* scale */\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\n+\t/* store float32x4_t */\n+\tvst1q_f32(output, f32x4);\n+}\n+\n+static void\n+__int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)\n+{\n+\t*output = scale * vcvts_f32_s32((int32_t)*input);\n+}\n+\n+int\n+ml_int16_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tint16_t *input_buffer;\n+\tfloat *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (int16_t *)input;\n+\toutput_buffer = (float *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(int16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tuint16x4_t u16x4;\n+\tuint32x4_t u32x4;\n+\tuint32x4_t vmax;\n+\n+\t/* set constants */\n+\tvmax = vdupq_n_u32(UINT16_MAX);\n+\n+\t/* load 4 float elements */\n+\tf32x4 = vld1q_f32(input);\n+\n+\t/* scale */\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\n+\t/* convert using round to nearest with ties to away rounding mode */\n+\tu32x4 = vcvtaq_u32_f32(f32x4);\n+\n+\t/* update range [0:UINT16_MAX] */\n+\tu32x4 = vminq_u32(u32x4, vmax);\n+\n+\t/* narrow */\n+\tu16x4 = vmovn_u32(u32x4);\n+\n+\t/* store 4 elements */\n+\tvst1_u16(output, u16x4);\n+}\n+\n+static void\n+__float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)\n+{\n+\tfloat32x2_t f32x2;\n+\tuint16x4_t u16x4;\n+\tint32x2_t s32x2;\n+\tint32x2_t vmax;\n+\n+\t/* set constants */\n+\tvmax = vdup_n_s32(UINT16_MAX);\n+\n+\t/* load element to 2 lanes */\n+\tf32x2 = vld1_dup_f32(input);\n+\n+\t/* scale */\n+\tf32x2 = vmul_n_f32(f32x2, scale);\n+\n+\t/* convert using round to nearest with ties to away rounding mode */\n+\ts32x2 = vcvta_s32_f32(f32x2);\n+\n+\t/* update range [0:UINT16_MAX] */\n+\ts32x2 = vmin_s32(s32x2, vmax);\n+\n+\t/* convert to uint16x4_t */\n+\tu16x4 = vreinterpret_u16_s32(s32x2);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_u16(output, u16x4, 0);\n+}\n+\n+int\n+ml_float32_to_uint16_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat *input_buffer;\n+\tuint16_t *output_buffer;\n+\tuint64_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float *)input;\n+\toutput_buffer = (uint16_t *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(uint16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tuint16x4_t u16x4;\n+\tuint32x4_t u32x4;\n+\n+\t/* load 4 x uint16_t elements */\n+\tu16x4 = vld1_u16(input);\n+\n+\t/* widen uint16_t to uint32_t */\n+\tu32x4 = vmovl_u16(u16x4);\n+\n+\t/* convert uint32_t to float */\n+\tf32x4 = vcvtq_f32_u32(u32x4);\n+\n+\t/* scale */\n+\tf32x4 = vmulq_n_f32(f32x4, scale);\n+\n+\t/* store float32x4_t */\n+\tvst1q_f32(output, f32x4);\n+}\n+\n+static void\n+__uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)\n+{\n+\t*output = scale * vcvts_f32_u32((uint32_t)*input);\n+}\n+\n+int\n+ml_uint16_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output)\n+{\n+\tuint16_t *input_buffer;\n+\tfloat *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (uint16_t *)input;\n+\toutput_buffer = (float *)output;\n+\tvlen = 2 * sizeof(float) / sizeof(uint16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tfloat16x4_t f16x4;\n+\n+\t/* load 4 x float32_t elements */\n+\tf32x4 = vld1q_f32(input);\n+\n+\t/* convert to float16x4_t */\n+\tf16x4 = vcvt_f16_f32(f32x4);\n+\n+\t/* store float16x4_t */\n+\tvst1_f16(output, f16x4);\n+}\n+\n+static void\n+__float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tfloat16x4_t f16x4;\n+\n+\t/* load element to 4 lanes */\n+\tf32x4 = vld1q_dup_f32(input);\n+\n+\t/* convert float32_t to float16_t */\n+\tf16x4 = vcvt_f16_f32(f32x4);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_f16(output, f16x4, 0);\n+}\n+\n+int\n+ml_float32_to_float16_neon(uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat32_t *input_buffer;\n+\tfloat16_t *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float32_t *)input;\n+\toutput_buffer = (float16_t *)output;\n+\tvlen = 2 * sizeof(float32_t) / sizeof(float16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__float32_to_float16_neon_f16x4(input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_float16_neon_f16x1(input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)\n+{\n+\tfloat16x4_t f16x4;\n+\tfloat32x4_t f32x4;\n+\n+\t/* load 4 x float16_t elements */\n+\tf16x4 = vld1_f16(input);\n+\n+\t/* convert float16x4_t to float32x4_t */\n+\tf32x4 = vcvt_f32_f16(f16x4);\n+\n+\t/* store float32x4_t */\n+\tvst1q_f32(output, f32x4);\n+}\n+\n+static void\n+__float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)\n+{\n+\tfloat16x4_t f16x4;\n+\tfloat32x4_t f32x4;\n+\n+\t/* load element to 4 lanes */\n+\tf16x4 = vld1_dup_f16(input);\n+\n+\t/* convert float16_t to float32_t */\n+\tf32x4 = vcvt_f32_f16(f16x4);\n+\n+\t/* store 1 element */\n+\tvst1q_lane_f32(output, f32x4, 0);\n+}\n+\n+int\n+ml_float16_to_float32_neon(uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat16_t *input_buffer;\n+\tfloat32_t *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float16_t *)input;\n+\toutput_buffer = (float32_t *)output;\n+\tvlen = 2 * sizeof(float32_t) / sizeof(float16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__float16_to_float32_neon_f32x4(input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float16_to_float32_neon_f32x1(input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+#ifdef __ARM_FEATURE_BF16\n+\n+static void\n+__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tbfloat16x4_t bf16x4;\n+\n+\t/* load 4 x float32_t elements */\n+\tf32x4 = vld1q_f32(input);\n+\n+\t/* convert float32x4_t to bfloat16x4_t */\n+\tbf16x4 = vcvt_bf16_f32(f32x4);\n+\n+\t/* store bfloat16x4_t */\n+\tvst1_bf16(output, bf16x4);\n+}\n+\n+static void\n+__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tbfloat16x4_t bf16x4;\n+\n+\t/* load element to 4 lanes */\n+\tf32x4 = vld1q_dup_f32(input);\n+\n+\t/* convert float32_t to bfloat16_t */\n+\tbf16x4 = vcvt_bf16_f32(f32x4);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_bf16(output, bf16x4, 0);\n+}\n+\n+int\n+ml_float32_to_bfloat16_neon(uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat32_t *input_buffer;\n+\tbfloat16_t *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float32_t *)input;\n+\toutput_buffer = (bfloat16_t *)output;\n+\tvlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static void\n+__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)\n+{\n+\tbfloat16x4_t bf16x4;\n+\tfloat32x4_t f32x4;\n+\n+\t/* load 4 x bfloat16_t elements */\n+\tbf16x4 = vld1_bf16(input);\n+\n+\t/* convert bfloat16x4_t to float32x4_t */\n+\tf32x4 = vcvt_f32_bf16(bf16x4);\n+\n+\t/* store float32x4_t */\n+\tvst1q_f32(output, f32x4);\n+}\n+\n+static void\n+__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)\n+{\n+\tbfloat16x4_t bf16x4;\n+\tfloat32x4_t f32x4;\n+\n+\t/* load element to 4 lanes */\n+\tbf16x4 = vld1_dup_bf16(input);\n+\n+\t/* convert bfloat16_t to float32_t */\n+\tf32x4 = vcvt_f32_bf16(bf16x4);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1q_lane_f32(output, f32x4, 0);\n+}\n+\n+int\n+ml_bfloat16_to_float32_neon(uint64_t nb_elements, void *input, void *output)\n+{\n+\tbfloat16_t *input_buffer;\n+\tfloat32_t *output_buffer;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (bfloat16_t *)input;\n+\toutput_buffer = (float32_t *)output;\n+\tvlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < (nb_elements / vlen); i++) {\n+\t\t__bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+#endif /* __ARM_FEATURE_BF16 */\ndiff --git a/drivers/common/ml/ml_utils_neon.h b/drivers/common/ml/ml_utils_neon.h\nnew file mode 100644\nindex 0000000000..d912049779\n--- /dev/null\n+++ b/drivers/common/ml/ml_utils_neon.h\n@@ -0,0 +1,23 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright (c) 2022 Marvell.\n+ */\n+\n+#ifndef _ML_UTILS_NEON_H_\n+#define _ML_UTILS_NEON_H_\n+\n+#include <stdint.h>\n+\n+int ml_float32_to_int8_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_int8_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_float32_to_uint8_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_uint8_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_float32_to_int16_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_int16_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_float32_to_uint16_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_uint16_to_float32_neon(float scale, uint64_t nb_elements, void *input, void *output);\n+int ml_float32_to_float16_neon(uint64_t nb_elements, void *input, void *output);\n+int ml_float16_to_float32_neon(uint64_t nb_elements, void *input, void *output);\n+int ml_float32_to_bfloat16_neon(uint64_t nb_elements, void *input, void *output);\n+int ml_bfloat16_to_float32_neon(uint64_t nb_elements, void *input, void *output);\n+\n+#endif /*_ML_UTILS_NEON_H_ */\n",
    "prefixes": [
        "v1",
        "4/4"
    ]
}