get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/125064/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 125064,
    "url": "http://patches.dpdk.org/api/patches/125064/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20230313114342.10812-1-syalavarthi@marvell.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20230313114342.10812-1-syalavarthi@marvell.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20230313114342.10812-1-syalavarthi@marvell.com",
    "date": "2023-03-13T11:43:42",
    "name": "[1/1] mldev: split bfloat16 routines to separate files",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "62d96dcad2bd9641c60df34fd6bbae8a7425eefb",
    "submitter": {
        "id": 2480,
        "url": "http://patches.dpdk.org/api/people/2480/?format=api",
        "name": "Srikanth Yalavarthi",
        "email": "syalavarthi@marvell.com"
    },
    "delegate": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/users/1/?format=api",
        "username": "tmonjalo",
        "first_name": "Thomas",
        "last_name": "Monjalon",
        "email": "thomas@monjalon.net"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20230313114342.10812-1-syalavarthi@marvell.com/mbox/",
    "series": [
        {
            "id": 27356,
            "url": "http://patches.dpdk.org/api/series/27356/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=27356",
            "date": "2023-03-13T11:43:42",
            "name": "[1/1] mldev: split bfloat16 routines to separate files",
            "version": 1,
            "mbox": "http://patches.dpdk.org/series/27356/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/125064/comments/",
    "check": "warning",
    "checks": "http://patches.dpdk.org/api/patches/125064/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 6ACF941DCC;\n\tMon, 13 Mar 2023 12:43:51 +0100 (CET)",
            "from mails.dpdk.org (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id 08A7941140;\n\tMon, 13 Mar 2023 12:43:51 +0100 (CET)",
            "from mx0b-0016f401.pphosted.com (mx0b-0016f401.pphosted.com\n [67.231.156.173])\n by mails.dpdk.org (Postfix) with ESMTP id 13B3B40151\n for <dev@dpdk.org>; Mon, 13 Mar 2023 12:43:49 +0100 (CET)",
            "from pps.filterd (m0045851.ppops.net [127.0.0.1])\n by mx0b-0016f401.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id\n 32DBKW34028251; Mon, 13 Mar 2023 04:43:47 -0700",
            "from dc5-exch02.marvell.com ([199.233.59.182])\n by mx0b-0016f401.pphosted.com (PPS) with ESMTPS id 3p8t1t5ada-1\n (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384 bits=256 verify=NOT);\n Mon, 13 Mar 2023 04:43:47 -0700",
            "from DC5-EXCH02.marvell.com (10.69.176.39) by DC5-EXCH02.marvell.com\n (10.69.176.39) with Microsoft SMTP Server (TLS) id 15.0.1497.42;\n Mon, 13 Mar 2023 04:43:44 -0700",
            "from maili.marvell.com (10.69.176.80) by DC5-EXCH02.marvell.com\n (10.69.176.39) with Microsoft SMTP Server id 15.0.1497.42 via Frontend\n Transport; Mon, 13 Mar 2023 04:43:44 -0700",
            "from ml-host-33.caveonetworks.com (unknown [10.110.143.233])\n by maili.marvell.com (Postfix) with ESMTP id 93A6B3F704A;\n Mon, 13 Mar 2023 04:43:43 -0700 (PDT)"
        ],
        "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=marvell.com;\n h=from : to : cc :\n subject : date : message-id : mime-version : content-type; s=pfpt0220;\n bh=GZJoGHaOTXGj6Knnt2q3zdE9KdSkpMtLOjmIsBtJDzc=;\n b=ebOBKfmlPMA+7fQWZxYxzCEpjkGJ27rpxcDC3wSDsXYnZE+MOFMbjJsrrSxcGMAF3c0d\n KjyOBxLIIYxwdvEwnQ8izlneZeQiOf+z89zSeUXrucZT+ffonq+Ez1KGQ3itRnTQ0RgJ\n yzWj/XH+523bC/nLPzS7mzMZWa96e38W/n7i+5xUNxBTedpYXte7cjH5CQmk8SAbXyRp\n iprnVqjIwt4gtTrMcpq2tkUoIToekvfBkK8TXoDwkzPYhuoVj0jjwxBS/zUjf0KgvU94\n S61ntAsAIAnUUs1XOvriSq9GeJB81GkuVkCKHq2BhUaDiuYXCaoOEzaVf/tCdDiWsSpr yg==",
        "From": "Srikanth Yalavarthi <syalavarthi@marvell.com>",
        "To": "Srikanth Yalavarthi <syalavarthi@marvell.com>, Ruifeng Wang\n <ruifeng.wang@arm.com>",
        "CC": "<dev@dpdk.org>, <sshankarnara@marvell.com>, <david.marchand@redhat.com>",
        "Subject": "[PATCH 1/1] mldev: split bfloat16 routines to separate files",
        "Date": "Mon, 13 Mar 2023 04:43:42 -0700",
        "Message-ID": "<20230313114342.10812-1-syalavarthi@marvell.com>",
        "X-Mailer": "git-send-email 2.17.1",
        "MIME-Version": "1.0",
        "Content-Type": "text/plain",
        "X-Proofpoint-GUID": "BCjbRjO8MTTy5m1LLlpnmCavhqxeDTAs",
        "X-Proofpoint-ORIG-GUID": "BCjbRjO8MTTy5m1LLlpnmCavhqxeDTAs",
        "X-Proofpoint-Virus-Version": "vendor=baseguard\n engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22\n definitions=2023-03-13_05,2023-03-13_01,2023-02-09_01",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.29",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org"
    },
    "content": "Since bfloat16 intrinsics are not supported on all ARM platforms\nthat support NEON, bfloat16 routines are moved to separate files.\nThis would enable using scalar implementation for bfloat16 on\nunsupported ARM platforms.\n\nBugzilla ID: 1179\nFixes: fc54766b1612 (\"mldev: add Arm NEON type conversion\")\n\nSigned-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>\n---\nDepends-on: patch-120653 (\"mldev: remove weak symbols use in type conversions\")\n\n lib/mldev/meson.build                   |  11 +-\n lib/mldev/mldev_utils_neon.c            | 142 +-----------\n lib/mldev/mldev_utils_neon_bfloat16.c   | 154 +++++++++++++\n lib/mldev/mldev_utils_scalar.c          | 278 +-----------------------\n lib/mldev/mldev_utils_scalar.h          |  80 +++++++\n lib/mldev/mldev_utils_scalar_bfloat16.c | 197 +++++++++++++++++\n 6 files changed, 453 insertions(+), 409 deletions(-)\n create mode 100644 lib/mldev/mldev_utils_neon_bfloat16.c\n create mode 100644 lib/mldev/mldev_utils_scalar.h\n create mode 100644 lib/mldev/mldev_utils_scalar_bfloat16.c\n\n--\n2.17.1",
    "diff": "diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build\nindex c9db42257b..5769b0640a 100644\n--- a/lib/mldev/meson.build\n+++ b/lib/mldev/meson.build\n@@ -7,12 +7,21 @@ sources = files(\n         'mldev_utils.c',\n )\n\n-if dpdk_conf.has('RTE_ARCH_ARM64')\n+if (dpdk_conf.has('RTE_ARCH_ARM64') and\n+    cc.get_define('__ARM_NEON', args: machine_args) != '')\n     sources += files('mldev_utils_neon.c')\n else\n     sources += files('mldev_utils_scalar.c')\n endif\n\n+if (dpdk_conf.has('RTE_ARCH_ARM64') and\n+    cc.get_define('__ARM_NEON', args: machine_args) != '' and\n+    cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')\n+    sources += files('mldev_utils_neon_bfloat16.c')\n+else\n+    sources += files('mldev_utils_scalar_bfloat16.c')\n+endif\n+\n headers = files(\n         'rte_mldev.h',\n )\ndiff --git a/lib/mldev/mldev_utils_neon.c b/lib/mldev/mldev_utils_neon.c\nindex 32b620db20..c7baec012b 100644\n--- a/lib/mldev/mldev_utils_neon.c\n+++ b/lib/mldev/mldev_utils_neon.c\n@@ -12,8 +12,8 @@\n\n /* Description:\n  * This file implements vector versions of Machine Learning utility functions used to convert data\n- * types from higher precision to lower precision and vice-versa. Implementation is based on Arm\n- * Neon intrinsics.\n+ * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation\n+ * is based on Arm Neon intrinsics.\n  */\n\n static inline void\n@@ -733,141 +733,3 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)\n\n \treturn 0;\n }\n-\n-#ifdef __ARM_FEATURE_BF16\n-\n-static inline void\n-__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)\n-{\n-\tfloat32x4_t f32x4;\n-\tbfloat16x4_t bf16x4;\n-\n-\t/* load 4 x float32_t elements */\n-\tf32x4 = vld1q_f32(input);\n-\n-\t/* convert float32x4_t to bfloat16x4_t */\n-\tbf16x4 = vcvt_bf16_f32(f32x4);\n-\n-\t/* store bfloat16x4_t */\n-\tvst1_bf16(output, bf16x4);\n-}\n-\n-static inline void\n-__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)\n-{\n-\tfloat32x4_t f32x4;\n-\tbfloat16x4_t bf16x4;\n-\n-\t/* load element to 4 lanes */\n-\tf32x4 = vld1q_dup_f32(input);\n-\n-\t/* convert float32_t to bfloat16_t */\n-\tbf16x4 = vcvt_bf16_f32(f32x4);\n-\n-\t/* store lane 0 / 1 element */\n-\tvst1_lane_bf16(output, bf16x4, 0);\n-}\n-\n-int\n-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)\n-{\n-\tfloat32_t *input_buffer;\n-\tbfloat16_t *output_buffer;\n-\tuint64_t nb_iterations;\n-\tuint32_t vlen;\n-\tuint64_t i;\n-\n-\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n-\t\treturn -EINVAL;\n-\n-\tinput_buffer = (float32_t *)input;\n-\toutput_buffer = (bfloat16_t *)output;\n-\tvlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);\n-\tnb_iterations = nb_elements / vlen;\n-\n-\t/* convert vlen elements in each iteration */\n-\tfor (i = 0; i < nb_iterations; i++) {\n-\t\t__float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);\n-\t\tinput_buffer += vlen;\n-\t\toutput_buffer += vlen;\n-\t}\n-\n-\t/* convert leftover elements */\n-\ti = i * vlen;\n-\tfor (; i < nb_elements; i++) {\n-\t\t__float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);\n-\t\tinput_buffer++;\n-\t\toutput_buffer++;\n-\t}\n-\n-\treturn 0;\n-}\n-\n-static inline void\n-__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)\n-{\n-\tbfloat16x4_t bf16x4;\n-\tfloat32x4_t f32x4;\n-\n-\t/* load 4 x bfloat16_t elements */\n-\tbf16x4 = vld1_bf16(input);\n-\n-\t/* convert bfloat16x4_t to float32x4_t */\n-\tf32x4 = vcvt_f32_bf16(bf16x4);\n-\n-\t/* store float32x4_t */\n-\tvst1q_f32(output, f32x4);\n-}\n-\n-static inline void\n-__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)\n-{\n-\tbfloat16x4_t bf16x4;\n-\tfloat32x4_t f32x4;\n-\n-\t/* load element to 4 lanes */\n-\tbf16x4 = vld1_dup_bf16(input);\n-\n-\t/* convert bfloat16_t to float32_t */\n-\tf32x4 = vcvt_f32_bf16(bf16x4);\n-\n-\t/* store lane 0 / 1 element */\n-\tvst1q_lane_f32(output, f32x4, 0);\n-}\n-\n-int\n-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)\n-{\n-\tbfloat16_t *input_buffer;\n-\tfloat32_t *output_buffer;\n-\tuint64_t nb_iterations;\n-\tuint32_t vlen;\n-\tuint64_t i;\n-\n-\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n-\t\treturn -EINVAL;\n-\n-\tinput_buffer = (bfloat16_t *)input;\n-\toutput_buffer = (float32_t *)output;\n-\tvlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);\n-\tnb_iterations = nb_elements / vlen;\n-\n-\t/* convert vlen elements in each iteration */\n-\tfor (i = 0; i < nb_iterations; i++) {\n-\t\t__bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);\n-\t\tinput_buffer += vlen;\n-\t\toutput_buffer += vlen;\n-\t}\n-\n-\t/* convert leftover elements */\n-\ti = i * vlen;\n-\tfor (; i < nb_elements; i++) {\n-\t\t__bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);\n-\t\tinput_buffer++;\n-\t\toutput_buffer++;\n-\t}\n-\n-\treturn 0;\n-}\n-\n-#endif /* __ARM_FEATURE_BF16 */\ndiff --git a/lib/mldev/mldev_utils_neon_bfloat16.c b/lib/mldev/mldev_utils_neon_bfloat16.c\nnew file mode 100644\nindex 0000000000..8dec3fd834\n--- /dev/null\n+++ b/lib/mldev/mldev_utils_neon_bfloat16.c\n@@ -0,0 +1,154 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright (c) 2023 Marvell.\n+ */\n+\n+#include <errno.h>\n+#include <stdint.h>\n+#include <stdlib.h>\n+\n+#include \"mldev_utils.h\"\n+\n+#include <arm_neon.h>\n+\n+/* Description:\n+ * This file implements vector versions of Machine Learning utility functions used to convert data\n+ * types from bfloat16 to float and vice-versa. Implementation is based on Arm Neon intrinsics.\n+ */\n+\n+#ifdef __ARM_FEATURE_BF16\n+\n+static inline void\n+__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tbfloat16x4_t bf16x4;\n+\n+\t/* load 4 x float32_t elements */\n+\tf32x4 = vld1q_f32(input);\n+\n+\t/* convert float32x4_t to bfloat16x4_t */\n+\tbf16x4 = vcvt_bf16_f32(f32x4);\n+\n+\t/* store bfloat16x4_t */\n+\tvst1_bf16(output, bf16x4);\n+}\n+\n+static inline void\n+__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)\n+{\n+\tfloat32x4_t f32x4;\n+\tbfloat16x4_t bf16x4;\n+\n+\t/* load element to 4 lanes */\n+\tf32x4 = vld1q_dup_f32(input);\n+\n+\t/* convert float32_t to bfloat16_t */\n+\tbf16x4 = vcvt_bf16_f32(f32x4);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1_lane_bf16(output, bf16x4, 0);\n+}\n+\n+int\n+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat32_t *input_buffer;\n+\tbfloat16_t *output_buffer;\n+\tuint64_t nb_iterations;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float32_t *)input;\n+\toutput_buffer = (bfloat16_t *)output;\n+\tvlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);\n+\tnb_iterations = nb_elements / vlen;\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < nb_iterations; i++) {\n+\t\t__float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static inline void\n+__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)\n+{\n+\tbfloat16x4_t bf16x4;\n+\tfloat32x4_t f32x4;\n+\n+\t/* load 4 x bfloat16_t elements */\n+\tbf16x4 = vld1_bf16(input);\n+\n+\t/* convert bfloat16x4_t to float32x4_t */\n+\tf32x4 = vcvt_f32_bf16(bf16x4);\n+\n+\t/* store float32x4_t */\n+\tvst1q_f32(output, f32x4);\n+}\n+\n+static inline void\n+__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)\n+{\n+\tbfloat16x4_t bf16x4;\n+\tfloat32x4_t f32x4;\n+\n+\t/* load element to 4 lanes */\n+\tbf16x4 = vld1_dup_bf16(input);\n+\n+\t/* convert bfloat16_t to float32_t */\n+\tf32x4 = vcvt_f32_bf16(bf16x4);\n+\n+\t/* store lane 0 / 1 element */\n+\tvst1q_lane_f32(output, f32x4, 0);\n+}\n+\n+int\n+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)\n+{\n+\tbfloat16_t *input_buffer;\n+\tfloat32_t *output_buffer;\n+\tuint64_t nb_iterations;\n+\tuint32_t vlen;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (bfloat16_t *)input;\n+\toutput_buffer = (float32_t *)output;\n+\tvlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);\n+\tnb_iterations = nb_elements / vlen;\n+\n+\t/* convert vlen elements in each iteration */\n+\tfor (i = 0; i < nb_iterations; i++) {\n+\t\t__bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);\n+\t\tinput_buffer += vlen;\n+\t\toutput_buffer += vlen;\n+\t}\n+\n+\t/* convert leftover elements */\n+\ti = i * vlen;\n+\tfor (; i < nb_elements; i++) {\n+\t\t__bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);\n+\t\tinput_buffer++;\n+\t\toutput_buffer++;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+#endif /* __ARM_FEATURE_BF16 */\ndiff --git a/lib/mldev/mldev_utils_scalar.c b/lib/mldev/mldev_utils_scalar.c\nindex 322b009f5d..a345b1e73c 100644\n--- a/lib/mldev/mldev_utils_scalar.c\n+++ b/lib/mldev/mldev_utils_scalar.c\n@@ -2,88 +2,13 @@\n  * Copyright (c) 2022 Marvell.\n  */\n\n-#include <errno.h>\n-#include <math.h>\n-#include <stdint.h>\n-\n-#include \"mldev_utils.h\"\n+#include \"mldev_utils_scalar.h\"\n\n /* Description:\n  * This file implements scalar versions of Machine Learning utility functions used to convert data\n- * types from higher precision to lower precision and vice-versa.\n+ * types from higher precision to lower precision and vice-versa, except bfloat16.\n  */\n\n-#ifndef BIT\n-#define BIT(nr) (1UL << (nr))\n-#endif\n-\n-#ifndef BITS_PER_LONG\n-#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)\n-#endif\n-\n-#ifndef GENMASK_U32\n-#define GENMASK_U32(h, l) (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))\n-#endif\n-\n-/* float32: bit index of MSB & LSB of sign, exponent and mantissa */\n-#define FP32_LSB_M 0\n-#define FP32_MSB_M 22\n-#define FP32_LSB_E 23\n-#define FP32_MSB_E 30\n-#define FP32_LSB_S 31\n-#define FP32_MSB_S 31\n-\n-/* float32: bitmask for sign, exponent and mantissa */\n-#define FP32_MASK_S GENMASK_U32(FP32_MSB_S, FP32_LSB_S)\n-#define FP32_MASK_E GENMASK_U32(FP32_MSB_E, FP32_LSB_E)\n-#define FP32_MASK_M GENMASK_U32(FP32_MSB_M, FP32_LSB_M)\n-\n-/* float16: bit index of MSB & LSB of sign, exponent and mantissa */\n-#define FP16_LSB_M 0\n-#define FP16_MSB_M 9\n-#define FP16_LSB_E 10\n-#define FP16_MSB_E 14\n-#define FP16_LSB_S 15\n-#define FP16_MSB_S 15\n-\n-/* float16: bitmask for sign, exponent and mantissa */\n-#define FP16_MASK_S GENMASK_U32(FP16_MSB_S, FP16_LSB_S)\n-#define FP16_MASK_E GENMASK_U32(FP16_MSB_E, FP16_LSB_E)\n-#define FP16_MASK_M GENMASK_U32(FP16_MSB_M, FP16_LSB_M)\n-\n-/* bfloat16: bit index of MSB & LSB of sign, exponent and mantissa */\n-#define BF16_LSB_M 0\n-#define BF16_MSB_M 6\n-#define BF16_LSB_E 7\n-#define BF16_MSB_E 14\n-#define BF16_LSB_S 15\n-#define BF16_MSB_S 15\n-\n-/* bfloat16: bitmask for sign, exponent and mantissa */\n-#define BF16_MASK_S GENMASK_U32(BF16_MSB_S, BF16_LSB_S)\n-#define BF16_MASK_E GENMASK_U32(BF16_MSB_E, BF16_LSB_E)\n-#define BF16_MASK_M GENMASK_U32(BF16_MSB_M, BF16_LSB_M)\n-\n-/* Exponent bias */\n-#define FP32_BIAS_E 127\n-#define FP16_BIAS_E 15\n-#define BF16_BIAS_E 127\n-\n-#define FP32_PACK(sign, exponent, mantissa)                                                        \\\n-\t(((sign) << FP32_LSB_S) | ((exponent) << FP32_LSB_E) | (mantissa))\n-\n-#define FP16_PACK(sign, exponent, mantissa)                                                        \\\n-\t(((sign) << FP16_LSB_S) | ((exponent) << FP16_LSB_E) | (mantissa))\n-\n-#define BF16_PACK(sign, exponent, mantissa)                                                        \\\n-\t(((sign) << BF16_LSB_S) | ((exponent) << BF16_LSB_E) | (mantissa))\n-\n-/* Represent float32 as float and uint32_t */\n-union float32 {\n-\tfloat f;\n-\tuint32_t u;\n-};\n-\n int\n rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)\n {\n@@ -334,18 +259,18 @@ __float32_to_float16_scalar_rtn(float x)\n \tf16_m = 0;\n\n \tswitch (f32_e) {\n-\tcase (0): /* float32: zero or subnormal number */\n+\tcase (0):\t\t/* float32: zero or subnormal number */\n \t\tf16_e = 0;\n \t\tif (f32_m == 0) /* zero */\n \t\t\tf16_m = 0;\n-\t\telse /* subnormal number, convert to zero */\n+\t\telse\t\t/* subnormal number, convert to zero */\n \t\t\tf16_m = 0;\n \t\tbreak;\n \tcase (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */\n \t\tf16_e = FP16_MASK_E >> FP16_LSB_E;\n-\t\tif (f32_m == 0) { /* infinity */\n+\t\tif (f32_m == 0) {\t  /* infinity */\n \t\t\tf16_m = 0;\n-\t\t} else { /* nan, propagate mantissa and set MSB of mantissa to 1 */\n+\t\t} else {\t\t  /* nan, propagate mantissa and set MSB of mantissa to 1 */\n \t\t\tf16_m = f32_m >> (FP32_MSB_M - FP16_MSB_M);\n \t\t\tf16_m |= BIT(FP16_MSB_M);\n \t\t}\n@@ -477,20 +402,20 @@ __float16_to_float32_scalar_rtx(uint16_t f16)\n \tswitch (f16_e) {\n \tcase (FP16_MASK_E >> FP16_LSB_E): /* float16: infinity or nan */\n \t\tf32_e = FP32_MASK_E >> FP32_LSB_E;\n-\t\tif (f16_m == 0x0) { /* infinity */\n+\t\tif (f16_m == 0x0) {\t  /* infinity */\n \t\t\tf32_m = f16_m;\n-\t\t} else { /* nan, propagate mantissa, set MSB of mantissa to 1 */\n+\t\t} else {\t\t  /* nan, propagate mantissa, set MSB of mantissa to 1 */\n \t\t\tf32_m = f16_m;\n \t\t\tshift = FP32_MSB_M - FP16_MSB_M;\n \t\t\tf32_m = (f32_m << shift) & FP32_MASK_M;\n \t\t\tf32_m |= BIT(FP32_MSB_M);\n \t\t}\n \t\tbreak;\n-\tcase 0: /* float16: zero or sub-normal */\n+\tcase 0:\t\t\t  /* float16: zero or sub-normal */\n \t\tf32_m = f16_m;\n \t\tif (f16_m == 0) { /* zero signed */\n \t\t\tf32_e = 0;\n-\t\t} else { /* subnormal numbers */\n+\t\t} else {\t  /* subnormal numbers */\n \t\t\tclz = __builtin_clz((uint32_t)f16_m) - sizeof(uint32_t) * 8 + FP16_LSB_E;\n \t\t\te_16 = (int)f16_e - clz;\n \t\t\tf32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E;\n@@ -535,186 +460,3 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)\n\n \treturn 0;\n }\n-\n-/* Convert a single precision floating point number (float32) into a\n- * brain float number (bfloat16) using round to nearest rounding mode.\n- */\n-static uint16_t\n-__float32_to_bfloat16_scalar_rtn(float x)\n-{\n-\tunion float32 f32; /* float32 input */\n-\tuint32_t f32_s;\t   /* float32 sign */\n-\tuint32_t f32_e;\t   /* float32 exponent */\n-\tuint32_t f32_m;\t   /* float32 mantissa */\n-\tuint16_t b16_s;\t   /* float16 sign */\n-\tuint16_t b16_e;\t   /* float16 exponent */\n-\tuint16_t b16_m;\t   /* float16 mantissa */\n-\tuint32_t tbits;\t   /* number of truncated bits */\n-\tuint16_t u16;\t   /* float16 output */\n-\n-\tf32.f = x;\n-\tf32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;\n-\tf32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;\n-\tf32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;\n-\n-\tb16_s = f32_s;\n-\tb16_e = 0;\n-\tb16_m = 0;\n-\n-\tswitch (f32_e) {\n-\tcase (0): /* float32: zero or subnormal number */\n-\t\tb16_e = 0;\n-\t\tif (f32_m == 0) /* zero */\n-\t\t\tb16_m = 0;\n-\t\telse /* subnormal float32 number, normal bfloat16 */\n-\t\t\tgoto bf16_normal;\n-\t\tbreak;\n-\tcase (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */\n-\t\tb16_e = BF16_MASK_E >> BF16_LSB_E;\n-\t\tif (f32_m == 0) { /* infinity */\n-\t\t\tb16_m = 0;\n-\t\t} else { /* nan, propagate mantissa and set MSB of mantissa to 1 */\n-\t\t\tb16_m = f32_m >> (FP32_MSB_M - BF16_MSB_M);\n-\t\t\tb16_m |= BIT(BF16_MSB_M);\n-\t\t}\n-\t\tbreak;\n-\tdefault: /* float32: normal number, normal bfloat16 */\n-\t\tgoto bf16_normal;\n-\t}\n-\n-\tgoto bf16_pack;\n-\n-bf16_normal:\n-\tb16_e = f32_e;\n-\ttbits = FP32_MSB_M - BF16_MSB_M;\n-\tb16_m = f32_m >> tbits;\n-\n-\t/* if non-leading truncated bits are set */\n-\tif ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {\n-\t\tb16_m++;\n-\n-\t\t/* if overflow into exponent */\n-\t\tif (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)\n-\t\t\tb16_e++;\n-\t} else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {\n-\t\t/* if only leading truncated bit is set */\n-\t\tif ((b16_m & 0x1) == 0x1) {\n-\t\t\tb16_m++;\n-\n-\t\t\t/* if overflow into exponent */\n-\t\t\tif (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)\n-\t\t\t\tb16_e++;\n-\t\t}\n-\t}\n-\tb16_m = b16_m & BF16_MASK_M;\n-\n-bf16_pack:\n-\tu16 = BF16_PACK(b16_s, b16_e, b16_m);\n-\n-\treturn u16;\n-}\n-\n-int\n-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)\n-{\n-\tfloat *input_buffer;\n-\tuint16_t *output_buffer;\n-\tuint64_t i;\n-\n-\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n-\t\treturn -EINVAL;\n-\n-\tinput_buffer = (float *)input;\n-\toutput_buffer = (uint16_t *)output;\n-\n-\tfor (i = 0; i < nb_elements; i++) {\n-\t\t*output_buffer = __float32_to_bfloat16_scalar_rtn(*input_buffer);\n-\n-\t\tinput_buffer = input_buffer + 1;\n-\t\toutput_buffer = output_buffer + 1;\n-\t}\n-\n-\treturn 0;\n-}\n-\n-/* Convert a brain float number (bfloat16) into a\n- * single precision floating point number (float32).\n- */\n-static float\n-__bfloat16_to_float32_scalar_rtx(uint16_t f16)\n-{\n-\tunion float32 f32; /* float32 output */\n-\tuint16_t b16_s;\t   /* float16 sign */\n-\tuint16_t b16_e;\t   /* float16 exponent */\n-\tuint16_t b16_m;\t   /* float16 mantissa */\n-\tuint32_t f32_s;\t   /* float32 sign */\n-\tuint32_t f32_e;\t   /* float32 exponent */\n-\tuint32_t f32_m;\t   /* float32 mantissa*/\n-\tuint8_t shift;\t   /* number of bits to be shifted */\n-\n-\tb16_s = (f16 & BF16_MASK_S) >> BF16_LSB_S;\n-\tb16_e = (f16 & BF16_MASK_E) >> BF16_LSB_E;\n-\tb16_m = (f16 & BF16_MASK_M) >> BF16_LSB_M;\n-\n-\tf32_s = b16_s;\n-\tswitch (b16_e) {\n-\tcase (BF16_MASK_E >> BF16_LSB_E): /* bfloat16: infinity or nan */\n-\t\tf32_e = FP32_MASK_E >> FP32_LSB_E;\n-\t\tif (b16_m == 0x0) { /* infinity */\n-\t\t\tf32_m = 0;\n-\t\t} else { /* nan, propagate mantissa, set MSB of mantissa to 1 */\n-\t\t\tf32_m = b16_m;\n-\t\t\tshift = FP32_MSB_M - BF16_MSB_M;\n-\t\t\tf32_m = (f32_m << shift) & FP32_MASK_M;\n-\t\t\tf32_m |= BIT(FP32_MSB_M);\n-\t\t}\n-\t\tbreak;\n-\tcase 0: /* bfloat16: zero or subnormal */\n-\t\tf32_m = b16_m;\n-\t\tif (b16_m == 0) { /* zero signed */\n-\t\t\tf32_e = 0;\n-\t\t} else { /* subnormal numbers */\n-\t\t\tgoto fp32_normal;\n-\t\t}\n-\t\tbreak;\n-\tdefault: /* bfloat16: normal number */\n-\t\tgoto fp32_normal;\n-\t}\n-\n-\tgoto fp32_pack;\n-\n-fp32_normal:\n-\tf32_m = b16_m;\n-\tf32_e = FP32_BIAS_E + b16_e - BF16_BIAS_E;\n-\n-\tshift = (FP32_MSB_M - BF16_MSB_M);\n-\tf32_m = (f32_m << shift) & FP32_MASK_M;\n-\n-fp32_pack:\n-\tf32.u = FP32_PACK(f32_s, f32_e, f32_m);\n-\n-\treturn f32.f;\n-}\n-\n-int\n-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)\n-{\n-\tuint16_t *input_buffer;\n-\tfloat *output_buffer;\n-\tuint64_t i;\n-\n-\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n-\t\treturn -EINVAL;\n-\n-\tinput_buffer = (uint16_t *)input;\n-\toutput_buffer = (float *)output;\n-\n-\tfor (i = 0; i < nb_elements; i++) {\n-\t\t*output_buffer = __bfloat16_to_float32_scalar_rtx(*input_buffer);\n-\n-\t\tinput_buffer = input_buffer + 1;\n-\t\toutput_buffer = output_buffer + 1;\n-\t}\n-\n-\treturn 0;\n-}\ndiff --git a/lib/mldev/mldev_utils_scalar.h b/lib/mldev/mldev_utils_scalar.h\nnew file mode 100644\nindex 0000000000..57e66ddb60\n--- /dev/null\n+++ b/lib/mldev/mldev_utils_scalar.h\n@@ -0,0 +1,80 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright (c) 2023 Marvell.\n+ */\n+\n+#include <errno.h>\n+#include <math.h>\n+#include <stdint.h>\n+\n+#include \"mldev_utils.h\"\n+\n+#ifndef BIT\n+#define BIT(nr) (1UL << (nr))\n+#endif\n+\n+#ifndef BITS_PER_LONG\n+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)\n+#endif\n+\n+#ifndef GENMASK_U32\n+#define GENMASK_U32(h, l) (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))\n+#endif\n+\n+/* float32: bit index of MSB & LSB of sign, exponent and mantissa */\n+#define FP32_LSB_M 0\n+#define FP32_MSB_M 22\n+#define FP32_LSB_E 23\n+#define FP32_MSB_E 30\n+#define FP32_LSB_S 31\n+#define FP32_MSB_S 31\n+\n+/* float32: bitmask for sign, exponent and mantissa */\n+#define FP32_MASK_S GENMASK_U32(FP32_MSB_S, FP32_LSB_S)\n+#define FP32_MASK_E GENMASK_U32(FP32_MSB_E, FP32_LSB_E)\n+#define FP32_MASK_M GENMASK_U32(FP32_MSB_M, FP32_LSB_M)\n+\n+/* float16: bit index of MSB & LSB of sign, exponent and mantissa */\n+#define FP16_LSB_M 0\n+#define FP16_MSB_M 9\n+#define FP16_LSB_E 10\n+#define FP16_MSB_E 14\n+#define FP16_LSB_S 15\n+#define FP16_MSB_S 15\n+\n+/* float16: bitmask for sign, exponent and mantissa */\n+#define FP16_MASK_S GENMASK_U32(FP16_MSB_S, FP16_LSB_S)\n+#define FP16_MASK_E GENMASK_U32(FP16_MSB_E, FP16_LSB_E)\n+#define FP16_MASK_M GENMASK_U32(FP16_MSB_M, FP16_LSB_M)\n+\n+/* bfloat16: bit index of MSB & LSB of sign, exponent and mantissa */\n+#define BF16_LSB_M 0\n+#define BF16_MSB_M 6\n+#define BF16_LSB_E 7\n+#define BF16_MSB_E 14\n+#define BF16_LSB_S 15\n+#define BF16_MSB_S 15\n+\n+/* bfloat16: bitmask for sign, exponent and mantissa */\n+#define BF16_MASK_S GENMASK_U32(BF16_MSB_S, BF16_LSB_S)\n+#define BF16_MASK_E GENMASK_U32(BF16_MSB_E, BF16_LSB_E)\n+#define BF16_MASK_M GENMASK_U32(BF16_MSB_M, BF16_LSB_M)\n+\n+/* Exponent bias */\n+#define FP32_BIAS_E 127\n+#define FP16_BIAS_E 15\n+#define BF16_BIAS_E 127\n+\n+#define FP32_PACK(sign, exponent, mantissa)                                                        \\\n+\t(((sign) << FP32_LSB_S) | ((exponent) << FP32_LSB_E) | (mantissa))\n+\n+#define FP16_PACK(sign, exponent, mantissa)                                                        \\\n+\t(((sign) << FP16_LSB_S) | ((exponent) << FP16_LSB_E) | (mantissa))\n+\n+#define BF16_PACK(sign, exponent, mantissa)                                                        \\\n+\t(((sign) << BF16_LSB_S) | ((exponent) << BF16_LSB_E) | (mantissa))\n+\n+/* Represent float32 as float and uint32_t */\n+union float32 {\n+\tfloat f;\n+\tuint32_t u;\n+};\ndiff --git a/lib/mldev/mldev_utils_scalar_bfloat16.c b/lib/mldev/mldev_utils_scalar_bfloat16.c\nnew file mode 100644\nindex 0000000000..43f9431835\n--- /dev/null\n+++ b/lib/mldev/mldev_utils_scalar_bfloat16.c\n@@ -0,0 +1,197 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright (c) 2023 Marvell.\n+ */\n+\n+#include <errno.h>\n+#include <math.h>\n+#include <stdint.h>\n+\n+#include \"mldev_utils_scalar.h\"\n+\n+/* Description:\n+ * This file implements scalar versions of Machine Learning utility functions used to convert data\n+ * types from bfloat16 to float32 and vice-versa.\n+ */\n+\n+/* Convert a single precision floating point number (float32) into a\n+ * brain float number (bfloat16) using round to nearest rounding mode.\n+ */\n+static uint16_t\n+__float32_to_bfloat16_scalar_rtn(float x)\n+{\n+\tunion float32 f32; /* float32 input */\n+\tuint32_t f32_s;\t   /* float32 sign */\n+\tuint32_t f32_e;\t   /* float32 exponent */\n+\tuint32_t f32_m;\t   /* float32 mantissa */\n+\tuint16_t b16_s;\t   /* float16 sign */\n+\tuint16_t b16_e;\t   /* float16 exponent */\n+\tuint16_t b16_m;\t   /* float16 mantissa */\n+\tuint32_t tbits;\t   /* number of truncated bits */\n+\tuint16_t u16;\t   /* float16 output */\n+\n+\tf32.f = x;\n+\tf32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;\n+\tf32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;\n+\tf32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;\n+\n+\tb16_s = f32_s;\n+\tb16_e = 0;\n+\tb16_m = 0;\n+\n+\tswitch (f32_e) {\n+\tcase (0):\t\t/* float32: zero or subnormal number */\n+\t\tb16_e = 0;\n+\t\tif (f32_m == 0) /* zero */\n+\t\t\tb16_m = 0;\n+\t\telse\t\t/* subnormal float32 number, normal bfloat16 */\n+\t\t\tgoto bf16_normal;\n+\t\tbreak;\n+\tcase (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */\n+\t\tb16_e = BF16_MASK_E >> BF16_LSB_E;\n+\t\tif (f32_m == 0) {\t  /* infinity */\n+\t\t\tb16_m = 0;\n+\t\t} else {\t\t  /* nan, propagate mantissa and set MSB of mantissa to 1 */\n+\t\t\tb16_m = f32_m >> (FP32_MSB_M - BF16_MSB_M);\n+\t\t\tb16_m |= BIT(BF16_MSB_M);\n+\t\t}\n+\t\tbreak;\n+\tdefault: /* float32: normal number, normal bfloat16 */\n+\t\tgoto bf16_normal;\n+\t}\n+\n+\tgoto bf16_pack;\n+\n+bf16_normal:\n+\tb16_e = f32_e;\n+\ttbits = FP32_MSB_M - BF16_MSB_M;\n+\tb16_m = f32_m >> tbits;\n+\n+\t/* if non-leading truncated bits are set */\n+\tif ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {\n+\t\tb16_m++;\n+\n+\t\t/* if overflow into exponent */\n+\t\tif (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)\n+\t\t\tb16_e++;\n+\t} else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {\n+\t\t/* if only leading truncated bit is set */\n+\t\tif ((b16_m & 0x1) == 0x1) {\n+\t\t\tb16_m++;\n+\n+\t\t\t/* if overflow into exponent */\n+\t\t\tif (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)\n+\t\t\t\tb16_e++;\n+\t\t}\n+\t}\n+\tb16_m = b16_m & BF16_MASK_M;\n+\n+bf16_pack:\n+\tu16 = BF16_PACK(b16_s, b16_e, b16_m);\n+\n+\treturn u16;\n+}\n+\n+int\n+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)\n+{\n+\tfloat *input_buffer;\n+\tuint16_t *output_buffer;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (float *)input;\n+\toutput_buffer = (uint16_t *)output;\n+\n+\tfor (i = 0; i < nb_elements; i++) {\n+\t\t*output_buffer = __float32_to_bfloat16_scalar_rtn(*input_buffer);\n+\n+\t\tinput_buffer = input_buffer + 1;\n+\t\toutput_buffer = output_buffer + 1;\n+\t}\n+\n+\treturn 0;\n+}\n+\n+/* Convert a brain float number (bfloat16) into a\n+ * single precision floating point number (float32).\n+ */\n+static float\n+__bfloat16_to_float32_scalar_rtx(uint16_t f16)\n+{\n+\tunion float32 f32; /* float32 output */\n+\tuint16_t b16_s;\t   /* float16 sign */\n+\tuint16_t b16_e;\t   /* float16 exponent */\n+\tuint16_t b16_m;\t   /* float16 mantissa */\n+\tuint32_t f32_s;\t   /* float32 sign */\n+\tuint32_t f32_e;\t   /* float32 exponent */\n+\tuint32_t f32_m;\t   /* float32 mantissa*/\n+\tuint8_t shift;\t   /* number of bits to be shifted */\n+\n+\tb16_s = (f16 & BF16_MASK_S) >> BF16_LSB_S;\n+\tb16_e = (f16 & BF16_MASK_E) >> BF16_LSB_E;\n+\tb16_m = (f16 & BF16_MASK_M) >> BF16_LSB_M;\n+\n+\tf32_s = b16_s;\n+\tswitch (b16_e) {\n+\tcase (BF16_MASK_E >> BF16_LSB_E): /* bfloat16: infinity or nan */\n+\t\tf32_e = FP32_MASK_E >> FP32_LSB_E;\n+\t\tif (b16_m == 0x0) {\t  /* infinity */\n+\t\t\tf32_m = 0;\n+\t\t} else {\t\t  /* nan, propagate mantissa, set MSB of mantissa to 1 */\n+\t\t\tf32_m = b16_m;\n+\t\t\tshift = FP32_MSB_M - BF16_MSB_M;\n+\t\t\tf32_m = (f32_m << shift) & FP32_MASK_M;\n+\t\t\tf32_m |= BIT(FP32_MSB_M);\n+\t\t}\n+\t\tbreak;\n+\tcase 0:\t\t\t  /* bfloat16: zero or subnormal */\n+\t\tf32_m = b16_m;\n+\t\tif (b16_m == 0) { /* zero signed */\n+\t\t\tf32_e = 0;\n+\t\t} else {\t  /* subnormal numbers */\n+\t\t\tgoto fp32_normal;\n+\t\t}\n+\t\tbreak;\n+\tdefault: /* bfloat16: normal number */\n+\t\tgoto fp32_normal;\n+\t}\n+\n+\tgoto fp32_pack;\n+\n+fp32_normal:\n+\tf32_m = b16_m;\n+\tf32_e = FP32_BIAS_E + b16_e - BF16_BIAS_E;\n+\n+\tshift = (FP32_MSB_M - BF16_MSB_M);\n+\tf32_m = (f32_m << shift) & FP32_MASK_M;\n+\n+fp32_pack:\n+\tf32.u = FP32_PACK(f32_s, f32_e, f32_m);\n+\n+\treturn f32.f;\n+}\n+\n+int\n+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)\n+{\n+\tuint16_t *input_buffer;\n+\tfloat *output_buffer;\n+\tuint64_t i;\n+\n+\tif ((nb_elements == 0) || (input == NULL) || (output == NULL))\n+\t\treturn -EINVAL;\n+\n+\tinput_buffer = (uint16_t *)input;\n+\toutput_buffer = (float *)output;\n+\n+\tfor (i = 0; i < nb_elements; i++) {\n+\t\t*output_buffer = __bfloat16_to_float32_scalar_rtx(*input_buffer);\n+\n+\t\tinput_buffer = input_buffer + 1;\n+\t\toutput_buffer = output_buffer + 1;\n+\t}\n+\n+\treturn 0;\n+}\n",
    "prefixes": [
        "1/1"
    ]
}