kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_bf16_neon.c

Directory:	./
File:	kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_bf16_neon.c
Date:	2025-10-20 13:18:31
	Coverage	Exec	Excl	Total
Lines:	97.8%	359	5	372
Functions:	85.7%	6	0	7
Branches:	77.4%	65	6	90
  
      Line
      Branch
      Exec
      Source
    
      //
    
      // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      //
    
      #if (!defined(__aarch64__) && !defined(_M_ARM64))
    
      #error This file must be compiled for AArch64.
    
      #else  // Architectural features check.
    
      #include "kai_lhs_quant_pack_qai8dxp_bf16_neon.h"
    
      #include <arm_neon.h>
    
      #endif
    
      #include <float.h>
    
      #include <math.h>
    
      #include <stddef.h>
    
      #include <stdint.h>
    
      #include "kai/kai_common.h"
    
      static const size_t kai_num_bytes_per_multiplier = sizeof(float);
    
      static const size_t kai_num_bytes_per_offset = sizeof(int32_t);
    
      3840
      inline static size_t kai_k_roundedup(size_t k) {
    
          // Round up k to be a multiple of 32.
    
          static const size_t kai_k_multiple_of = 32;
    
      3840
          return kai_roundup(k, kai_k_multiple_of);
    
      }
    
      2880
      inline static size_t kai_lhs_packed_stride(size_t k, size_t mr) {
    
      2880
          const size_t k_internal = kai_k_roundedup(k);
    
      −
          KAI_ASSERT((k_internal % 2) == 0);
    
      5760
          return mr * (k_internal * sizeof(int8_t) + kai_num_bytes_per_multiplier + kai_num_bytes_per_offset);
    
      2880
      }
    
      ✗
      size_t kai_get_m_step_lhs_quant_pack_qai8dxp_bf16_neon(size_t mr) {
    
      ✗
          KAI_UNUSED(mr);
    
      ✗
          return 1;
    
      }
    
      960
      size_t kai_get_lhs_offset_lhs_quant_pack_qai8dxp_bf16_neon(size_t m_idx, size_t lhs_stride) {
    
      960
          return m_idx * lhs_stride;
    
      }
    
      960
      size_t kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_bf16_neon(
    
          size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr) {
    
      960
          KAI_UNUSED(kr);
    
      960
          KAI_UNUSED(sr);
    
          // It always points to the beginning of the row
    
      960
          return (m_idx / mr) * kai_lhs_packed_stride(k, mr);
    
      }
    
      960
      size_t kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_bf16_neon(size_t m, size_t k, size_t mr, size_t kr, size_t sr) {
    
      960
          KAI_UNUSED(kr);
    
      960
          KAI_UNUSED(sr);
    
      960
          const size_t num_rows = kai_roundup(m, mr) / mr;
    
      1920
          return num_rows * kai_lhs_packed_stride(k, mr);
    
      960
      }
    
      // Note: The lhs parameter type has been changed from float* to void*.
    
      // The bfloat16 values (packed in 16 bits) will be converted to float32.
    
      960
      void kai_run_lhs_quant_pack_qai8dxp_bf16_neon(
    
          size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* restrict lhs,
    
          size_t lhs_stride, void* restrict lhs_packed) {
    
      −
          KAI_ASSERT((kr % sr) == 0);
    
        1/2✓ Branch 0 taken 960 times.
✗ Branch 1 not taken.

      960
          if (m == 0) {
    
      ✗
              return;
    
          }
    
          // Now lhs is assumed to contain bfloat16 values encoded in uint16_t.
    
      960
          const uint16_t* src_ptr = (uint16_t const*)lhs;
    
      960
          const size_t dst_stride = kai_lhs_packed_stride(k, mr);
    
      960
          const size_t k_internal = kai_k_roundedup(k);
    
      960
          const int32_t k_block_len = (int32_t)(kr / sr);
    
      −
          KAI_ASSERT(k_block_len == 8);
    
      960
          const int32_t num_blocks_k = (int32_t)(k / k_block_len);
    
      960
          const int32_t num_blocks_k_internal = (int32_t)(k_internal / k_block_len);
    
      960
          size_t row_idx = 0;
    
        2/2✓ Branch 0 taken 464 times.
✓ Branch 1 taken 496 times.

      960
          if (mr == 4) {
    
        2/2✓ Branch 0 taken 2228 times.
✓ Branch 1 taken 496 times.

      2724
              for (; row_idx + 3 < m; row_idx += 4) {
    
      2228
                  float max0 = -FLT_MAX;
    
      2228
                  float min0 = FLT_MAX;
    
      2228
                  float max1 = -FLT_MAX;
    
      2228
                  float min1 = FLT_MAX;
    
      2228
                  float max2 = -FLT_MAX;
    
      2228
                  float min2 = FLT_MAX;
    
      2228
                  float max3 = -FLT_MAX;
    
      2228
                  float min3 = FLT_MAX;
    
                  // Find min/max for each channel
    
      2228
                  int32_t k_idx = 0;
    
      2228
                  float32x4_t vmax0 = vdupq_n_f32(-FLT_MAX);
    
      2228
                  float32x4_t vmin0 = vdupq_n_f32(FLT_MAX);
    
      2228
                  float32x4_t vmax1 = vmax0;
    
      2228
                  float32x4_t vmin1 = vmin0;
    
      2228
                  float32x4_t vmax2 = vmax0;
    
      2228
                  float32x4_t vmin2 = vmin0;
    
      2228
                  float32x4_t vmax3 = vmax0;
    
      2228
                  float32x4_t vmin3 = vmin0;
    
      2228
                  const uint16x8_t zero = vdupq_n_u16(0);
    
                  // Process 8 bfloat16 values per iteration.
    
        2/2✓ Branch 0 taken 25608 times.
✓ Branch 1 taken 2228 times.

      27836
                  for (; k_idx <= ((int32_t)k - 8); k_idx += 8) {
    
                      // Load eight bfloat16 values.
    
      25608
                      const uint16x8_t bf16_vec_0 = vld1q_u16(src_ptr + k_idx);
    
      25608
                      const uint16x8_t bf16_vec_1 = vld1q_u16(src_ptr + k_idx + (lhs_stride / sizeof(uint16_t)));
    
      25608
                      const uint16x8_t bf16_vec_2 = vld1q_u16(src_ptr + k_idx + (2 * (lhs_stride / sizeof(uint16_t))));
    
      25608
                      const uint16x8_t bf16_vec_3 = vld1q_u16(src_ptr + k_idx + (3 * (lhs_stride / sizeof(uint16_t))));
    
      25608
                      const uint16x8_t bf16_vec1_0 = vzip1q_u16(zero, bf16_vec_0);
    
      25608
                      const uint16x8_t bf16_vec2_0 = vzip2q_u16(zero, bf16_vec_0);
    
      25608
                      const uint16x8_t bf16_vec1_1 = vzip1q_u16(zero, bf16_vec_1);
    
      25608
                      const uint16x8_t bf16_vec2_1 = vzip2q_u16(zero, bf16_vec_1);
    
      25608
                      const uint16x8_t bf16_vec1_2 = vzip1q_u16(zero, bf16_vec_2);
    
      25608
                      const uint16x8_t bf16_vec2_2 = vzip2q_u16(zero, bf16_vec_2);
    
      25608
                      const uint16x8_t bf16_vec1_3 = vzip1q_u16(zero, bf16_vec_3);
    
      25608
                      const uint16x8_t bf16_vec2_3 = vzip2q_u16(zero, bf16_vec_3);
    
      25608
                      const float32x4_t src0_0 = vreinterpretq_f32_u16(bf16_vec1_0);
    
      25608
                      const float32x4_t src0_1 = vreinterpretq_f32_u16(bf16_vec2_0);
    
      25608
                      const float32x4_t src1_0 = vreinterpretq_f32_u16(bf16_vec1_1);
    
      25608
                      const float32x4_t src1_1 = vreinterpretq_f32_u16(bf16_vec2_1);
    
      25608
                      const float32x4_t src2_0 = vreinterpretq_f32_u16(bf16_vec1_2);
    
      25608
                      const float32x4_t src2_1 = vreinterpretq_f32_u16(bf16_vec2_2);
    
      25608
                      const float32x4_t src3_0 = vreinterpretq_f32_u16(bf16_vec1_3);
    
      25608
                      const float32x4_t src3_1 = vreinterpretq_f32_u16(bf16_vec2_3);
    
                      // Calculate the maximum
    
      25608
                      vmax0 = vmaxq_f32(src0_0, vmax0);
    
      25608
                      vmax0 = vmaxq_f32(vmax0, src0_1);
    
      25608
                      vmax1 = vmaxq_f32(src1_0, vmax1);
    
      25608
                      vmax1 = vmaxq_f32(vmax1, src1_1);
    
      25608
                      vmax2 = vmaxq_f32(src2_0, vmax2);
    
      25608
                      vmax2 = vmaxq_f32(vmax2, src2_1);
    
      25608
                      vmax3 = vmaxq_f32(src3_0, vmax3);
    
      25608
                      vmax3 = vmaxq_f32(vmax3, src3_1);
    
                      // Calculate the minimum
    
      25608
                      vmin0 = vminq_f32(src0_0, vmin0);
    
      25608
                      vmin0 = vminq_f32(vmin0, src0_1);
    
      25608
                      vmin1 = vminq_f32(src1_0, vmin1);
    
      25608
                      vmin1 = vminq_f32(vmin1, src1_1);
    
      25608
                      vmin2 = vminq_f32(src2_0, vmin2);
    
      25608
                      vmin2 = vminq_f32(vmin2, src2_1);
    
      25608
                      vmin3 = vminq_f32(src3_0, vmin3);
    
      25608
                      vmin3 = vminq_f32(vmin3, src3_1);
    
      25608
                  }
    
                  // Get the max/min scalar values.
    
      2228
                  max0 = vmaxvq_f32(vmax0);
    
      2228
                  min0 = vminvq_f32(vmin0);
    
      2228
                  max1 = vmaxvq_f32(vmax1);
    
      2228
                  min1 = vminvq_f32(vmin1);
    
      2228
                  max2 = vmaxvq_f32(vmax2);
    
      2228
                  min2 = vminvq_f32(vmin2);
    
      2228
                  max3 = vmaxvq_f32(vmax3);
    
      2228
                  min3 = vminvq_f32(vmin3);
    
                  // Process leftover elements with a scalar loop.
    
        2/2✓ Branch 0 taken 3528 times.
✓ Branch 1 taken 2228 times.

      5756
                  for (; k_idx < (int32_t)k; ++k_idx) {
    
      3528
                      const float src0 = kai_cast_f32_bf16(*(src_ptr + k_idx));
    
      3528
                      max0 = fmaxf(src0, max0);
    
      3528
                      min0 = fminf(src0, min0);
    
      3528
                      const float src1 = kai_cast_f32_bf16(*(src_ptr + k_idx + (lhs_stride / sizeof(uint16_t))));
    
      3528
                      max1 = fmaxf(src1, max1);
    
      3528
                      min1 = fminf(src1, min1);
    
      3528
                      const float src2 = kai_cast_f32_bf16(*(src_ptr + k_idx + (2 * (lhs_stride / sizeof(uint16_t)))));
    
      3528
                      max2 = fmaxf(src2, max2);
    
      3528
                      min2 = fminf(src2, min2);
    
      3528
                      const float src3 = kai_cast_f32_bf16(*(src_ptr + k_idx + (3 * (lhs_stride / sizeof(uint16_t)))));
    
      3528
                      max3 = fmaxf(src3, max3);
    
      3528
                      min3 = fminf(src3, min3);
    
      3528
                  }
    
                  // Maximum/minimum int8 values
    
      2228
                  const float qmin = (float)INT8_MIN;
    
      2228
                  const float qmax = (float)INT8_MAX;
    
      2228
                  const float rmin0 = fminf(0.0F, min0);
    
      2228
                  const float rmax0 = fmaxf(0.0F, max0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2228 times.

      2228
                  const float scale0 = rmin0 == rmax0 ? 1.F : (qmax - qmin) / (rmax0 - rmin0);
    
      2228
                  const float rmin1 = fminf(0.0F, min1);
    
      2228
                  const float rmax1 = fmaxf(0.0F, max1);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2228 times.

      2228
                  const float scale1 = rmin1 == rmax1 ? 1.F : (qmax - qmin) / (rmax1 - rmin1);
    
      2228
                  const float rmin2 = fminf(0.0F, min2);
    
      2228
                  const float rmax2 = fmaxf(0.0F, max2);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2228 times.

      2228
                  const float scale2 = rmin2 == rmax2 ? 1.F : (qmax - qmin) / (rmax2 - rmin2);
    
      2228
                  const float rmin3 = fminf(0.0F, min3);
    
      2228
                  const float rmax3 = fmaxf(0.0F, max3);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2228 times.

      2228
                  const float scale3 = rmin3 == rmax3 ? 1.F : (qmax - qmin) / (rmax3 - rmin3);
    
                  // Reciprocal to quantize
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  const float recip_scale0 = scale0 ? 1.0F / scale0 : 0.0F;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  const float recip_scale1 = scale1 ? 1.0F / scale1 : 0.0F;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  const float recip_scale2 = scale2 ? 1.0F / scale2 : 0.0F;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  const float recip_scale3 = scale3 ? 1.0F / scale3 : 0.0F;
    
      2228
                  const float descaled_min0 = rmin0 * scale0;
    
      2228
                  const float descaled_max0 = rmax0 * scale0;
    
      2228
                  const float descaled_min1 = rmin1 * scale1;
    
      2228
                  const float descaled_max1 = rmax1 * scale1;
    
      2228
                  const float descaled_min2 = rmin2 * scale2;
    
      2228
                  const float descaled_max2 = rmax2 * scale2;
    
      2228
                  const float descaled_min3 = rmin3 * scale3;
    
      2228
                  const float descaled_max3 = rmax3 * scale3;
    
      2228
                  const float zero_point_from_min_error0 = qmin + descaled_min0;
    
      2228
                  const float zero_point_from_max_error0 = qmax + descaled_max0;
    
      2228
                  const float zero_point_from_min_error1 = qmin + descaled_min1;
    
      2228
                  const float zero_point_from_max_error1 = qmax + descaled_max1;
    
      2228
                  const float zero_point_from_min_error2 = qmin + descaled_min2;
    
      2228
                  const float zero_point_from_max_error2 = qmax + descaled_max2;
    
      2228
                  const float zero_point_from_min_error3 = qmin + descaled_min3;
    
      2228
                  const float zero_point_from_max_error3 = qmax + descaled_max3;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  float zero_point0 = (zero_point_from_min_error0 + zero_point_from_max_error0 > 0) ? qmin - descaled_min0
    
      ✗
                                                                                                    : qmax - descaled_max0;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  float zero_point1 = (zero_point_from_min_error1 + zero_point_from_max_error1 > 0) ? qmin - descaled_min1
    
      ✗
                                                                                                    : qmax - descaled_max1;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  float zero_point2 = (zero_point_from_min_error2 + zero_point_from_max_error2 > 0) ? qmin - descaled_min2
    
      ✗
                                                                                                    : qmax - descaled_max2;
    
        1/2✓ Branch 0 taken 2228 times.
✗ Branch 1 not taken.

      2228
                  float zero_point3 = (zero_point_from_min_error3 + zero_point_from_max_error3 > 0) ? qmin - descaled_min3
    
      ✗
                                                                                                    : qmax - descaled_max3;
    
      2228
                  zero_point0 = fmaxf(zero_point0, qmin);
    
      2228
                  zero_point0 = fminf(zero_point0, qmax);
    
      2228
                  zero_point1 = fmaxf(zero_point1, qmin);
    
      2228
                  zero_point1 = fminf(zero_point1, qmax);
    
      2228
                  zero_point2 = fmaxf(zero_point2, qmin);
    
      2228
                  zero_point2 = fminf(zero_point2, qmax);
    
      2228
                  zero_point3 = fmaxf(zero_point3, qmin);
    
      2228
                  zero_point3 = fminf(zero_point3, qmax);
    
                  // Round to nearest integer
    
      2228
                  const int32_t nudged_zero_point0 = (int32_t)rintf(zero_point0);
    
      2228
                  const int32_t nudged_zero_point1 = (int32_t)rintf(zero_point1);
    
      2228
                  const int32_t nudged_zero_point2 = (int32_t)rintf(zero_point2);
    
      2228
                  const int32_t nudged_zero_point3 = (int32_t)rintf(zero_point3);
    
      2228
                  const size_t dst_x = ((row_idx + m_idx_start) % mr);
    
      2228
                  uint8_t* dst_ptr = (uint8_t*)lhs_packed + (dst_x * k_block_len);
    
                  // Quantize the channels
    
      2228
                  int32_t block_idx = 0;
    
        2/2✓ Branch 0 taken 25608 times.
✓ Branch 1 taken 2228 times.

      27836
                  for (; block_idx < num_blocks_k; ++block_idx) {
    
                      // Clamp at the last valid k-index
    
      25608
                      const int32_t k_idx_start = block_idx * k_block_len;
    
                      // Load eight bfloat16 values and convert them to float32.
    
      25608
                      const uint16x8_t bf16_vec_0 = vld1q_u16(src_ptr + k_idx_start);
    
      25608
                      const uint16x8_t bf16_vec_1 = vld1q_u16(src_ptr + k_idx_start + (lhs_stride / sizeof(uint16_t)));
    
      25608
                      const uint16x8_t bf16_vec_2 = vld1q_u16(src_ptr + k_idx_start + (2 * (lhs_stride / sizeof(uint16_t))));
    
      25608
                      const uint16x8_t bf16_vec_3 = vld1q_u16(src_ptr + k_idx_start + (3 * (lhs_stride / sizeof(uint16_t))));
    
      25608
                      const uint16x8_t bf16_vec1_0 = vzip1q_u16(zero, bf16_vec_0);
    
      25608
                      const uint16x8_t bf16_vec2_0 = vzip2q_u16(zero, bf16_vec_0);
    
      25608
                      const uint16x8_t bf16_vec1_1 = vzip1q_u16(zero, bf16_vec_1);
    
      25608
                      const uint16x8_t bf16_vec2_1 = vzip2q_u16(zero, bf16_vec_1);
    
      25608
                      const uint16x8_t bf16_vec1_2 = vzip1q_u16(zero, bf16_vec_2);
    
      25608
                      const uint16x8_t bf16_vec2_2 = vzip2q_u16(zero, bf16_vec_2);
    
      25608
                      const uint16x8_t bf16_vec1_3 = vzip1q_u16(zero, bf16_vec_3);
    
      25608
                      const uint16x8_t bf16_vec2_3 = vzip2q_u16(zero, bf16_vec_3);
    
      25608
                      const float32x4_t src0_0 = vreinterpretq_f32_u16(bf16_vec1_0);
    
      25608
                      const float32x4_t src0_1 = vreinterpretq_f32_u16(bf16_vec2_0);
    
      25608
                      const float32x4_t src1_0 = vreinterpretq_f32_u16(bf16_vec1_1);
    
      25608
                      const float32x4_t src1_1 = vreinterpretq_f32_u16(bf16_vec2_1);
    
      25608
                      const float32x4_t src2_0 = vreinterpretq_f32_u16(bf16_vec1_2);
    
      25608
                      const float32x4_t src2_1 = vreinterpretq_f32_u16(bf16_vec2_2);
    
      25608
                      const float32x4_t src3_0 = vreinterpretq_f32_u16(bf16_vec1_3);
    
      25608
                      const float32x4_t src3_1 = vreinterpretq_f32_u16(bf16_vec2_3);
    
                      // Scale the values.
    
      25608
                      const int16x4_t v0_0 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src0_0, scale0)));
    
      25608
                      const int16x4_t v1_0 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src0_1, scale0)));
    
      25608
                      const int16x4_t v0_1 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src1_0, scale1)));
    
      25608
                      const int16x4_t v1_1 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src1_1, scale1)));
    
      25608
                      const int16x4_t v0_2 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src2_0, scale2)));
    
      25608
                      const int16x4_t v1_2 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src2_1, scale2)));
    
      25608
                      const int16x4_t v0_3 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src3_0, scale3)));
    
      25608
                      const int16x4_t v1_3 = vqmovn_s32(vcvtnq_s32_f32(vmulq_n_f32(src3_1, scale3)));
    
      25608
                      int16x8_t v0_s16 = vcombine_s16(v0_0, v1_0);
    
      25608
                      int16x8_t v1_s16 = vcombine_s16(v0_1, v1_1);
    
      25608
                      int16x8_t v2_s16 = vcombine_s16(v0_2, v1_2);
    
      25608
                      int16x8_t v3_s16 = vcombine_s16(v0_3, v1_3);
    
                      // Add zero points.
    
      25608
                      const int16x8_t vnzp0 = vdupq_n_s16((int16_t)nudged_zero_point0);
    
      25608
                      const int16x8_t vnzp1 = vdupq_n_s16((int16_t)nudged_zero_point1);
    
      25608
                      const int16x8_t vnzp2 = vdupq_n_s16((int16_t)nudged_zero_point2);
    
      25608
                      const int16x8_t vnzp3 = vdupq_n_s16((int16_t)nudged_zero_point3);
    
      25608
                      v0_s16 = vaddq_s16(v0_s16, vnzp0);
    
      25608
                      v0_s16 = vmaxq_s16(v0_s16, vdupq_n_s16(INT8_MIN));
    
      25608
                      v0_s16 = vminq_s16(v0_s16, vdupq_n_s16(INT8_MAX));
    
      25608
                      v1_s16 = vaddq_s16(v1_s16, vnzp1);
    
      25608
                      v1_s16 = vmaxq_s16(v1_s16, vdupq_n_s16(INT8_MIN));
    
      25608
                      v1_s16 = vminq_s16(v1_s16, vdupq_n_s16(INT8_MAX));
    
      25608
                      v2_s16 = vaddq_s16(v2_s16, vnzp2);
    
      25608
                      v2_s16 = vmaxq_s16(v2_s16, vdupq_n_s16(INT8_MIN));
    
      25608
                      v2_s16 = vminq_s16(v2_s16, vdupq_n_s16(INT8_MAX));
    
      25608
                      v3_s16 = vaddq_s16(v3_s16, vnzp3);
    
      25608
                      v3_s16 = vmaxq_s16(v3_s16, vdupq_n_s16(INT8_MIN));
    
      25608
                      v3_s16 = vminq_s16(v3_s16, vdupq_n_s16(INT8_MAX));
    
      25608
                      const int8x8_t v0_s8 = vqmovn_s16(v0_s16);
    
      25608
                      const int8x8_t v1_s8 = vqmovn_s16(v1_s16);
    
      25608
                      const int8x8_t v2_s8 = vqmovn_s16(v2_s16);
    
      25608
                      const int8x8_t v3_s8 = vqmovn_s16(v3_s16);
    
      25608
                      vst1_s8((int8_t*)(dst_ptr), v0_s8);
    
      25608
                      vst1_s8((int8_t*)(dst_ptr + sizeof(int8x8_t)), v1_s8);
    
      25608
                      vst1_s8((int8_t*)(dst_ptr + 2 * sizeof(int8x8_t)), v2_s8);
    
      25608
                      vst1_s8((int8_t*)(dst_ptr + 3 * sizeof(int8x8_t)), v3_s8);
    
      25608
                      dst_ptr += 4 * sizeof(int8x8_t);
    
      25608
                  }
    
        2/2✓ Branch 0 taken 1960 times.
✓ Branch 1 taken 2228 times.

      4188
                  for (; block_idx < num_blocks_k_internal; ++block_idx) {
    
                      // Left over k
    
        2/2✓ Branch 0 taken 15680 times.
✓ Branch 1 taken 1960 times.

      17640
                      for (int32_t k_block_idx = 0; k_block_idx < k_block_len; ++k_block_idx) {
    
                          // Clamp at the last valid k-index.
    
        2/2✓ Branch 0 taken 2744 times.
✓ Branch 1 taken 12936 times.

      15680
                          const size_t k_idx_start = KAI_MIN((size_t)((block_idx * k_block_len) + k_block_idx), k - 1);
    
      15680
                          const float src0 = kai_cast_f32_bf16(*(src_ptr + k_idx_start));
    
      15680
                          const float src1 = kai_cast_f32_bf16(*(src_ptr + k_idx_start + (lhs_stride / sizeof(uint16_t))));
    
      31360
                          const float src2 =
    
      15680
                              kai_cast_f32_bf16(*(src_ptr + k_idx_start + (2 * (lhs_stride / sizeof(uint16_t)))));
    
      31360
                          const float src3 =
    
      15680
                              kai_cast_f32_bf16(*(src_ptr + k_idx_start + (3 * (lhs_stride / sizeof(uint16_t)))));
    
                          // Scale the value.
    
      15680
                          int32_t v0_s32 = (int32_t)(roundf(src0 * scale0));
    
      15680
                          int32_t v1_s32 = (int32_t)(roundf(src1 * scale1));
    
      15680
                          int32_t v2_s32 = (int32_t)(roundf(src2 * scale2));
    
      15680
                          int32_t v3_s32 = (int32_t)(roundf(src3 * scale3));
    
      15680
                          v0_s32 = v0_s32 + nudged_zero_point0;
    
        2/2✓ Branch 0 taken 15620 times.
✓ Branch 1 taken 60 times.

      15680
                          v0_s32 = KAI_MAX(v0_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 15520 times.
✓ Branch 1 taken 160 times.

      15680
                          v0_s32 = KAI_MIN(v0_s32, INT8_MAX);
    
      15680
                          v1_s32 = v1_s32 + nudged_zero_point1;
    
        1/2✓ Branch 0 taken 15680 times.
✗ Branch 1 not taken.

      15680
                          v1_s32 = KAI_MAX(v1_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 15616 times.
✓ Branch 1 taken 64 times.

      15680
                          v1_s32 = KAI_MIN(v1_s32, INT8_MAX);
    
      15680
                          v2_s32 = v2_s32 + nudged_zero_point2;
    
        1/2✓ Branch 0 taken 15680 times.
✗ Branch 1 not taken.

      15680
                          v2_s32 = KAI_MAX(v2_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 15532 times.
✓ Branch 1 taken 148 times.

      15680
                          v2_s32 = KAI_MIN(v2_s32, INT8_MAX);
    
      15680
                          v3_s32 = v3_s32 + nudged_zero_point3;
    
        1/2✓ Branch 0 taken 15680 times.
✗ Branch 1 not taken.

      15680
                          v3_s32 = KAI_MAX(v3_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 15584 times.
✓ Branch 1 taken 96 times.

      15680
                          v3_s32 = KAI_MIN(v3_s32, INT8_MAX);
    
      15680
                          *(int8_t*)dst_ptr = (int8_t)v0_s32;
    
      15680
                          *(int8_t*)(dst_ptr + sizeof(int8x8_t)) = (int8_t)v1_s32;
    
      15680
                          *(int8_t*)(dst_ptr + 2 * sizeof(int8x8_t)) = (int8_t)v2_s32;
    
      15680
                          *(int8_t*)(dst_ptr + 3 * sizeof(int8x8_t)) = (int8_t)v3_s32;
    
      15680
                          dst_ptr += sizeof(int8_t);
    
      15680
                      }
    
      1960
                      dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
    
      1960
                  }
    
      2228
                  uint8_t* dst_base = (uint8_t*)lhs_packed + mr * (k_internal * sizeof(int8_t));
    
      2228
                  dst_ptr = dst_base + dst_x * kai_num_bytes_per_offset;
    
                  // LHS offset at the beginning of the row.
    
      2228
                  *((int32_t*)(dst_ptr)) = -nudged_zero_point0;
    
      2228
                  *((int32_t*)(dst_ptr + kai_num_bytes_per_offset)) = -nudged_zero_point1;
    
      2228
                  *((int32_t*)(dst_ptr + 2 * kai_num_bytes_per_offset)) = -nudged_zero_point2;
    
      2228
                  *((int32_t*)(dst_ptr + 3 * kai_num_bytes_per_offset)) = -nudged_zero_point3;
    
                  // Assuming the same sizeof() for kai_num_bytes_per_offset and kai_num_bytes_per_multiplier.
    
      −
                  KAI_ASSERT(kai_num_bytes_per_offset == kai_num_bytes_per_multiplier);
    
      2228
                  dst_ptr += mr * kai_num_bytes_per_offset;
    
                  // Store the scale quantization params.
    
      2228
                  *((float*)(dst_ptr)) = recip_scale0;
    
      2228
                  *((float*)(dst_ptr + kai_num_bytes_per_multiplier)) = recip_scale1;
    
      2228
                  *((float*)(dst_ptr + 2 * kai_num_bytes_per_multiplier)) = recip_scale2;
    
      2228
                  *((float*)(dst_ptr + 3 * kai_num_bytes_per_multiplier)) = recip_scale3;
    
                  // Update src_ptr. Note: now lhs contains bfloat16 values (2 bytes each).
    
      2228
                  src_ptr += (4 * lhs_stride / sizeof(uint16_t));
    
                  // Move to the next row as we have interleaved all Mr rows.
    
      2228
                  lhs_packed = (void*)((uint8_t*)lhs_packed + dst_stride);
    
      2228
              }
    
      496
          }
    
        2/2✓ Branch 0 taken 9500 times.
✓ Branch 1 taken 960 times.

      10460
          for (; row_idx < m; ++row_idx) {
    
      9500
              float max0 = -FLT_MAX;
    
      9500
              float min0 = FLT_MAX;
    
              // Find min/max for each channel
    
      9500
              int32_t k_idx = 0;
    
      9500
              float32x4_t vmax0 = vdupq_n_f32(-FLT_MAX);
    
      9500
              float32x4_t vmin0 = vdupq_n_f32(FLT_MAX);
    
      9500
              const uint16x8_t zero = vdupq_n_u16(0);
    
              // Process 8 bfloat16 values per iteration.
    
        2/2✓ Branch 0 taken 106808 times.
✓ Branch 1 taken 9500 times.

      116308
              for (; k_idx <= ((int32_t)k - 8); k_idx += 8) {
    
                  // Load eight bfloat16 values.
    
      106808
                  const uint16x8_t bf16_vec = vld1q_u16(src_ptr + k_idx);
    
      106808
                  const uint16x8_t bf16_vec1 = vzip1q_u16(zero, bf16_vec);
    
      106808
                  const uint16x8_t bf16_vec2 = vzip2q_u16(zero, bf16_vec);
    
      106808
                  const float32x4_t src0_0 = vreinterpretq_f32_u16(bf16_vec1);
    
      106808
                  const float32x4_t src0_1 = vreinterpretq_f32_u16(bf16_vec2);
    
                  // Calculate the maximum
    
      106808
                  vmax0 = vmaxq_f32(src0_0, vmax0);
    
      106808
                  vmax0 = vmaxq_f32(vmax0, src0_1);
    
                  // Calculate the minimum
    
      106808
                  vmin0 = vminq_f32(src0_0, vmin0);
    
      106808
                  vmin0 = vminq_f32(vmin0, src0_1);
    
      106808
              }
    
              // Get the max/min scalar values.
    
      9500
              max0 = vmaxvq_f32(vmax0);
    
      9500
              min0 = vminvq_f32(vmin0);
    
              // Process leftover elements with a scalar loop.
    
        2/2✓ Branch 0 taken 14328 times.
✓ Branch 1 taken 9500 times.

      23828
              for (; k_idx < (int32_t)k; ++k_idx) {
    
      14328
                  const float src0_0 = kai_cast_f32_bf16(*(src_ptr + k_idx));
    
      14328
                  max0 = fmaxf(src0_0, max0);
    
      14328
                  min0 = fminf(src0_0, min0);
    
      14328
              }
    
              // Maximum/minimum int8 values
    
      9500
              const float qmin = (float)INT8_MIN;
    
      9500
              const float qmax = (float)INT8_MAX;
    
      9500
              const float rmin0 = fminf(0.0F, min0);
    
      9500
              const float rmax0 = fmaxf(0.0F, max0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9500 times.

      9500
              const float scale0 = rmin0 == rmax0 ? 1.F : (qmax - qmin) / (rmax0 - rmin0);
    
              // Reciprocal to quantize
    
        1/2✓ Branch 0 taken 9500 times.
✗ Branch 1 not taken.

      9500
              const float recip_scale0 = scale0 ? 1.0F / scale0 : 0.0F;
    
      9500
              const float descaled_min0 = rmin0 * scale0;
    
      9500
              const float descaled_max0 = rmax0 * scale0;
    
      9500
              const float zero_point_from_min_error0 = qmin + descaled_min0;
    
      9500
              const float zero_point_from_max_error0 = qmax + descaled_max0;
    
      19000
              float zero_point0 =
    
        1/2✓ Branch 0 taken 9500 times.
✗ Branch 1 not taken.

      9500
                  (zero_point_from_min_error0 + zero_point_from_max_error0 > 0) ? qmin - descaled_min0 : qmax - descaled_max0;
    
      9500
              zero_point0 = fmaxf(zero_point0, qmin);
    
      9500
              zero_point0 = fminf(zero_point0, qmax);
    
              // Round to nearest integer
    
      9500
              const int32_t nudged_zero_point0 = (int32_t)rintf(zero_point0);
    
      9500
              const size_t dst_x = ((row_idx + m_idx_start) % mr);
    
      9500
              uint8_t* dst_ptr = (uint8_t*)lhs_packed + (dst_x * k_block_len * sizeof(int8_t));
    
              // Quantize the channels
    
      9500
              int32_t block_idx = 0;
    
        2/2✓ Branch 0 taken 106808 times.
✓ Branch 1 taken 9500 times.

      116308
              for (; block_idx < num_blocks_k; ++block_idx) {
    
                  // Clamp at the last valid k-index
    
      106808
                  const int32_t k_idx_start = block_idx * k_block_len;
    
                  // Load eight bfloat16 values and convert them to float32.
    
      106808
                  const uint16x8_t bf16_vec = vld1q_u16(src_ptr + k_idx_start);
    
      106808
                  const uint16x8_t bf16_vec1 = vzip1q_u16(zero, bf16_vec);
    
      106808
                  const uint16x8_t bf16_vec2 = vzip2q_u16(zero, bf16_vec);
    
      106808
                  const float32x4_t src0_0 = vreinterpretq_f32_u16(bf16_vec1);
    
      106808
                  const float32x4_t src0_1 = vreinterpretq_f32_u16(bf16_vec2);
    
                  // Scale the values.
    
      106808
                  const float32x4_t v0_f32 = vmulq_n_f32(src0_0, scale0);
    
      106808
                  const float32x4_t v1_f32 = vmulq_n_f32(src0_1, scale0);
    
      106808
                  const int32x4_t v0_s32 = vcvtnq_s32_f32(v0_f32);
    
      106808
                  const int32x4_t v1_s32 = vcvtnq_s32_f32(v1_f32);
    
      106808
                  const int16x4_t v0_s16 = vqmovn_s32(v0_s32);
    
      106808
                  const int16x4_t v1_s16 = vqmovn_s32(v1_s32);
    
      106808
                  int16x8_t v_s16 = vcombine_s16(v0_s16, v1_s16);
    
                  // Add zero points.
    
      106808
                  int16_t nzp_s16 = (int16_t)nudged_zero_point0;
    
      106808
                  int16x8_t vnzp_s16 = vdupq_n_s16(nzp_s16);
    
      106808
                  v_s16 = vaddq_s16(v_s16, vnzp_s16);
    
      106808
                  v_s16 = vmaxq_s16(v_s16, vdupq_n_s16(INT8_MIN));
    
      106808
                  v_s16 = vminq_s16(v_s16, vdupq_n_s16(INT8_MAX));
    
      106808
                  const int8x8_t v0_s8 = vqmovn_s16(v_s16);
    
      106808
                  vst1_s8((int8_t*)(dst_ptr), v0_s8);
    
      106808
                  dst_ptr += 8 * sizeof(int8_t);
    
      106808
                  dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
    
      106808
              }
    
        2/2✓ Branch 0 taken 7960 times.
✓ Branch 1 taken 9500 times.

      17460
              for (; block_idx < num_blocks_k_internal; ++block_idx) {
    
                  // Left over k
    
        2/2✓ Branch 0 taken 63680 times.
✓ Branch 1 taken 7960 times.

      71640
                  for (int32_t k_block_idx = 0; k_block_idx < k_block_len; ++k_block_idx) {
    
                      // Clamp at the last valid k-index.
    
        2/2✓ Branch 0 taken 11144 times.
✓ Branch 1 taken 52536 times.

      63680
                      const size_t k_idx_start = KAI_MIN((size_t)((block_idx * k_block_len) + k_block_idx), k - 1);
    
      63680
                      const float src0_0 = kai_cast_f32_bf16(*(src_ptr + k_idx_start));
    
                      // Scale the value.
    
      63680
                      int32_t v0_s32 = (int32_t)(roundf(src0_0 * scale0));
    
      63680
                      v0_s32 = v0_s32 + nudged_zero_point0;
    
        2/2✓ Branch 0 taken 63620 times.
✓ Branch 1 taken 60 times.

      63680
                      v0_s32 = KAI_MAX(v0_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 63224 times.
✓ Branch 1 taken 456 times.

      63680
                      v0_s32 = KAI_MIN(v0_s32, INT8_MAX);
    
      63680
                      *((int8_t*)(dst_ptr)) = (int8_t)v0_s32;
    
      63680
                      dst_ptr += sizeof(int8_t);
    
      63680
                  }
    
      7960
                  dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
    
      7960
              }
    
      9500
              dst_ptr = (uint8_t*)lhs_packed + mr * (k_internal * sizeof(int8_t));
    
      9500
              dst_ptr += dst_x * kai_num_bytes_per_offset;
    
              // LHS offset at the beginning of the row.
    
      9500
              *((int32_t*)(dst_ptr)) = -nudged_zero_point0;
    
              // Assuming the same sizeof() for kai_num_bytes_per_offset and kai_num_bytes_per_multiplier.
    
      −
              KAI_ASSERT(kai_num_bytes_per_offset == kai_num_bytes_per_multiplier);
    
      9500
              dst_ptr += mr * kai_num_bytes_per_offset;
    
              // Store the scale quantization params.
    
      9500
              *((float*)(dst_ptr)) = recip_scale0;
    
              // Update src_ptr. Note: now lhs contains bfloat16 values (2 bytes each).
    
      9500
              src_ptr += (lhs_stride / sizeof(uint16_t));
    
              // Move to the next row if we have interleaved all Mr rows.
    
        2/2✓ Branch 0 taken 484 times.
✓ Branch 1 taken 9016 times.

      9500
              if ((((row_idx + 1) + m_idx_start) % mr) == 0) {
    
      9016
                  lhs_packed = (void*)((uint8_t*)lhs_packed + dst_stride);
    
      9016
              }
    
      9500
          }
    
      960
      }
Function (Line)	Call count	Line coverage	Branch coverage	Block coverage
kai_get_lhs_offset_lhs_quant_pack_qai8dxp_bf16_neon (line 43)	called 960 times	100.0%	-%	100.0%
kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_bf16_neon (line 47)	called 960 times	100.0%	-%	100.0%
kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_bf16_neon (line 55)	called 960 times	100.0%	-%	100.0%
kai_get_m_step_lhs_quant_pack_qai8dxp_bf16_neon (line 38)	not called	0.0%	-%	0.0%
kai_k_roundedup (line 24)	called 3840 times	100.0%	-%	100.0%
kai_lhs_packed_stride (line 30)	called 2880 times	100.0%	-%	55.0%
kai_run_lhs_quant_pack_qai8dxp_bf16_neon (line 65)	called 960 times	98.6%	77.4%	83.0%