kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.c

Directory:	./
File:	kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.c
Date:	2025-10-20 13:18:31
	Coverage	Exec	Excl	Total
Lines:	97.5%	318	5	331
Functions:	85.7%	6	0	7
Branches:	75.6%	65	8	94
  
      Line
      Branch
      Exec
      Source
    
      //
    
      // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      //
    
      #if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \
    
          !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
    
      #error This file must be compiled for AArch64, FEAT_FP16.
    
      #else  // Architectural features check.
    
      #include "kai_lhs_quant_pack_qai8dxp_f16_neon.h"
    
      #include <arm_fp16.h>
    
      #include <arm_neon.h>
    
      #include <float.h>
    
      #include <math.h>
    
      #include <stddef.h>
    
      #include <stdint.h>
    
      #include "kai/kai_common.h"
    
      #define FLT16_MAX 65504.0
    
      #define FLT16_MIN (-65504.0F)
    
      static const size_t kai_num_bytes_per_multiplier = sizeof(float);
    
      static const size_t kai_num_bytes_per_offset = sizeof(int32_t);
    
      4480
      inline static size_t kai_k_roundedup(size_t k) {
    
          // Round up k to be a multiple of 32.
    
      4480
          size_t kai_k_multiple_of = 32;
    
      8960
          return kai_roundup(k, kai_k_multiple_of);
    
      4480
      }
    
      3360
      inline static size_t kai_lhs_packed_stride(size_t k, size_t mr, size_t kr, size_t sr) {
    
      3360
          KAI_UNUSED(kr);
    
      3360
          KAI_UNUSED(sr);
    
      3360
          const size_t k_internal = kai_k_roundedup(k);
    
      −
          KAI_ASSERT((k_internal % 2) == 0);
    
      6720
          return mr * (k_internal * sizeof(int8_t) + kai_num_bytes_per_multiplier + kai_num_bytes_per_offset);
    
      3360
      }
    
      ✗
      size_t kai_get_m_step_lhs_quant_pack_qai8dxp_f16_neon(size_t mr) {
    
      ✗
          KAI_UNUSED(mr);
    
      ✗
          return 1;
    
      }
    
      1120
      size_t kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f16_neon(size_t m_idx, size_t lhs_stride) {
    
      1120
          return m_idx * lhs_stride;
    
      }
    
      1120
      size_t kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f16_neon(
    
          size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr) {
    
          // It always points to the beginning of the row
    
      1120
          return (m_idx / mr) * kai_lhs_packed_stride(k, mr, kr, sr);
    
      }
    
      1120
      size_t kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f16_neon(size_t m, size_t k, size_t mr, size_t kr, size_t sr) {
    
      1120
          const size_t num_rows = kai_roundup(m, mr) / mr;
    
      2240
          return num_rows * kai_lhs_packed_stride(k, mr, kr, sr);
    
      1120
      }
    
      1120
      void kai_run_lhs_quant_pack_qai8dxp_f16_neon(
    
          size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* restrict lhs,
    
          size_t lhs_stride, void* restrict lhs_packed) {
    
      −
          KAI_ASSERT((kr % sr) == 0);
    
      −
          KAI_ASSUME((kr / sr == 8) || (kr / sr == 4));
    
        1/2✓ Branch 0 taken 1120 times.
✗ Branch 1 not taken.

      1120
          if (m == 0) {
    
      ✗
              return;
    
          }
    
      1120
          const size_t num_rows = m;
    
      1120
          float16_t const* src_ptr = (float16_t const*)lhs;
    
      1120
          const size_t dst_stride = kai_lhs_packed_stride(k, mr, kr, sr);
    
      1120
          const size_t k_internal = kai_k_roundedup(k);
    
      1120
          const int32_t k_block_len = (int32_t)(kr / sr);
    
      1120
          const int32_t num_blocks_k = (int32_t)(k / k_block_len);
    
      1120
          const int32_t num_blocks_k_internal = (int32_t)(k_internal / k_block_len);
    
      1120
          const size_t lhs_row_length = lhs_stride / sizeof(float16_t);
    
      1120
          const float16x8_t vmax = vdupq_n_f16((float16_t)FLT16_MIN);
    
      1120
          const float16x8_t vmin = vdupq_n_f16((float16_t)FLT16_MAX);
    
          // As we load 8-element vectors, limit vectorized loop to avoid reading out-of-bounds
    
      1120
          const int32_t blocks_lim_k = num_blocks_k - (8 / k_block_len);
    
      1120
          size_t row_idx = 0;
    
          // Improved performance with 4x loop unrolling where packing parameters allow
    
        2/2✓ Branch 0 taken 232 times.
✓ Branch 1 taken 888 times.

      1120
          if (mr == 4) {
    
        2/2✓ Branch 0 taken 2496 times.
✓ Branch 1 taken 888 times.

      3384
              for (; row_idx + 3 < m; row_idx += 4) {
    
                  // Find min/max for each channel
    
      2496
                  int32_t k_idx = 0;
    
      2496
                  float16x8_t vmax0 = vmax;
    
      2496
                  float16x8_t vmin0 = vmin;
    
      2496
                  float16x8_t vmax1 = vmax;
    
      2496
                  float16x8_t vmin1 = vmin;
    
      2496
                  float16x8_t vmax2 = vmax;
    
      2496
                  float16x8_t vmin2 = vmin;
    
      2496
                  float16x8_t vmax3 = vmax;
    
      2496
                  float16x8_t vmin3 = vmin;
    
        2/2✓ Branch 0 taken 15376 times.
✓ Branch 1 taken 2496 times.

      17872
                  for (; k_idx <= ((int32_t)k - 8); k_idx += 8) {
    
      15376
                      const float16x8_t src0 = vld1q_f16(src_ptr + k_idx);
    
      15376
                      const float16x8_t src1 = vld1q_f16(src_ptr + k_idx + lhs_row_length);
    
      15376
                      const float16x8_t src2 = vld1q_f16(src_ptr + k_idx + (2 * lhs_row_length));
    
      15376
                      const float16x8_t src3 = vld1q_f16(src_ptr + k_idx + (3 * lhs_row_length));
    
      15376
                      vmax0 = vmaxq_f16(src0, vmax0);
    
      15376
                      vmax1 = vmaxq_f16(src1, vmax1);
    
      15376
                      vmax2 = vmaxq_f16(src2, vmax2);
    
      15376
                      vmax3 = vmaxq_f16(src3, vmax3);
    
      15376
                      vmin0 = vminq_f16(src0, vmin0);
    
      15376
                      vmin1 = vminq_f16(src1, vmin1);
    
      15376
                      vmin2 = vminq_f16(src2, vmin2);
    
      15376
                      vmin3 = vminq_f16(src3, vmin3);
    
      15376
                  }
    
      2496
                  float16_t max0 = vmaxvq_f16(vmax0);
    
      2496
                  float16_t min0 = vminvq_f16(vmin0);
    
      2496
                  float16_t max1 = vmaxvq_f16(vmax1);
    
      2496
                  float16_t min1 = vminvq_f16(vmin1);
    
      2496
                  float16_t max2 = vmaxvq_f16(vmax2);
    
      2496
                  float16_t min2 = vminvq_f16(vmin2);
    
      2496
                  float16_t max3 = vmaxvq_f16(vmax3);
    
      2496
                  float16_t min3 = vminvq_f16(vmin3);
    
                  // Process leftover elements with a scalar loop.
    
        2/2✓ Branch 0 taken 2296 times.
✓ Branch 1 taken 2496 times.

      4792
                  for (; k_idx < (int32_t)k; ++k_idx) {
    
      2296
                      const float16_t src0 = *(src_ptr + (size_t)k_idx);
    
      2296
                      max0 = vmaxh_f16(src0, max0);
    
      2296
                      min0 = vminh_f16(src0, min0);
    
      2296
                      const float16_t src1 = *(src_ptr + (size_t)k_idx + lhs_row_length);
    
      2296
                      max1 = vmaxh_f16(src1, max1);
    
      2296
                      min1 = vminh_f16(src1, min1);
    
      2296
                      const float16_t src2 = *(src_ptr + (size_t)k_idx + (2 * lhs_row_length));
    
      2296
                      max2 = vmaxh_f16(src2, max2);
    
      2296
                      min2 = vminh_f16(src2, min2);
    
      2296
                      const float16_t src3 = *(src_ptr + (size_t)k_idx + (3 * lhs_row_length));
    
      2296
                      max3 = vmaxh_f16(src3, max3);
    
      2296
                      min3 = vminh_f16(src3, min3);
    
      2296
                  }
    
                  // Maximum/minimum int8 values
    
      2496
                  const float qmin = (float)INT8_MIN;
    
      2496
                  const float qmax = (float)INT8_MAX;
    
      2496
                  const float rmin0 = fminf(0.0F, min0);
    
      2496
                  const float rmax0 = fmaxf(0.0F, max0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2496 times.

      2496
                  const float scale0 = rmin0 == rmax0 ? 1.F : (qmax - qmin) / (rmax0 - rmin0);
    
      2496
                  const float rmin1 = fminf(0.0F, min1);
    
      2496
                  const float rmax1 = fmaxf(0.0F, max1);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2496 times.

      2496
                  const float scale1 = rmin1 == rmax1 ? 1.F : (qmax - qmin) / (rmax1 - rmin1);
    
      2496
                  const float rmin2 = fminf(0.0F, min2);
    
      2496
                  const float rmax2 = fmaxf(0.0F, max2);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2496 times.

      2496
                  const float scale2 = rmin2 == rmax2 ? 1.F : (qmax - qmin) / (rmax2 - rmin2);
    
      2496
                  const float rmin3 = fminf(0.0F, min3);
    
      2496
                  const float rmax3 = fmaxf(0.0F, max3);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2496 times.

      2496
                  const float scale3 = rmin3 == rmax3 ? 1.F : (qmax - qmin) / (rmax3 - rmin3);
    
                  // Reciprocal to quantize
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  const float recip_scale0 = scale0 ? 1.0F / scale0 : 0.0F;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  const float recip_scale1 = scale1 ? 1.0F / scale1 : 0.0F;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  const float recip_scale2 = scale2 ? 1.0F / scale2 : 0.0F;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  const float recip_scale3 = scale3 ? 1.0F / scale3 : 0.0F;
    
      2496
                  const float descaled_min0 = rmin0 * scale0;
    
      2496
                  const float descaled_max0 = rmax0 * scale0;
    
      2496
                  const float descaled_min1 = rmin1 * scale1;
    
      2496
                  const float descaled_max1 = rmax1 * scale1;
    
      2496
                  const float descaled_min2 = rmin2 * scale2;
    
      2496
                  const float descaled_max2 = rmax2 * scale2;
    
      2496
                  const float descaled_min3 = rmin3 * scale3;
    
      2496
                  const float descaled_max3 = rmax3 * scale3;
    
      2496
                  const float zero_point_from_min_error0 = qmin + descaled_min0;
    
      2496
                  const float zero_point_from_max_error0 = qmax + descaled_max0;
    
      2496
                  const float zero_point_from_min_error1 = qmin + descaled_min1;
    
      2496
                  const float zero_point_from_max_error1 = qmax + descaled_max1;
    
      2496
                  const float zero_point_from_min_error2 = qmin + descaled_min2;
    
      2496
                  const float zero_point_from_max_error2 = qmax + descaled_max2;
    
      2496
                  const float zero_point_from_min_error3 = qmin + descaled_min3;
    
      2496
                  const float zero_point_from_max_error3 = qmax + descaled_max3;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  float zero_point0 = (zero_point_from_min_error0 + zero_point_from_max_error0 > 0) ? qmin - descaled_min0
    
      ✗
                                                                                                    : qmax - descaled_max0;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  float zero_point1 = (zero_point_from_min_error1 + zero_point_from_max_error1 > 0) ? qmin - descaled_min1
    
      ✗
                                                                                                    : qmax - descaled_max1;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  float zero_point2 = (zero_point_from_min_error2 + zero_point_from_max_error2 > 0) ? qmin - descaled_min2
    
      ✗
                                                                                                    : qmax - descaled_max2;
    
        1/2✓ Branch 0 taken 2496 times.
✗ Branch 1 not taken.

      2496
                  float zero_point3 = (zero_point_from_min_error3 + zero_point_from_max_error3 > 0) ? qmin - descaled_min3
    
      ✗
                                                                                                    : qmax - descaled_max3;
    
      2496
                  zero_point0 = fmaxf(zero_point0, qmin);
    
      2496
                  zero_point0 = fminf(zero_point0, qmax);
    
      2496
                  zero_point1 = fmaxf(zero_point1, qmin);
    
      2496
                  zero_point1 = fminf(zero_point1, qmax);
    
      2496
                  zero_point2 = fmaxf(zero_point2, qmin);
    
      2496
                  zero_point2 = fminf(zero_point2, qmax);
    
      2496
                  zero_point3 = fmaxf(zero_point3, qmin);
    
      2496
                  zero_point3 = fminf(zero_point3, qmax);
    
                  // Round to nearest integer
    
      2496
                  const int32_t nudged_zero_point0 = (int32_t)rintf(zero_point0);
    
      2496
                  const int32_t nudged_zero_point1 = (int32_t)rintf(zero_point1);
    
      2496
                  const int32_t nudged_zero_point2 = (int32_t)rintf(zero_point2);
    
      2496
                  const int32_t nudged_zero_point3 = (int32_t)rintf(zero_point3);
    
      2496
                  const size_t dst_x = ((row_idx + m_idx_start) % mr);
    
      2496
                  uint8_t* dst_ptr = (uint8_t*)lhs_packed + (dst_x * k_block_len);
    
                  // Quantize the channels
    
      2496
                  int32_t block_idx = 0;
    
      2496
                  const int32_t block_incr = 8 / k_block_len;
    
        2/2✓ Branch 0 taken 15376 times.
✓ Branch 1 taken 2496 times.

      17872
                  for (; block_idx <= blocks_lim_k; block_idx += block_incr) {
    
                      // Clamp at the last valid k-index
    
      15376
                      const int32_t k_idx_start = block_idx * k_block_len;
    
      15376
                      const float16x8_t src0 = vld1q_f16(src_ptr + k_idx_start);
    
      15376
                      const float16x8_t src1 = vld1q_f16(src_ptr + k_idx_start + lhs_row_length);
    
      15376
                      const float16x8_t src2 = vld1q_f16(src_ptr + k_idx_start + (2 * lhs_row_length));
    
      15376
                      const float16x8_t src3 = vld1q_f16(src_ptr + k_idx_start + (3 * lhs_row_length));
    
                      // Scale the values.
    
      15376
                      const int32x4_t v0_0_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_f32_f16(vget_low_f16(src0)), scale0));
    
      15376
                      const int32x4_t v0_1_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_high_f32_f16(src0), scale0));
    
      15376
                      const int32x4_t v1_0_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_f32_f16(vget_low_f16(src1)), scale1));
    
      15376
                      const int32x4_t v1_1_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_high_f32_f16(src1), scale1));
    
      15376
                      const int32x4_t v2_0_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_f32_f16(vget_low_f16(src2)), scale2));
    
      15376
                      const int32x4_t v2_1_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_high_f32_f16(src2), scale2));
    
      15376
                      const int32x4_t v3_0_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_f32_f16(vget_low_f16(src3)), scale3));
    
      15376
                      const int32x4_t v3_1_s32 = vcvtq_s32_f32(vmulq_n_f32(vcvt_high_f32_f16(src3), scale3));
    
      15376
                      const int16x4_t v0_0_s16 = vqmovn_s32(v0_0_s32);
    
      15376
                      const int16x4_t v0_1_s16 = vqmovn_s32(v0_1_s32);
    
      15376
                      const int16x4_t v1_0_s16 = vqmovn_s32(v1_0_s32);
    
      15376
                      const int16x4_t v1_1_s16 = vqmovn_s32(v1_1_s32);
    
      15376
                      const int16x4_t v2_0_s16 = vqmovn_s32(v2_0_s32);
    
      15376
                      const int16x4_t v2_1_s16 = vqmovn_s32(v2_1_s32);
    
      15376
                      const int16x4_t v3_0_s16 = vqmovn_s32(v3_0_s32);
    
      15376
                      const int16x4_t v3_1_s16 = vqmovn_s32(v3_1_s32);
    
      15376
                      int16x8_t v0_s16;
    
      15376
                      int16x8_t v1_s16;
    
      15376
                      int16x8_t v2_s16;
    
      15376
                      int16x8_t v3_s16;
    
        2/2✓ Branch 0 taken 7688 times.
✓ Branch 1 taken 7688 times.

      15376
                      if (k_block_len == 8) {
    
      7688
                          v0_s16 = vcombine_s16(v0_0_s16, v0_1_s16);
    
      7688
                          v1_s16 = vcombine_s16(v1_0_s16, v1_1_s16);
    
      7688
                          v2_s16 = vcombine_s16(v2_0_s16, v2_1_s16);
    
      7688
                          v3_s16 = vcombine_s16(v3_0_s16, v3_1_s16);
    
      7688
                      } else {  // k_block_len == 4
    
      7688
                          v0_s16 = vcombine_s16(v0_0_s16, v1_0_s16);
    
      7688
                          v1_s16 = vcombine_s16(v2_0_s16, v3_0_s16);
    
      7688
                          v2_s16 = vcombine_s16(v0_1_s16, v1_1_s16);
    
      7688
                          v3_s16 = vcombine_s16(v2_1_s16, v3_1_s16);
    
                      }
    
                      // Add zero points.
    
      15376
                      const int16x8_t vnzp0 = vdupq_n_s16((int16_t)nudged_zero_point0);
    
      15376
                      const int16x8_t vnzp1 = vdupq_n_s16((int16_t)nudged_zero_point1);
    
      15376
                      const int16x8_t vnzp2 = vdupq_n_s16((int16_t)nudged_zero_point2);
    
      15376
                      const int16x8_t vnzp3 = vdupq_n_s16((int16_t)nudged_zero_point3);
    
      15376
                      v0_s16 = vaddq_s16(v0_s16, vnzp0);
    
      15376
                      v0_s16 = vmaxq_s16(v0_s16, vdupq_n_s16(INT8_MIN));
    
      15376
                      v0_s16 = vminq_s16(v0_s16, vdupq_n_s16(INT8_MAX));
    
      15376
                      v1_s16 = vaddq_s16(v1_s16, vnzp1);
    
      15376
                      v1_s16 = vmaxq_s16(v1_s16, vdupq_n_s16(INT8_MIN));
    
      15376
                      v1_s16 = vminq_s16(v1_s16, vdupq_n_s16(INT8_MAX));
    
      15376
                      v2_s16 = vaddq_s16(v2_s16, vnzp2);
    
      15376
                      v2_s16 = vmaxq_s16(v2_s16, vdupq_n_s16(INT8_MIN));
    
      15376
                      v2_s16 = vminq_s16(v2_s16, vdupq_n_s16(INT8_MAX));
    
      15376
                      v3_s16 = vaddq_s16(v3_s16, vnzp3);
    
      15376
                      v3_s16 = vmaxq_s16(v3_s16, vdupq_n_s16(INT8_MIN));
    
      15376
                      v3_s16 = vminq_s16(v3_s16, vdupq_n_s16(INT8_MAX));
    
      15376
                      int8x8_t v0_s8 = vqmovn_s16(v0_s16);
    
      15376
                      int8x8_t v1_s8 = vqmovn_s16(v1_s16);
    
      15376
                      int8x8_t v2_s8 = vqmovn_s16(v2_s16);
    
      15376
                      int8x8_t v3_s8 = vqmovn_s16(v3_s16);
    
      15376
                      vst1_s8((int8_t*)(dst_ptr), v0_s8);
    
      15376
                      vst1_s8((int8_t*)(dst_ptr + sizeof(int8x8_t)), v1_s8);
    
      15376
                      vst1_s8((int8_t*)(dst_ptr + 2 * sizeof(int8x8_t)), v2_s8);
    
      15376
                      vst1_s8((int8_t*)(dst_ptr + 3 * sizeof(int8x8_t)), v3_s8);
    
      15376
                      dst_ptr += block_incr * mr * k_block_len * sizeof(int8_t);
    
      15376
                  }
    
        2/2✓ Branch 0 taken 3192 times.
✓ Branch 1 taken 2496 times.

      5688
                  for (; block_idx < num_blocks_k_internal; ++block_idx) {
    
                      // left over k
    
        2/2✓ Branch 0 taken 17024 times.
✓ Branch 1 taken 3192 times.

      20216
                      for (int32_t k_block_idx = 0; k_block_idx < k_block_len; ++k_block_idx) {
    
                          // Clamp at the last valid k-index.
    
        2/2✓ Branch 0 taken 1680 times.
✓ Branch 1 taken 15344 times.

      17024
                          const size_t k_idx_start = KAI_MIN((size_t)((block_idx * k_block_len) + k_block_idx), k - 1);
    
      17024
                          const float src0 = (float)(*(src_ptr + k_idx_start));
    
      17024
                          const float src1 = (float)(*(src_ptr + k_idx_start + lhs_row_length));
    
      17024
                          const float src2 = (float)(*(src_ptr + k_idx_start + (2 * lhs_row_length)));
    
      17024
                          const float src3 = (float)(*(src_ptr + k_idx_start + (3 * lhs_row_length)));
    
                          // Scale the value.
    
      17024
                          int32_t d0_s32 = (int32_t)(roundf(src0 * scale0));
    
      17024
                          int32_t d1_s32 = (int32_t)(roundf(src1 * scale1));
    
      17024
                          int32_t d2_s32 = (int32_t)(roundf(src2 * scale2));
    
      17024
                          int32_t d3_s32 = (int32_t)(roundf(src3 * scale3));
    
      17024
                          d0_s32 = d0_s32 + nudged_zero_point0;
    
        1/2✓ Branch 0 taken 17024 times.
✗ Branch 1 not taken.

      17024
                          d0_s32 = KAI_MAX(d0_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 16912 times.
✓ Branch 1 taken 112 times.

      17024
                          d0_s32 = KAI_MIN(d0_s32, INT8_MAX);
    
      17024
                          d1_s32 = d1_s32 + nudged_zero_point1;
    
        1/2✓ Branch 0 taken 17024 times.
✗ Branch 1 not taken.

      17024
                          d1_s32 = KAI_MAX(d1_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 16912 times.
✓ Branch 1 taken 112 times.

      17024
                          d1_s32 = KAI_MIN(d1_s32, INT8_MAX);
    
      17024
                          d2_s32 = d2_s32 + nudged_zero_point2;
    
        1/2✓ Branch 0 taken 17024 times.
✗ Branch 1 not taken.

      17024
                          d2_s32 = KAI_MAX(d2_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 16912 times.
✓ Branch 1 taken 112 times.

      17024
                          d2_s32 = KAI_MIN(d2_s32, INT8_MAX);
    
      17024
                          d3_s32 = d3_s32 + nudged_zero_point3;
    
        1/2✓ Branch 0 taken 17024 times.
✗ Branch 1 not taken.

      17024
                          d3_s32 = KAI_MAX(d3_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 16912 times.
✓ Branch 1 taken 112 times.

      17024
                          d3_s32 = KAI_MIN(d3_s32, INT8_MAX);
    
      17024
                          *(int8_t*)dst_ptr = (int8_t)d0_s32;
    
      17024
                          *(int8_t*)(dst_ptr + k_block_len * sizeof(int8_t)) = (int8_t)d1_s32;
    
      17024
                          *(int8_t*)(dst_ptr + 2 * (k_block_len * sizeof(int8_t))) = (int8_t)d2_s32;
    
      17024
                          *(int8_t*)(dst_ptr + 3 * (k_block_len * sizeof(int8_t))) = (int8_t)d3_s32;
    
      17024
                          dst_ptr += sizeof(int8_t);
    
      17024
                      }
    
      3192
                      dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
    
      3192
                  }
    
      2496
                  uint8_t* dst_base = (uint8_t*)lhs_packed + mr * (k_internal * sizeof(int8_t));
    
      2496
                  dst_ptr = dst_base + dst_x * kai_num_bytes_per_offset;
    
                  // LHS offset at the beginning of the row.
    
      2496
                  *((int32_t*)(dst_ptr)) = -nudged_zero_point0;
    
      2496
                  *((int32_t*)(dst_ptr + kai_num_bytes_per_offset)) = -nudged_zero_point1;
    
      2496
                  *((int32_t*)(dst_ptr + 2 * kai_num_bytes_per_offset)) = -nudged_zero_point2;
    
      2496
                  *((int32_t*)(dst_ptr + 3 * kai_num_bytes_per_offset)) = -nudged_zero_point3;
    
                  // Assuming the same sizeof() for kai_num_bytes_per_offset and kai_num_bytes_per_multiplier.
    
      −
                  KAI_ASSERT(kai_num_bytes_per_offset == kai_num_bytes_per_multiplier);
    
      2496
                  dst_ptr += mr * kai_num_bytes_per_offset;
    
                  // Store the scale quantization params.
    
      2496
                  *((float*)(dst_ptr)) = recip_scale0;
    
      2496
                  *((float*)(dst_ptr + kai_num_bytes_per_multiplier)) = recip_scale1;
    
      2496
                  *((float*)(dst_ptr + 2 * kai_num_bytes_per_multiplier)) = recip_scale2;
    
      2496
                  *((float*)(dst_ptr + 3 * kai_num_bytes_per_multiplier)) = recip_scale3;
    
                  // Update src_ptr. Note: now lhs contains fp16 values (2 bytes each).
    
      2496
                  src_ptr += (4 * lhs_row_length);
    
                  // Move to the next row as we have interleaved all Mr rows.
    
      2496
                  lhs_packed = (void*)((uint8_t*)lhs_packed + dst_stride);
    
      2496
              }
    
      888
          }
    
        2/2✓ Branch 0 taken 1168 times.
✓ Branch 1 taken 1120 times.

      2288
          for (; row_idx < num_rows; ++row_idx) {
    
              // Find min/max for each channel
    
      1168
              int32_t k_idx = 0;
    
      1168
              float16x8_t vmax0 = vmax;
    
      1168
              float16x8_t vmin0 = vmin;
    
        2/2✓ Branch 0 taken 5936 times.
✓ Branch 1 taken 1168 times.

      7104
              for (; k_idx <= ((int32_t)k - 8); k_idx += 8) {
    
      5936
                  const float16x8_t src0_0 = vld1q_f16(src_ptr + (size_t)k_idx);
    
      5936
                  vmax0 = vmaxq_f16(vmax0, src0_0);
    
      5936
                  vmin0 = vminq_f16(vmin0, src0_0);
    
      5936
              }
    
              // Get the max/min
    
      1168
              float16_t max0 = vmaxvq_f16(vmax0);
    
      1168
              float16_t min0 = vminvq_f16(vmin0);
    
        2/2✓ Branch 0 taken 2168 times.
✓ Branch 1 taken 1168 times.

      3336
              for (; k_idx < (int32_t)k; ++k_idx) {
    
      2168
                  const float16_t src0 = *(src_ptr + (size_t)k_idx);
    
      2168
                  max0 = vmaxh_f16(src0, max0);
    
      2168
                  min0 = vminh_f16(src0, min0);
    
      2168
              }
    
              // Maximum/minimum int8 values
    
      1168
              const float qmin = (float)INT8_MIN;
    
      1168
              const float qmax = (float)INT8_MAX;
    
      1168
              const float rmin0 = fminf(0.0F, min0);
    
      1168
              const float rmax0 = fmaxf(0.0F, max0);
    
        1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1168 times.

      1168
              const float scale0 = rmin0 == rmax0 ? 1.F : (qmax - qmin) / (rmax0 - rmin0);
    
              // Reciprocal to quantize
    
        1/2✓ Branch 0 taken 1168 times.
✗ Branch 1 not taken.

      1168
              const float recip_scale0 = scale0 ? 1.0F / scale0 : 0.0F;
    
      1168
              const float descaled_min0 = rmin0 * scale0;
    
      1168
              const float descaled_max0 = rmax0 * scale0;
    
      1168
              const float zero_point_from_min_error0 = qmin + descaled_min0;
    
      1168
              const float zero_point_from_max_error0 = qmax + descaled_max0;
    
      2336
              float zero_point0 =
    
        1/2✓ Branch 0 taken 1168 times.
✗ Branch 1 not taken.

      1168
                  zero_point_from_min_error0 + zero_point_from_max_error0 > 0 ? qmin - descaled_min0 : qmax - descaled_max0;
    
      1168
              zero_point0 = fmaxf(zero_point0, qmin);
    
      1168
              zero_point0 = fminf(zero_point0, qmax);
    
              // Round to nearest integer
    
      1168
              const int32_t nudged_zero_point0 = (int32_t)rintf(zero_point0);
    
      1168
              const size_t dst_x = ((row_idx + m_idx_start) % mr);
    
      1168
              uint8_t* dst_ptr = (uint8_t*)lhs_packed + (dst_x * k_block_len * sizeof(int8_t));
    
              // Quantize the channels
    
      1168
              int32_t block_idx = 0;
    
        2/2✓ Branch 0 taken 8480 times.
✓ Branch 1 taken 1168 times.

      9648
              for (; block_idx <= blocks_lim_k; ++block_idx) {
    
      8480
                  const int32_t k_idx_start = block_idx * k_block_len;
    
      8480
                  const float16x8_t src_0 = vld1q_f16(src_ptr + k_idx_start);
    
                  // Scale the values
    
      8480
                  const float32x4_t v0_f32 = vmulq_n_f32(vcvt_f32_f16(vget_low_f16(src_0)), scale0);
    
      8480
                  const float32x4_t v1_f32 = vmulq_n_f32(vcvt_high_f32_f16(src_0), scale0);
    
      8480
                  const int32x4_t v0_s32 = vcvtnq_s32_f32(v0_f32);
    
      8480
                  const int32x4_t v1_s32 = vcvtnq_s32_f32(v1_f32);
    
      8480
                  const int16x4_t v0_s16 = vqmovn_s32(v0_s32);
    
      8480
                  const int16x4_t v1_s16 = vqmovn_s32(v1_s32);
    
      8480
                  int16x8_t v_s16 = vcombine_s16(v0_s16, v1_s16);
    
                  // Add zero points
    
      8480
                  int16_t nzp_s16 = (int16_t)nudged_zero_point0;
    
      8480
                  int16x8_t vnzp_s16 = vdupq_n_s16(nzp_s16);
    
      8480
                  v_s16 = vaddq_s16(v_s16, vnzp_s16);
    
      8480
                  v_s16 = vmaxq_s16(v_s16, vdupq_n_s16(INT8_MIN));
    
      8480
                  v_s16 = vminq_s16(v_s16, vdupq_n_s16(INT8_MAX));
    
      8480
                  int8x8_t v_s8 = vqmovn_s16(v_s16);
    
      8480
                  vst1_s8((int8_t*)(dst_ptr), v_s8);
    
      8480
                  dst_ptr += mr * k_block_len * sizeof(int8_t);
    
      8480
              }
    
        2/2✓ Branch 0 taken 2992 times.
✓ Branch 1 taken 1168 times.

      4160
              for (; block_idx < num_blocks_k_internal; ++block_idx) {
    
                  // left over k
    
        2/2✓ Branch 0 taken 15392 times.
✓ Branch 1 taken 2992 times.

      18384
                  for (int32_t k_block_idx = 0; k_block_idx < k_block_len; ++k_block_idx) {
    
                      // Clamp at the last valid k-index
    
        2/2✓ Branch 0 taken 2988 times.
✓ Branch 1 taken 12404 times.

      15392
                      const size_t k_idx_start = KAI_MIN((size_t)((block_idx * k_block_len) + k_block_idx), k - 1);
    
      15392
                      const float src0 = (float)(*(src_ptr + k_idx_start));
    
                      // Scale the values
    
      15392
                      int32_t d0_s32 = (int32_t)(roundf(src0 * scale0));
    
      15392
                      d0_s32 = d0_s32 + nudged_zero_point0;
    
        1/2✓ Branch 0 taken 15392 times.
✗ Branch 1 not taken.

      15392
                      d0_s32 = KAI_MAX(d0_s32, INT8_MIN);
    
        2/2✓ Branch 0 taken 15308 times.
✓ Branch 1 taken 84 times.

      15392
                      d0_s32 = KAI_MIN(d0_s32, INT8_MAX);
    
      15392
                      *((int8_t*)(dst_ptr)) = (int8_t)d0_s32;
    
      15392
                      dst_ptr += sizeof(int8_t);
    
      15392
                  }
    
      2992
                  dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
    
      2992
              }
    
      1168
              dst_ptr = (uint8_t*)lhs_packed + mr * (k_internal * sizeof(int8_t));
    
      1168
              dst_ptr += dst_x * kai_num_bytes_per_offset;
    
              // LHS offset at the beginning of the row
    
      1168
              *((int32_t*)(dst_ptr)) = -nudged_zero_point0;
    
              // Assuming the same sizeof() for kai_num_bytes_per_offset and kai_num_bytes_per_multiplier
    
      −
              KAI_ASSERT(kai_num_bytes_per_offset == kai_num_bytes_per_multiplier);
    
      1168
              dst_ptr += mr * kai_num_bytes_per_offset;
    
              // Store the scale quantization params
    
      1168
              *((float*)(dst_ptr)) = recip_scale0;
    
      1168
              src_ptr += lhs_row_length;
    
              // Move to the next row if we have interleaved all Mr rows
    
        2/2✓ Branch 0 taken 936 times.
✓ Branch 1 taken 232 times.

      1168
              if ((((row_idx + 1) + m_idx_start) % mr) == 0) {
    
      232
                  lhs_packed = (void*)((uint8_t*)lhs_packed + dst_stride);
    
      232
              }
    
      1168
          }
    
      1120
      }
    
      #endif  // Architectural features check.
Function (Line)	Call count	Line coverage	Branch coverage	Block coverage
kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f16_neon (line 50)	called 1120 times	100.0%	-%	100.0%
kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f16_neon (line 54)	called 1120 times	100.0%	-%	100.0%
kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f16_neon (line 60)	called 1120 times	100.0%	-%	100.0%
kai_get_m_step_lhs_quant_pack_qai8dxp_f16_neon (line 45)	not called	0.0%	-%	0.0%
kai_k_roundedup (line 28)	called 4480 times	100.0%	-%	100.0%
kai_lhs_packed_stride (line 34)	called 3360 times	100.0%	-%	55.0%
kai_run_lhs_quant_pack_qai8dxp_f16_neon (line 66)	called 1120 times	98.4%	75.6%	83.0%