kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.c

Directory:	./
File:	kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.c
Date:	2025-10-20 13:18:31

	Coverage	Exec	Excl	Total
Lines:	98.4%	62	11	74
Functions:	100.0%	16	0	16
Branches:	75.0%	3	22	26

  
      Line
      Branch
      Exec
      Source
    
      //
    
      // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      //
    
      #if !defined(__aarch64__) && !defined(__ARM_FEATURE_DOTPROD) && !defined(_M_ARM64)
    
      #error "Dotprod extension required to compile this micro-kernel"
    
      #else  // Architectural features check.
    
      #include "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
    
      #include <stddef.h>
    
      #include <stdint.h>
    
      #include "kai/kai_common.h"
    
      typedef struct {
    
          float* dst;
    
          const void* lhs_packed;
    
          const void* rhs_packed;
    
          const float* clamp_vals;
    
          size_t dst_stride_row;
    
          size_t m;
    
          size_t n;
    
          size_t num_blocks;
    
          size_t num_subblocks;
    
      } KernelArgs;
    
      void kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(KernelArgs* args_ptr);
    
      void kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_opt32_neon_dotprod(KernelArgs* args_ptr);
    
      // Compute args
    
      static const size_t kai_m_step = 16;
    
      static const size_t kai_n_step = 4;
    
      // Packing args
    
      static const size_t kai_mr = 4;
    
      static const size_t kai_nr = 4;
    
      static const size_t kai_kr = 8;
    
      static const size_t kai_sr = 2;
    
      // LHS format args (num. bytes per value, multiplier, zero_point (if asymmetric))
    
      static const size_t kai_num_bytes_qvalue_lhs = 1;
    
      static const size_t kai_num_bytes_multiplier_lhs = 4;
    
      static const size_t kai_num_bytes_zp_lhs = 4;
    
      // RHS format args (num. bytes per value, multiplier, zero_point (if asymmetric), and reduction sum (if LHS is
    
      // asymmetric))
    
      static const size_t kai_num_bytes_recip_qvalue_rhs = 2;
    
      static const size_t kai_num_bytes_multiplier_rhs = 2;
    
      static const size_t kai_num_bytes_rsum_rhs = 4;
    
      // DST format args
    
      static const size_t kai_num_bytes_dst_value = 4;
    
      // Extra args
    
      static const size_t kai_num_bytes_bias = 4;
    
      static const size_t kai_k_multiple_of = 32;
    
      static const size_t kai_bl = 32;
    
      108
      inline static size_t kai_get_k_roundedup(size_t k) {
    
      108
          return kai_roundup(k, kai_k_multiple_of);
    
      }
    
      108
      inline static size_t kai_get_num_bytes_per_block_rhs(size_t bl) {
    
      −
          KAI_ASSUME((bl % kai_bl) == 0);
    
      108
          size_t num_bytes_per_block_rhs = (bl / kai_num_bytes_recip_qvalue_rhs) + kai_num_bytes_multiplier_rhs;
    
      216
          return num_bytes_per_block_rhs;
    
      108
      }
    
      271
      inline static size_t kai_get_num_blocks_per_row(size_t k, size_t bl) {
    
      −
          KAI_ASSUME((bl % kai_bl) == 0);
    
      271
          return kai_roundup(k, bl) / bl;
    
      }
    
      108
      inline static size_t kai_get_lhs_packed_stride(size_t k) {
    
      108
          const size_t k_internal = kai_get_k_roundedup(k);
    
      108
          size_t lhs_packed_stride = kai_mr * ((k_internal * kai_num_bytes_qvalue_lhs) + kai_num_bytes_multiplier_lhs);
    
          // Since the LHS matrix is asymmetric with per-row quantization, we must include the
    
          // the number of bytes to hold the zero point value
    
      108
          lhs_packed_stride += kai_mr * kai_num_bytes_zp_lhs;
    
      216
          return lhs_packed_stride;
    
      108
      }
    
      108
      inline static size_t kai_get_rhs_packed_stride(size_t k, size_t bl) {
    
      −
          KAI_ASSUME((bl % kai_bl) == 0);
    
      108
          const size_t num_blocks_per_row = kai_get_num_blocks_per_row(k, bl);
    
      108
          const size_t num_bytes_per_block = kai_get_num_bytes_per_block_rhs(bl);
    
      108
          size_t rhs_packed_stride = kai_nr * (num_bytes_per_block * num_blocks_per_row);
    
          // Since the LHS matrix is quantized asymmetric with per-row quantization, we also include
    
          // the number of bytes for the reduction sum
    
      108
          rhs_packed_stride += kai_nr * kai_num_bytes_rsum_rhs;
    
          // Since the bias is packed with the RHS matrix, the stride is adjusted with the number of bytes of the bias
    
      108
          rhs_packed_stride += kai_nr * kai_num_bytes_bias;
    
      216
          return rhs_packed_stride;
    
      108
      }
    
      112
      size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
    
      112
          return kai_m_step;
    
      }
    
      112
      size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
    
      112
          return kai_n_step;
    
      }
    
      112
      size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
    
      112
          return kai_mr;
    
      }
    
      112
      size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
    
      112
          return kai_nr;
    
      }
    
      112
      size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
    
      112
          return kai_kr;
    
      }
    
      112
      size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
    
      112
          return kai_sr;
    
      }
    
      108
      size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(size_t m_idx, size_t k) {
    
      −
          KAI_ASSUME((m_idx % kai_m_step) == 0);
    
      108
          return (m_idx / kai_mr) * kai_get_lhs_packed_stride(k);
    
      }
    
      108
      size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(
    
          size_t n_idx, size_t k, size_t bl) {
    
      −
          KAI_ASSUME((k % bl) == 0);
    
      −
          KAI_ASSUME((n_idx % kai_n_step) == 0);
    
      108
          return (n_idx / kai_nr) * kai_get_rhs_packed_stride(k, bl);
    
      }
    
      108
      size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(
    
          size_t m_idx, size_t n_idx, size_t dst_stride) {
    
      −
          KAI_ASSUME((m_idx % kai_m_step) == 0);
    
      −
          KAI_ASSUME((n_idx % kai_n_step) == 0);
    
      108
          return (n_idx * kai_num_bytes_dst_value) + m_idx * dst_stride;
    
      }
    
      108
      size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(size_t m, size_t n) {
    
      108
          return m * n * kai_num_bytes_dst_value;
    
      }
    
      163
      void kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(
    
          size_t m,                         //
    
          size_t n,                         //
    
          size_t k,                         //
    
          size_t bl,                        //
    
          const void* restrict lhs_packed,  //
    
          const void* restrict rhs_packed,  //
    
          float* restrict dst,              // NOLINT(readability-non-const-parameter)
    
          size_t dst_stride_row,            //
    
          size_t dst_stride_col,            //
    
          float scalar_min,                 //
    
          float scalar_max) {
    
      −
          KAI_ASSUME(dst_stride_col == sizeof(float));
    
      −
          KAI_ASSUME((k % bl) == 0);
    
      −
          KAI_ASSUME((bl % kai_bl) == 0);
    
        1/2✓ Branch 0 taken 163 times.
✗ Branch 1 not taken.

      163
          if (m == 0) {
    
      ✗
              return;
    
          }
    
      163
          const size_t num_subblocks = bl / kai_bl;
    
      163
          const size_t num_blocks = kai_get_num_blocks_per_row(k, bl);
    
      163
          const float clamp_vals[2] = {scalar_min, scalar_max};
    
      163
          KernelArgs args;
    
      163
          args.dst = dst;
    
      163
          args.lhs_packed = lhs_packed;
    
      163
          args.rhs_packed = rhs_packed;
    
      163
          args.clamp_vals = clamp_vals;
    
      163
          args.dst_stride_row = dst_stride_row;
    
      163
          args.m = m;
    
      163
          args.n = n;
    
      163
          args.num_blocks = num_blocks;
    
      163
          args.num_subblocks = num_subblocks;
    
        2/2✓ Branch 0 taken 82 times.
✓ Branch 1 taken 81 times.

      163
          if (bl == 32) {
    
      82
              kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_opt32_neon_dotprod(&args);
    
      82
          } else {
    
      81
              kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(&args);
    
          }
    
      163
      }
    
      #endif  // Architectural features check.

Line	Branch	Exec	Source
1			//
2			// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3			//
4			// SPDX-License-Identifier: Apache-2.0
5			//
6			#if !defined(__aarch64__) && !defined(__ARM_FEATURE_DOTPROD) && !defined(_M_ARM64)
7			#error "Dotprod extension required to compile this micro-kernel"
8			#else // Architectural features check.
9
10			#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
11
12			#include <stddef.h>
13			#include <stdint.h>
14
15			#include "kai/kai_common.h"
16
17			typedef struct {
18			float* dst;
19			const void* lhs_packed;
20			const void* rhs_packed;
21			const float* clamp_vals;
22			size_t dst_stride_row;
23			size_t m;
24			size_t n;
25			size_t num_blocks;
26			size_t num_subblocks;
27			} KernelArgs;
28
29			void kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(KernelArgs* args_ptr);
30			void kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_opt32_neon_dotprod(KernelArgs* args_ptr);
31
32			// Compute args
33			static const size_t kai_m_step = 16;
34			static const size_t kai_n_step = 4;
35			// Packing args
36			static const size_t kai_mr = 4;
37			static const size_t kai_nr = 4;
38			static const size_t kai_kr = 8;
39			static const size_t kai_sr = 2;
40			// LHS format args (num. bytes per value, multiplier, zero_point (if asymmetric))
41			static const size_t kai_num_bytes_qvalue_lhs = 1;
42			static const size_t kai_num_bytes_multiplier_lhs = 4;
43			static const size_t kai_num_bytes_zp_lhs = 4;
44			// RHS format args (num. bytes per value, multiplier, zero_point (if asymmetric), and reduction sum (if LHS is
45			// asymmetric))
46			static const size_t kai_num_bytes_recip_qvalue_rhs = 2;
47			static const size_t kai_num_bytes_multiplier_rhs = 2;
48			static const size_t kai_num_bytes_rsum_rhs = 4;
49			// DST format args
50			static const size_t kai_num_bytes_dst_value = 4;
51			// Extra args
52			static const size_t kai_num_bytes_bias = 4;
53			static const size_t kai_k_multiple_of = 32;
54			static const size_t kai_bl = 32;
55
56		108	inline static size_t kai_get_k_roundedup(size_t k) {
57		108	return kai_roundup(k, kai_k_multiple_of);
58			}
59
60		108	inline static size_t kai_get_num_bytes_per_block_rhs(size_t bl) {
61		−	KAI_ASSUME((bl % kai_bl) == 0);
62		108	size_t num_bytes_per_block_rhs = (bl / kai_num_bytes_recip_qvalue_rhs) + kai_num_bytes_multiplier_rhs;
63		216	return num_bytes_per_block_rhs;
64		108	}
65
66		271	inline static size_t kai_get_num_blocks_per_row(size_t k, size_t bl) {
67		−	KAI_ASSUME((bl % kai_bl) == 0);
68
69		271	return kai_roundup(k, bl) / bl;
70			}
71
72		108	inline static size_t kai_get_lhs_packed_stride(size_t k) {
73		108	const size_t k_internal = kai_get_k_roundedup(k);
74		108	size_t lhs_packed_stride = kai_mr * ((k_internal * kai_num_bytes_qvalue_lhs) + kai_num_bytes_multiplier_lhs);
75			// Since the LHS matrix is asymmetric with per-row quantization, we must include the
76			// the number of bytes to hold the zero point value
77		108	lhs_packed_stride += kai_mr * kai_num_bytes_zp_lhs;
78
79		216	return lhs_packed_stride;
80		108	}
81
82		108	inline static size_t kai_get_rhs_packed_stride(size_t k, size_t bl) {
83		−	KAI_ASSUME((bl % kai_bl) == 0);
84
85		108	const size_t num_blocks_per_row = kai_get_num_blocks_per_row(k, bl);
86		108	const size_t num_bytes_per_block = kai_get_num_bytes_per_block_rhs(bl);
87
88		108	size_t rhs_packed_stride = kai_nr * (num_bytes_per_block * num_blocks_per_row);
89			// Since the LHS matrix is quantized asymmetric with per-row quantization, we also include
90			// the number of bytes for the reduction sum
91		108	rhs_packed_stride += kai_nr * kai_num_bytes_rsum_rhs;
92			// Since the bias is packed with the RHS matrix, the stride is adjusted with the number of bytes of the bias
93		108	rhs_packed_stride += kai_nr * kai_num_bytes_bias;
94
95		216	return rhs_packed_stride;
96		108	}
97
98		112	size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
99		112	return kai_m_step;
100			}
101
102		112	size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
103		112	return kai_n_step;
104			}
105
106		112	size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
107		112	return kai_mr;
108			}
109
110		112	size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
111		112	return kai_nr;
112			}
113
114		112	size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
115		112	return kai_kr;
116			}
117
118		112	size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
119		112	return kai_sr;
120			}
121
122		108	size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(size_t m_idx, size_t k) {
123		−	KAI_ASSUME((m_idx % kai_m_step) == 0);
124
125		108	return (m_idx / kai_mr) * kai_get_lhs_packed_stride(k);
126			}
127
128		108	size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(
129			size_t n_idx, size_t k, size_t bl) {
130		−	KAI_ASSUME((k % bl) == 0);
131		−	KAI_ASSUME((n_idx % kai_n_step) == 0);
132
133		108	return (n_idx / kai_nr) * kai_get_rhs_packed_stride(k, bl);
134			}
135
136		108	size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(
137			size_t m_idx, size_t n_idx, size_t dst_stride) {
138		−	KAI_ASSUME((m_idx % kai_m_step) == 0);
139		−	KAI_ASSUME((n_idx % kai_n_step) == 0);
140
141		108	return (n_idx * kai_num_bytes_dst_value) + m_idx * dst_stride;
142			}
143
144		108	size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(size_t m, size_t n) {
145		108	return m * n * kai_num_bytes_dst_value;
146			}
147
148		163	void kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(
149			size_t m, //
150			size_t n, //
151			size_t k, //
152			size_t bl, //
153			const void* restrict lhs_packed, //
154			const void* restrict rhs_packed, //
155			float* restrict dst, // NOLINT(readability-non-const-parameter)
156			size_t dst_stride_row, //
157			size_t dst_stride_col, //
158			float scalar_min, //
159			float scalar_max) {
160		−	KAI_ASSUME(dst_stride_col == sizeof(float));
161		−	KAI_ASSUME((k % bl) == 0);
162		−	KAI_ASSUME((bl % kai_bl) == 0);
163
164	1/2 ✓ Branch 0 taken 163 times. ✗ Branch 1 not taken.	163	if (m == 0) {
165		✗	return;
166			}
167		163	const size_t num_subblocks = bl / kai_bl;
168		163	const size_t num_blocks = kai_get_num_blocks_per_row(k, bl);
169		163	const float clamp_vals[2] = {scalar_min, scalar_max};
170
171		163	KernelArgs args;
172
173		163	args.dst = dst;
174		163	args.lhs_packed = lhs_packed;
175		163	args.rhs_packed = rhs_packed;
176		163	args.clamp_vals = clamp_vals;
177		163	args.dst_stride_row = dst_stride_row;
178		163	args.m = m;
179		163	args.n = n;
180		163	args.num_blocks = num_blocks;
181		163	args.num_subblocks = num_subblocks;
182
183	2/2 ✓ Branch 0 taken 82 times. ✓ Branch 1 taken 81 times.	163	if (bl == 32) {
184		82	kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_opt32_neon_dotprod(&args);
185		82	} else {
186		81	kai_kernel_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(&args);
187			}
188		163	}
189
190			#endif // Architectural features check.
191