KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 96.6% 112 / 35 / 151
Functions: 88.9% 8 / 0 / 9
Branches: 84.4% 27 / 70 / 102

kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6 #include "kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h"
7
8 #include <stddef.h>
9 #include <stdint.h>
10 #include <string.h>
11
12 #include "kai/kai_common.h"
13
14 static const size_t kai_num_bytes_sum_rhs = sizeof(float);
15 static const size_t kai_num_bytes_bias = sizeof(float);
16 static const size_t kai_nr_multiple_of = 4;
17 static const size_t kai_bl_multiple_of = 32;
18
19 6568 inline static size_t kai_get_num_blocks_per_row(size_t k, size_t bl) {
20 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
21 6568 return kai_roundup(k, bl) / bl;
22 }
23
24 5460 inline static size_t kai_get_num_bytes_per_block(size_t bl, size_t num_bytes_multiplier_rhs) {
25 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
26 5460 return (bl / 2) + num_bytes_multiplier_rhs;
27 }
28
29 1108 inline static size_t kai_get_rhs_packed_offset_end_of_all_blocks(
30 size_t k, size_t nr, size_t kr, size_t bl, size_t num_bytes_multiplier_rhs) {
31 KAI_ASSERT((bl % kr) == 0);
32 KAI_ASSERT((nr % kai_nr_multiple_of) == 0);
33 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
34
35 1108 const size_t num_blocks_per_row = kai_get_num_blocks_per_row(k, bl);
36 1108 const size_t num_bytes_per_block = kai_get_num_bytes_per_block(bl, num_bytes_multiplier_rhs);
37
38 2216 return (nr * num_bytes_per_block * num_blocks_per_row);
39 1108 }
40
41 size_t kai_get_n_step_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(size_t nr) {
42 return nr;
43 }
44
45 1108 size_t kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
46 size_t n_idx, //
47 size_t rhs_stride) {
48 1108 return n_idx * rhs_stride;
49 }
50
51 4352 size_t kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
52 size_t k, //
53 size_t nr, //
54 size_t kr, //
55 size_t sr, //
56 size_t bl, //
57 enum kai_datatype scale_dt) {
58 KAI_ASSERT((k % bl) == 0);
59 KAI_ASSERT((bl % kr) == 0);
60 KAI_ASSERT((nr % kai_nr_multiple_of) == 0);
61 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
62 KAI_ASSERT(scale_dt == kai_dt_bf16);
63
64 4352 KAI_UNUSED(kr);
65 4352 KAI_UNUSED(sr);
66
67 4352 const size_t num_bytes_multiplier_rhs = kai_get_datatype_size_in_bytes(scale_dt);
68 4352 const size_t num_blocks_per_row = kai_get_num_blocks_per_row(k, bl);
69 4352 const size_t num_bytes_per_block = kai_get_num_bytes_per_block(bl, num_bytes_multiplier_rhs);
70
71 8704 return nr * ((num_bytes_per_block * num_blocks_per_row) + kai_num_bytes_sum_rhs + kai_num_bytes_bias);
72 4352 }
73
74 3244 size_t kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
75 size_t n_idx, //
76 size_t k, //
77 size_t nr, //
78 size_t kr, //
79 size_t sr, //
80 size_t bl, //
81 enum kai_datatype scale_dt) {
82 KAI_ASSERT((n_idx % nr) == 0);
83 KAI_ASSERT((k % bl) == 0);
84 KAI_ASSERT((bl % kr) == 0);
85 KAI_ASSERT((nr % kai_nr_multiple_of) == 0);
86 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
87 KAI_ASSERT(scale_dt == kai_dt_bf16);
88
89 3244 return (n_idx / nr) * kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(k, nr, kr, sr, bl, scale_dt);
90 }
91
92 1108 size_t kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
93 size_t n, //
94 size_t k, //
95 size_t nr, //
96 size_t kr, //
97 size_t sr, //
98 size_t bl, //
99 enum kai_datatype scale_dt) {
100 KAI_ASSERT((k % bl) == 0);
101 KAI_ASSERT((bl % kr) == 0);
102 KAI_ASSERT((nr % kai_nr_multiple_of) == 0);
103 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
104 KAI_ASSERT(scale_dt == kai_dt_bf16);
105
106 1108 const size_t num_rows = kai_roundup(n, nr) / nr;
107
108 2216 return num_rows * kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(k, nr, kr, sr, bl, scale_dt);
109 1108 }
110
111 1108 void kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
112 size_t num_groups, //
113 size_t n, //
114 size_t k, //
115 size_t nr, //
116 size_t kr, //
117 size_t sr, //
118 size_t bl, //
119 const uint8_t* rhs, //
120 size_t rhs_stride, //
121 const float* bias, //
122 const void* scale, //
123 size_t scale_stride, //
124 void* rhs_packed, //
125 size_t extra_bytes, //
126 const struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params* params) {
127 KAI_ASSERT(num_groups == 1);
128 KAI_ASSERT(extra_bytes == 0);
129 KAI_ASSERT(rhs != NULL);
130 KAI_ASSERT(scale != NULL);
131 KAI_ASSERT(rhs_packed != NULL);
132 KAI_ASSERT(params != NULL);
133 KAI_ASSERT(params->rhs_zero_point == 8);
134 KAI_ASSERT(params->lhs_zero_point == 1);
135
136 KAI_ASSERT((k % bl) == 0);
137 KAI_ASSERT((bl % kr) == 0);
138 KAI_ASSERT((kr % sr) == 0);
139 KAI_ASSERT((nr % kai_nr_multiple_of) == 0);
140 KAI_ASSERT((bl % kai_bl_multiple_of) == 0);
141 KAI_ASSERT(params->scale_dt == kai_dt_bf16);
142
143 // Note: The input matrix (rhs) is expected with:
144 // "k" columns and "n" rows (NxK)
145 1108 const enum kai_datatype scale_dt = params->scale_dt;
146 1108 const size_t num_bytes_multiplier_rhs = kai_get_datatype_size_in_bytes(scale_dt);
147 2216 const size_t rhs_packed_offset_end_of_all_blocks =
148 1108 kai_get_rhs_packed_offset_end_of_all_blocks(k, nr, kr, bl, num_bytes_multiplier_rhs);
149 1108 const size_t num_qblocks_per_row = kai_get_num_blocks_per_row(k, bl);
150 1108 const size_t num_bytes_per_block_k = bl / 2;
151 1108 const size_t dst_num_rows = kai_roundup(n, nr);
152 1108 const size_t block_length_in_bytes = kr / sr;
153
154 1108 uint8_t* dst_row = (uint8_t*)rhs_packed;
155
156
2/2
✓ Branch 0 taken 1108 times.
✓ Branch 1 taken 17166 times.
18274 for (size_t dst_row_idx = 0; dst_row_idx < dst_num_rows; dst_row_idx += nr) {
157 17166 float* sums = (float*)(dst_row + rhs_packed_offset_end_of_all_blocks);
158
159 // Initialize the RHS reduction sums to zero
160 17166 memset(sums, 0, nr * kai_num_bytes_sum_rhs);
161
162 // Iterate over the quantized blocks
163
2/2
✓ Branch 0 taken 136876 times.
✓ Branch 1 taken 17166 times.
154042 for (size_t dst_qblock_idx = 0; dst_qblock_idx < num_qblocks_per_row; ++dst_qblock_idx) {
164 // Store the scales after packing all K values in the block
165 136876 uint8_t* rhs_packed_scale = dst_row + num_bytes_per_block_k * nr;
166 136876 const uint8_t* scale_ptr = (const uint8_t*)scale + dst_qblock_idx * num_bytes_multiplier_rhs;
167
168
2/2
✓ Branch 0 taken 738160 times.
✓ Branch 1 taken 136876 times.
875036 for (size_t i = 0; i < nr; ++i) {
169
2/2
✓ Branch 0 taken 727132 times.
✓ Branch 1 taken 11028 times.
738160 const size_t src_row_idx = KAI_MIN(dst_row_idx + i, n - 1);
170 738160 const void* src_scales_ptr = scale_ptr + src_row_idx * scale_stride;
171 738160 void* dst_scales_ptr = rhs_packed_scale + i * num_bytes_multiplier_rhs;
172
173 738160 memcpy(
174 369080 dst_scales_ptr, //
175 369080 src_scales_ptr, //
176 369080 num_bytes_multiplier_rhs); //
177 738160 }
178
179 136876 size_t k0_idx_i = dst_qblock_idx * bl;
180
181
2/2
✓ Branch 0 taken 151276 times.
✓ Branch 1 taken 136876 times.
288152 for (size_t dst_byte_idx = 0; dst_byte_idx < num_bytes_per_block_k; dst_byte_idx += 16) {
182
2/2
✓ Branch 0 taken 408376 times.
✓ Branch 1 taken 151276 times.
559652 for (size_t segment_idx = 0; segment_idx < 16 / block_length_in_bytes; ++segment_idx) {
183
2/2
✓ Branch 0 taken 2199520 times.
✓ Branch 1 taken 408376 times.
2607896 for (size_t nr_idx = 0; nr_idx < nr; ++nr_idx) {
184 2199520 const size_t n0_idx = dst_row_idx + nr_idx;
185
186 // Two int4 values are stored in one byte.
187 // The lower order part of the byte (low) holds the first nibble (K-index + 0).
188 // The higher order of the byte holds the second nibble (K-index + 16).
189 2199520 size_t k0_idx = k0_idx_i;
190 2199520 size_t k1_idx = k0_idx_i + 16;
191
192 // Clamp the index to avoid out-of-bound reads
193
2/2
✓ Branch 0 taken 2167792 times.
✓ Branch 1 taken 31728 times.
2199520 const size_t n0_valid_idx = KAI_MIN(n0_idx, n - 1);
194 2199520 float d = kai_cast_f32_bf16(((uint16_t*)rhs_packed_scale)[nr_idx]);
195
196 2199520 int32_t partial_sum = 0;
197
198 2199520 size_t src_addr_byte0 = (k0_idx / 2) + n0_valid_idx * rhs_stride;
199
200
2/2
✓ Branch 0 taken 6538880 times.
✓ Branch 1 taken 2199520 times.
8738400 for (size_t block_byte_idx = 0; block_byte_idx < block_length_in_bytes; block_byte_idx += 2) {
201 // Initialize the byte with the zero-point (8)
202 // e.g. uint8_t byte0 = 8 | 8 << 4
203 6538880 uint8_t byte0 = 136;
204 6538880 uint8_t byte1 = 136;
205 6538880 uint8_t byte2 = 136;
206 6538880 uint8_t byte3 = 136;
207
208
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
6538880 if (k0_idx < k) {
209 6538880 byte0 = rhs[src_addr_byte0];
210 6538880 }
211
212
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
6538880 if (k1_idx < k) {
213 6538880 byte1 = rhs[src_addr_byte0 + 8];
214 6538880 }
215
216
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
6538880 if (k0_idx + 1 < k) {
217 6538880 byte2 = byte0;
218 6538880 }
219
220
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
6538880 if (k1_idx + 1 < k) {
221 6538880 byte3 = byte1;
222 6538880 }
223
224 6538880 k0_idx += 2;
225 6538880 k1_idx += 2;
226
227 6538880 const uint8_t src_x0_lo = byte0 & 0x0F;
228 6538880 const uint8_t src_x0_hi = byte1 & 0x0F;
229 6538880 const uint8_t src_x1_lo = (byte2 >> 4) & 0x0F;
230 6538880 const uint8_t src_x1_hi = (byte3 >> 4) & 0x0F;
231
232 6538880 partial_sum += (int32_t)src_x0_lo;
233 6538880 partial_sum += (int32_t)src_x0_hi;
234 6538880 partial_sum += (int32_t)src_x1_lo;
235 6538880 partial_sum += (int32_t)src_x1_hi;
236 6538880 partial_sum -= 32; // 4 * zero_point (8)
237
238 13077760 const uint16_t dst_q =
239 6538880 ((src_x0_lo)) | ((src_x0_hi) << 4) | ((src_x1_lo) << 8) | ((src_x1_hi) << 12);
240
241 6538880 *((uint16_t*)dst_row) = dst_q ^ 0x8888;
242
243 6538880 dst_row += 2;
244 6538880 src_addr_byte0 += 1;
245 6538880 }
246 // NOLINTBEGIN(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
247 2199520 sums[nr_idx] += (float)partial_sum * d;
248 // NOLINTEND(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
249 2199520 }
250
251 408376 k0_idx_i += block_length_in_bytes;
252 408376 }
253 151276 k0_idx_i += 16;
254 151276 }
255 // Move the pointer after scales
256 136876 dst_row += num_bytes_multiplier_rhs * nr;
257 136876 }
258
259 // Move the pointer after the row sum
260 17166 dst_row += kai_num_bytes_sum_rhs * nr;
261
262 // Set the bias
263
1/2
✓ Branch 0 taken 17166 times.
✗ Branch 1 not taken.
17166 if (bias == NULL) {
264 memset(dst_row, 0, nr * kai_num_bytes_bias);
265 } else {
266
2/2
✓ Branch 0 taken 92088 times.
✓ Branch 1 taken 17166 times.
109254 for (size_t i = 0; i < nr; ++i) {
267 // Clamp the row index to avoid out-of-bound reads
268
2/2
✓ Branch 0 taken 90364 times.
✓ Branch 1 taken 1724 times.
92088 const size_t src_row_idx = KAI_MIN(dst_row_idx + i, n - 1);
269 92088 ((float*)dst_row)[i] = bias[src_row_idx];
270 92088 }
271 }
272 // Move the pointer after the row sum
273 17166 dst_row += kai_num_bytes_bias * nr;
274 17166 }
275 1108 }
276