KleidiAI Coverage Report


Directory: ./
File: kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 97.8% 45 10 56
Functions: 100.0% 16 0 16
Branches: 50.0% 1 20 22

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 // Do not flag up inline assembly blocks
8 #pragma GCC diagnostic ignored "-Woverlength-strings"
9
10 #if !defined(__aarch64__) && !defined(__ARM_FEATURE_DOTPROD)
11 #error "Dotprod extension required to compile this micro-kernel"
12 #else // Architectural features check.
13
14 #include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
15
16 #include <stddef.h>
17 #include <stdint.h>
18
19 #include "kai/kai_common.h"
20
21 // Compute args
22 static const size_t kai_m_step = 16;
23 static const size_t kai_n_step = 4;
24 // Packing args
25 static const size_t kai_mr = 4;
26 static const size_t kai_nr = 4;
27 static const size_t kai_kr = 8;
28 static const size_t kai_sr = 2;
29 // LHS format args (num. bytes per value, multiplier, zero_point (if asymmetric))
30 static const size_t kai_num_bytes_qvalue_lhs = 1;
31 static const size_t kai_num_bytes_multiplier_lhs = 2;
32 // RHS format args (num. bytes per value, multiplier, zero_point (if asymmetric), and reduction sum (if LHS is
33 // asymmetric))
34 static const size_t kai_recip_num_bytes_qvalue_rhs = 2;
35 static const size_t kai_num_bytes_multiplier_rhs = 2;
36 // DST format args
37 static const size_t kai_num_bytes_dst_value = 4;
38 // Extra args
39 static const size_t kai_bl = 32;
40
41 46 inline static size_t kai_num_bytes_per_block_lhs(size_t bl) {
42 46 return (bl * kai_num_bytes_qvalue_lhs) + kai_num_bytes_multiplier_lhs;
43 }
44
45 46 inline static size_t kai_num_bytes_per_block_rhs(size_t bl) {
46 KAI_ASSUME(bl == kai_bl);
47 46 size_t num_bytes_per_block_rhs = (bl / kai_recip_num_bytes_qvalue_rhs) + kai_num_bytes_multiplier_rhs;
48 92 return num_bytes_per_block_rhs;
49 46 }
50
51 116 inline static size_t kai_num_blocks_per_row(size_t k, size_t bl) {
52 KAI_ASSUME(bl == kai_bl);
53 KAI_ASSUME((k % kai_bl) == 0);
54
55 116 return kai_roundup(k, bl) / bl;
56 }
57
58 46 inline static size_t kai_lhs_packed_stride(size_t k, size_t bl) {
59 46 return kai_mr * kai_num_blocks_per_row(k, bl) * kai_num_bytes_per_block_lhs(bl);
60 }
61
62 46 inline static size_t kai_rhs_packed_stride(size_t k, size_t bl) {
63 KAI_ASSUME(bl == kai_bl);
64 KAI_ASSUME((k % kai_bl) == 0);
65
66 46 const size_t num_blocks_per_row = kai_num_blocks_per_row(k, bl);
67 46 const size_t num_bytes_per_block = kai_num_bytes_per_block_rhs(bl);
68
69 46 size_t rhs_packed_stride = kai_nr * (num_bytes_per_block * num_blocks_per_row);
70
71 92 return rhs_packed_stride;
72 46 }
73
74 72 size_t kai_get_m_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
75 72 return kai_m_step;
76 }
77
78 72 size_t kai_get_n_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
79 72 return kai_n_step;
80 }
81
82 48 size_t kai_get_mr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
83 48 return kai_mr;
84 }
85
86 48 size_t kai_get_nr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
87 48 return kai_nr;
88 }
89
90 72 size_t kai_get_kr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
91 72 return kai_kr;
92 }
93
94 48 size_t kai_get_sr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(void) {
95 48 return kai_sr;
96 }
97
98 46 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(
99 size_t m_idx, size_t k, size_t bl) {
100 KAI_ASSUME((m_idx % kai_m_step) == 0);
101
102 46 return (m_idx / kai_mr) * kai_lhs_packed_stride(k, bl);
103 }
104
105 46 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(
106 size_t n_idx, size_t k, size_t bl) {
107 KAI_ASSUME((n_idx % kai_n_step) == 0);
108
109 46 return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl);
110 }
111
112 23 size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(
113 size_t m_idx, size_t n_idx, size_t dst_stride) {
114 KAI_ASSUME((m_idx % kai_m_step) == 0);
115 KAI_ASSUME((n_idx % kai_n_step) == 0);
116
117 23 return (n_idx * kai_num_bytes_dst_value) + m_idx * dst_stride;
118 }
119
120 23 size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(size_t m, size_t n) {
121 23 return m * n * kai_num_bytes_dst_value;
122 }
123
124 24 void kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod(
125 size_t m, //
126 size_t n, //
127 size_t k, //
128 size_t bl, //
129 const void* restrict lhs_packed, //
130 const void* restrict rhs_packed, //
131 float* restrict dst, // NOLINT(readability-non-const-parameter)
132 size_t dst_stride_row, //
133 size_t dst_stride_col, //
134 float scalar_min, //
135 float scalar_max) {
136 KAI_ASSUME(dst_stride_col == sizeof(float));
137
138 24 KAI_UNUSED(scalar_min);
139 24 KAI_UNUSED(scalar_max);
140
141
1/2
✓ Branch 0 taken 24 times.
✗ Branch 1 not taken.
24 if (m == 0) {
142 return;
143 }
144
145 24 size_t num_blocks = kai_num_blocks_per_row(k, bl);
146
147 48 __asm__ __volatile__(
148 "mov x13, %x[m]\n"
149 "mov x12, #0x88\n"
150 "cmp x13, #0x10\n"
151 "mul x12, %x[num_blocks], x12\n"
152 "blt 14f\n"
153 "1:" // Row loop
154 "mov x11, %x[rhs_packed]\n"
155 "mov x10, %x[n]\n"
156 "add x9, %x[dst], %x[dst_stride_row], LSL #4\n"
157 "2:" // Column loop
158 "mov x27, %x[lhs_packed]\n"
159 "movi v31.16b, #0x0\n"
160 "movi v30.16b, #0x0\n"
161 "mov x23, %x[num_blocks]\n"
162 "movi v29.16b, #0x0\n"
163 "movi v28.16b, #0x0\n"
164 "movi v27.16b, #0x0\n"
165 "movi v26.16b, #0x0\n"
166 "add x22, x27, x12\n"
167 "add x21, x22, x12\n"
168 "movi v25.16b, #0x0\n"
169 "movi v24.16b, #0x0\n"
170 "add x20, x21, x12\n"
171 "movi v23.16b, #0x0\n"
172 "movi v22.16b, #0x0\n"
173 "movi v21.16b, #0x0\n"
174 "movi v20.16b, #0x0\n"
175 "movi v19.16b, #0x0\n"
176 "movi v18.16b, #0x0\n"
177 "movi v17.16b, #0x0\n"
178 "movi v16.16b, #0x0\n"
179 "3:" // Block loop
180 "ldr d15, [x11, #0x0]\n"
181 "ldr d1, [x27, #0x0]\n"
182 "add x11, x11, #0x8\n"
183 "add x27, x27, #0x8\n"
184 "ldr q0, [x11, #0x0]\n"
185 "ldr q6, [x27, #0x0]\n"
186 "movi v12.4s, #0x0\n"
187 "movi v8.4s, #0x0\n"
188 "ldr q11, [x11, #0x10]\n"
189 "ldr q5, [x27, #0x10]\n"
190 "movi v10.4s, #0x0\n"
191 "movi v13.4s, #0x0\n"
192 "ldr q7, [x11, #0x20]\n"
193 "ldr q3, [x27, #0x20]\n"
194 "movi v9.16b, #0xf0\n"
195 "fcvtl v14.4s, v15.4h\n"
196 "ldr q15, [x11, #0x30]\n"
197 "ldr q4, [x27, #0x30]\n"
198 "shl v2.16b, v0.16b, #0x4\n"
199 "fcvtl v1.4s, v1.4h\n"
200 "and v0.16b, v0.16b, v9.16b\n"
201 "add x11, x11, #0x40\n"
202 ".inst 0x4f86e04c // sdot v12.4s, v2.16b, v6.4b[0]\n"
203 ".inst 0x4fa6e048 // sdot v8.4s, v2.16b, v6.4b[1]\n"
204 ".inst 0x4f86e84a // sdot v10.4s, v2.16b, v6.4b[2]\n"
205 ".inst 0x4fa6e84d // sdot v13.4s, v2.16b, v6.4b[3]\n"
206 "shl v6.16b, v11.16b, #0x4\n"
207 "and v11.16b, v11.16b, v9.16b\n"
208 ".inst 0x4f85e0cc // sdot v12.4s, v6.16b, v5.4b[0]\n"
209 ".inst 0x4fa5e0c8 // sdot v8.4s, v6.16b, v5.4b[1]\n"
210 ".inst 0x4f85e8ca // sdot v10.4s, v6.16b, v5.4b[2]\n"
211 ".inst 0x4fa5e8cd // sdot v13.4s, v6.16b, v5.4b[3]\n"
212 "shl v5.16b, v7.16b, #0x4\n"
213 "and v7.16b, v7.16b, v9.16b\n"
214 ".inst 0x4f83e0ac // sdot v12.4s, v5.16b, v3.4b[0]\n"
215 ".inst 0x4fa3e0a8 // sdot v8.4s, v5.16b, v3.4b[1]\n"
216 ".inst 0x4f83e8aa // sdot v10.4s, v5.16b, v3.4b[2]\n"
217 ".inst 0x4fa3e8ad // sdot v13.4s, v5.16b, v3.4b[3]\n"
218 "shl v3.16b, v15.16b, #0x4\n"
219 "and v15.16b, v15.16b, v9.16b\n"
220 "ldr q9, [x27, #0x40]\n"
221 ".inst 0x4f84e06c // sdot v12.4s, v3.16b, v4.4b[0]\n"
222 ".inst 0x4fa4e068 // sdot v8.4s, v3.16b, v4.4b[1]\n"
223 ".inst 0x4f84e86a // sdot v10.4s, v3.16b, v4.4b[2]\n"
224 ".inst 0x4fa4e86d // sdot v13.4s, v3.16b, v4.4b[3]\n"
225 "ldr q4, [x27, #0x50]\n"
226 ".inst 0x4f89e00c // sdot v12.4s, v0.16b, v9.4b[0]\n"
227 ".inst 0x4fa9e008 // sdot v8.4s, v0.16b, v9.4b[1]\n"
228 ".inst 0x4f89e80a // sdot v10.4s, v0.16b, v9.4b[2]\n"
229 ".inst 0x4fa9e80d // sdot v13.4s, v0.16b, v9.4b[3]\n"
230 "ldr q9, [x27, #0x60]\n"
231 ".inst 0x4f84e16c // sdot v12.4s, v11.16b, v4.4b[0]\n"
232 ".inst 0x4fa4e168 // sdot v8.4s, v11.16b, v4.4b[1]\n"
233 ".inst 0x4f84e96a // sdot v10.4s, v11.16b, v4.4b[2]\n"
234 ".inst 0x4fa4e96d // sdot v13.4s, v11.16b, v4.4b[3]\n"
235 "ldr q4, [x27, #0x70]\n"
236 "add x27, x27, #0x80\n"
237 ".inst 0x4f89e0ec // sdot v12.4s, v7.16b, v9.4b[0]\n"
238 ".inst 0x4fa9e0e8 // sdot v8.4s, v7.16b, v9.4b[1]\n"
239 ".inst 0x4f89e8ea // sdot v10.4s, v7.16b, v9.4b[2]\n"
240 ".inst 0x4fa9e8ed // sdot v13.4s, v7.16b, v9.4b[3]\n"
241 "fmul v9.4s, v14.4s, v1.s[0]\n"
242 ".inst 0x4f84e1ec // sdot v12.4s, v15.16b, v4.4b[0]\n"
243 ".inst 0x4fa4e1e8 // sdot v8.4s, v15.16b, v4.4b[1]\n"
244 ".inst 0x4f84e9ea // sdot v10.4s, v15.16b, v4.4b[2]\n"
245 ".inst 0x4fa4e9ed // sdot v13.4s, v15.16b, v4.4b[3]\n"
246 "fmul v4.4s, v14.4s, v1.s[1]\n"
247 "scvtf v12.4s, v12.4s, #0x4\n"
248 "scvtf v8.4s, v8.4s, #0x4\n"
249 "scvtf v10.4s, v10.4s, #0x4\n"
250 "scvtf v13.4s, v13.4s, #0x4\n"
251 "fmla v31.4s, v12.4s, v9.4s\n"
252 "fmul v12.4s, v14.4s, v1.s[2]\n"
253 "fmul v1.4s, v14.4s, v1.s[3]\n"
254 "fmla v30.4s, v8.4s, v4.4s\n"
255 "fmla v29.4s, v10.4s, v12.4s\n"
256 "fmla v28.4s, v13.4s, v1.4s\n"
257 "ldr d13, [x22, #0x0]\n"
258 "add x22, x22, #0x8\n"
259 "movi v10.4s, #0x0\n"
260 "movi v8.4s, #0x0\n"
261 "ldr q12, [x22, #0x0]\n"
262 "ldr q1, [x22, #0x10]\n"
263 "movi v9.4s, #0x0\n"
264 "movi v4.4s, #0x0\n"
265 "fcvtl v13.4s, v13.4h\n"
266 ".inst 0x4f8ce04a // sdot v10.4s, v2.16b, v12.4b[0]\n"
267 ".inst 0x4face048 // sdot v8.4s, v2.16b, v12.4b[1]\n"
268 ".inst 0x4f8ce849 // sdot v9.4s, v2.16b, v12.4b[2]\n"
269 ".inst 0x4face844 // sdot v4.4s, v2.16b, v12.4b[3]\n"
270 "ldr q12, [x22, #0x20]\n"
271 ".inst 0x4f81e0ca // sdot v10.4s, v6.16b, v1.4b[0]\n"
272 ".inst 0x4fa1e0c8 // sdot v8.4s, v6.16b, v1.4b[1]\n"
273 ".inst 0x4f81e8c9 // sdot v9.4s, v6.16b, v1.4b[2]\n"
274 ".inst 0x4fa1e8c4 // sdot v4.4s, v6.16b, v1.4b[3]\n"
275 "ldr q1, [x22, #0x30]\n"
276 ".inst 0x4f8ce0aa // sdot v10.4s, v5.16b, v12.4b[0]\n"
277 ".inst 0x4face0a8 // sdot v8.4s, v5.16b, v12.4b[1]\n"
278 ".inst 0x4f8ce8a9 // sdot v9.4s, v5.16b, v12.4b[2]\n"
279 ".inst 0x4face8a4 // sdot v4.4s, v5.16b, v12.4b[3]\n"
280 "ldr q12, [x22, #0x40]\n"
281 ".inst 0x4f81e06a // sdot v10.4s, v3.16b, v1.4b[0]\n"
282 ".inst 0x4fa1e068 // sdot v8.4s, v3.16b, v1.4b[1]\n"
283 ".inst 0x4f81e869 // sdot v9.4s, v3.16b, v1.4b[2]\n"
284 ".inst 0x4fa1e864 // sdot v4.4s, v3.16b, v1.4b[3]\n"
285 "ldr q1, [x22, #0x50]\n"
286 ".inst 0x4f8ce00a // sdot v10.4s, v0.16b, v12.4b[0]\n"
287 ".inst 0x4face008 // sdot v8.4s, v0.16b, v12.4b[1]\n"
288 ".inst 0x4f8ce809 // sdot v9.4s, v0.16b, v12.4b[2]\n"
289 ".inst 0x4face804 // sdot v4.4s, v0.16b, v12.4b[3]\n"
290 "ldr q12, [x22, #0x60]\n"
291 ".inst 0x4f81e16a // sdot v10.4s, v11.16b, v1.4b[0]\n"
292 ".inst 0x4fa1e168 // sdot v8.4s, v11.16b, v1.4b[1]\n"
293 ".inst 0x4f81e969 // sdot v9.4s, v11.16b, v1.4b[2]\n"
294 ".inst 0x4fa1e964 // sdot v4.4s, v11.16b, v1.4b[3]\n"
295 "ldr q1, [x22, #0x70]\n"
296 "add x22, x22, #0x80\n"
297 ".inst 0x4f8ce0ea // sdot v10.4s, v7.16b, v12.4b[0]\n"
298 ".inst 0x4face0e8 // sdot v8.4s, v7.16b, v12.4b[1]\n"
299 ".inst 0x4f8ce8e9 // sdot v9.4s, v7.16b, v12.4b[2]\n"
300 ".inst 0x4face8e4 // sdot v4.4s, v7.16b, v12.4b[3]\n"
301 "fmul v12.4s, v14.4s, v13.s[0]\n"
302 ".inst 0x4f81e1ea // sdot v10.4s, v15.16b, v1.4b[0]\n"
303 ".inst 0x4fa1e1e8 // sdot v8.4s, v15.16b, v1.4b[1]\n"
304 ".inst 0x4f81e9e9 // sdot v9.4s, v15.16b, v1.4b[2]\n"
305 ".inst 0x4fa1e9e4 // sdot v4.4s, v15.16b, v1.4b[3]\n"
306 "fmul v1.4s, v14.4s, v13.s[1]\n"
307 "scvtf v10.4s, v10.4s, #0x4\n"
308 "scvtf v8.4s, v8.4s, #0x4\n"
309 "scvtf v9.4s, v9.4s, #0x4\n"
310 "scvtf v4.4s, v4.4s, #0x4\n"
311 "fmla v27.4s, v10.4s, v12.4s\n"
312 "fmul v10.4s, v14.4s, v13.s[2]\n"
313 "fmul v13.4s, v14.4s, v13.s[3]\n"
314 "fmla v26.4s, v8.4s, v1.4s\n"
315 "fmla v25.4s, v9.4s, v10.4s\n"
316 "fmla v24.4s, v4.4s, v13.4s\n"
317 "ldr d12, [x21, #0x0]\n"
318 "add x21, x21, #0x8\n"
319 "movi v9.4s, #0x0\n"
320 "movi v8.4s, #0x0\n"
321 "ldr q1, [x21, #0x0]\n"
322 "ldr q4, [x21, #0x10]\n"
323 "movi v10.4s, #0x0\n"
324 "movi v13.4s, #0x0\n"
325 "fcvtl v12.4s, v12.4h\n"
326 ".inst 0x4f81e049 // sdot v9.4s, v2.16b, v1.4b[0]\n"
327 ".inst 0x4fa1e048 // sdot v8.4s, v2.16b, v1.4b[1]\n"
328 ".inst 0x4f81e84a // sdot v10.4s, v2.16b, v1.4b[2]\n"
329 ".inst 0x4fa1e84d // sdot v13.4s, v2.16b, v1.4b[3]\n"
330 "ldr q1, [x21, #0x20]\n"
331 ".inst 0x4f84e0c9 // sdot v9.4s, v6.16b, v4.4b[0]\n"
332 ".inst 0x4fa4e0c8 // sdot v8.4s, v6.16b, v4.4b[1]\n"
333 ".inst 0x4f84e8ca // sdot v10.4s, v6.16b, v4.4b[2]\n"
334 ".inst 0x4fa4e8cd // sdot v13.4s, v6.16b, v4.4b[3]\n"
335 "ldr q4, [x21, #0x30]\n"
336 ".inst 0x4f81e0a9 // sdot v9.4s, v5.16b, v1.4b[0]\n"
337 ".inst 0x4fa1e0a8 // sdot v8.4s, v5.16b, v1.4b[1]\n"
338 ".inst 0x4f81e8aa // sdot v10.4s, v5.16b, v1.4b[2]\n"
339 ".inst 0x4fa1e8ad // sdot v13.4s, v5.16b, v1.4b[3]\n"
340 "ldr q1, [x21, #0x40]\n"
341 ".inst 0x4f84e069 // sdot v9.4s, v3.16b, v4.4b[0]\n"
342 ".inst 0x4fa4e068 // sdot v8.4s, v3.16b, v4.4b[1]\n"
343 ".inst 0x4f84e86a // sdot v10.4s, v3.16b, v4.4b[2]\n"
344 ".inst 0x4fa4e86d // sdot v13.4s, v3.16b, v4.4b[3]\n"
345 "ldr q4, [x21, #0x50]\n"
346 ".inst 0x4f81e009 // sdot v9.4s, v0.16b, v1.4b[0]\n"
347 ".inst 0x4fa1e008 // sdot v8.4s, v0.16b, v1.4b[1]\n"
348 ".inst 0x4f81e80a // sdot v10.4s, v0.16b, v1.4b[2]\n"
349 ".inst 0x4fa1e80d // sdot v13.4s, v0.16b, v1.4b[3]\n"
350 "ldr q1, [x21, #0x60]\n"
351 ".inst 0x4f84e169 // sdot v9.4s, v11.16b, v4.4b[0]\n"
352 ".inst 0x4fa4e168 // sdot v8.4s, v11.16b, v4.4b[1]\n"
353 ".inst 0x4f84e96a // sdot v10.4s, v11.16b, v4.4b[2]\n"
354 ".inst 0x4fa4e96d // sdot v13.4s, v11.16b, v4.4b[3]\n"
355 "ldr q4, [x21, #0x70]\n"
356 "add x21, x21, #0x80\n"
357 ".inst 0x4f81e0e9 // sdot v9.4s, v7.16b, v1.4b[0]\n"
358 ".inst 0x4fa1e0e8 // sdot v8.4s, v7.16b, v1.4b[1]\n"
359 ".inst 0x4f81e8ea // sdot v10.4s, v7.16b, v1.4b[2]\n"
360 ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
361 "fmul v1.4s, v14.4s, v12.s[0]\n"
362 ".inst 0x4f84e1e9 // sdot v9.4s, v15.16b, v4.4b[0]\n"
363 ".inst 0x4fa4e1e8 // sdot v8.4s, v15.16b, v4.4b[1]\n"
364 ".inst 0x4f84e9ea // sdot v10.4s, v15.16b, v4.4b[2]\n"
365 ".inst 0x4fa4e9ed // sdot v13.4s, v15.16b, v4.4b[3]\n"
366 "fmul v4.4s, v14.4s, v12.s[1]\n"
367 "scvtf v9.4s, v9.4s, #0x4\n"
368 "scvtf v8.4s, v8.4s, #0x4\n"
369 "scvtf v10.4s, v10.4s, #0x4\n"
370 "scvtf v13.4s, v13.4s, #0x4\n"
371 "fmla v23.4s, v9.4s, v1.4s\n"
372 "fmul v1.4s, v14.4s, v12.s[2]\n"
373 "fmul v9.4s, v14.4s, v12.s[3]\n"
374 "fmla v22.4s, v8.4s, v4.4s\n"
375 "fmla v21.4s, v10.4s, v1.4s\n"
376 "fmla v20.4s, v13.4s, v9.4s\n"
377 "ldr d13, [x20, #0x0]\n"
378 "add x20, x20, #0x8\n"
379 "movi v12.4s, #0x0\n"
380 "movi v8.4s, #0x0\n"
381 "ldr q4, [x20, #0x0]\n"
382 "ldr q1, [x20, #0x10]\n"
383 "movi v10.4s, #0x0\n"
384 "movi v9.4s, #0x0\n"
385 "fcvtl v13.4s, v13.4h\n"
386 ".inst 0x4f84e04c // sdot v12.4s, v2.16b, v4.4b[0]\n"
387 ".inst 0x4fa4e048 // sdot v8.4s, v2.16b, v4.4b[1]\n"
388 ".inst 0x4f84e84a // sdot v10.4s, v2.16b, v4.4b[2]\n"
389 ".inst 0x4fa4e849 // sdot v9.4s, v2.16b, v4.4b[3]\n"
390 "ldr q2, [x20, #0x20]\n"
391 "ldr q4, [x20, #0x30]\n"
392 ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
393 ".inst 0x4fa1e0c8 // sdot v8.4s, v6.16b, v1.4b[1]\n"
394 ".inst 0x4f81e8ca // sdot v10.4s, v6.16b, v1.4b[2]\n"
395 ".inst 0x4fa1e8c9 // sdot v9.4s, v6.16b, v1.4b[3]\n"
396 "ldr q1, [x20, #0x40]\n"
397 "ldr q6, [x20, #0x50]\n"
398 ".inst 0x4f82e0ac // sdot v12.4s, v5.16b, v2.4b[0]\n"
399 ".inst 0x4fa2e0a8 // sdot v8.4s, v5.16b, v2.4b[1]\n"
400 ".inst 0x4f82e8aa // sdot v10.4s, v5.16b, v2.4b[2]\n"
401 ".inst 0x4fa2e8a9 // sdot v9.4s, v5.16b, v2.4b[3]\n"
402 "ldr q2, [x20, #0x60]\n"
403 "ldr q5, [x20, #0x70]\n"
404 "add x20, x20, #0x80\n"
405 ".inst 0x4f84e06c // sdot v12.4s, v3.16b, v4.4b[0]\n"
406 ".inst 0x4fa4e068 // sdot v8.4s, v3.16b, v4.4b[1]\n"
407 ".inst 0x4f84e86a // sdot v10.4s, v3.16b, v4.4b[2]\n"
408 ".inst 0x4fa4e869 // sdot v9.4s, v3.16b, v4.4b[3]\n"
409 "fmul v3.4s, v14.4s, v13.s[0]\n"
410 "fmul v4.4s, v14.4s, v13.s[1]\n"
411 ".inst 0x4f81e00c // sdot v12.4s, v0.16b, v1.4b[0]\n"
412 ".inst 0x4fa1e008 // sdot v8.4s, v0.16b, v1.4b[1]\n"
413 ".inst 0x4f81e80a // sdot v10.4s, v0.16b, v1.4b[2]\n"
414 ".inst 0x4fa1e809 // sdot v9.4s, v0.16b, v1.4b[3]\n"
415 "fmul v1.4s, v14.4s, v13.s[2]\n"
416 "fmul v0.4s, v14.4s, v13.s[3]\n"
417 ".inst 0x4f86e16c // sdot v12.4s, v11.16b, v6.4b[0]\n"
418 ".inst 0x4fa6e168 // sdot v8.4s, v11.16b, v6.4b[1]\n"
419 ".inst 0x4f86e96a // sdot v10.4s, v11.16b, v6.4b[2]\n"
420 ".inst 0x4fa6e969 // sdot v9.4s, v11.16b, v6.4b[3]\n"
421 ".inst 0x4f82e0ec // sdot v12.4s, v7.16b, v2.4b[0]\n"
422 ".inst 0x4fa2e0e8 // sdot v8.4s, v7.16b, v2.4b[1]\n"
423 ".inst 0x4f82e8ea // sdot v10.4s, v7.16b, v2.4b[2]\n"
424 ".inst 0x4fa2e8e9 // sdot v9.4s, v7.16b, v2.4b[3]\n"
425 ".inst 0x4f85e1ec // sdot v12.4s, v15.16b, v5.4b[0]\n"
426 ".inst 0x4fa5e1e8 // sdot v8.4s, v15.16b, v5.4b[1]\n"
427 ".inst 0x4f85e9ea // sdot v10.4s, v15.16b, v5.4b[2]\n"
428 ".inst 0x4fa5e9e9 // sdot v9.4s, v15.16b, v5.4b[3]\n"
429 "scvtf v12.4s, v12.4s, #0x4\n"
430 "scvtf v8.4s, v8.4s, #0x4\n"
431 "fmla v19.4s, v12.4s, v3.4s\n"
432 "scvtf v10.4s, v10.4s, #0x4\n"
433 "scvtf v9.4s, v9.4s, #0x4\n"
434 "fmla v18.4s, v8.4s, v4.4s\n"
435 "fmla v17.4s, v10.4s, v1.4s\n"
436 "fmla v16.4s, v9.4s, v0.4s\n"
437 "subs x23, x23, #0x1\n"
438 "bgt 3b\n"
439 "cmp x10, #0x4\n"
440 "blt 8f\n"
441 "mov x20, %x[dst]\n"
442 "str q31, [x20, #0x0]\n"
443 "add x20, x20, %x[dst_stride_row]\n"
444 "str q30, [x20, #0x0]\n"
445 "add x20, x20, %x[dst_stride_row]\n"
446 "str q29, [x20, #0x0]\n"
447 "add x20, x20, %x[dst_stride_row]\n"
448 "str q28, [x20, #0x0]\n"
449 "add x20, x20, %x[dst_stride_row]\n"
450 "str q27, [x20, #0x0]\n"
451 "add x20, x20, %x[dst_stride_row]\n"
452 "str q26, [x20, #0x0]\n"
453 "add x20, x20, %x[dst_stride_row]\n"
454 "str q25, [x20, #0x0]\n"
455 "add x20, x20, %x[dst_stride_row]\n"
456 "str q24, [x20, #0x0]\n"
457 "add x20, x20, %x[dst_stride_row]\n"
458 "str q23, [x20, #0x0]\n"
459 "add x20, x20, %x[dst_stride_row]\n"
460 "str q22, [x20, #0x0]\n"
461 "add x20, x20, %x[dst_stride_row]\n"
462 "str q21, [x20, #0x0]\n"
463 "add x20, x20, %x[dst_stride_row]\n"
464 "str q20, [x20, #0x0]\n"
465 "add x20, x20, %x[dst_stride_row]\n"
466 "str q19, [x20, #0x0]\n"
467 "add x20, x20, %x[dst_stride_row]\n"
468 "str q18, [x20, #0x0]\n"
469 "add x20, x20, %x[dst_stride_row]\n"
470 "str q17, [x20, #0x0]\n"
471 "add x20, x20, %x[dst_stride_row]\n"
472 "str q16, [x20, #0x0]\n"
473 "b 13f\n"
474 "8:" // Partial output
475 "mov x28, %x[dst]\n"
476 "add x26, x28, %x[dst_stride_row], LSL #2\n"
477 "add x25, x26, %x[dst_stride_row], LSL #1\n"
478 "add x24, x26, %x[dst_stride_row]\n"
479 "add x23, x25, %x[dst_stride_row]\n"
480 "add x22, x28, %x[dst_stride_row], LSL #1\n"
481 "add x21, x28, %x[dst_stride_row]\n"
482 "add x20, x22, %x[dst_stride_row]\n"
483 "add x27, x23, %x[dst_stride_row]\n"
484 "tbz x10, #1, 9f\n"
485 "st1 { v24.d }[0], [x23], #0x8\n"
486 "st1 { v25.d }[0], [x25], #0x8\n"
487 "st1 { v26.d }[0], [x24], #0x8\n"
488 "st1 { v27.d }[0], [x26], #0x8\n"
489 "st1 { v28.d }[0], [x20], #0x8\n"
490 "st1 { v29.d }[0], [x22], #0x8\n"
491 "st1 { v30.d }[0], [x21], #0x8\n"
492 "st1 { v31.d }[0], [x28], #0x8\n"
493 "tbz x10, #0, 10f\n"
494 "st1 { v24.s }[2], [x23]\n"
495 "st1 { v25.s }[2], [x25]\n"
496 "st1 { v26.s }[2], [x24]\n"
497 "st1 { v27.s }[2], [x26]\n"
498 "st1 { v28.s }[2], [x20]\n"
499 "st1 { v29.s }[2], [x22]\n"
500 "st1 { v30.s }[2], [x21]\n"
501 "st1 { v31.s }[2], [x28]\n"
502 "b 10f\n"
503 "9:" // Output block 0: partial_1_0
504 "st1 { v24.s }[0], [x23]\n"
505 "st1 { v25.s }[0], [x25]\n"
506 "st1 { v26.s }[0], [x24]\n"
507 "st1 { v27.s }[0], [x26]\n"
508 "st1 { v28.s }[0], [x20]\n"
509 "st1 { v29.s }[0], [x22]\n"
510 "st1 { v30.s }[0], [x21]\n"
511 "st1 { v31.s }[0], [x28]\n"
512 "10:" // Output block 0: Done
513 "add x26, x27, %x[dst_stride_row], LSL #2\n"
514 "add x25, x27, %x[dst_stride_row], LSL #1\n"
515 "add x24, x26, %x[dst_stride_row], LSL #1\n"
516 "add x23, x27, %x[dst_stride_row]\n"
517 "add x22, x25, %x[dst_stride_row]\n"
518 "add x21, x26, %x[dst_stride_row]\n"
519 "add x20, x24, %x[dst_stride_row]\n"
520 "tbz x10, #1, 11f\n"
521 "st1 { v16.d }[0], [x20], #0x8\n"
522 "st1 { v17.d }[0], [x24], #0x8\n"
523 "st1 { v18.d }[0], [x21], #0x8\n"
524 "st1 { v19.d }[0], [x26], #0x8\n"
525 "st1 { v20.d }[0], [x22], #0x8\n"
526 "st1 { v21.d }[0], [x25], #0x8\n"
527 "st1 { v22.d }[0], [x23], #0x8\n"
528 "st1 { v23.d }[0], [x27], #0x8\n"
529 "tbz x10, #0, 12f\n"
530 "st1 { v16.s }[2], [x20]\n"
531 "st1 { v17.s }[2], [x24]\n"
532 "st1 { v18.s }[2], [x21]\n"
533 "st1 { v19.s }[2], [x26]\n"
534 "st1 { v20.s }[2], [x22]\n"
535 "st1 { v21.s }[2], [x25]\n"
536 "st1 { v22.s }[2], [x23]\n"
537 "st1 { v23.s }[2], [x27]\n"
538 "b 12f\n"
539 "11:" // Output block 1: partial_1_0
540 "st1 { v16.s }[0], [x20]\n"
541 "st1 { v17.s }[0], [x24]\n"
542 "st1 { v18.s }[0], [x21]\n"
543 "st1 { v19.s }[0], [x26]\n"
544 "st1 { v20.s }[0], [x22]\n"
545 "st1 { v21.s }[0], [x25]\n"
546 "st1 { v22.s }[0], [x23]\n"
547 "st1 { v23.s }[0], [x27]\n"
548 "12:" // Output block 1: Done
549 "13:" // Output stage exit
550 "subs x10, x10, #0x4\n"
551 "add %x[dst], %x[dst], #0x10\n"
552 "bgt 2b\n"
553 "mov x20, #0x4\n"
554 "sub x13, x13, #0x10\n"
555 "cmp x13, #0x10\n"
556 "mov %x[dst], x9\n"
557 "madd %x[lhs_packed], x20, x12, %x[lhs_packed]\n"
558 "bge 1b\n"
559 "14:" // Row loop skip
560 "cbz x13, 23f\n"
561 "15:" // Row tail: Row loop
562 "mov x26, %x[rhs_packed]\n"
563 "mov x25, %x[n]\n"
564 "add x24, %x[dst], %x[dst_stride_row], LSL #2\n"
565 "16:" // Row tail: Column loop
566 "movi v31.16b, #0x0\n"
567 "movi v30.16b, #0x0\n"
568 "mov x27, %x[lhs_packed]\n"
569 "mov x20, %x[num_blocks]\n"
570 "movi v29.16b, #0x0\n"
571 "movi v28.16b, #0x0\n"
572 "17:" // Row tail: Block loop
573 "ldr d16, [x26, #0x0]\n"
574 "ldr d11, [x27, #0x0]\n"
575 "add x26, x26, #0x8\n"
576 "add x27, x27, #0x8\n"
577 "ldr q10, [x26, #0x0]\n"
578 "ldr q18, [x27, #0x0]\n"
579 "movi v9.4s, #0x0\n"
580 "movi v8.4s, #0x0\n"
581 "ldr q7, [x26, #0x10]\n"
582 "ldr q6, [x27, #0x10]\n"
583 "movi v5.4s, #0x0\n"
584 "movi v4.4s, #0x0\n"
585 "ldr q3, [x26, #0x20]\n"
586 "ldr q2, [x27, #0x20]\n"
587 "movi v17.16b, #0xf0\n"
588 "fcvtl v1.4s, v16.4h\n"
589 "ldr q0, [x26, #0x30]\n"
590 "ldr q27, [x27, #0x30]\n"
591 "shl v16.16b, v10.16b, #0x4\n"
592 "fcvtl v11.4s, v11.4h\n"
593 "ldr q26, [x27, #0x40]\n"
594 "ldr q25, [x27, #0x50]\n"
595 "shl v24.16b, v7.16b, #0x4\n"
596 "and v10.16b, v10.16b, v17.16b\n"
597 "ldr q23, [x27, #0x60]\n"
598 "ldr q22, [x27, #0x70]\n"
599 "shl v21.16b, v3.16b, #0x4\n"
600 "and v7.16b, v7.16b, v17.16b\n"
601 ".inst 0x4f92e209 // sdot v9.4s, v16.16b, v18.4b[0]\n"
602 ".inst 0x4fb2e208 // sdot v8.4s, v16.16b, v18.4b[1]\n"
603 "shl v20.16b, v0.16b, #0x4\n"
604 "add x26, x26, #0x40\n"
605 ".inst 0x4f92ea05 // sdot v5.4s, v16.16b, v18.4b[2]\n"
606 ".inst 0x4fb2ea04 // sdot v4.4s, v16.16b, v18.4b[3]\n"
607 "and v3.16b, v3.16b, v17.16b\n"
608 "add x27, x27, #0x80\n"
609 "and v0.16b, v0.16b, v17.16b\n"
610 "fmul v19.4s, v1.4s, v11.s[0]\n"
611 "fmul v18.4s, v1.4s, v11.s[1]\n"
612 "fmul v17.4s, v1.4s, v11.s[2]\n"
613 ".inst 0x4f86e309 // sdot v9.4s, v24.16b, v6.4b[0]\n"
614 ".inst 0x4fa6e308 // sdot v8.4s, v24.16b, v6.4b[1]\n"
615 "fmul v16.4s, v1.4s, v11.s[3]\n"
616 ".inst 0x4f86eb05 // sdot v5.4s, v24.16b, v6.4b[2]\n"
617 ".inst 0x4fa6eb04 // sdot v4.4s, v24.16b, v6.4b[3]\n"
618 ".inst 0x4f82e2a9 // sdot v9.4s, v21.16b, v2.4b[0]\n"
619 ".inst 0x4fa2e2a8 // sdot v8.4s, v21.16b, v2.4b[1]\n"
620 ".inst 0x4f82eaa5 // sdot v5.4s, v21.16b, v2.4b[2]\n"
621 ".inst 0x4fa2eaa4 // sdot v4.4s, v21.16b, v2.4b[3]\n"
622 ".inst 0x4f9be289 // sdot v9.4s, v20.16b, v27.4b[0]\n"
623 ".inst 0x4fbbe288 // sdot v8.4s, v20.16b, v27.4b[1]\n"
624 ".inst 0x4f9bea85 // sdot v5.4s, v20.16b, v27.4b[2]\n"
625 ".inst 0x4fbbea84 // sdot v4.4s, v20.16b, v27.4b[3]\n"
626 ".inst 0x4f9ae149 // sdot v9.4s, v10.16b, v26.4b[0]\n"
627 ".inst 0x4fbae148 // sdot v8.4s, v10.16b, v26.4b[1]\n"
628 ".inst 0x4f9ae945 // sdot v5.4s, v10.16b, v26.4b[2]\n"
629 ".inst 0x4fbae944 // sdot v4.4s, v10.16b, v26.4b[3]\n"
630 ".inst 0x4f99e0e9 // sdot v9.4s, v7.16b, v25.4b[0]\n"
631 ".inst 0x4fb9e0e8 // sdot v8.4s, v7.16b, v25.4b[1]\n"
632 ".inst 0x4f99e8e5 // sdot v5.4s, v7.16b, v25.4b[2]\n"
633 ".inst 0x4fb9e8e4 // sdot v4.4s, v7.16b, v25.4b[3]\n"
634 ".inst 0x4f97e069 // sdot v9.4s, v3.16b, v23.4b[0]\n"
635 ".inst 0x4fb7e068 // sdot v8.4s, v3.16b, v23.4b[1]\n"
636 ".inst 0x4f97e865 // sdot v5.4s, v3.16b, v23.4b[2]\n"
637 ".inst 0x4fb7e864 // sdot v4.4s, v3.16b, v23.4b[3]\n"
638 ".inst 0x4f96e009 // sdot v9.4s, v0.16b, v22.4b[0]\n"
639 ".inst 0x4fb6e008 // sdot v8.4s, v0.16b, v22.4b[1]\n"
640 ".inst 0x4f96e805 // sdot v5.4s, v0.16b, v22.4b[2]\n"
641 ".inst 0x4fb6e804 // sdot v4.4s, v0.16b, v22.4b[3]\n"
642 "scvtf v9.4s, v9.4s, #0x4\n"
643 "scvtf v8.4s, v8.4s, #0x4\n"
644 "scvtf v5.4s, v5.4s, #0x4\n"
645 "fmla v31.4s, v9.4s, v19.4s\n"
646 "scvtf v4.4s, v4.4s, #0x4\n"
647 "fmla v30.4s, v8.4s, v18.4s\n"
648 "fmla v29.4s, v5.4s, v17.4s\n"
649 "fmla v28.4s, v4.4s, v16.4s\n"
650 "subs x20, x20, #0x1\n"
651 "bgt 17b\n"
652 "cmp x25, #0x4\n"
653 "blt 19f\n"
654 "mov x20, %x[dst]\n"
655 "cmp x13, #0x1\n"
656 "str q31, [x20, #0x0]\n"
657 "add x20, x20, %x[dst_stride_row]\n"
658 "ble 22f\n"
659 "cmp x13, #0x2\n"
660 "str q30, [x20, #0x0]\n"
661 "add x20, x20, %x[dst_stride_row]\n"
662 "ble 22f\n"
663 "cmp x13, #0x3\n"
664 "str q29, [x20, #0x0]\n"
665 "add x20, x20, %x[dst_stride_row]\n"
666 "ble 22f\n"
667 "str q28, [x20, #0x0]\n"
668 "b 22f\n"
669 "19:" // Row tail: Partial output
670 "mov x23, %x[dst]\n"
671 "cmp x13, #0x1\n"
672 "add x22, x23, %x[dst_stride_row]\n"
673 "csel x22, x22, x23, GT\n"
674 "cmp x13, #0x2\n"
675 "add x21, x23, %x[dst_stride_row], LSL #1\n"
676 "csel x21, x21, x22, GT\n"
677 "cmp x13, #0x3\n"
678 "add x20, x21, %x[dst_stride_row]\n"
679 "csel x20, x20, x21, GT\n"
680 "tbz x25, #1, 20f\n"
681 "st1 { v28.d }[0], [x20], #0x8\n"
682 "st1 { v29.d }[0], [x21], #0x8\n"
683 "st1 { v30.d }[0], [x22], #0x8\n"
684 "st1 { v31.d }[0], [x23], #0x8\n"
685 "tbz x25, #0, 21f\n"
686 "st1 { v28.s }[2], [x20]\n"
687 "st1 { v29.s }[2], [x21]\n"
688 "st1 { v30.s }[2], [x22]\n"
689 "st1 { v31.s }[2], [x23]\n"
690 "b 21f\n"
691 "20:" // Row tail: Output block 0: partial_1_0
692 "st1 { v28.s }[0], [x20]\n"
693 "st1 { v29.s }[0], [x21]\n"
694 "st1 { v30.s }[0], [x22]\n"
695 "st1 { v31.s }[0], [x23]\n"
696 "21:" // Row tail: Output block 0: Done
697 "22:" // Row tail: Output stage exit
698 "subs x25, x25, #0x4\n"
699 "add %x[dst], %x[dst], #0x10\n"
700 "bgt 16b\n"
701 "subs x13, x13, #0x4\n"
702 "add %x[lhs_packed], %x[lhs_packed], x12\n"
703 "mov %x[dst], x24\n"
704 "bgt 15b\n"
705 "23:" // Row tail: Row loop skip
706 : [dst] "+&r"(dst), [lhs_packed] "+&r"(lhs_packed)
707 24 : [dst_stride_row] "r"(dst_stride_row), [m] "r"(m), [n] "r"(n), [num_blocks] "r"(num_blocks),
708 24 [rhs_packed] "r"(rhs_packed)
709 : "cc", "memory", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20",
710 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7",
711 "v8", "v9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x9");
712 24 }
713
714 #endif // Architectural features check.
715