KleidiAI Coverage Report


Directory: ./
File: kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 86.2% 25 16 45
Functions: 66.7% 4 0 6
Branches: 100.0% 4 32 36

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 // Do not flag up inline assembly blocks
8 #pragma GCC diagnostic ignored "-Woverlength-strings"
9
10 #if !defined(__aarch64__) || !defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
11 #error This file must be compiled for AArch64, FEAT_BF16.
12 #else // Architectural features check.
13
14 #define MAX_NR 12
15
16 #include "kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.h"
17
18 #include <arm_neon.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <string.h>
22
23 #include "kai/kai_common.h"
24
25 static const size_t kai_nr = 12;
26 static const size_t kai_kr = 4;
27 static const size_t kai_sr = 1;
28
29 size_t kai_get_n_step_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(void) {
30 return kai_nr;
31 }
32
33 184 size_t kai_get_rhs_offset_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t n_idx) {
34 KAI_ASSUME(n_idx % kai_nr == 0);
35 184 return n_idx * sizeof(float);
36 }
37
38 size_t kai_get_bias_offset_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t n_idx) {
39 KAI_ASSUME(n_idx % kai_nr == 0);
40 return n_idx * sizeof(uint32_t);
41 }
42
43 228 size_t kai_get_rhs_packed_offset_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(
44 size_t n_idx, size_t k, size_t nr, size_t kr) {
45 KAI_ASSUME(n_idx % nr == 0);
46 KAI_ASSUME(kai_nr == nr);
47 KAI_ASSUME(kai_kr == kr);
48
49 228 return n_idx * (sizeof(uint32_t) + kai_roundup(k, kr) * sizeof(uint16_t));
50 }
51
52 228 size_t kai_get_rhs_packed_size_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t n, size_t k, size_t nr, size_t kr) {
53 228 return kai_get_rhs_packed_offset_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(kai_roundup(n, nr), k, nr, kr);
54 }
55
56 228 void kai_run_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(
57 size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
58 const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params) {
59 KAI_ASSUME(num_groups == 1);
60 KAI_ASSUME(kai_nr == nr);
61 KAI_ASSUME(kai_kr == kr);
62 KAI_ASSUME(kai_sr == sr);
63 KAI_ASSUME(sr == 1);
64 KAI_ASSUME(rhs != NULL);
65 KAI_ASSUME(scale == NULL);
66 KAI_ASSUME(rhs_packed != NULL);
67 KAI_ASSUME(extra_bytes == 0);
68 KAI_ASSUME(params == NULL);
69 KAI_ASSUME(nr <= MAX_NR);
70
71 228 size_t height = k;
72 228 const size_t width = n;
73 228 const void* in = rhs;
74 228 void* out = rhs_packed;
75 228 const size_t in_stride = rhs_stride;
76 228 const float* pad_row = rhs;
77
78 // Fill zeros if bias is nullptr
79 228 size_t bias_step = nr * sizeof(float);
80 228 uint8_t zero_bias[MAX_NR * sizeof(float)];
81
82
2/2
✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
228 if (bias == NULL) {
83 114 memset(zero_bias, 0, MAX_NR * sizeof(float));
84 114 bias_step = 0;
85 114 }
86
87
2/2
✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
228 const void* bias_ptr = bias == NULL ? (const void*)zero_bias : bias;
88
89 228 const size_t out_stride = nr * kai_roundup(height, kr) * sizeof(uint16_t) + nr * sizeof(uint32_t);
90
91 456 __asm__ __volatile__(
92 "mov x22, %x[width]\n"
93 "mov x21, %x[out]\n"
94 "cmp x22, #0xc\n"
95 "blt 2f\n"
96 "1:" // Bias: Full loop
97 "ldr q16, [%x[bias], #0x0]\n"
98 "ldr q26, [%x[bias], #0x10]\n"
99 "sub x22, x22, #0xc\n"
100 "ldr q8, [%x[bias], #0x20]\n"
101 "cmp x22, #0xc\n"
102 "add %x[bias], %x[bias], %x[bias_step]\n"
103 "str q16, [x21, #0x0]\n"
104 "str q26, [x21, #0x10]\n"
105 "str q8, [x21, #0x20]\n"
106 "add x21, x21, %x[out_stride]\n"
107 "bge 1b\n"
108 "cbz x22, 3f\n"
109 "2:" // Bias: Tail loop
110 "ldr w20, [%x[bias], #0x0]\n"
111 "sub x22, x22, #0x1\n"
112 "add %x[bias], %x[bias], #0x4\n"
113 "cmp x22, #0x0\n"
114 "str w20, [x21]\n"
115 "add x21, x21, #0x4\n"
116 "bgt 2b\n"
117 "3:" // Bias: Done
118 "cmp %x[height], #0x8\n"
119 "add %x[out], %x[out], #0x30\n"
120 "blt 12f\n"
121 "4:" // Main row loop: Head
122 "mov x9, %x[in]\n"
123 "mov x28, %x[width]\n"
124 "mov x27, %x[out]\n"
125 "sub %x[height], %x[height], #0x8\n"
126 "add x26, x9, %x[in_stride]\n"
127 "add x25, x26, %x[in_stride]\n"
128 "add x24, x25, %x[in_stride]\n"
129 "cmp x28, #0xc\n"
130 "add x23, x24, %x[in_stride]\n"
131 "add x22, x23, %x[in_stride]\n"
132 "add x21, x22, %x[in_stride]\n"
133 "add x20, x21, %x[in_stride]\n"
134 "add %x[in], x20, %x[in_stride]\n"
135 "blt 6f\n"
136 "5:" // Main row loop: Column loop
137 "ldr q28, [x9], #0x10\n"
138 "ldr q27, [x26], #0x10\n"
139 "sub x28, x28, #0xc\n"
140 "ldr q11, [x25], #0x10\n"
141 "ldr q5, [x24], #0x10\n"
142 "cmp x28, #0xc\n"
143 "ldr q14, [x23], #0x10\n"
144 "ldr q6, [x22], #0x10\n"
145 "ldr q2, [x21], #0x10\n"
146 "ldr q18, [x20], #0x10\n"
147 "ldr q1, [x9], #0x10\n"
148 "ldr q7, [x26], #0x10\n"
149 "zip1 v15.4s, v28.4s, v11.4s\n"
150 "zip1 v8.4s, v27.4s, v5.4s\n"
151 "ldr q3, [x25], #0x10\n"
152 "ldr q23, [x24], #0x10\n"
153 "zip2 v17.4s, v28.4s, v11.4s\n"
154 "zip2 v27.4s, v27.4s, v5.4s\n"
155 "ldr q5, [x23], #0x10\n"
156 "ldr q30, [x22], #0x10\n"
157 "zip1 v26.4s, v14.4s, v2.4s\n"
158 "zip1 v31.4s, v6.4s, v18.4s\n"
159 "ldr q20, [x21], #0x10\n"
160 "ldr q16, [x20], #0x10\n"
161 "zip2 v12.4s, v14.4s, v2.4s\n"
162 "zip2 v24.4s, v6.4s, v18.4s\n"
163 "ldr q29, [x9], #0x10\n"
164 "ldr q6, [x26], #0x10\n"
165 "zip1 v18.4s, v1.4s, v3.4s\n"
166 "zip1 v4.4s, v7.4s, v23.4s\n"
167 "ldr q22, [x25], #0x10\n"
168 "ldr q0, [x24], #0x10\n"
169 "zip2 v3.4s, v1.4s, v3.4s\n"
170 "zip2 v1.4s, v7.4s, v23.4s\n"
171 "ldr q2, [x23], #0x10\n"
172 "ldr q10, [x22], #0x10\n"
173 "zip1 v28.4s, v5.4s, v20.4s\n"
174 "zip1 v14.4s, v30.4s, v16.4s\n"
175 "ldr q9, [x21], #0x10\n"
176 "ldr q23, [x20], #0x10\n"
177 "zip2 v13.4s, v5.4s, v20.4s\n"
178 "zip2 v30.4s, v30.4s, v16.4s\n"
179 "zip1 v16.4s, v29.4s, v22.4s\n"
180 "zip1 v5.4s, v6.4s, v0.4s\n"
181 "zip2 v22.4s, v29.4s, v22.4s\n"
182 "zip2 v0.4s, v6.4s, v0.4s\n"
183 "zip1 v7.4s, v2.4s, v9.4s\n"
184 "zip1 v19.4s, v10.4s, v23.4s\n"
185 "zip2 v21.4s, v2.4s, v9.4s\n"
186 "zip2 v25.4s, v10.4s, v23.4s\n"
187 "zip1 v11.4s, v15.4s, v8.4s\n"
188 "zip1 v9.4s, v17.4s, v27.4s\n"
189 "zip1 v6.4s, v18.4s, v4.4s\n"
190 "zip1 v2.4s, v3.4s, v1.4s\n"
191 "zip1 v29.4s, v16.4s, v5.4s\n"
192 "zip1 v20.4s, v22.4s, v0.4s\n"
193 "zip1 v10.4s, v26.4s, v31.4s\n"
194 "zip1 v23.4s, v12.4s, v24.4s\n"
195 ".inst 0x0ea1696b // bfcvtn v11.4h, v11.4s\n"
196 "zip2 v8.4s, v15.4s, v8.4s\n"
197 "zip1 v15.4s, v28.4s, v14.4s\n"
198 ".inst 0x0ea16929 // bfcvtn v9.4h, v9.4s\n"
199 "zip2 v27.4s, v17.4s, v27.4s\n"
200 "zip1 v17.4s, v13.4s, v30.4s\n"
201 ".inst 0x0ea168c6 // bfcvtn v6.4h, v6.4s\n"
202 "zip2 v4.4s, v18.4s, v4.4s\n"
203 "zip1 v18.4s, v7.4s, v19.4s\n"
204 ".inst 0x0ea16842 // bfcvtn v2.4h, v2.4s\n"
205 "zip2 v1.4s, v3.4s, v1.4s\n"
206 "zip1 v3.4s, v21.4s, v25.4s\n"
207 ".inst 0x0ea16bbd // bfcvtn v29.4h, v29.4s\n"
208 "zip2 v5.4s, v16.4s, v5.4s\n"
209 ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n"
210 "zip2 v0.4s, v22.4s, v0.4s\n"
211 ".inst 0x0ea16956 // bfcvtn v22.4h, v10.4s\n"
212 "zip2 v31.4s, v26.4s, v31.4s\n"
213 ".inst 0x0ea16aea // bfcvtn v10.4h, v23.4s\n"
214 "zip2 v26.4s, v12.4s, v24.4s\n"
215 ".inst 0x0ea169ef // bfcvtn v15.4h, v15.4s\n"
216 "zip2 v12.4s, v28.4s, v14.4s\n"
217 ".inst 0x0ea16a2e // bfcvtn v14.4h, v17.4s\n"
218 "zip2 v24.4s, v13.4s, v30.4s\n"
219 ".inst 0x0ea16a57 // bfcvtn v23.4h, v18.4s\n"
220 "zip2 v18.4s, v7.4s, v19.4s\n"
221 ".inst 0x0ea16871 // bfcvtn v17.4h, v3.4s\n"
222 "zip2 v16.4s, v21.4s, v25.4s\n"
223 ".inst 0x4ea1690b // bfcvtn2 v11.8h, v8.4s\n"
224 ".inst 0x4ea16b69 // bfcvtn2 v9.8h, v27.4s\n"
225 ".inst 0x4ea16886 // bfcvtn2 v6.8h, v4.4s\n"
226 ".inst 0x4ea16822 // bfcvtn2 v2.8h, v1.4s\n"
227 ".inst 0x4ea168bd // bfcvtn2 v29.8h, v5.4s\n"
228 ".inst 0x4ea16814 // bfcvtn2 v20.8h, v0.4s\n"
229 ".inst 0x4ea16bf6 // bfcvtn2 v22.8h, v31.4s\n"
230 ".inst 0x4ea16b4a // bfcvtn2 v10.8h, v26.4s\n"
231 "str q11, [x27, #0x0]\n"
232 ".inst 0x4ea1698f // bfcvtn2 v15.8h, v12.4s\n"
233 ".inst 0x4ea16b0e // bfcvtn2 v14.8h, v24.4s\n"
234 "str q9, [x27, #0x10]\n"
235 ".inst 0x4ea16a57 // bfcvtn2 v23.8h, v18.4s\n"
236 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
237 "str q6, [x27, #0x20]\n"
238 "str q2, [x27, #0x30]\n"
239 "str q29, [x27, #0x40]\n"
240 "str q20, [x27, #0x50]\n"
241 "str q22, [x27, #0x60]\n"
242 "str q10, [x27, #0x70]\n"
243 "str q15, [x27, #0x80]\n"
244 "str q14, [x27, #0x90]\n"
245 "str q23, [x27, #0xa0]\n"
246 "str q17, [x27, #0xb0]\n"
247 "add x27, x27, %x[out_stride]\n"
248 "bge 5b\n"
249 "6:" // Main row loop: Column loop skip
250 "cbz x28, 11f\n"
251 "cmp x28, #0x4\n"
252 "movi v16.16b, #0x0\n"
253 "str q16, [x27, #0x0]\n"
254 "str q16, [x27, #0x10]\n"
255 "str q16, [x27, #0x20]\n"
256 "str q16, [x27, #0x30]\n"
257 "str q16, [x27, #0x40]\n"
258 "str q16, [x27, #0x50]\n"
259 "str q16, [x27, #0x60]\n"
260 "str q16, [x27, #0x70]\n"
261 "str q16, [x27, #0x80]\n"
262 "str q16, [x27, #0x90]\n"
263 "str q16, [x27, #0xa0]\n"
264 "str q16, [x27, #0xb0]\n"
265 "blt 8f\n"
266 "7:" // Main row loop: width 4 loop: loop
267 "ldr q25, [x9], #0x10\n"
268 "ldr q24, [x26], #0x10\n"
269 "sub x28, x28, #0x4\n"
270 "ldr q21, [x25], #0x10\n"
271 "ldr q20, [x24], #0x10\n"
272 "cmp x28, #0x4\n"
273 "ldr q23, [x23], #0x10\n"
274 "ldr q19, [x22], #0x10\n"
275 "ldr q18, [x21], #0x10\n"
276 "ldr q17, [x20], #0x10\n"
277 "zip1 v22.4s, v25.4s, v21.4s\n"
278 "zip1 v16.4s, v24.4s, v20.4s\n"
279 "zip2 v21.4s, v25.4s, v21.4s\n"
280 "zip2 v20.4s, v24.4s, v20.4s\n"
281 "zip1 v27.4s, v23.4s, v18.4s\n"
282 "zip1 v26.4s, v19.4s, v17.4s\n"
283 "zip2 v25.4s, v23.4s, v18.4s\n"
284 "zip2 v24.4s, v19.4s, v17.4s\n"
285 "zip1 v19.4s, v22.4s, v16.4s\n"
286 "zip1 v18.4s, v21.4s, v20.4s\n"
287 "zip1 v17.4s, v27.4s, v26.4s\n"
288 "zip2 v23.4s, v22.4s, v16.4s\n"
289 "zip1 v16.4s, v25.4s, v24.4s\n"
290 "zip2 v22.4s, v21.4s, v20.4s\n"
291 ".inst 0x0ea16a75 // bfcvtn v21.4h, v19.4s\n"
292 ".inst 0x0ea16a54 // bfcvtn v20.4h, v18.4s\n"
293 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
294 "zip2 v18.4s, v27.4s, v26.4s\n"
295 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
296 "zip2 v16.4s, v25.4s, v24.4s\n"
297 ".inst 0x4ea16af5 // bfcvtn2 v21.8h, v23.4s\n"
298 ".inst 0x4ea16ad4 // bfcvtn2 v20.8h, v22.4s\n"
299 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
300 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
301 "str q21, [x27, #0x0]\n"
302 "str q20, [x27, #0x10]\n"
303 "str q19, [x27, #0x60]\n"
304 "str q17, [x27, #0x70]\n"
305 "add x27, x27, #0x20\n"
306 "bge 7b\n"
307 "8:" // Main row loop: width 4 loop: skip
308 "cmp x28, #0x1\n"
309 "blt 10f\n"
310 "9:" // Main row loop: width 1 loop: loop
311 "ldr s23, [x9], #0x4\n"
312 "ldr s22, [x26], #0x4\n"
313 "sub x28, x28, #0x1\n"
314 "ldr s19, [x25], #0x4\n"
315 "ldr s17, [x24], #0x4\n"
316 "cmp x28, #0x1\n"
317 "ldr s21, [x23], #0x4\n"
318 "ldr s20, [x22], #0x4\n"
319 "ldr s18, [x21], #0x4\n"
320 "ldr s16, [x20], #0x4\n"
321 "zip1 v19.4s, v23.4s, v19.4s\n"
322 "zip1 v17.4s, v22.4s, v17.4s\n"
323 "zip1 v18.4s, v21.4s, v18.4s\n"
324 "zip1 v16.4s, v20.4s, v16.4s\n"
325 "zip1 v17.4s, v19.4s, v17.4s\n"
326 "zip1 v16.4s, v18.4s, v16.4s\n"
327 ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n"
328 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
329 "str d17, [x27, #0x0]\n"
330 "str d16, [x27, #0x60]\n"
331 "add x27, x27, #0x8\n"
332 "bge 9b\n"
333 "10:" // Main row loop: width 1 loop: skip
334 "11:" // Main row loop: odd col skip
335 "cmp %x[height], #0x8\n"
336 "add %x[out], %x[out], #0xc0\n"
337 "bge 4b\n"
338 "cbz %x[height], 21f\n"
339 "12:" // Main loop skip
340 "13:" // Tail row loop: Head
341 "mov x9, %x[in]\n"
342 "mov x20, %x[width]\n"
343 "cmp %x[height], #0x3\n"
344 "mov x27, %x[out]\n"
345 "add x26, x9, %x[in_stride]\n"
346 "add x25, x26, %x[in_stride]\n"
347 "add x24, x25, %x[in_stride]\n"
348 "csel x25, x25, %x[pad_row], GE\n"
349 "add %x[in], x24, %x[in_stride]\n"
350 "csel x24, x24, %x[pad_row], GT\n"
351 "cmp %x[height], #0x1\n"
352 "sub %x[height], %x[height], #0x4\n"
353 "csel x26, x26, %x[pad_row], GT\n"
354 "cmp x20, #0xc\n"
355 "blt 15f\n"
356 "14:" // Tail row loop: Column loop
357 "ldr q24, [x9], #0x10\n"
358 "ldr q23, [x26], #0x10\n"
359 "sub x20, x20, #0xc\n"
360 "ldr q22, [x25], #0x10\n"
361 "ldr q16, [x24], #0x10\n"
362 "cmp x20, #0xc\n"
363 "ldr q28, [x9], #0x10\n"
364 "ldr q27, [x26], #0x10\n"
365 "ldr q21, [x25], #0x10\n"
366 "ldr q20, [x24], #0x10\n"
367 "ldr q19, [x9], #0x10\n"
368 "zip1 v26.4s, v24.4s, v22.4s\n"
369 "zip1 v25.4s, v23.4s, v16.4s\n"
370 "ldr q18, [x26], #0x10\n"
371 "ldr q17, [x25], #0x10\n"
372 "zip2 v24.4s, v24.4s, v22.4s\n"
373 "zip2 v23.4s, v23.4s, v16.4s\n"
374 "ldr q16, [x24], #0x10\n"
375 "zip1 v2.4s, v28.4s, v21.4s\n"
376 "zip1 v22.4s, v27.4s, v20.4s\n"
377 "zip2 v1.4s, v28.4s, v21.4s\n"
378 "zip2 v0.4s, v27.4s, v20.4s\n"
379 "zip1 v31.4s, v19.4s, v17.4s\n"
380 "zip1 v30.4s, v18.4s, v16.4s\n"
381 "zip2 v29.4s, v19.4s, v17.4s\n"
382 "zip2 v28.4s, v18.4s, v16.4s\n"
383 "zip1 v21.4s, v26.4s, v25.4s\n"
384 "zip1 v20.4s, v24.4s, v23.4s\n"
385 "zip1 v19.4s, v2.4s, v22.4s\n"
386 "zip1 v18.4s, v1.4s, v0.4s\n"
387 "zip1 v17.4s, v31.4s, v30.4s\n"
388 "zip1 v16.4s, v29.4s, v28.4s\n"
389 ".inst 0x0ea16abb // bfcvtn v27.4h, v21.4s\n"
390 "zip2 v26.4s, v26.4s, v25.4s\n"
391 ".inst 0x0ea16a99 // bfcvtn v25.4h, v20.4s\n"
392 "zip2 v24.4s, v24.4s, v23.4s\n"
393 ".inst 0x0ea16a77 // bfcvtn v23.4h, v19.4s\n"
394 "zip2 v22.4s, v2.4s, v22.4s\n"
395 ".inst 0x0ea16a55 // bfcvtn v21.4h, v18.4s\n"
396 "zip2 v20.4s, v1.4s, v0.4s\n"
397 ".inst 0x0ea16a33 // bfcvtn v19.4h, v17.4s\n"
398 "zip2 v18.4s, v31.4s, v30.4s\n"
399 ".inst 0x0ea16a11 // bfcvtn v17.4h, v16.4s\n"
400 "zip2 v16.4s, v29.4s, v28.4s\n"
401 ".inst 0x4ea16b5b // bfcvtn2 v27.8h, v26.4s\n"
402 ".inst 0x4ea16b19 // bfcvtn2 v25.8h, v24.4s\n"
403 ".inst 0x4ea16ad7 // bfcvtn2 v23.8h, v22.4s\n"
404 ".inst 0x4ea16a95 // bfcvtn2 v21.8h, v20.4s\n"
405 ".inst 0x4ea16a53 // bfcvtn2 v19.8h, v18.4s\n"
406 ".inst 0x4ea16a11 // bfcvtn2 v17.8h, v16.4s\n"
407 "str q27, [x27, #0x0]\n"
408 "str q25, [x27, #0x10]\n"
409 "str q23, [x27, #0x20]\n"
410 "str q21, [x27, #0x30]\n"
411 "str q19, [x27, #0x40]\n"
412 "str q17, [x27, #0x50]\n"
413 "add x27, x27, %x[out_stride]\n"
414 "bge 14b\n"
415 "15:" // Tail row loop: Column loop skip
416 "cbz x20, 20f\n"
417 "cmp x20, #0x4\n"
418 "movi v16.16b, #0x0\n"
419 "str q16, [x27, #0x0]\n"
420 "str q16, [x27, #0x10]\n"
421 "str q16, [x27, #0x20]\n"
422 "str q16, [x27, #0x30]\n"
423 "str q16, [x27, #0x40]\n"
424 "str q16, [x27, #0x50]\n"
425 "blt 17f\n"
426 "16:" // Tail row loop: width 4 loop: loop
427 "ldr q21, [x9], #0x10\n"
428 "ldr q20, [x26], #0x10\n"
429 "sub x20, x20, #0x4\n"
430 "ldr q19, [x25], #0x10\n"
431 "ldr q17, [x24], #0x10\n"
432 "cmp x20, #0x4\n"
433 "zip1 v18.4s, v21.4s, v19.4s\n"
434 "zip1 v16.4s, v20.4s, v17.4s\n"
435 "zip2 v21.4s, v21.4s, v19.4s\n"
436 "zip2 v20.4s, v20.4s, v17.4s\n"
437 "zip1 v17.4s, v18.4s, v16.4s\n"
438 "zip2 v19.4s, v18.4s, v16.4s\n"
439 "zip1 v16.4s, v21.4s, v20.4s\n"
440 ".inst 0x0ea16a32 // bfcvtn v18.4h, v17.4s\n"
441 "zip2 v17.4s, v21.4s, v20.4s\n"
442 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
443 ".inst 0x4ea16a72 // bfcvtn2 v18.8h, v19.4s\n"
444 ".inst 0x4ea16a30 // bfcvtn2 v16.8h, v17.4s\n"
445 "str q18, [x27, #0x0]\n"
446 "str q16, [x27, #0x10]\n"
447 "add x27, x27, #0x20\n"
448 "bge 16b\n"
449 "17:" // Tail row loop: width 4 loop: skip
450 "cmp x20, #0x1\n"
451 "blt 19f\n"
452 "18:" // Tail row loop: width 1 loop: loop
453 "ldr s19, [x9], #0x4\n"
454 "ldr s18, [x26], #0x4\n"
455 "sub x20, x20, #0x1\n"
456 "ldr s17, [x25], #0x4\n"
457 "ldr s16, [x24], #0x4\n"
458 "cmp x20, #0x1\n"
459 "zip1 v17.4s, v19.4s, v17.4s\n"
460 "zip1 v16.4s, v18.4s, v16.4s\n"
461 "zip1 v16.4s, v17.4s, v16.4s\n"
462 ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n"
463 "str d16, [x27, #0x0]\n"
464 "add x27, x27, #0x8\n"
465 "bge 18b\n"
466 "19:" // Tail row loop: width 1 loop: skip
467 "20:" // Tail row loop: odd col skip
468 "cmp %x[height], #0x1\n"
469 "add %x[out], %x[out], #0x60\n"
470 "bge 13b\n"
471 "21:" // Done
472 : [bias] "+&r"(bias_ptr), [height] "+&r"(height), [in] "+&r"(in), [out] "+&r"(out)
473 228 : [bias_step] "r"(bias_step), [in_stride] "r"(in_stride), [out_stride] "r"(out_stride), [pad_row] "r"(pad_row),
474 228 [width] "r"(width)
475 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
476 "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
477 "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
478 228 }
479
480 #endif // Architectural features check.
481