KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 94.3% 33 / 5 / 40
Functions: 90.9% 10 / 0 / 11
Branches: -% 0 / 10 / 10

kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.c
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 // Do not flag up inline assembly blocks
8 #pragma GCC diagnostic ignored "-Woverlength-strings"
9
10 #if !defined(__aarch64__) || !defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
11 #error This file must be compiled for AArch64, FEAT_BF16.
12 #else // Architectural features check.
13
14 #include "kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.h"
15
16 #include <stddef.h>
17 #include <stdint.h>
18
19 #include "kai/kai_common.h"
20
21 static const size_t kai_mr = 1;
22 static const size_t kai_nr = 12;
23 static const size_t kai_kr = 4;
24 static const size_t kai_sr = 1;
25
26 static const size_t kai_m_step = 1;
27 static const size_t kai_n_step = 36;
28
29 1152 size_t kai_get_m_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
30 1152 return kai_m_step;
31 }
32
33 1728 size_t kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
34 1728 return kai_n_step;
35 }
36
37 size_t kai_get_mr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
38 return kai_mr;
39 }
40
41 144 size_t kai_get_nr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
42 144 return kai_nr;
43 }
44
45 144 size_t kai_get_kr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
46 144 return kai_kr;
47 }
48
49 144 size_t kai_get_sr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
50 144 return kai_sr;
51 }
52
53 576 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t m_idx, size_t k) {
54 KAI_ASSUME(m_idx == 0);
55
56 576 return m_idx * kai_roundup(k, kai_kr) * sizeof(uint16_t);
57 }
58
59 576 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t n_idx, size_t k) {
60 KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0);
61 576 return n_idx * (kai_roundup(k, kai_kr) * sizeof(uint16_t) + sizeof(float));
62 }
63
64 576 size_t kai_get_dst_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(
65 size_t m_idx, size_t n_idx, size_t dst_stride) {
66 KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0);
67 KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0);
68
69 576 return (m_idx * dst_stride) + (n_idx * sizeof(float));
70 }
71
72 576 size_t kai_get_dst_size_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t m, size_t n) {
73 576 return m * n * sizeof(float);
74 }
75
76 582 void kai_run_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(
77 size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row,
78 size_t dst_stride_col, float clamp_min, float clamp_max) {
79 582 KAI_UNUSED(dst_stride_row);
80 582 KAI_UNUSED(dst_stride_col);
81
82 KAI_ASSUME(m == 1);
83
84 typedef struct {
85 float maxval;
86 float minval;
87 } KernelArgs;
88
89 582 KernelArgs ka;
90 582 ka.maxval = clamp_max;
91 582 ka.minval = clamp_min;
92
93 582 size_t N = n;
94 582 size_t K = k;
95
96 582 const void* A_ptr = lhs_packed;
97 582 const void* B_ptr = rhs_packed;
98 582 void* output_ptr = dst;
99
100 // Add clamping flag
101 582 uint64_t flags = 0x2;
102
103 1164 __asm__ __volatile__(
104 "add x26, %x[K], #0x3\n"
105 "mov x20, #0xc\n"
106 "bic x26, x26, #0x3\n"
107 "add x25, %x[N], #0x3\n"
108 "lsr x25, x25, #0x2\n"
109 "lsl x26, x26, #0x1\n"
110 "add x26, x26, #0x4\n"
111 "mul x26, x26, x20\n"
112 "1:" // Column loop
113 "cmp x25, #0x9\n"
114 "bge 89f\n"
115 "cmp x25, #0x7\n"
116 "bgt 78f\n"
117 "beq 67f\n"
118 "cmp x25, #0x5\n"
119 "bgt 56f\n"
120 "beq 45f\n"
121 "cmp x25, #0x3\n"
122 "bgt 34f\n"
123 "beq 23f\n"
124 "cmp x25, #0x1\n"
125 "bgt 12f\n"
126 "ldr q14, [%x[B_ptr], #0x0]\n"
127 "mov x24, %x[K]\n"
128 "movi v16.16b, #0x0\n"
129 "mov x23, %x[A_ptr]\n"
130 "add %x[B_ptr], %x[B_ptr], #0x30\n"
131 "cmp x24, #0x4\n"
132 "zip2 v15.4s, v14.4s, v16.4s\n"
133 "zip1 v14.4s, v14.4s, v16.4s\n"
134 "blt 4f\n"
135 "cmp x24, #0x8\n"
136 "blt 3f\n"
137 "2:" // Width 1: Multiply loop: Main loop head
138 "ld1r { v0.2d }, [x23]\n"
139 "ldr q1, [%x[B_ptr], #0x0]\n"
140 "sub x24, x24, #0x4\n"
141 "add x23, x23, #0x8\n"
142 "ldr q2, [%x[B_ptr], #0x10]\n"
143 "add %x[B_ptr], %x[B_ptr], #0x60\n"
144 "cmp x24, #0x8\n"
145 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
146 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
147 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
148 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
149 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
150 "prfm pldl1keep, [x23, #0x80]\n"
151 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
152 "bge 2b\n"
153 "3:" // Width 1: Multiply loop: Single iteration only
154 "ld1r { v0.2d }, [x23]\n"
155 "ldr q3, [%x[B_ptr], #0x0]\n"
156 "add x23, x23, #0x8\n"
157 "sub x24, x24, #0x4\n"
158 "ldr q4, [%x[B_ptr], #0x10]\n"
159 "add %x[B_ptr], %x[B_ptr], #0x60\n"
160 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
161 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
162 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
163 ".inst 0x6e40fc6e // bfdot v14.4s, v3.8h, v0.8h\n"
164 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
165 "prfm pldl1keep, [x23, #0x80]\n"
166 ".inst 0x6e40fc8f // bfdot v15.4s, v4.8h, v0.8h\n"
167 "4:" // Width 1: Multiply loop: Main loop skip
168 "cbz x24, 7f\n"
169 "tbz x24, #1, 5f\n"
170 "ldr s0, [x23], #0x4\n"
171 "tbz x24, #0, 6f\n"
172 "ld1 { v0.h }[2], [x23]\n"
173 "b 6f\n"
174 "5:" // Width 1: Multiply loop: Ragged operand read: partial_1_0
175 "ldr h0, [x23, #0x0]\n"
176 "6:" // Width 1: Multiply loop: Ragged operand read: Done
177 "ldr q5, [%x[B_ptr], #0x0]\n"
178 "ldr q6, [%x[B_ptr], #0x10]\n"
179 "dup v0.2d, v0.d[0]\n"
180 "add %x[B_ptr], %x[B_ptr], #0x90\n"
181 ".inst 0x6e40fcae // bfdot v14.4s, v5.8h, v0.8h\n"
182 ".inst 0x6e40fccf // bfdot v15.4s, v6.8h, v0.8h\n"
183 "7:" // Width 1: Multiply loop: No odd multiplies
184 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
185 "faddp v14.4s, v14.4s, v15.4s\n"
186 "tbz %x[flags], #1, 8f\n"
187 "add x21, %x[args_ptr], %[offset_max]\n"
188 "add x20, %x[args_ptr], %[offset_min]\n"
189 "ld1r { v17.4s }, [x21]\n"
190 "ld1r { v16.4s }, [x20]\n"
191 "fmin v14.4s, v14.4s, v17.4s\n"
192 "fmax v14.4s, v14.4s, v16.4s\n"
193 "8:" // Width 1: No activation
194 "cmp %x[N], #0x4\n"
195 "blt 9f\n"
196 "str q14, [%x[output_ptr], #0x0]\n"
197 "add %x[output_ptr], %x[output_ptr], #0x10\n"
198 "b 11f\n"
199 "9:" // Width 1: Partial writeback
200 "tbz %x[N], #1, 10f\n"
201 "str d14, [%x[output_ptr]], #0x8\n"
202 "tbz %x[N], #0, 11f\n"
203 "st1 { v14.s }[2], [%x[output_ptr]]\n"
204 "b 11f\n"
205 "10:" // Width 1: Partial direct writeback: partial_1_0
206 "str s14, [%x[output_ptr], #0x0]\n"
207 "11:" // Width 1: Writeback done
208 "b 100f\n"
209 "12:" // Width 2
210 "ldr q14, [%x[B_ptr], #0x0]\n"
211 "ldr q15, [%x[B_ptr], #0x10]\n"
212 "mov x24, %x[K]\n"
213 "movi v18.16b, #0x0\n"
214 "mov x23, %x[A_ptr]\n"
215 "add %x[B_ptr], %x[B_ptr], #0x30\n"
216 "cmp x24, #0x4\n"
217 "zip2 v17.4s, v15.4s, v18.4s\n"
218 "zip1 v16.4s, v15.4s, v18.4s\n"
219 "zip2 v15.4s, v14.4s, v18.4s\n"
220 "zip1 v14.4s, v14.4s, v18.4s\n"
221 "blt 15f\n"
222 "cmp x24, #0x8\n"
223 "blt 14f\n"
224 "13:" // Width 2: Multiply loop: Main loop head
225 "ld1r { v0.2d }, [x23]\n"
226 "ldr q1, [%x[B_ptr], #0x0]\n"
227 "sub x24, x24, #0x4\n"
228 "add x23, x23, #0x8\n"
229 "ldr q2, [%x[B_ptr], #0x10]\n"
230 "ldr q3, [%x[B_ptr], #0x20]\n"
231 "cmp x24, #0x8\n"
232 "ldr q4, [%x[B_ptr], #0x30]\n"
233 "add %x[B_ptr], %x[B_ptr], #0x60\n"
234 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
235 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
236 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
237 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
238 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
239 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
240 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
241 "prfm pldl1keep, [x23, #0x80]\n"
242 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
243 "bge 13b\n"
244 "14:" // Width 2: Multiply loop: Single iteration only
245 "ld1r { v0.2d }, [x23]\n"
246 "ldr q5, [%x[B_ptr], #0x0]\n"
247 "add x23, x23, #0x8\n"
248 "sub x24, x24, #0x4\n"
249 "ldr q6, [%x[B_ptr], #0x10]\n"
250 "ldr q7, [%x[B_ptr], #0x20]\n"
251 "ldr q8, [%x[B_ptr], #0x30]\n"
252 "add %x[B_ptr], %x[B_ptr], #0x60\n"
253 ".inst 0x6e40fcae // bfdot v14.4s, v5.8h, v0.8h\n"
254 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
255 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
256 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
257 ".inst 0x6e40fccf // bfdot v15.4s, v6.8h, v0.8h\n"
258 ".inst 0x6e40fcf0 // bfdot v16.4s, v7.8h, v0.8h\n"
259 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
260 "prfm pldl1keep, [x23, #0x80]\n"
261 ".inst 0x6e40fd11 // bfdot v17.4s, v8.8h, v0.8h\n"
262 "15:" // Width 2: Multiply loop: Main loop skip
263 "cbz x24, 18f\n"
264 "tbz x24, #1, 16f\n"
265 "ldr s0, [x23], #0x4\n"
266 "tbz x24, #0, 17f\n"
267 "ld1 { v0.h }[2], [x23]\n"
268 "b 17f\n"
269 "16:" // Width 2: Multiply loop: Ragged operand read: partial_1_0
270 "ldr h0, [x23, #0x0]\n"
271 "17:" // Width 2: Multiply loop: Ragged operand read: Done
272 "ldr q9, [%x[B_ptr], #0x0]\n"
273 "ldr q10, [%x[B_ptr], #0x10]\n"
274 "dup v0.2d, v0.d[0]\n"
275 "ldr q11, [%x[B_ptr], #0x20]\n"
276 "ldr q12, [%x[B_ptr], #0x30]\n"
277 "add %x[B_ptr], %x[B_ptr], #0x90\n"
278 ".inst 0x6e40fd2e // bfdot v14.4s, v9.8h, v0.8h\n"
279 ".inst 0x6e40fd4f // bfdot v15.4s, v10.8h, v0.8h\n"
280 ".inst 0x6e40fd70 // bfdot v16.4s, v11.8h, v0.8h\n"
281 ".inst 0x6e40fd91 // bfdot v17.4s, v12.8h, v0.8h\n"
282 "18:" // Width 2: Multiply loop: No odd multiplies
283 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
284 "faddp v14.4s, v14.4s, v15.4s\n"
285 "faddp v15.4s, v16.4s, v17.4s\n"
286 "tbz %x[flags], #1, 19f\n"
287 "add x21, %x[args_ptr], %[offset_max]\n"
288 "add x20, %x[args_ptr], %[offset_min]\n"
289 "ld1r { v17.4s }, [x21]\n"
290 "ld1r { v16.4s }, [x20]\n"
291 "fmin v14.4s, v14.4s, v17.4s\n"
292 "fmin v15.4s, v15.4s, v17.4s\n"
293 "fmax v14.4s, v14.4s, v16.4s\n"
294 "fmax v15.4s, v15.4s, v16.4s\n"
295 "19:" // Width 2: No activation
296 "cmp %x[N], #0x8\n"
297 "str q14, [%x[output_ptr], #0x0]\n"
298 "add %x[output_ptr], %x[output_ptr], #0x10\n"
299 "blt 20f\n"
300 "str q15, [%x[output_ptr], #0x0]\n"
301 "add %x[output_ptr], %x[output_ptr], #0x10\n"
302 "b 22f\n"
303 "20:" // Width 2: Partial writeback
304 "tbz %x[N], #1, 21f\n"
305 "str d15, [%x[output_ptr]], #0x8\n"
306 "tbz %x[N], #0, 22f\n"
307 "st1 { v15.s }[2], [%x[output_ptr]]\n"
308 "b 22f\n"
309 "21:" // Width 2: Partial direct writeback: partial_1_4
310 "tbz %x[N], #0, 22f\n"
311 "str s15, [%x[output_ptr], #0x0]\n"
312 "22:" // Width 2: Writeback done
313 "b 100f\n"
314 "23:" // Width 3
315 "ldr q14, [%x[B_ptr], #0x0]\n"
316 "ldr q15, [%x[B_ptr], #0x10]\n"
317 "mov x24, %x[K]\n"
318 "movi v20.16b, #0x0\n"
319 "ldr q16, [%x[B_ptr], #0x20]\n"
320 "mov x23, %x[A_ptr]\n"
321 "add %x[B_ptr], %x[B_ptr], #0x30\n"
322 "cmp x24, #0x4\n"
323 "zip2 v17.4s, v15.4s, v20.4s\n"
324 "zip2 v19.4s, v16.4s, v20.4s\n"
325 "zip1 v18.4s, v16.4s, v20.4s\n"
326 "zip1 v16.4s, v15.4s, v20.4s\n"
327 "zip2 v15.4s, v14.4s, v20.4s\n"
328 "zip1 v14.4s, v14.4s, v20.4s\n"
329 "blt 26f\n"
330 "cmp x24, #0x8\n"
331 "blt 25f\n"
332 "24:" // Width 3: Multiply loop: Main loop head
333 "ld1r { v0.2d }, [x23]\n"
334 "ldr q1, [%x[B_ptr], #0x0]\n"
335 "sub x24, x24, #0x4\n"
336 "add x23, x23, #0x8\n"
337 "ldr q2, [%x[B_ptr], #0x10]\n"
338 "ldr q3, [%x[B_ptr], #0x20]\n"
339 "cmp x24, #0x8\n"
340 "ldr q4, [%x[B_ptr], #0x30]\n"
341 "ldr q5, [%x[B_ptr], #0x40]\n"
342 "ldr q6, [%x[B_ptr], #0x50]\n"
343 "add %x[B_ptr], %x[B_ptr], #0x60\n"
344 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
345 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
346 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
347 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
348 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
349 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
350 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
351 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
352 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
353 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
354 "prfm pldl1keep, [x23, #0x80]\n"
355 "bge 24b\n"
356 "25:" // Width 3: Multiply loop: Single iteration only
357 "ld1r { v0.2d }, [x23]\n"
358 "ldr q7, [%x[B_ptr], #0x0]\n"
359 "add x23, x23, #0x8\n"
360 "sub x24, x24, #0x4\n"
361 "ldr q8, [%x[B_ptr], #0x10]\n"
362 "ldr q9, [%x[B_ptr], #0x20]\n"
363 "ldr q10, [%x[B_ptr], #0x30]\n"
364 "ldr q11, [%x[B_ptr], #0x40]\n"
365 "ldr q12, [%x[B_ptr], #0x50]\n"
366 "add %x[B_ptr], %x[B_ptr], #0x60\n"
367 ".inst 0x6e40fcee // bfdot v14.4s, v7.8h, v0.8h\n"
368 ".inst 0x6e40fd0f // bfdot v15.4s, v8.8h, v0.8h\n"
369 ".inst 0x6e40fd30 // bfdot v16.4s, v9.8h, v0.8h\n"
370 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
371 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
372 ".inst 0x6e40fd51 // bfdot v17.4s, v10.8h, v0.8h\n"
373 ".inst 0x6e40fd72 // bfdot v18.4s, v11.8h, v0.8h\n"
374 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
375 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
376 ".inst 0x6e40fd93 // bfdot v19.4s, v12.8h, v0.8h\n"
377 "prfm pldl1keep, [x23, #0x80]\n"
378 "26:" // Width 3: Multiply loop: Main loop skip
379 "cbz x24, 29f\n"
380 "tbz x24, #1, 27f\n"
381 "ldr s0, [x23], #0x4\n"
382 "tbz x24, #0, 28f\n"
383 "ld1 { v0.h }[2], [x23]\n"
384 "b 28f\n"
385 "27:" // Width 3: Multiply loop: Ragged operand read: partial_1_0
386 "ldr h0, [x23, #0x0]\n"
387 "28:" // Width 3: Multiply loop: Ragged operand read: Done
388 "ldr q13, [%x[B_ptr], #0x0]\n"
389 "ldr q1, [%x[B_ptr], #0x10]\n"
390 "dup v0.2d, v0.d[0]\n"
391 "ldr q2, [%x[B_ptr], #0x20]\n"
392 "ldr q3, [%x[B_ptr], #0x30]\n"
393 "ldr q4, [%x[B_ptr], #0x40]\n"
394 "ldr q5, [%x[B_ptr], #0x50]\n"
395 "add %x[B_ptr], %x[B_ptr], #0x90\n"
396 ".inst 0x6e40fdae // bfdot v14.4s, v13.8h, v0.8h\n"
397 ".inst 0x6e40fc2f // bfdot v15.4s, v1.8h, v0.8h\n"
398 ".inst 0x6e40fc50 // bfdot v16.4s, v2.8h, v0.8h\n"
399 ".inst 0x6e40fc71 // bfdot v17.4s, v3.8h, v0.8h\n"
400 ".inst 0x6e40fc92 // bfdot v18.4s, v4.8h, v0.8h\n"
401 ".inst 0x6e40fcb3 // bfdot v19.4s, v5.8h, v0.8h\n"
402 "29:" // Width 3: Multiply loop: No odd multiplies
403 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
404 "faddp v14.4s, v14.4s, v15.4s\n"
405 "faddp v15.4s, v16.4s, v17.4s\n"
406 "faddp v16.4s, v18.4s, v19.4s\n"
407 "tbz %x[flags], #1, 30f\n"
408 "add x21, %x[args_ptr], %[offset_max]\n"
409 "add x20, %x[args_ptr], %[offset_min]\n"
410 "ld1r { v18.4s }, [x21]\n"
411 "ld1r { v17.4s }, [x20]\n"
412 "fmin v14.4s, v14.4s, v18.4s\n"
413 "fmin v15.4s, v15.4s, v18.4s\n"
414 "fmin v16.4s, v16.4s, v18.4s\n"
415 "fmax v14.4s, v14.4s, v17.4s\n"
416 "fmax v15.4s, v15.4s, v17.4s\n"
417 "fmax v16.4s, v16.4s, v17.4s\n"
418 "30:" // Width 3: No activation
419 "cmp %x[N], #0xc\n"
420 "str q14, [%x[output_ptr], #0x0]\n"
421 "str q15, [%x[output_ptr], #0x10]\n"
422 "add %x[output_ptr], %x[output_ptr], #0x20\n"
423 "blt 31f\n"
424 "str q16, [%x[output_ptr], #0x0]\n"
425 "add %x[output_ptr], %x[output_ptr], #0x10\n"
426 "b 33f\n"
427 "31:" // Width 3: Partial writeback
428 "tbz %x[N], #1, 32f\n"
429 "str d16, [%x[output_ptr]], #0x8\n"
430 "tbz %x[N], #0, 33f\n"
431 "st1 { v16.s }[2], [%x[output_ptr]]\n"
432 "b 33f\n"
433 "32:" // Width 3: Partial direct writeback: partial_1_8
434 "tbz %x[N], #0, 33f\n"
435 "str s16, [%x[output_ptr], #0x0]\n"
436 "33:" // Width 3: Writeback done
437 "b 100f\n"
438 "34:" // Width 4
439 "ldr q14, [%x[B_ptr], #0x0]\n"
440 "ldr q15, [%x[B_ptr], #0x10]\n"
441 "mov x24, %x[K]\n"
442 "add x20, %x[B_ptr], x26\n"
443 "ldr q16, [%x[B_ptr], #0x20]\n"
444 "ldr q17, [x20, #0x0]\n"
445 "movi v22.16b, #0x0\n"
446 "mov x23, %x[A_ptr]\n"
447 "add %x[B_ptr], %x[B_ptr], #0x30\n"
448 "add x20, x20, #0x30\n"
449 "cmp x24, #0x4\n"
450 "zip2 v21.4s, v17.4s, v22.4s\n"
451 "zip1 v20.4s, v17.4s, v22.4s\n"
452 "zip2 v19.4s, v16.4s, v22.4s\n"
453 "zip1 v18.4s, v16.4s, v22.4s\n"
454 "zip2 v17.4s, v15.4s, v22.4s\n"
455 "zip1 v16.4s, v15.4s, v22.4s\n"
456 "zip2 v15.4s, v14.4s, v22.4s\n"
457 "zip1 v14.4s, v14.4s, v22.4s\n"
458 "blt 37f\n"
459 "cmp x24, #0x8\n"
460 "blt 36f\n"
461 "35:" // Width 4: Multiply loop: Main loop head
462 "ld1r { v0.2d }, [x23]\n"
463 "ldr q1, [%x[B_ptr], #0x0]\n"
464 "sub x24, x24, #0x4\n"
465 "add x23, x23, #0x8\n"
466 "ldr q2, [%x[B_ptr], #0x10]\n"
467 "ldr q3, [%x[B_ptr], #0x20]\n"
468 "cmp x24, #0x8\n"
469 "ldr q4, [%x[B_ptr], #0x30]\n"
470 "ldr q5, [%x[B_ptr], #0x40]\n"
471 "ldr q6, [%x[B_ptr], #0x50]\n"
472 "ldr q7, [x20, #0x0]\n"
473 "add %x[B_ptr], %x[B_ptr], #0x60\n"
474 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
475 "ldr q8, [x20, #0x10]\n"
476 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
477 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
478 "add x20, x20, #0x60\n"
479 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
480 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
481 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
482 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
483 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
484 ".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n"
485 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
486 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
487 ".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n"
488 "prfm pldl1keep, [x23, #0x80]\n"
489 "bge 35b\n"
490 "36:" // Width 4: Multiply loop: Single iteration only
491 "ld1r { v0.2d }, [x23]\n"
492 "ldr q9, [%x[B_ptr], #0x0]\n"
493 "add x23, x23, #0x8\n"
494 "sub x24, x24, #0x4\n"
495 "ldr q10, [%x[B_ptr], #0x10]\n"
496 "ldr q11, [%x[B_ptr], #0x20]\n"
497 "ldr q12, [%x[B_ptr], #0x30]\n"
498 "ldr q13, [%x[B_ptr], #0x40]\n"
499 "ldr q1, [%x[B_ptr], #0x50]\n"
500 "ldr q2, [x20, #0x0]\n"
501 "add %x[B_ptr], %x[B_ptr], #0x60\n"
502 ".inst 0x6e40fd2e // bfdot v14.4s, v9.8h, v0.8h\n"
503 "ldr q3, [x20, #0x10]\n"
504 ".inst 0x6e40fd4f // bfdot v15.4s, v10.8h, v0.8h\n"
505 ".inst 0x6e40fd70 // bfdot v16.4s, v11.8h, v0.8h\n"
506 "add x20, x20, #0x60\n"
507 ".inst 0x6e40fd91 // bfdot v17.4s, v12.8h, v0.8h\n"
508 ".inst 0x6e40fdb2 // bfdot v18.4s, v13.8h, v0.8h\n"
509 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
510 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
511 ".inst 0x6e40fc33 // bfdot v19.4s, v1.8h, v0.8h\n"
512 ".inst 0x6e40fc54 // bfdot v20.4s, v2.8h, v0.8h\n"
513 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
514 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
515 ".inst 0x6e40fc75 // bfdot v21.4s, v3.8h, v0.8h\n"
516 "prfm pldl1keep, [x23, #0x80]\n"
517 "37:" // Width 4: Multiply loop: Main loop skip
518 "cbz x24, 40f\n"
519 "tbz x24, #1, 38f\n"
520 "ldr s0, [x23], #0x4\n"
521 "tbz x24, #0, 39f\n"
522 "ld1 { v0.h }[2], [x23]\n"
523 "b 39f\n"
524 "38:" // Width 4: Multiply loop: Ragged operand read: partial_1_0
525 "ldr h0, [x23, #0x0]\n"
526 "39:" // Width 4: Multiply loop: Ragged operand read: Done
527 "ldr q4, [%x[B_ptr], #0x0]\n"
528 "ldr q5, [%x[B_ptr], #0x10]\n"
529 "dup v0.2d, v0.d[0]\n"
530 "ldr q6, [%x[B_ptr], #0x20]\n"
531 "ldr q7, [%x[B_ptr], #0x30]\n"
532 "ldr q8, [%x[B_ptr], #0x40]\n"
533 "ldr q9, [%x[B_ptr], #0x50]\n"
534 "add %x[B_ptr], %x[B_ptr], #0x90\n"
535 "ldr q10, [x20, #0x0]\n"
536 "ldr q11, [x20, #0x10]\n"
537 ".inst 0x6e40fc8e // bfdot v14.4s, v4.8h, v0.8h\n"
538 ".inst 0x6e40fcaf // bfdot v15.4s, v5.8h, v0.8h\n"
539 ".inst 0x6e40fcd0 // bfdot v16.4s, v6.8h, v0.8h\n"
540 ".inst 0x6e40fcf1 // bfdot v17.4s, v7.8h, v0.8h\n"
541 ".inst 0x6e40fd12 // bfdot v18.4s, v8.8h, v0.8h\n"
542 ".inst 0x6e40fd33 // bfdot v19.4s, v9.8h, v0.8h\n"
543 ".inst 0x6e40fd54 // bfdot v20.4s, v10.8h, v0.8h\n"
544 ".inst 0x6e40fd75 // bfdot v21.4s, v11.8h, v0.8h\n"
545 "40:" // Width 4: Multiply loop: No odd multiplies
546 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
547 "faddp v14.4s, v14.4s, v15.4s\n"
548 "faddp v15.4s, v16.4s, v17.4s\n"
549 "faddp v16.4s, v18.4s, v19.4s\n"
550 "faddp v17.4s, v20.4s, v21.4s\n"
551 "tbz %x[flags], #1, 41f\n"
552 "add x21, %x[args_ptr], %[offset_max]\n"
553 "add x20, %x[args_ptr], %[offset_min]\n"
554 "ld1r { v19.4s }, [x21]\n"
555 "ld1r { v18.4s }, [x20]\n"
556 "fmin v14.4s, v14.4s, v19.4s\n"
557 "fmin v15.4s, v15.4s, v19.4s\n"
558 "fmin v16.4s, v16.4s, v19.4s\n"
559 "fmin v17.4s, v17.4s, v19.4s\n"
560 "fmax v14.4s, v14.4s, v18.4s\n"
561 "fmax v15.4s, v15.4s, v18.4s\n"
562 "fmax v16.4s, v16.4s, v18.4s\n"
563 "fmax v17.4s, v17.4s, v18.4s\n"
564 "41:" // Width 4: No activation
565 "cmp %x[N], #0x10\n"
566 "str q14, [%x[output_ptr], #0x0]\n"
567 "str q15, [%x[output_ptr], #0x10]\n"
568 "str q16, [%x[output_ptr], #0x20]\n"
569 "add %x[output_ptr], %x[output_ptr], #0x30\n"
570 "blt 42f\n"
571 "str q17, [%x[output_ptr], #0x0]\n"
572 "add %x[output_ptr], %x[output_ptr], #0x10\n"
573 "b 44f\n"
574 "42:" // Width 4: Partial writeback
575 "tbz %x[N], #1, 43f\n"
576 "str d17, [%x[output_ptr]], #0x8\n"
577 "tbz %x[N], #0, 44f\n"
578 "st1 { v17.s }[2], [%x[output_ptr]]\n"
579 "b 44f\n"
580 "43:" // Width 4: Partial direct writeback: partial_1_12
581 "tbz %x[N], #0, 44f\n"
582 "str s17, [%x[output_ptr], #0x0]\n"
583 "44:" // Width 4: Writeback done
584 "b 100f\n"
585 "45:" // Width 5
586 "ldr q14, [%x[B_ptr], #0x0]\n"
587 "ldr q15, [%x[B_ptr], #0x10]\n"
588 "mov x24, %x[K]\n"
589 "add x20, %x[B_ptr], x26\n"
590 "ldr q16, [%x[B_ptr], #0x20]\n"
591 "ldr q17, [x20, #0x0]\n"
592 "movi v24.16b, #0x0\n"
593 "mov x23, %x[A_ptr]\n"
594 "ldr q18, [x20, #0x10]\n"
595 "add %x[B_ptr], %x[B_ptr], #0x30\n"
596 "add x20, x20, #0x30\n"
597 "cmp x24, #0x4\n"
598 "zip2 v21.4s, v17.4s, v24.4s\n"
599 "zip1 v20.4s, v17.4s, v24.4s\n"
600 "zip2 v19.4s, v16.4s, v24.4s\n"
601 "zip2 v17.4s, v15.4s, v24.4s\n"
602 "zip2 v23.4s, v18.4s, v24.4s\n"
603 "zip1 v22.4s, v18.4s, v24.4s\n"
604 "zip1 v18.4s, v16.4s, v24.4s\n"
605 "zip1 v16.4s, v15.4s, v24.4s\n"
606 "zip2 v15.4s, v14.4s, v24.4s\n"
607 "zip1 v14.4s, v14.4s, v24.4s\n"
608 "blt 48f\n"
609 "cmp x24, #0x8\n"
610 "blt 47f\n"
611 "46:" // Width 5: Multiply loop: Main loop head
612 "ld1r { v0.2d }, [x23]\n"
613 "ldr q1, [%x[B_ptr], #0x0]\n"
614 "sub x24, x24, #0x4\n"
615 "add x23, x23, #0x8\n"
616 "ldr q2, [%x[B_ptr], #0x10]\n"
617 "ldr q3, [%x[B_ptr], #0x20]\n"
618 "cmp x24, #0x8\n"
619 "ldr q4, [%x[B_ptr], #0x30]\n"
620 "ldr q5, [%x[B_ptr], #0x40]\n"
621 "ldr q6, [%x[B_ptr], #0x50]\n"
622 "ldr q7, [x20, #0x0]\n"
623 "add %x[B_ptr], %x[B_ptr], #0x60\n"
624 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
625 "ldr q8, [x20, #0x10]\n"
626 "ldr q9, [x20, #0x20]\n"
627 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
628 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
629 "ldr q10, [x20, #0x30]\n"
630 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
631 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
632 "add x20, x20, #0x60\n"
633 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
634 ".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n"
635 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
636 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
637 ".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n"
638 ".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n"
639 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
640 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
641 ".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n"
642 "prfm pldl1keep, [x23, #0x80]\n"
643 "bge 46b\n"
644 "47:" // Width 5: Multiply loop: Single iteration only
645 "ld1r { v0.2d }, [x23]\n"
646 "ldr q11, [%x[B_ptr], #0x0]\n"
647 "add x23, x23, #0x8\n"
648 "sub x24, x24, #0x4\n"
649 "ldr q12, [%x[B_ptr], #0x10]\n"
650 "ldr q13, [%x[B_ptr], #0x20]\n"
651 "ldr q1, [%x[B_ptr], #0x30]\n"
652 "ldr q2, [%x[B_ptr], #0x40]\n"
653 "ldr q3, [%x[B_ptr], #0x50]\n"
654 "ldr q4, [x20, #0x0]\n"
655 "add %x[B_ptr], %x[B_ptr], #0x60\n"
656 ".inst 0x6e40fd6e // bfdot v14.4s, v11.8h, v0.8h\n"
657 "ldr q5, [x20, #0x10]\n"
658 "ldr q6, [x20, #0x20]\n"
659 ".inst 0x6e40fd8f // bfdot v15.4s, v12.8h, v0.8h\n"
660 ".inst 0x6e40fdb0 // bfdot v16.4s, v13.8h, v0.8h\n"
661 "ldr q7, [x20, #0x30]\n"
662 ".inst 0x6e40fc31 // bfdot v17.4s, v1.8h, v0.8h\n"
663 ".inst 0x6e40fc52 // bfdot v18.4s, v2.8h, v0.8h\n"
664 "add x20, x20, #0x60\n"
665 ".inst 0x6e40fc73 // bfdot v19.4s, v3.8h, v0.8h\n"
666 ".inst 0x6e40fc94 // bfdot v20.4s, v4.8h, v0.8h\n"
667 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
668 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
669 ".inst 0x6e40fcb5 // bfdot v21.4s, v5.8h, v0.8h\n"
670 ".inst 0x6e40fcd6 // bfdot v22.4s, v6.8h, v0.8h\n"
671 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
672 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
673 ".inst 0x6e40fcf7 // bfdot v23.4s, v7.8h, v0.8h\n"
674 "prfm pldl1keep, [x23, #0x80]\n"
675 "48:" // Width 5: Multiply loop: Main loop skip
676 "cbz x24, 51f\n"
677 "tbz x24, #1, 49f\n"
678 "ldr s0, [x23], #0x4\n"
679 "tbz x24, #0, 50f\n"
680 "ld1 { v0.h }[2], [x23]\n"
681 "b 50f\n"
682 "49:" // Width 5: Multiply loop: Ragged operand read: partial_1_0
683 "ldr h0, [x23, #0x0]\n"
684 "50:" // Width 5: Multiply loop: Ragged operand read: Done
685 "ldr q8, [%x[B_ptr], #0x0]\n"
686 "ldr q9, [%x[B_ptr], #0x10]\n"
687 "dup v0.2d, v0.d[0]\n"
688 "ldr q10, [%x[B_ptr], #0x20]\n"
689 "ldr q11, [%x[B_ptr], #0x30]\n"
690 "ldr q12, [%x[B_ptr], #0x40]\n"
691 "ldr q13, [%x[B_ptr], #0x50]\n"
692 "add %x[B_ptr], %x[B_ptr], #0x90\n"
693 "ldr q1, [x20, #0x0]\n"
694 "ldr q2, [x20, #0x10]\n"
695 ".inst 0x6e40fd0e // bfdot v14.4s, v8.8h, v0.8h\n"
696 ".inst 0x6e40fd2f // bfdot v15.4s, v9.8h, v0.8h\n"
697 "ldr q3, [x20, #0x20]\n"
698 "ldr q4, [x20, #0x30]\n"
699 ".inst 0x6e40fd50 // bfdot v16.4s, v10.8h, v0.8h\n"
700 ".inst 0x6e40fd71 // bfdot v17.4s, v11.8h, v0.8h\n"
701 ".inst 0x6e40fd92 // bfdot v18.4s, v12.8h, v0.8h\n"
702 ".inst 0x6e40fdb3 // bfdot v19.4s, v13.8h, v0.8h\n"
703 ".inst 0x6e40fc34 // bfdot v20.4s, v1.8h, v0.8h\n"
704 ".inst 0x6e40fc55 // bfdot v21.4s, v2.8h, v0.8h\n"
705 ".inst 0x6e40fc76 // bfdot v22.4s, v3.8h, v0.8h\n"
706 ".inst 0x6e40fc97 // bfdot v23.4s, v4.8h, v0.8h\n"
707 "51:" // Width 5: Multiply loop: No odd multiplies
708 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
709 "faddp v14.4s, v14.4s, v15.4s\n"
710 "faddp v15.4s, v16.4s, v17.4s\n"
711 "faddp v16.4s, v18.4s, v19.4s\n"
712 "faddp v17.4s, v20.4s, v21.4s\n"
713 "faddp v18.4s, v22.4s, v23.4s\n"
714 "tbz %x[flags], #1, 52f\n"
715 "add x21, %x[args_ptr], %[offset_max]\n"
716 "add x20, %x[args_ptr], %[offset_min]\n"
717 "ld1r { v20.4s }, [x21]\n"
718 "ld1r { v19.4s }, [x20]\n"
719 "fmin v14.4s, v14.4s, v20.4s\n"
720 "fmin v15.4s, v15.4s, v20.4s\n"
721 "fmin v16.4s, v16.4s, v20.4s\n"
722 "fmin v17.4s, v17.4s, v20.4s\n"
723 "fmin v18.4s, v18.4s, v20.4s\n"
724 "fmax v14.4s, v14.4s, v19.4s\n"
725 "fmax v15.4s, v15.4s, v19.4s\n"
726 "fmax v16.4s, v16.4s, v19.4s\n"
727 "fmax v17.4s, v17.4s, v19.4s\n"
728 "fmax v18.4s, v18.4s, v19.4s\n"
729 "52:" // Width 5: No activation
730 "cmp %x[N], #0x14\n"
731 "str q14, [%x[output_ptr], #0x0]\n"
732 "str q15, [%x[output_ptr], #0x10]\n"
733 "str q16, [%x[output_ptr], #0x20]\n"
734 "str q17, [%x[output_ptr], #0x30]\n"
735 "add %x[output_ptr], %x[output_ptr], #0x40\n"
736 "blt 53f\n"
737 "str q18, [%x[output_ptr], #0x0]\n"
738 "add %x[output_ptr], %x[output_ptr], #0x10\n"
739 "b 55f\n"
740 "53:" // Width 5: Partial writeback
741 "tbz %x[N], #1, 54f\n"
742 "str d18, [%x[output_ptr]], #0x8\n"
743 "tbz %x[N], #0, 55f\n"
744 "st1 { v18.s }[2], [%x[output_ptr]]\n"
745 "b 55f\n"
746 "54:" // Width 5: Partial direct writeback: partial_1_16
747 "tbz %x[N], #0, 55f\n"
748 "str s18, [%x[output_ptr], #0x0]\n"
749 "55:" // Width 5: Writeback done
750 "b 100f\n"
751 "56:" // Width 6
752 "ldr q14, [%x[B_ptr], #0x0]\n"
753 "ldr q15, [%x[B_ptr], #0x10]\n"
754 "mov x24, %x[K]\n"
755 "add x20, %x[B_ptr], x26\n"
756 "ldr q16, [%x[B_ptr], #0x20]\n"
757 "ldr q17, [x20, #0x0]\n"
758 "movi v26.16b, #0x0\n"
759 "mov x23, %x[A_ptr]\n"
760 "ldr q18, [x20, #0x10]\n"
761 "ldr q19, [x20, #0x20]\n"
762 "add %x[B_ptr], %x[B_ptr], #0x30\n"
763 "add x20, x20, #0x30\n"
764 "cmp x24, #0x4\n"
765 "zip2 v21.4s, v17.4s, v26.4s\n"
766 "zip1 v20.4s, v17.4s, v26.4s\n"
767 "zip2 v17.4s, v15.4s, v26.4s\n"
768 "zip2 v25.4s, v19.4s, v26.4s\n"
769 "zip1 v24.4s, v19.4s, v26.4s\n"
770 "zip2 v23.4s, v18.4s, v26.4s\n"
771 "zip1 v22.4s, v18.4s, v26.4s\n"
772 "zip2 v19.4s, v16.4s, v26.4s\n"
773 "zip1 v18.4s, v16.4s, v26.4s\n"
774 "zip1 v16.4s, v15.4s, v26.4s\n"
775 "zip2 v15.4s, v14.4s, v26.4s\n"
776 "zip1 v14.4s, v14.4s, v26.4s\n"
777 "blt 59f\n"
778 "cmp x24, #0x8\n"
779 "blt 58f\n"
780 "57:" // Width 6: Multiply loop: Main loop head
781 "ld1r { v0.2d }, [x23]\n"
782 "ldr q1, [%x[B_ptr], #0x0]\n"
783 "sub x24, x24, #0x4\n"
784 "add x23, x23, #0x8\n"
785 "ldr q2, [%x[B_ptr], #0x10]\n"
786 "ldr q3, [%x[B_ptr], #0x20]\n"
787 "cmp x24, #0x8\n"
788 "ldr q4, [%x[B_ptr], #0x30]\n"
789 "ldr q5, [%x[B_ptr], #0x40]\n"
790 "ldr q6, [%x[B_ptr], #0x50]\n"
791 "ldr q7, [x20, #0x0]\n"
792 "add %x[B_ptr], %x[B_ptr], #0x60\n"
793 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
794 "ldr q8, [x20, #0x10]\n"
795 "ldr q9, [x20, #0x20]\n"
796 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
797 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
798 "ldr q10, [x20, #0x30]\n"
799 "ldr q11, [x20, #0x40]\n"
800 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
801 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
802 "ldr q12, [x20, #0x50]\n"
803 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
804 ".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n"
805 "add x20, x20, #0x60\n"
806 ".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n"
807 ".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n"
808 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
809 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
810 ".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n"
811 ".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n"
812 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
813 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
814 ".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n"
815 "prfm pldl1keep, [x23, #0x80]\n"
816 "bge 57b\n"
817 "58:" // Width 6: Multiply loop: Single iteration only
818 "ld1r { v0.2d }, [x23]\n"
819 "ldr q13, [%x[B_ptr], #0x0]\n"
820 "add x23, x23, #0x8\n"
821 "sub x24, x24, #0x4\n"
822 "ldr q1, [%x[B_ptr], #0x10]\n"
823 "ldr q2, [%x[B_ptr], #0x20]\n"
824 "ldr q3, [%x[B_ptr], #0x30]\n"
825 "ldr q4, [%x[B_ptr], #0x40]\n"
826 "ldr q5, [%x[B_ptr], #0x50]\n"
827 "ldr q6, [x20, #0x0]\n"
828 "add %x[B_ptr], %x[B_ptr], #0x60\n"
829 ".inst 0x6e40fdae // bfdot v14.4s, v13.8h, v0.8h\n"
830 "ldr q7, [x20, #0x10]\n"
831 "ldr q8, [x20, #0x20]\n"
832 ".inst 0x6e40fc2f // bfdot v15.4s, v1.8h, v0.8h\n"
833 ".inst 0x6e40fc50 // bfdot v16.4s, v2.8h, v0.8h\n"
834 "ldr q9, [x20, #0x30]\n"
835 "ldr q10, [x20, #0x40]\n"
836 ".inst 0x6e40fc71 // bfdot v17.4s, v3.8h, v0.8h\n"
837 ".inst 0x6e40fc92 // bfdot v18.4s, v4.8h, v0.8h\n"
838 "ldr q11, [x20, #0x50]\n"
839 ".inst 0x6e40fcb3 // bfdot v19.4s, v5.8h, v0.8h\n"
840 ".inst 0x6e40fcd4 // bfdot v20.4s, v6.8h, v0.8h\n"
841 "add x20, x20, #0x60\n"
842 ".inst 0x6e40fcf5 // bfdot v21.4s, v7.8h, v0.8h\n"
843 ".inst 0x6e40fd16 // bfdot v22.4s, v8.8h, v0.8h\n"
844 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
845 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
846 ".inst 0x6e40fd37 // bfdot v23.4s, v9.8h, v0.8h\n"
847 ".inst 0x6e40fd58 // bfdot v24.4s, v10.8h, v0.8h\n"
848 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
849 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
850 ".inst 0x6e40fd79 // bfdot v25.4s, v11.8h, v0.8h\n"
851 "prfm pldl1keep, [x23, #0x80]\n"
852 "59:" // Width 6: Multiply loop: Main loop skip
853 "cbz x24, 62f\n"
854 "tbz x24, #1, 60f\n"
855 "ldr s0, [x23], #0x4\n"
856 "tbz x24, #0, 61f\n"
857 "ld1 { v0.h }[2], [x23]\n"
858 "b 61f\n"
859 "60:" // Width 6: Multiply loop: Ragged operand read: partial_1_0
860 "ldr h0, [x23, #0x0]\n"
861 "61:" // Width 6: Multiply loop: Ragged operand read: Done
862 "ldr q12, [%x[B_ptr], #0x0]\n"
863 "ldr q13, [%x[B_ptr], #0x10]\n"
864 "dup v0.2d, v0.d[0]\n"
865 "ldr q1, [%x[B_ptr], #0x20]\n"
866 "ldr q2, [%x[B_ptr], #0x30]\n"
867 "ldr q3, [%x[B_ptr], #0x40]\n"
868 "ldr q4, [%x[B_ptr], #0x50]\n"
869 "add %x[B_ptr], %x[B_ptr], #0x90\n"
870 "ldr q5, [x20, #0x0]\n"
871 "ldr q6, [x20, #0x10]\n"
872 ".inst 0x6e40fd8e // bfdot v14.4s, v12.8h, v0.8h\n"
873 ".inst 0x6e40fdaf // bfdot v15.4s, v13.8h, v0.8h\n"
874 "ldr q7, [x20, #0x20]\n"
875 "ldr q8, [x20, #0x30]\n"
876 ".inst 0x6e40fc30 // bfdot v16.4s, v1.8h, v0.8h\n"
877 ".inst 0x6e40fc51 // bfdot v17.4s, v2.8h, v0.8h\n"
878 "ldr q9, [x20, #0x40]\n"
879 "ldr q10, [x20, #0x50]\n"
880 ".inst 0x6e40fc72 // bfdot v18.4s, v3.8h, v0.8h\n"
881 ".inst 0x6e40fc93 // bfdot v19.4s, v4.8h, v0.8h\n"
882 ".inst 0x6e40fcb4 // bfdot v20.4s, v5.8h, v0.8h\n"
883 ".inst 0x6e40fcd5 // bfdot v21.4s, v6.8h, v0.8h\n"
884 ".inst 0x6e40fcf6 // bfdot v22.4s, v7.8h, v0.8h\n"
885 ".inst 0x6e40fd17 // bfdot v23.4s, v8.8h, v0.8h\n"
886 ".inst 0x6e40fd38 // bfdot v24.4s, v9.8h, v0.8h\n"
887 ".inst 0x6e40fd59 // bfdot v25.4s, v10.8h, v0.8h\n"
888 "62:" // Width 6: Multiply loop: No odd multiplies
889 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
890 "faddp v14.4s, v14.4s, v15.4s\n"
891 "faddp v15.4s, v16.4s, v17.4s\n"
892 "faddp v16.4s, v18.4s, v19.4s\n"
893 "faddp v17.4s, v20.4s, v21.4s\n"
894 "faddp v18.4s, v22.4s, v23.4s\n"
895 "faddp v19.4s, v24.4s, v25.4s\n"
896 "tbz %x[flags], #1, 63f\n"
897 "add x21, %x[args_ptr], %[offset_max]\n"
898 "add x20, %x[args_ptr], %[offset_min]\n"
899 "ld1r { v21.4s }, [x21]\n"
900 "ld1r { v20.4s }, [x20]\n"
901 "fmin v14.4s, v14.4s, v21.4s\n"
902 "fmin v15.4s, v15.4s, v21.4s\n"
903 "fmin v16.4s, v16.4s, v21.4s\n"
904 "fmin v17.4s, v17.4s, v21.4s\n"
905 "fmin v18.4s, v18.4s, v21.4s\n"
906 "fmin v19.4s, v19.4s, v21.4s\n"
907 "fmax v14.4s, v14.4s, v20.4s\n"
908 "fmax v15.4s, v15.4s, v20.4s\n"
909 "fmax v16.4s, v16.4s, v20.4s\n"
910 "fmax v17.4s, v17.4s, v20.4s\n"
911 "fmax v18.4s, v18.4s, v20.4s\n"
912 "fmax v19.4s, v19.4s, v20.4s\n"
913 "63:" // Width 6: No activation
914 "cmp %x[N], #0x18\n"
915 "str q14, [%x[output_ptr], #0x0]\n"
916 "str q15, [%x[output_ptr], #0x10]\n"
917 "str q16, [%x[output_ptr], #0x20]\n"
918 "str q17, [%x[output_ptr], #0x30]\n"
919 "str q18, [%x[output_ptr], #0x40]\n"
920 "add %x[output_ptr], %x[output_ptr], #0x50\n"
921 "blt 64f\n"
922 "str q19, [%x[output_ptr], #0x0]\n"
923 "add %x[output_ptr], %x[output_ptr], #0x10\n"
924 "b 66f\n"
925 "64:" // Width 6: Partial writeback
926 "tbz %x[N], #1, 65f\n"
927 "str d19, [%x[output_ptr]], #0x8\n"
928 "tbz %x[N], #0, 66f\n"
929 "st1 { v19.s }[2], [%x[output_ptr]]\n"
930 "b 66f\n"
931 "65:" // Width 6: Partial direct writeback: partial_1_20
932 "tbz %x[N], #0, 66f\n"
933 "str s19, [%x[output_ptr], #0x0]\n"
934 "66:" // Width 6: Writeback done
935 "b 100f\n"
936 "67:" // Width 7
937 "ldr q14, [%x[B_ptr], #0x0]\n"
938 "ldr q15, [%x[B_ptr], #0x10]\n"
939 "mov x24, %x[K]\n"
940 "add x21, %x[B_ptr], x26\n"
941 "ldr q16, [%x[B_ptr], #0x20]\n"
942 "ldr q17, [x21, #0x0]\n"
943 "add x20, %x[B_ptr], x26, LSL #1\n"
944 "movi v28.16b, #0x0\n"
945 "ldr q18, [x21, #0x10]\n"
946 "ldr q19, [x21, #0x20]\n"
947 "mov x23, %x[A_ptr]\n"
948 "add %x[B_ptr], %x[B_ptr], #0x30\n"
949 "ldr q20, [x20, #0x0]\n"
950 "cmp x24, #0x4\n"
951 "add x21, x21, #0x30\n"
952 "add x20, x20, #0x30\n"
953 "zip2 v21.4s, v17.4s, v28.4s\n"
954 "zip2 v25.4s, v19.4s, v28.4s\n"
955 "zip1 v24.4s, v19.4s, v28.4s\n"
956 "zip2 v23.4s, v18.4s, v28.4s\n"
957 "zip2 v27.4s, v20.4s, v28.4s\n"
958 "zip1 v26.4s, v20.4s, v28.4s\n"
959 "zip1 v22.4s, v18.4s, v28.4s\n"
960 "zip1 v20.4s, v17.4s, v28.4s\n"
961 "zip2 v19.4s, v16.4s, v28.4s\n"
962 "zip1 v18.4s, v16.4s, v28.4s\n"
963 "zip2 v17.4s, v15.4s, v28.4s\n"
964 "zip1 v16.4s, v15.4s, v28.4s\n"
965 "zip2 v15.4s, v14.4s, v28.4s\n"
966 "zip1 v14.4s, v14.4s, v28.4s\n"
967 "blt 70f\n"
968 "cmp x24, #0x8\n"
969 "blt 69f\n"
970 "68:" // Width 7: Multiply loop: Main loop head
971 "ld1r { v0.2d }, [x23]\n"
972 "ldr q1, [%x[B_ptr], #0x0]\n"
973 "sub x24, x24, #0x4\n"
974 "add x23, x23, #0x8\n"
975 "ldr q2, [%x[B_ptr], #0x10]\n"
976 "ldr q3, [%x[B_ptr], #0x20]\n"
977 "cmp x24, #0x8\n"
978 "ldr q4, [%x[B_ptr], #0x30]\n"
979 "ldr q5, [%x[B_ptr], #0x40]\n"
980 "ldr q6, [%x[B_ptr], #0x50]\n"
981 "ldr q7, [x21, #0x0]\n"
982 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
983 "add %x[B_ptr], %x[B_ptr], #0x60\n"
984 "ldr q8, [x21, #0x10]\n"
985 "ldr q9, [x21, #0x20]\n"
986 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
987 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
988 "ldr q10, [x21, #0x30]\n"
989 "ldr q11, [x21, #0x40]\n"
990 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
991 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
992 "ldr q12, [x21, #0x50]\n"
993 "ldr q13, [x20, #0x0]\n"
994 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
995 ".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n"
996 "ldr q1, [x20, #0x10]\n"
997 ".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n"
998 ".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n"
999 "add x21, x21, #0x60\n"
1000 ".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n"
1001 ".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n"
1002 "add x20, x20, #0x60\n"
1003 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1004 ".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n"
1005 ".inst 0x6e40fdba // bfdot v26.4s, v13.8h, v0.8h\n"
1006 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1007 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
1008 ".inst 0x6e40fc3b // bfdot v27.4s, v1.8h, v0.8h\n"
1009 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
1010 "prfm pldl1keep, [x23, #0x80]\n"
1011 "bge 68b\n"
1012 "69:" // Width 7: Multiply loop: Single iteration only
1013 "ld1r { v0.2d }, [x23]\n"
1014 "ldr q2, [%x[B_ptr], #0x0]\n"
1015 "add x23, x23, #0x8\n"
1016 "sub x24, x24, #0x4\n"
1017 "ldr q3, [%x[B_ptr], #0x10]\n"
1018 "ldr q4, [%x[B_ptr], #0x20]\n"
1019 "ldr q5, [%x[B_ptr], #0x30]\n"
1020 "ldr q6, [%x[B_ptr], #0x40]\n"
1021 "ldr q7, [%x[B_ptr], #0x50]\n"
1022 "ldr q8, [x21, #0x0]\n"
1023 ".inst 0x6e40fc4e // bfdot v14.4s, v2.8h, v0.8h\n"
1024 "add %x[B_ptr], %x[B_ptr], #0x60\n"
1025 "ldr q9, [x21, #0x10]\n"
1026 "ldr q10, [x21, #0x20]\n"
1027 ".inst 0x6e40fc6f // bfdot v15.4s, v3.8h, v0.8h\n"
1028 ".inst 0x6e40fc90 // bfdot v16.4s, v4.8h, v0.8h\n"
1029 "ldr q11, [x21, #0x30]\n"
1030 "ldr q12, [x21, #0x40]\n"
1031 ".inst 0x6e40fcb1 // bfdot v17.4s, v5.8h, v0.8h\n"
1032 ".inst 0x6e40fcd2 // bfdot v18.4s, v6.8h, v0.8h\n"
1033 "ldr q13, [x21, #0x50]\n"
1034 "ldr q1, [x20, #0x0]\n"
1035 ".inst 0x6e40fcf3 // bfdot v19.4s, v7.8h, v0.8h\n"
1036 ".inst 0x6e40fd14 // bfdot v20.4s, v8.8h, v0.8h\n"
1037 "ldr q2, [x20, #0x10]\n"
1038 ".inst 0x6e40fd35 // bfdot v21.4s, v9.8h, v0.8h\n"
1039 ".inst 0x6e40fd56 // bfdot v22.4s, v10.8h, v0.8h\n"
1040 "add x21, x21, #0x60\n"
1041 ".inst 0x6e40fd77 // bfdot v23.4s, v11.8h, v0.8h\n"
1042 ".inst 0x6e40fd98 // bfdot v24.4s, v12.8h, v0.8h\n"
1043 "add x20, x20, #0x60\n"
1044 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1045 ".inst 0x6e40fdb9 // bfdot v25.4s, v13.8h, v0.8h\n"
1046 ".inst 0x6e40fc3a // bfdot v26.4s, v1.8h, v0.8h\n"
1047 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1048 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
1049 ".inst 0x6e40fc5b // bfdot v27.4s, v2.8h, v0.8h\n"
1050 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
1051 "prfm pldl1keep, [x23, #0x80]\n"
1052 "70:" // Width 7: Multiply loop: Main loop skip
1053 "cbz x24, 73f\n"
1054 "tbz x24, #1, 71f\n"
1055 "ldr s0, [x23], #0x4\n"
1056 "tbz x24, #0, 72f\n"
1057 "ld1 { v0.h }[2], [x23]\n"
1058 "b 72f\n"
1059 "71:" // Width 7: Multiply loop: Ragged operand read: partial_1_0
1060 "ldr h0, [x23, #0x0]\n"
1061 "72:" // Width 7: Multiply loop: Ragged operand read: Done
1062 "ldr q3, [%x[B_ptr], #0x0]\n"
1063 "ldr q4, [%x[B_ptr], #0x10]\n"
1064 "dup v0.2d, v0.d[0]\n"
1065 "ldr q5, [%x[B_ptr], #0x20]\n"
1066 "ldr q6, [%x[B_ptr], #0x30]\n"
1067 "ldr q7, [%x[B_ptr], #0x40]\n"
1068 "ldr q8, [%x[B_ptr], #0x50]\n"
1069 "add %x[B_ptr], %x[B_ptr], #0x90\n"
1070 "ldr q9, [x21, #0x0]\n"
1071 "ldr q10, [x21, #0x10]\n"
1072 ".inst 0x6e40fc6e // bfdot v14.4s, v3.8h, v0.8h\n"
1073 ".inst 0x6e40fc8f // bfdot v15.4s, v4.8h, v0.8h\n"
1074 "ldr q11, [x21, #0x20]\n"
1075 "ldr q12, [x21, #0x30]\n"
1076 ".inst 0x6e40fcb0 // bfdot v16.4s, v5.8h, v0.8h\n"
1077 ".inst 0x6e40fcd1 // bfdot v17.4s, v6.8h, v0.8h\n"
1078 "ldr q13, [x21, #0x40]\n"
1079 "ldr q1, [x21, #0x50]\n"
1080 ".inst 0x6e40fcf2 // bfdot v18.4s, v7.8h, v0.8h\n"
1081 ".inst 0x6e40fd13 // bfdot v19.4s, v8.8h, v0.8h\n"
1082 "ldr q2, [x20, #0x0]\n"
1083 "ldr q3, [x20, #0x10]\n"
1084 ".inst 0x6e40fd34 // bfdot v20.4s, v9.8h, v0.8h\n"
1085 ".inst 0x6e40fd55 // bfdot v21.4s, v10.8h, v0.8h\n"
1086 ".inst 0x6e40fd76 // bfdot v22.4s, v11.8h, v0.8h\n"
1087 ".inst 0x6e40fd97 // bfdot v23.4s, v12.8h, v0.8h\n"
1088 ".inst 0x6e40fdb8 // bfdot v24.4s, v13.8h, v0.8h\n"
1089 ".inst 0x6e40fc39 // bfdot v25.4s, v1.8h, v0.8h\n"
1090 ".inst 0x6e40fc5a // bfdot v26.4s, v2.8h, v0.8h\n"
1091 ".inst 0x6e40fc7b // bfdot v27.4s, v3.8h, v0.8h\n"
1092 "73:" // Width 7: Multiply loop: No odd multiplies
1093 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1094 "faddp v14.4s, v14.4s, v15.4s\n"
1095 "faddp v15.4s, v16.4s, v17.4s\n"
1096 "faddp v16.4s, v18.4s, v19.4s\n"
1097 "faddp v17.4s, v20.4s, v21.4s\n"
1098 "faddp v18.4s, v22.4s, v23.4s\n"
1099 "faddp v19.4s, v24.4s, v25.4s\n"
1100 "faddp v20.4s, v26.4s, v27.4s\n"
1101 "tbz %x[flags], #1, 74f\n"
1102 "add x21, %x[args_ptr], %[offset_max]\n"
1103 "add x20, %x[args_ptr], %[offset_min]\n"
1104 "ld1r { v22.4s }, [x21]\n"
1105 "ld1r { v21.4s }, [x20]\n"
1106 "fmin v14.4s, v14.4s, v22.4s\n"
1107 "fmin v15.4s, v15.4s, v22.4s\n"
1108 "fmin v16.4s, v16.4s, v22.4s\n"
1109 "fmin v17.4s, v17.4s, v22.4s\n"
1110 "fmin v18.4s, v18.4s, v22.4s\n"
1111 "fmin v19.4s, v19.4s, v22.4s\n"
1112 "fmin v20.4s, v20.4s, v22.4s\n"
1113 "fmax v14.4s, v14.4s, v21.4s\n"
1114 "fmax v15.4s, v15.4s, v21.4s\n"
1115 "fmax v16.4s, v16.4s, v21.4s\n"
1116 "fmax v17.4s, v17.4s, v21.4s\n"
1117 "fmax v18.4s, v18.4s, v21.4s\n"
1118 "fmax v19.4s, v19.4s, v21.4s\n"
1119 "fmax v20.4s, v20.4s, v21.4s\n"
1120 "74:" // Width 7: No activation
1121 "cmp %x[N], #0x1c\n"
1122 "str q14, [%x[output_ptr], #0x0]\n"
1123 "str q15, [%x[output_ptr], #0x10]\n"
1124 "str q16, [%x[output_ptr], #0x20]\n"
1125 "str q17, [%x[output_ptr], #0x30]\n"
1126 "str q18, [%x[output_ptr], #0x40]\n"
1127 "str q19, [%x[output_ptr], #0x50]\n"
1128 "add %x[output_ptr], %x[output_ptr], #0x60\n"
1129 "blt 75f\n"
1130 "str q20, [%x[output_ptr], #0x0]\n"
1131 "add %x[output_ptr], %x[output_ptr], #0x10\n"
1132 "b 77f\n"
1133 "75:" // Width 7: Partial writeback
1134 "tbz %x[N], #1, 76f\n"
1135 "str d20, [%x[output_ptr]], #0x8\n"
1136 "tbz %x[N], #0, 77f\n"
1137 "st1 { v20.s }[2], [%x[output_ptr]]\n"
1138 "b 77f\n"
1139 "76:" // Width 7: Partial direct writeback: partial_1_24
1140 "tbz %x[N], #0, 77f\n"
1141 "str s20, [%x[output_ptr], #0x0]\n"
1142 "77:" // Width 7: Writeback done
1143 "b 100f\n"
1144 "78:" // Width 8
1145 "ldr q14, [%x[B_ptr], #0x0]\n"
1146 "ldr q15, [%x[B_ptr], #0x10]\n"
1147 "mov x24, %x[K]\n"
1148 "add x21, %x[B_ptr], x26\n"
1149 "ldr q16, [%x[B_ptr], #0x20]\n"
1150 "ldr q17, [x21, #0x0]\n"
1151 "add x20, %x[B_ptr], x26, LSL #1\n"
1152 "movi v30.16b, #0x0\n"
1153 "ldr q18, [x21, #0x10]\n"
1154 "ldr q19, [x21, #0x20]\n"
1155 "mov x23, %x[A_ptr]\n"
1156 "add %x[B_ptr], %x[B_ptr], #0x30\n"
1157 "ldr q20, [x20, #0x0]\n"
1158 "ldr q21, [x20, #0x10]\n"
1159 "cmp x24, #0x4\n"
1160 "add x21, x21, #0x30\n"
1161 "add x20, x20, #0x30\n"
1162 "zip2 v25.4s, v19.4s, v30.4s\n"
1163 "zip1 v24.4s, v19.4s, v30.4s\n"
1164 "zip2 v23.4s, v18.4s, v30.4s\n"
1165 "zip1 v22.4s, v18.4s, v30.4s\n"
1166 "zip2 v29.4s, v21.4s, v30.4s\n"
1167 "zip1 v28.4s, v21.4s, v30.4s\n"
1168 "zip2 v27.4s, v20.4s, v30.4s\n"
1169 "zip1 v26.4s, v20.4s, v30.4s\n"
1170 "zip2 v21.4s, v17.4s, v30.4s\n"
1171 "zip1 v20.4s, v17.4s, v30.4s\n"
1172 "zip2 v19.4s, v16.4s, v30.4s\n"
1173 "zip1 v18.4s, v16.4s, v30.4s\n"
1174 "zip2 v17.4s, v15.4s, v30.4s\n"
1175 "zip1 v16.4s, v15.4s, v30.4s\n"
1176 "zip2 v15.4s, v14.4s, v30.4s\n"
1177 "zip1 v14.4s, v14.4s, v30.4s\n"
1178 "blt 81f\n"
1179 "cmp x24, #0x8\n"
1180 "blt 80f\n"
1181 "79:" // Width 8: Multiply loop: Main loop head
1182 "ld1r { v0.2d }, [x23]\n"
1183 "ldr q1, [%x[B_ptr], #0x0]\n"
1184 "sub x24, x24, #0x4\n"
1185 "add x23, x23, #0x8\n"
1186 "ldr q2, [%x[B_ptr], #0x10]\n"
1187 "ldr q3, [%x[B_ptr], #0x20]\n"
1188 "cmp x24, #0x8\n"
1189 "ldr q4, [%x[B_ptr], #0x30]\n"
1190 "ldr q5, [%x[B_ptr], #0x40]\n"
1191 "ldr q6, [%x[B_ptr], #0x50]\n"
1192 "ldr q7, [x21, #0x0]\n"
1193 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
1194 "add %x[B_ptr], %x[B_ptr], #0x60\n"
1195 "ldr q8, [x21, #0x10]\n"
1196 "ldr q9, [x21, #0x20]\n"
1197 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
1198 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
1199 "ldr q10, [x21, #0x30]\n"
1200 "ldr q11, [x21, #0x40]\n"
1201 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
1202 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
1203 "ldr q12, [x21, #0x50]\n"
1204 "ldr q13, [x20, #0x0]\n"
1205 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
1206 ".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n"
1207 "ldr q1, [x20, #0x10]\n"
1208 "ldr q2, [x20, #0x20]\n"
1209 ".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n"
1210 ".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n"
1211 "ldr q3, [x20, #0x30]\n"
1212 ".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n"
1213 ".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n"
1214 "add x21, x21, #0x60\n"
1215 ".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n"
1216 ".inst 0x6e40fdba // bfdot v26.4s, v13.8h, v0.8h\n"
1217 "add x20, x20, #0x60\n"
1218 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1219 ".inst 0x6e40fc3b // bfdot v27.4s, v1.8h, v0.8h\n"
1220 ".inst 0x6e40fc5c // bfdot v28.4s, v2.8h, v0.8h\n"
1221 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1222 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
1223 ".inst 0x6e40fc7d // bfdot v29.4s, v3.8h, v0.8h\n"
1224 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
1225 "prfm pldl1keep, [x23, #0x80]\n"
1226 "bge 79b\n"
1227 "80:" // Width 8: Multiply loop: Single iteration only
1228 "ld1r { v0.2d }, [x23]\n"
1229 "ldr q4, [%x[B_ptr], #0x0]\n"
1230 "add x23, x23, #0x8\n"
1231 "sub x24, x24, #0x4\n"
1232 "ldr q5, [%x[B_ptr], #0x10]\n"
1233 "ldr q6, [%x[B_ptr], #0x20]\n"
1234 "ldr q7, [%x[B_ptr], #0x30]\n"
1235 "ldr q8, [%x[B_ptr], #0x40]\n"
1236 "ldr q9, [%x[B_ptr], #0x50]\n"
1237 "ldr q10, [x21, #0x0]\n"
1238 ".inst 0x6e40fc8e // bfdot v14.4s, v4.8h, v0.8h\n"
1239 "add %x[B_ptr], %x[B_ptr], #0x60\n"
1240 "ldr q11, [x21, #0x10]\n"
1241 "ldr q12, [x21, #0x20]\n"
1242 ".inst 0x6e40fcaf // bfdot v15.4s, v5.8h, v0.8h\n"
1243 ".inst 0x6e40fcd0 // bfdot v16.4s, v6.8h, v0.8h\n"
1244 "ldr q13, [x21, #0x30]\n"
1245 "ldr q1, [x21, #0x40]\n"
1246 ".inst 0x6e40fcf1 // bfdot v17.4s, v7.8h, v0.8h\n"
1247 ".inst 0x6e40fd12 // bfdot v18.4s, v8.8h, v0.8h\n"
1248 "ldr q2, [x21, #0x50]\n"
1249 "ldr q3, [x20, #0x0]\n"
1250 ".inst 0x6e40fd33 // bfdot v19.4s, v9.8h, v0.8h\n"
1251 ".inst 0x6e40fd54 // bfdot v20.4s, v10.8h, v0.8h\n"
1252 "ldr q4, [x20, #0x10]\n"
1253 "ldr q5, [x20, #0x20]\n"
1254 ".inst 0x6e40fd75 // bfdot v21.4s, v11.8h, v0.8h\n"
1255 ".inst 0x6e40fd96 // bfdot v22.4s, v12.8h, v0.8h\n"
1256 "ldr q6, [x20, #0x30]\n"
1257 ".inst 0x6e40fdb7 // bfdot v23.4s, v13.8h, v0.8h\n"
1258 ".inst 0x6e40fc38 // bfdot v24.4s, v1.8h, v0.8h\n"
1259 "add x21, x21, #0x60\n"
1260 ".inst 0x6e40fc59 // bfdot v25.4s, v2.8h, v0.8h\n"
1261 ".inst 0x6e40fc7a // bfdot v26.4s, v3.8h, v0.8h\n"
1262 "add x20, x20, #0x60\n"
1263 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1264 ".inst 0x6e40fc9b // bfdot v27.4s, v4.8h, v0.8h\n"
1265 ".inst 0x6e40fcbc // bfdot v28.4s, v5.8h, v0.8h\n"
1266 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1267 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
1268 ".inst 0x6e40fcdd // bfdot v29.4s, v6.8h, v0.8h\n"
1269 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
1270 "prfm pldl1keep, [x23, #0x80]\n"
1271 "81:" // Width 8: Multiply loop: Main loop skip
1272 "cbz x24, 84f\n"
1273 "tbz x24, #1, 82f\n"
1274 "ldr s0, [x23], #0x4\n"
1275 "tbz x24, #0, 83f\n"
1276 "ld1 { v0.h }[2], [x23]\n"
1277 "b 83f\n"
1278 "82:" // Width 8: Multiply loop: Ragged operand read: partial_1_0
1279 "ldr h0, [x23, #0x0]\n"
1280 "83:" // Width 8: Multiply loop: Ragged operand read: Done
1281 "ldr q7, [%x[B_ptr], #0x0]\n"
1282 "ldr q8, [%x[B_ptr], #0x10]\n"
1283 "dup v0.2d, v0.d[0]\n"
1284 "ldr q9, [%x[B_ptr], #0x20]\n"
1285 "ldr q10, [%x[B_ptr], #0x30]\n"
1286 "ldr q11, [%x[B_ptr], #0x40]\n"
1287 "ldr q12, [%x[B_ptr], #0x50]\n"
1288 "add %x[B_ptr], %x[B_ptr], #0x90\n"
1289 "ldr q13, [x21, #0x0]\n"
1290 "ldr q1, [x21, #0x10]\n"
1291 ".inst 0x6e40fcee // bfdot v14.4s, v7.8h, v0.8h\n"
1292 ".inst 0x6e40fd0f // bfdot v15.4s, v8.8h, v0.8h\n"
1293 "ldr q2, [x21, #0x20]\n"
1294 "ldr q3, [x21, #0x30]\n"
1295 ".inst 0x6e40fd30 // bfdot v16.4s, v9.8h, v0.8h\n"
1296 ".inst 0x6e40fd51 // bfdot v17.4s, v10.8h, v0.8h\n"
1297 "ldr q4, [x21, #0x40]\n"
1298 "ldr q5, [x21, #0x50]\n"
1299 ".inst 0x6e40fd72 // bfdot v18.4s, v11.8h, v0.8h\n"
1300 ".inst 0x6e40fd93 // bfdot v19.4s, v12.8h, v0.8h\n"
1301 "ldr q6, [x20, #0x0]\n"
1302 "ldr q7, [x20, #0x10]\n"
1303 ".inst 0x6e40fdb4 // bfdot v20.4s, v13.8h, v0.8h\n"
1304 ".inst 0x6e40fc35 // bfdot v21.4s, v1.8h, v0.8h\n"
1305 "ldr q8, [x20, #0x20]\n"
1306 "ldr q9, [x20, #0x30]\n"
1307 ".inst 0x6e40fc56 // bfdot v22.4s, v2.8h, v0.8h\n"
1308 ".inst 0x6e40fc77 // bfdot v23.4s, v3.8h, v0.8h\n"
1309 ".inst 0x6e40fc98 // bfdot v24.4s, v4.8h, v0.8h\n"
1310 ".inst 0x6e40fcb9 // bfdot v25.4s, v5.8h, v0.8h\n"
1311 ".inst 0x6e40fcda // bfdot v26.4s, v6.8h, v0.8h\n"
1312 ".inst 0x6e40fcfb // bfdot v27.4s, v7.8h, v0.8h\n"
1313 ".inst 0x6e40fd1c // bfdot v28.4s, v8.8h, v0.8h\n"
1314 ".inst 0x6e40fd3d // bfdot v29.4s, v9.8h, v0.8h\n"
1315 "84:" // Width 8: Multiply loop: No odd multiplies
1316 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1317 "faddp v14.4s, v14.4s, v15.4s\n"
1318 "faddp v15.4s, v16.4s, v17.4s\n"
1319 "faddp v16.4s, v18.4s, v19.4s\n"
1320 "faddp v17.4s, v20.4s, v21.4s\n"
1321 "faddp v18.4s, v22.4s, v23.4s\n"
1322 "faddp v19.4s, v24.4s, v25.4s\n"
1323 "faddp v20.4s, v26.4s, v27.4s\n"
1324 "faddp v21.4s, v28.4s, v29.4s\n"
1325 "tbz %x[flags], #1, 85f\n"
1326 "add x21, %x[args_ptr], %[offset_max]\n"
1327 "add x20, %x[args_ptr], %[offset_min]\n"
1328 "ld1r { v23.4s }, [x21]\n"
1329 "ld1r { v22.4s }, [x20]\n"
1330 "fmin v14.4s, v14.4s, v23.4s\n"
1331 "fmin v15.4s, v15.4s, v23.4s\n"
1332 "fmin v16.4s, v16.4s, v23.4s\n"
1333 "fmin v17.4s, v17.4s, v23.4s\n"
1334 "fmin v18.4s, v18.4s, v23.4s\n"
1335 "fmin v19.4s, v19.4s, v23.4s\n"
1336 "fmin v20.4s, v20.4s, v23.4s\n"
1337 "fmin v21.4s, v21.4s, v23.4s\n"
1338 "fmax v14.4s, v14.4s, v22.4s\n"
1339 "fmax v15.4s, v15.4s, v22.4s\n"
1340 "fmax v16.4s, v16.4s, v22.4s\n"
1341 "fmax v17.4s, v17.4s, v22.4s\n"
1342 "fmax v18.4s, v18.4s, v22.4s\n"
1343 "fmax v19.4s, v19.4s, v22.4s\n"
1344 "fmax v20.4s, v20.4s, v22.4s\n"
1345 "fmax v21.4s, v21.4s, v22.4s\n"
1346 "85:" // Width 8: No activation
1347 "cmp %x[N], #0x20\n"
1348 "str q14, [%x[output_ptr], #0x0]\n"
1349 "str q15, [%x[output_ptr], #0x10]\n"
1350 "str q16, [%x[output_ptr], #0x20]\n"
1351 "str q17, [%x[output_ptr], #0x30]\n"
1352 "str q18, [%x[output_ptr], #0x40]\n"
1353 "str q19, [%x[output_ptr], #0x50]\n"
1354 "str q20, [%x[output_ptr], #0x60]\n"
1355 "add %x[output_ptr], %x[output_ptr], #0x70\n"
1356 "blt 86f\n"
1357 "str q21, [%x[output_ptr], #0x0]\n"
1358 "add %x[output_ptr], %x[output_ptr], #0x10\n"
1359 "b 88f\n"
1360 "86:" // Width 8: Partial writeback
1361 "tbz %x[N], #1, 87f\n"
1362 "str d21, [%x[output_ptr]], #0x8\n"
1363 "tbz %x[N], #0, 88f\n"
1364 "st1 { v21.s }[2], [%x[output_ptr]]\n"
1365 "b 88f\n"
1366 "87:" // Width 8: Partial direct writeback: partial_1_28
1367 "tbz %x[N], #0, 88f\n"
1368 "str s21, [%x[output_ptr], #0x0]\n"
1369 "88:" // Width 8: Writeback done
1370 "b 100f\n"
1371 "89:" // Width 9
1372 "ldr q14, [%x[B_ptr], #0x0]\n"
1373 "ldr q15, [%x[B_ptr], #0x10]\n"
1374 "mov x24, %x[K]\n"
1375 "add x21, %x[B_ptr], x26\n"
1376 "ldr q16, [%x[B_ptr], #0x20]\n"
1377 "ldr q17, [x21, #0x0]\n"
1378 "add x20, %x[B_ptr], x26, LSL #1\n"
1379 "movi v0.16b, #0x0\n"
1380 "ldr q18, [x21, #0x10]\n"
1381 "ldr q19, [x21, #0x20]\n"
1382 "mov x23, %x[A_ptr]\n"
1383 "add x22, x20, x26\n"
1384 "ldr q20, [x20, #0x0]\n"
1385 "ldr q21, [x20, #0x10]\n"
1386 "cmp x24, #0x4\n"
1387 "add %x[B_ptr], %x[B_ptr], #0x30\n"
1388 "ldr q22, [x20, #0x20]\n"
1389 "add x21, x21, #0x30\n"
1390 "add x20, x20, #0x30\n"
1391 "zip2 v25.4s, v19.4s, v0.4s\n"
1392 "zip1 v24.4s, v19.4s, v0.4s\n"
1393 "zip2 v23.4s, v18.4s, v0.4s\n"
1394 "zip2 v29.4s, v21.4s, v0.4s\n"
1395 "zip1 v28.4s, v21.4s, v0.4s\n"
1396 "zip2 v31.4s, v22.4s, v0.4s\n"
1397 "zip1 v30.4s, v22.4s, v0.4s\n"
1398 "zip2 v27.4s, v20.4s, v0.4s\n"
1399 "zip1 v26.4s, v20.4s, v0.4s\n"
1400 "zip1 v22.4s, v18.4s, v0.4s\n"
1401 "zip2 v21.4s, v17.4s, v0.4s\n"
1402 "zip1 v20.4s, v17.4s, v0.4s\n"
1403 "zip2 v19.4s, v16.4s, v0.4s\n"
1404 "zip1 v18.4s, v16.4s, v0.4s\n"
1405 "zip2 v17.4s, v15.4s, v0.4s\n"
1406 "zip1 v16.4s, v15.4s, v0.4s\n"
1407 "zip2 v15.4s, v14.4s, v0.4s\n"
1408 "zip1 v14.4s, v14.4s, v0.4s\n"
1409 "blt 92f\n"
1410 "cmp x24, #0x8\n"
1411 "blt 91f\n"
1412 "90:" // Width 9: Multiply loop: Main loop head
1413 "ld1r { v0.2d }, [x23]\n"
1414 "ldr q1, [%x[B_ptr], #0x0]\n"
1415 "sub x24, x24, #0x4\n"
1416 "add x23, x23, #0x8\n"
1417 "ldr q2, [%x[B_ptr], #0x10]\n"
1418 "ldr q3, [%x[B_ptr], #0x20]\n"
1419 "cmp x24, #0x8\n"
1420 "ldr q4, [%x[B_ptr], #0x30]\n"
1421 "ldr q5, [%x[B_ptr], #0x40]\n"
1422 "ldr q6, [%x[B_ptr], #0x50]\n"
1423 "ldr q7, [x21, #0x0]\n"
1424 ".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n"
1425 "add %x[B_ptr], %x[B_ptr], #0x60\n"
1426 "ldr q8, [x21, #0x10]\n"
1427 "ldr q9, [x21, #0x20]\n"
1428 ".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n"
1429 ".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n"
1430 "ldr q10, [x21, #0x30]\n"
1431 "ldr q11, [x21, #0x40]\n"
1432 ".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n"
1433 ".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n"
1434 "ldr q12, [x21, #0x50]\n"
1435 "ldr q13, [x20, #0x0]\n"
1436 ".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n"
1437 ".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n"
1438 "ldr q1, [x20, #0x10]\n"
1439 "ldr q2, [x20, #0x20]\n"
1440 ".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n"
1441 ".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n"
1442 "ldr q3, [x20, #0x30]\n"
1443 "ldr q4, [x20, #0x40]\n"
1444 ".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n"
1445 ".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n"
1446 "ldr q5, [x20, #0x50]\n"
1447 ".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n"
1448 ".inst 0x6e40fdba // bfdot v26.4s, v13.8h, v0.8h\n"
1449 "add x21, x21, #0x60\n"
1450 ".inst 0x6e40fc3b // bfdot v27.4s, v1.8h, v0.8h\n"
1451 ".inst 0x6e40fc5c // bfdot v28.4s, v2.8h, v0.8h\n"
1452 "add x20, x20, #0x60\n"
1453 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1454 ".inst 0x6e40fc7d // bfdot v29.4s, v3.8h, v0.8h\n"
1455 ".inst 0x6e40fc9e // bfdot v30.4s, v4.8h, v0.8h\n"
1456 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1457 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
1458 ".inst 0x6e40fcbf // bfdot v31.4s, v5.8h, v0.8h\n"
1459 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
1460 "prfm pldl1keep, [x23, #0x80]\n"
1461 "bge 90b\n"
1462 "91:" // Width 9: Multiply loop: Single iteration only
1463 "ld1r { v0.2d }, [x23]\n"
1464 "ldr q6, [%x[B_ptr], #0x0]\n"
1465 "add x23, x23, #0x8\n"
1466 "sub x24, x24, #0x4\n"
1467 "ldr q7, [%x[B_ptr], #0x10]\n"
1468 "ldr q8, [%x[B_ptr], #0x20]\n"
1469 "ldr q9, [%x[B_ptr], #0x30]\n"
1470 "ldr q10, [%x[B_ptr], #0x40]\n"
1471 "ldr q11, [%x[B_ptr], #0x50]\n"
1472 "ldr q12, [x21, #0x0]\n"
1473 ".inst 0x6e40fcce // bfdot v14.4s, v6.8h, v0.8h\n"
1474 "add %x[B_ptr], %x[B_ptr], #0x60\n"
1475 "ldr q13, [x21, #0x10]\n"
1476 "ldr q1, [x21, #0x20]\n"
1477 ".inst 0x6e40fcef // bfdot v15.4s, v7.8h, v0.8h\n"
1478 ".inst 0x6e40fd10 // bfdot v16.4s, v8.8h, v0.8h\n"
1479 "ldr q2, [x21, #0x30]\n"
1480 "ldr q3, [x21, #0x40]\n"
1481 ".inst 0x6e40fd31 // bfdot v17.4s, v9.8h, v0.8h\n"
1482 ".inst 0x6e40fd52 // bfdot v18.4s, v10.8h, v0.8h\n"
1483 "ldr q4, [x21, #0x50]\n"
1484 "ldr q5, [x20, #0x0]\n"
1485 ".inst 0x6e40fd73 // bfdot v19.4s, v11.8h, v0.8h\n"
1486 ".inst 0x6e40fd94 // bfdot v20.4s, v12.8h, v0.8h\n"
1487 "ldr q6, [x20, #0x10]\n"
1488 "ldr q7, [x20, #0x20]\n"
1489 ".inst 0x6e40fdb5 // bfdot v21.4s, v13.8h, v0.8h\n"
1490 ".inst 0x6e40fc36 // bfdot v22.4s, v1.8h, v0.8h\n"
1491 "ldr q8, [x20, #0x30]\n"
1492 "ldr q9, [x20, #0x40]\n"
1493 ".inst 0x6e40fc57 // bfdot v23.4s, v2.8h, v0.8h\n"
1494 ".inst 0x6e40fc78 // bfdot v24.4s, v3.8h, v0.8h\n"
1495 "ldr q10, [x20, #0x50]\n"
1496 ".inst 0x6e40fc99 // bfdot v25.4s, v4.8h, v0.8h\n"
1497 ".inst 0x6e40fcba // bfdot v26.4s, v5.8h, v0.8h\n"
1498 "add x21, x21, #0x60\n"
1499 ".inst 0x6e40fcdb // bfdot v27.4s, v6.8h, v0.8h\n"
1500 ".inst 0x6e40fcfc // bfdot v28.4s, v7.8h, v0.8h\n"
1501 "add x20, x20, #0x60\n"
1502 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1503 ".inst 0x6e40fd1d // bfdot v29.4s, v8.8h, v0.8h\n"
1504 ".inst 0x6e40fd3e // bfdot v30.4s, v9.8h, v0.8h\n"
1505 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1506 "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
1507 ".inst 0x6e40fd5f // bfdot v31.4s, v10.8h, v0.8h\n"
1508 "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
1509 "prfm pldl1keep, [x23, #0x80]\n"
1510 "92:" // Width 9: Multiply loop: Main loop skip
1511 "cbz x24, 95f\n"
1512 "tbz x24, #1, 93f\n"
1513 "ldr s0, [x23], #0x4\n"
1514 "tbz x24, #0, 94f\n"
1515 "ld1 { v0.h }[2], [x23]\n"
1516 "b 94f\n"
1517 "93:" // Width 9: Multiply loop: Ragged operand read: partial_1_0
1518 "ldr h0, [x23, #0x0]\n"
1519 "94:" // Width 9: Multiply loop: Ragged operand read: Done
1520 "ldr q11, [%x[B_ptr], #0x0]\n"
1521 "ldr q12, [%x[B_ptr], #0x10]\n"
1522 "dup v0.2d, v0.d[0]\n"
1523 "ldr q13, [%x[B_ptr], #0x20]\n"
1524 "ldr q1, [%x[B_ptr], #0x30]\n"
1525 "ldr q2, [%x[B_ptr], #0x40]\n"
1526 "ldr q3, [%x[B_ptr], #0x50]\n"
1527 "add %x[B_ptr], %x[B_ptr], #0x90\n"
1528 "ldr q4, [x21, #0x0]\n"
1529 "ldr q5, [x21, #0x10]\n"
1530 ".inst 0x6e40fd6e // bfdot v14.4s, v11.8h, v0.8h\n"
1531 ".inst 0x6e40fd8f // bfdot v15.4s, v12.8h, v0.8h\n"
1532 "ldr q6, [x21, #0x20]\n"
1533 "ldr q7, [x21, #0x30]\n"
1534 ".inst 0x6e40fdb0 // bfdot v16.4s, v13.8h, v0.8h\n"
1535 ".inst 0x6e40fc31 // bfdot v17.4s, v1.8h, v0.8h\n"
1536 "ldr q8, [x21, #0x40]\n"
1537 "ldr q9, [x21, #0x50]\n"
1538 ".inst 0x6e40fc52 // bfdot v18.4s, v2.8h, v0.8h\n"
1539 ".inst 0x6e40fc73 // bfdot v19.4s, v3.8h, v0.8h\n"
1540 "ldr q10, [x20, #0x0]\n"
1541 "ldr q11, [x20, #0x10]\n"
1542 ".inst 0x6e40fc94 // bfdot v20.4s, v4.8h, v0.8h\n"
1543 ".inst 0x6e40fcb5 // bfdot v21.4s, v5.8h, v0.8h\n"
1544 "ldr q12, [x20, #0x20]\n"
1545 "ldr q13, [x20, #0x30]\n"
1546 ".inst 0x6e40fcd6 // bfdot v22.4s, v6.8h, v0.8h\n"
1547 ".inst 0x6e40fcf7 // bfdot v23.4s, v7.8h, v0.8h\n"
1548 "ldr q1, [x20, #0x40]\n"
1549 "ldr q2, [x20, #0x50]\n"
1550 ".inst 0x6e40fd18 // bfdot v24.4s, v8.8h, v0.8h\n"
1551 ".inst 0x6e40fd39 // bfdot v25.4s, v9.8h, v0.8h\n"
1552 ".inst 0x6e40fd5a // bfdot v26.4s, v10.8h, v0.8h\n"
1553 ".inst 0x6e40fd7b // bfdot v27.4s, v11.8h, v0.8h\n"
1554 ".inst 0x6e40fd9c // bfdot v28.4s, v12.8h, v0.8h\n"
1555 ".inst 0x6e40fdbd // bfdot v29.4s, v13.8h, v0.8h\n"
1556 ".inst 0x6e40fc3e // bfdot v30.4s, v1.8h, v0.8h\n"
1557 ".inst 0x6e40fc5f // bfdot v31.4s, v2.8h, v0.8h\n"
1558 "95:" // Width 9: Multiply loop: No odd multiplies
1559 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1560 "faddp v14.4s, v14.4s, v15.4s\n"
1561 "faddp v15.4s, v16.4s, v17.4s\n"
1562 "faddp v16.4s, v18.4s, v19.4s\n"
1563 "faddp v17.4s, v20.4s, v21.4s\n"
1564 "faddp v18.4s, v22.4s, v23.4s\n"
1565 "faddp v19.4s, v24.4s, v25.4s\n"
1566 "faddp v20.4s, v26.4s, v27.4s\n"
1567 "faddp v21.4s, v28.4s, v29.4s\n"
1568 "faddp v22.4s, v30.4s, v31.4s\n"
1569 "tbz %x[flags], #1, 96f\n"
1570 "add x21, %x[args_ptr], %[offset_max]\n"
1571 "add x20, %x[args_ptr], %[offset_min]\n"
1572 "ld1r { v24.4s }, [x21]\n"
1573 "ld1r { v23.4s }, [x20]\n"
1574 "fmin v14.4s, v14.4s, v24.4s\n"
1575 "fmin v15.4s, v15.4s, v24.4s\n"
1576 "fmin v16.4s, v16.4s, v24.4s\n"
1577 "fmin v17.4s, v17.4s, v24.4s\n"
1578 "fmin v18.4s, v18.4s, v24.4s\n"
1579 "fmin v19.4s, v19.4s, v24.4s\n"
1580 "fmin v20.4s, v20.4s, v24.4s\n"
1581 "fmin v21.4s, v21.4s, v24.4s\n"
1582 "fmin v22.4s, v22.4s, v24.4s\n"
1583 "fmax v14.4s, v14.4s, v23.4s\n"
1584 "fmax v15.4s, v15.4s, v23.4s\n"
1585 "fmax v16.4s, v16.4s, v23.4s\n"
1586 "fmax v17.4s, v17.4s, v23.4s\n"
1587 "fmax v18.4s, v18.4s, v23.4s\n"
1588 "fmax v19.4s, v19.4s, v23.4s\n"
1589 "fmax v20.4s, v20.4s, v23.4s\n"
1590 "fmax v21.4s, v21.4s, v23.4s\n"
1591 "fmax v22.4s, v22.4s, v23.4s\n"
1592 "96:" // Width 9: No activation
1593 "cmp %x[N], #0x24\n"
1594 "str q14, [%x[output_ptr], #0x0]\n"
1595 "str q15, [%x[output_ptr], #0x10]\n"
1596 "str q16, [%x[output_ptr], #0x20]\n"
1597 "str q17, [%x[output_ptr], #0x30]\n"
1598 "str q18, [%x[output_ptr], #0x40]\n"
1599 "str q19, [%x[output_ptr], #0x50]\n"
1600 "str q20, [%x[output_ptr], #0x60]\n"
1601 "str q21, [%x[output_ptr], #0x70]\n"
1602 "add %x[output_ptr], %x[output_ptr], #0x80\n"
1603 "blt 97f\n"
1604 "str q22, [%x[output_ptr], #0x0]\n"
1605 "add %x[output_ptr], %x[output_ptr], #0x10\n"
1606 "b 99f\n"
1607 "97:" // Width 9: Partial writeback
1608 "tbz %x[N], #1, 98f\n"
1609 "str d22, [%x[output_ptr]], #0x8\n"
1610 "tbz %x[N], #0, 99f\n"
1611 "st1 { v22.s }[2], [%x[output_ptr]]\n"
1612 "b 99f\n"
1613 "98:" // Width 9: Partial direct writeback: partial_1_32
1614 "tbz %x[N], #0, 99f\n"
1615 "str s22, [%x[output_ptr], #0x0]\n"
1616 "99:" // Width 9: Writeback done
1617 "subs x25, x25, #0x9\n"
1618 "mov %x[B_ptr], x22\n"
1619 "sub %x[N], %x[N], #0x24\n"
1620 "bgt 1b\n"
1621 "100:" // Exit
1622 : [B_ptr] "+&r"(B_ptr), [N] "+&r"(N), [output_ptr] "+&r"(output_ptr)
1623 582 : [A_ptr] "r"(A_ptr), [K] "r"(K), [args_ptr] "r"(&ka), [flags] "r"(flags),
1624 [offset_max] "I"(offsetof(KernelArgs, maxval)), [offset_min] "I"(offsetof(KernelArgs, minval))
1625 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
1626 "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
1627 "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
1628 582 }
1629
1630 #endif // Architectural features check.
1631