KleidiAI Coverage Report


Directory: ./
File: kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 100.0% 38 5 43
Functions: 100.0% 10 0 10
Branches: -% 0 10 10

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 // Do not flag up inline assembly blocks
8 #pragma GCC diagnostic ignored "-Woverlength-strings"
9
10 #if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \
11 !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
12 #error This file must be compiled for AArch64, FEAT_FP16.
13 #else // Architectural features check.
14
15 #include "kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h"
16
17 #include <arm_neon.h>
18 #include <stddef.h>
19 #include <stdint.h>
20
21 #include "kai/kai_common.h"
22
23 static const size_t kai_mr = 6;
24 static const size_t kai_nr = 16;
25 static const size_t kai_kr = 1;
26 static const size_t kai_sr = 1;
27
28 18 size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
29 18 return kai_mr;
30 }
31
32 18 size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
33 18 return kai_nr;
34 }
35
36 17 size_t kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
37 17 return kai_nr;
38 }
39
40 17 size_t kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
41 17 return kai_kr;
42 }
43
44 17 size_t kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) {
45 17 return kai_sr;
46 }
47
48 16 size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m_idx, size_t stride) {
49 KAI_ASSUME(m_idx % kai_mr == 0);
50
51 16 return m_idx * stride;
52 }
53
54 16 size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t n_idx, size_t k) {
55 KAI_ASSUME(n_idx % kai_nr == 0);
56
57 16 return n_idx / kai_nr * (kai_nr * sizeof(uint16_t) + kai_nr * k * sizeof(uint16_t));
58 }
59
60 16 size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(
61 size_t m_idx, size_t n_idx, size_t stride) {
62 KAI_ASSUME(m_idx % kai_mr == 0);
63 KAI_ASSUME(n_idx % kai_nr == 0);
64
65 16 return m_idx * stride + n_idx * sizeof(uint16_t);
66 }
67
68 16 size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m, size_t n) {
69 16 return m * n * sizeof(uint16_t);
70 }
71
72 17 void kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(
73 size_t m, size_t n, size_t k, //
74 const void* lhs, size_t lhs_stride, //
75 const void* rhs_packed, //
76 void* dst, size_t dst_stride_row, size_t dst_stride_col, //
77 float clamp_min, float clamp_max) {
78 KAI_ASSERT(dst_stride_col == sizeof(uint16_t));
79
80 typedef struct {
81 float16_t maxval;
82 float16_t minval;
83 unsigned int num_strings;
84 const unsigned int* string_lengths;
85 size_t N;
86 const void* B_ptr;
87 size_t output_offset;
88 size_t input_initial_col;
89 size_t input_offset;
90 void* output_ptr;
91 const void* bias;
92 } KernelArgs;
93
94 17 KernelArgs ka;
95
96 17 unsigned long flags = 0;
97
98 17 unsigned int string_length = k;
99 17 ka.num_strings = 1;
100 17 ka.string_lengths = &string_length;
101 17 ka.N = n;
102 17 ka.B_ptr = rhs_packed;
103 17 ka.bias = NULL;
104
105 // Direct input.
106 17 const void* input_ptr = lhs;
107 17 ka.input_offset = lhs_stride / sizeof(uint16_t);
108 17 ka.input_initial_col = 0;
109
110 // Direct output.
111 17 ka.output_ptr = dst;
112 17 ka.output_offset = dst_stride_row / sizeof(uint16_t);
113
114 // Clamping output.
115 17 flags |= 0x2;
116 17 ka.maxval = (float16_t)clamp_max;
117 17 ka.minval = (float16_t)clamp_min;
118
119 34 __asm__ __volatile__(
120 "1:" // Row loop
121 "cmp %x[m], #0x6\n"
122 "bge 166f\n"
123 "cmp %x[m], #0x4\n"
124 "bgt 133f\n"
125 "beq 100f\n"
126 "cmp %x[m], #0x2\n"
127 "bgt 67f\n"
128 "beq 34f\n"
129 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
130 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
131 "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
132 "2:" // Height 1: Column loop
133 "cbz x10, 3f\n"
134 "ldr q20, [x10, #0x0]\n"
135 "ldr q21, [x10, #0x10]\n"
136 "add x10, x10, #0x20\n"
137 "b 14f\n"
138 "3:" // Height 1: no bias
139 "tbz %x[flags], #0, 13f\n"
140 "cmp x11, #0x10\n"
141 "bge 12f\n"
142 "tbz x11, #3, 7f\n"
143 "ld1 { v20.8h }, [x9], #0x10\n"
144 "tbz x11, #2, 5f\n"
145 "ldr d21, [x9], #0x8\n"
146 "tbz x11, #1, 4f\n"
147 "ld1 { v21.s }[2], [x9], #0x4\n"
148 "mov x20, #0x1c\n"
149 "tbz x11, #0, 11f\n"
150 "ld1 { v21.h }[6], [x9]\n"
151 "b 11f\n"
152 "4:" // Height 1: Partial accumulate: partial_1_12
153 "mov x20, #0x18\n"
154 "tbz x11, #0, 11f\n"
155 "ld1 { v21.h }[4], [x9]\n"
156 "b 11f\n"
157 "5:" // Height 1: Partial accumulate: partial_2_8
158 "tbz x11, #1, 6f\n"
159 "ldr s21, [x9], #0x4\n"
160 "mov x20, #0x14\n"
161 "tbz x11, #0, 11f\n"
162 "ld1 { v21.h }[2], [x9]\n"
163 "b 11f\n"
164 "6:" // Height 1: Partial accumulate: partial_1_8
165 "mov x20, #0x10\n"
166 "tbz x11, #0, 11f\n"
167 "ldr h21, [x9, #0x0]\n"
168 "b 11f\n"
169 "7:" // Height 1: Partial accumulate: partial_4_0
170 "tbz x11, #2, 9f\n"
171 "ldr d20, [x9], #0x8\n"
172 "tbz x11, #1, 8f\n"
173 "ld1 { v20.s }[2], [x9], #0x4\n"
174 "mov x20, #0xc\n"
175 "tbz x11, #0, 11f\n"
176 "ld1 { v20.h }[6], [x9]\n"
177 "b 11f\n"
178 "8:" // Height 1: Partial accumulate: partial_1_4
179 "mov x20, #0x8\n"
180 "tbz x11, #0, 11f\n"
181 "ld1 { v20.h }[4], [x9]\n"
182 "b 11f\n"
183 "9:" // Height 1: Partial accumulate: partial_2_0
184 "tbz x11, #1, 10f\n"
185 "ldr s20, [x9], #0x4\n"
186 "mov x20, #0x4\n"
187 "tbz x11, #0, 11f\n"
188 "ld1 { v20.h }[2], [x9]\n"
189 "b 11f\n"
190 "10:" // Height 1: Partial accumulate: partial_1_0
191 "ldr h20, [x9, #0x0]\n"
192 "mov x20, #0x0\n"
193 "11:" // Height 1: Partial accumulate: Done
194 "sub x9, x9, x20\n"
195 "b 14f\n"
196 "12:" // Height 1: full accumulate
197 "ldr q20, [x9, #0x0]\n"
198 "ldr q21, [x9, #0x10]\n"
199 "b 14f\n"
200 "13:" // Height 1: no accumulate
201 "movi v20.16b, #0x0\n"
202 "movi v21.16b, #0x0\n"
203 "14:" // Height 1: setup done
204 "mov x28, #0x0\n"
205 "15:" // Height 1: String loop
206 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
207 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
208 "ldr w27, [x20, x28, LSL #0x2]\n"
209 "tbz %x[flags], #3, 16f\n"
210 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
211 "add x20, x20, x21, LSL #3\n"
212 "ldr x26, [x20, #0x0]\n"
213 "cbnz x28, 17f\n"
214 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
215 "add x26, x26, x20, LSL #1\n"
216 "b 17f\n"
217 "16:" // Height 1: setup direct input
218 "mov x26, %x[input_ptr]\n"
219 "17:" // Height 1: input setup done
220 "cmp x27, #0x8\n"
221 "blt 20f\n"
222 "ldr q0, [x26, #0x0]\n"
223 "ldr q6, [x10, #0x0]\n"
224 "cmp x27, #0x10\n"
225 "ldr q7, [x10, #0x10]\n"
226 "ldr q8, [x10, #0x20]\n"
227 "ldr q9, [x10, #0x30]\n"
228 "ldr q10, [x10, #0x40]\n"
229 "ldr q11, [x10, #0x50]\n"
230 "ldr q12, [x10, #0x60]\n"
231 "ldr q13, [x10, #0x70]\n"
232 "ldr q14, [x10, #0x80]\n"
233 "ldr q15, [x10, #0x90]\n"
234 "ldr q16, [x10, #0xa0]\n"
235 "ldr q17, [x10, #0xb0]\n"
236 "ldr q18, [x10, #0xc0]\n"
237 "ldr q19, [x10, #0xd0]\n"
238 "blt 19f\n"
239 "18:" // Height 1: Multiply loop: Main loop head
240 "fmla v20.8h, v6.8h, v0.h[0]\n"
241 "ldr q6, [x10, #0xe0]\n"
242 "fmla v21.8h, v7.8h, v0.h[0]\n"
243 "ldr q7, [x10, #0xf0]\n"
244 "sub x27, x27, #0x8\n"
245 "add x26, x26, #0x10\n"
246 "cmp x27, #0x10\n"
247 "add x10, x10, #0x100\n"
248 "prfm pldl1keep, [x26, #0x80]\n"
249 "fmla v20.8h, v8.8h, v0.h[1]\n"
250 "ldr q8, [x10, #0x20]\n"
251 "fmla v21.8h, v9.8h, v0.h[1]\n"
252 "ldr q9, [x10, #0x30]\n"
253 "fmla v20.8h, v10.8h, v0.h[2]\n"
254 "ldr q10, [x10, #0x40]\n"
255 "fmla v21.8h, v11.8h, v0.h[2]\n"
256 "ldr q11, [x10, #0x50]\n"
257 "fmla v20.8h, v12.8h, v0.h[3]\n"
258 "ldr q12, [x10, #0x60]\n"
259 "fmla v21.8h, v13.8h, v0.h[3]\n"
260 "ldr q13, [x10, #0x70]\n"
261 "fmla v20.8h, v14.8h, v0.h[4]\n"
262 "ldr q14, [x10, #0x80]\n"
263 "fmla v21.8h, v15.8h, v0.h[4]\n"
264 "ldr q15, [x10, #0x90]\n"
265 "fmla v20.8h, v16.8h, v0.h[5]\n"
266 "ldr q16, [x10, #0xa0]\n"
267 "fmla v21.8h, v17.8h, v0.h[5]\n"
268 "ldr q17, [x10, #0xb0]\n"
269 "fmla v20.8h, v18.8h, v0.h[6]\n"
270 "ldr q18, [x10, #0xc0]\n"
271 "fmla v21.8h, v19.8h, v0.h[6]\n"
272 "ldr q19, [x10, #0xd0]\n"
273 "fmla v20.8h, v6.8h, v0.h[7]\n"
274 "ldr q6, [x10, #0x0]\n"
275 "fmla v21.8h, v7.8h, v0.h[7]\n"
276 "ldr q0, [x26, #0x0]\n"
277 "ldr q7, [x10, #0x10]\n"
278 "bge 18b\n"
279 "19:" // Height 1: Multiply loop: Single iteration only
280 "fmla v20.8h, v6.8h, v0.h[0]\n"
281 "ldr q6, [x10, #0xe0]\n"
282 "fmla v21.8h, v7.8h, v0.h[0]\n"
283 "ldr q7, [x10, #0xf0]\n"
284 "add x26, x26, #0x10\n"
285 "sub x27, x27, #0x8\n"
286 "add x10, x10, #0x100\n"
287 "prfm pldl1keep, [x26, #0x80]\n"
288 "fmla v20.8h, v8.8h, v0.h[1]\n"
289 "fmla v21.8h, v9.8h, v0.h[1]\n"
290 "fmla v20.8h, v10.8h, v0.h[2]\n"
291 "fmla v21.8h, v11.8h, v0.h[2]\n"
292 "fmla v20.8h, v12.8h, v0.h[3]\n"
293 "fmla v21.8h, v13.8h, v0.h[3]\n"
294 "fmla v20.8h, v14.8h, v0.h[4]\n"
295 "fmla v21.8h, v15.8h, v0.h[4]\n"
296 "fmla v20.8h, v16.8h, v0.h[5]\n"
297 "fmla v21.8h, v17.8h, v0.h[5]\n"
298 "fmla v20.8h, v18.8h, v0.h[6]\n"
299 "fmla v21.8h, v19.8h, v0.h[6]\n"
300 "fmla v20.8h, v6.8h, v0.h[7]\n"
301 "fmla v21.8h, v7.8h, v0.h[7]\n"
302 "20:" // Height 1: Multiply loop: Main loop skip
303 "cbz x27, 22f\n"
304 "21:" // Height 1: Multiply loop: Odd block loop
305 "ldr h0, [x26], #0x2\n"
306 "ldr q8, [x10, #0x0]\n"
307 "sub x27, x27, #0x1\n"
308 "ldr q9, [x10, #0x10]\n"
309 "add x10, x10, #0x20\n"
310 "fmla v20.8h, v8.8h, v0.h[0]\n"
311 "fmla v21.8h, v9.8h, v0.h[0]\n"
312 "cbnz x27, 21b\n"
313 "22:" // Height 1: Multiply loop: No odd multiplies
314 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
315 "add x28, x28, #0x1\n"
316 "cmp x28, x20\n"
317 "bne 15b\n"
318 "prfm pstl1keep, [x9, #0x0]\n"
319 "tbz %x[flags], #1, 23f\n"
320 "add x21, %x[args_ptr], %[offset_max]\n"
321 "add x20, %x[args_ptr], %[offset_min]\n"
322 "ld1r { v17.8h }, [x21]\n"
323 "ld1r { v16.8h }, [x20]\n"
324 "fmin v20.8h, v20.8h, v17.8h\n"
325 "fmin v21.8h, v21.8h, v17.8h\n"
326 "fmax v20.8h, v20.8h, v16.8h\n"
327 "fmax v21.8h, v21.8h, v16.8h\n"
328 "23:" // Height 1: No activation
329 "cmp x11, #0x10\n"
330 "bge 32f\n"
331 "tbz x11, #3, 27f\n"
332 "st1 { v20.8h }, [x9], #0x10\n"
333 "tbz x11, #2, 25f\n"
334 "str d21, [x9], #0x8\n"
335 "tbz x11, #1, 24f\n"
336 "st1 { v21.s }[2], [x9], #0x4\n"
337 "tbz x11, #0, 31f\n"
338 "st1 { v21.h }[6], [x9]\n"
339 "b 31f\n"
340 "24:" // Height 1: Partial direct writeback: partial_1_12
341 "tbz x11, #0, 31f\n"
342 "st1 { v21.h }[4], [x9]\n"
343 "b 31f\n"
344 "25:" // Height 1: Partial direct writeback: partial_2_8
345 "tbz x11, #1, 26f\n"
346 "str s21, [x9], #0x4\n"
347 "tbz x11, #0, 31f\n"
348 "st1 { v21.h }[2], [x9]\n"
349 "b 31f\n"
350 "26:" // Height 1: Partial direct writeback: partial_1_8
351 "tbz x11, #0, 31f\n"
352 "str h21, [x9, #0x0]\n"
353 "b 31f\n"
354 "27:" // Height 1: Partial direct writeback: partial_4_0
355 "tbz x11, #2, 29f\n"
356 "str d20, [x9], #0x8\n"
357 "tbz x11, #1, 28f\n"
358 "st1 { v20.s }[2], [x9], #0x4\n"
359 "tbz x11, #0, 31f\n"
360 "st1 { v20.h }[6], [x9]\n"
361 "b 31f\n"
362 "28:" // Height 1: Partial direct writeback: partial_1_4
363 "tbz x11, #0, 31f\n"
364 "st1 { v20.h }[4], [x9]\n"
365 "b 31f\n"
366 "29:" // Height 1: Partial direct writeback: partial_2_0
367 "tbz x11, #1, 30f\n"
368 "str s20, [x9], #0x4\n"
369 "tbz x11, #0, 31f\n"
370 "st1 { v20.h }[2], [x9]\n"
371 "b 31f\n"
372 "30:" // Height 1: Partial direct writeback: partial_1_0
373 "str h20, [x9, #0x0]\n"
374 "31:" // Height 1: Partial direct writeback: Done
375 "b 33f\n"
376 "32:" // Height 1: Full writeback
377 "str q20, [x9, #0x0]\n"
378 "str q21, [x9, #0x10]\n"
379 "add x9, x9, #0x20\n"
380 "33:" // Height 1: Writeback done
381 "subs x11, x11, #0x10\n"
382 "bgt 2b\n"
383 "b 200f\n"
384 "34:" // Height 2
385 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
386 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
387 "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
388 "35:" // Height 2: Column loop
389 "cbz x10, 36f\n"
390 "ldr q20, [x10, #0x0]\n"
391 "ldr q21, [x10, #0x10]\n"
392 "add x10, x10, #0x20\n"
393 "mov v22.16b, v20.16b\n"
394 "mov v23.16b, v21.16b\n"
395 "b 47f\n"
396 "36:" // Height 2: no bias
397 "tbz %x[flags], #0, 46f\n"
398 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
399 "cmp x11, #0x10\n"
400 "add x26, x9, x20, LSL #1\n"
401 "bge 45f\n"
402 "tbz x11, #3, 40f\n"
403 "ld1 { v20.8h }, [x9], #0x10\n"
404 "ld1 { v22.8h }, [x26], #0x10\n"
405 "tbz x11, #2, 38f\n"
406 "ldr d21, [x9], #0x8\n"
407 "ldr d23, [x26], #0x8\n"
408 "tbz x11, #1, 37f\n"
409 "ld1 { v21.s }[2], [x9], #0x4\n"
410 "ld1 { v23.s }[2], [x26], #0x4\n"
411 "mov x20, #0x1c\n"
412 "tbz x11, #0, 44f\n"
413 "ld1 { v21.h }[6], [x9]\n"
414 "ld1 { v23.h }[6], [x26]\n"
415 "b 44f\n"
416 "37:" // Height 2: Partial accumulate: partial_1_12
417 "mov x20, #0x18\n"
418 "tbz x11, #0, 44f\n"
419 "ld1 { v21.h }[4], [x9]\n"
420 "ld1 { v23.h }[4], [x26]\n"
421 "b 44f\n"
422 "38:" // Height 2: Partial accumulate: partial_2_8
423 "tbz x11, #1, 39f\n"
424 "ldr s21, [x9], #0x4\n"
425 "ldr s23, [x26], #0x4\n"
426 "mov x20, #0x14\n"
427 "tbz x11, #0, 44f\n"
428 "ld1 { v21.h }[2], [x9]\n"
429 "ld1 { v23.h }[2], [x26]\n"
430 "b 44f\n"
431 "39:" // Height 2: Partial accumulate: partial_1_8
432 "mov x20, #0x10\n"
433 "tbz x11, #0, 44f\n"
434 "ldr h21, [x9, #0x0]\n"
435 "ldr h23, [x26, #0x0]\n"
436 "b 44f\n"
437 "40:" // Height 2: Partial accumulate: partial_4_0
438 "tbz x11, #2, 42f\n"
439 "ldr d20, [x9], #0x8\n"
440 "ldr d22, [x26], #0x8\n"
441 "tbz x11, #1, 41f\n"
442 "ld1 { v20.s }[2], [x9], #0x4\n"
443 "ld1 { v22.s }[2], [x26], #0x4\n"
444 "mov x20, #0xc\n"
445 "tbz x11, #0, 44f\n"
446 "ld1 { v20.h }[6], [x9]\n"
447 "ld1 { v22.h }[6], [x26]\n"
448 "b 44f\n"
449 "41:" // Height 2: Partial accumulate: partial_1_4
450 "mov x20, #0x8\n"
451 "tbz x11, #0, 44f\n"
452 "ld1 { v20.h }[4], [x9]\n"
453 "ld1 { v22.h }[4], [x26]\n"
454 "b 44f\n"
455 "42:" // Height 2: Partial accumulate: partial_2_0
456 "tbz x11, #1, 43f\n"
457 "ldr s20, [x9], #0x4\n"
458 "ldr s22, [x26], #0x4\n"
459 "mov x20, #0x4\n"
460 "tbz x11, #0, 44f\n"
461 "ld1 { v20.h }[2], [x9]\n"
462 "ld1 { v22.h }[2], [x26]\n"
463 "b 44f\n"
464 "43:" // Height 2: Partial accumulate: partial_1_0
465 "ldr h20, [x9, #0x0]\n"
466 "ldr h22, [x26, #0x0]\n"
467 "mov x20, #0x0\n"
468 "44:" // Height 2: Partial accumulate: Done
469 "sub x9, x9, x20\n"
470 "b 47f\n"
471 "45:" // Height 2: full accumulate
472 "ldr q20, [x9, #0x0]\n"
473 "ldr q21, [x9, #0x10]\n"
474 "ldr q22, [x26, #0x0]\n"
475 "ldr q23, [x26, #0x10]\n"
476 "b 47f\n"
477 "46:" // Height 2: no accumulate
478 "movi v20.16b, #0x0\n"
479 "movi v21.16b, #0x0\n"
480 "movi v22.16b, #0x0\n"
481 "movi v23.16b, #0x0\n"
482 "47:" // Height 2: setup done
483 "mov x28, #0x0\n"
484 "48:" // Height 2: String loop
485 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
486 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
487 "ldr w27, [x20, x28, LSL #0x2]\n"
488 "tbz %x[flags], #3, 49f\n"
489 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
490 "add x20, x20, x21, LSL #3\n"
491 "ldr x26, [x20, #0x0]\n"
492 "ldr x25, [x20, #0x8]\n"
493 "cbnz x28, 50f\n"
494 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
495 "add x26, x26, x20, LSL #1\n"
496 "add x25, x25, x20, LSL #1\n"
497 "b 50f\n"
498 "49:" // Height 2: setup direct input
499 "mov x26, %x[input_ptr]\n"
500 "add x25, x26, x21, LSL #1\n"
501 "50:" // Height 2: input setup done
502 "cmp x27, #0x8\n"
503 "blt 53f\n"
504 "ldr q0, [x26, #0x0]\n"
505 "ldr q1, [x25, #0x0]\n"
506 "cmp x27, #0x10\n"
507 "ldr q6, [x10, #0x0]\n"
508 "ldr q7, [x10, #0x10]\n"
509 "ldr q8, [x10, #0x20]\n"
510 "ldr q9, [x10, #0x30]\n"
511 "ldr q10, [x10, #0x40]\n"
512 "ldr q11, [x10, #0x50]\n"
513 "ldr q12, [x10, #0x60]\n"
514 "ldr q13, [x10, #0x70]\n"
515 "ldr q14, [x10, #0x80]\n"
516 "ldr q15, [x10, #0x90]\n"
517 "ldr q16, [x10, #0xa0]\n"
518 "ldr q17, [x10, #0xb0]\n"
519 "ldr q18, [x10, #0xc0]\n"
520 "ldr q19, [x10, #0xd0]\n"
521 "blt 52f\n"
522 "51:" // Height 2: Multiply loop: Main loop head
523 "fmla v20.8h, v6.8h, v0.h[0]\n"
524 "fmla v22.8h, v6.8h, v1.h[0]\n"
525 "ldr q6, [x10, #0xe0]\n"
526 "sub x27, x27, #0x8\n"
527 "fmla v21.8h, v7.8h, v0.h[0]\n"
528 "fmla v23.8h, v7.8h, v1.h[0]\n"
529 "ldr q7, [x10, #0xf0]\n"
530 "add x26, x26, #0x10\n"
531 "add x25, x25, #0x10\n"
532 "cmp x27, #0x10\n"
533 "prfm pldl1keep, [x26, #0x80]\n"
534 "add x10, x10, #0x100\n"
535 "prfm pldl1keep, [x25, #0x80]\n"
536 "fmla v20.8h, v8.8h, v0.h[1]\n"
537 "fmla v22.8h, v8.8h, v1.h[1]\n"
538 "ldr q8, [x10, #0x20]\n"
539 "fmla v21.8h, v9.8h, v0.h[1]\n"
540 "fmla v23.8h, v9.8h, v1.h[1]\n"
541 "ldr q9, [x10, #0x30]\n"
542 "fmla v20.8h, v10.8h, v0.h[2]\n"
543 "fmla v22.8h, v10.8h, v1.h[2]\n"
544 "ldr q10, [x10, #0x40]\n"
545 "fmla v21.8h, v11.8h, v0.h[2]\n"
546 "fmla v23.8h, v11.8h, v1.h[2]\n"
547 "ldr q11, [x10, #0x50]\n"
548 "fmla v20.8h, v12.8h, v0.h[3]\n"
549 "fmla v22.8h, v12.8h, v1.h[3]\n"
550 "ldr q12, [x10, #0x60]\n"
551 "fmla v21.8h, v13.8h, v0.h[3]\n"
552 "fmla v23.8h, v13.8h, v1.h[3]\n"
553 "ldr q13, [x10, #0x70]\n"
554 "fmla v20.8h, v14.8h, v0.h[4]\n"
555 "fmla v22.8h, v14.8h, v1.h[4]\n"
556 "ldr q14, [x10, #0x80]\n"
557 "fmla v21.8h, v15.8h, v0.h[4]\n"
558 "fmla v23.8h, v15.8h, v1.h[4]\n"
559 "ldr q15, [x10, #0x90]\n"
560 "fmla v20.8h, v16.8h, v0.h[5]\n"
561 "fmla v22.8h, v16.8h, v1.h[5]\n"
562 "ldr q16, [x10, #0xa0]\n"
563 "fmla v21.8h, v17.8h, v0.h[5]\n"
564 "fmla v23.8h, v17.8h, v1.h[5]\n"
565 "ldr q17, [x10, #0xb0]\n"
566 "fmla v20.8h, v18.8h, v0.h[6]\n"
567 "fmla v22.8h, v18.8h, v1.h[6]\n"
568 "ldr q18, [x10, #0xc0]\n"
569 "fmla v21.8h, v19.8h, v0.h[6]\n"
570 "fmla v23.8h, v19.8h, v1.h[6]\n"
571 "ldr q19, [x10, #0xd0]\n"
572 "fmla v20.8h, v6.8h, v0.h[7]\n"
573 "fmla v22.8h, v6.8h, v1.h[7]\n"
574 "ldr q6, [x10, #0x0]\n"
575 "fmla v21.8h, v7.8h, v0.h[7]\n"
576 "ldr q0, [x26, #0x0]\n"
577 "fmla v23.8h, v7.8h, v1.h[7]\n"
578 "ldr q1, [x25, #0x0]\n"
579 "ldr q7, [x10, #0x10]\n"
580 "bge 51b\n"
581 "52:" // Height 2: Multiply loop: Single iteration only
582 "fmla v20.8h, v6.8h, v0.h[0]\n"
583 "fmla v22.8h, v6.8h, v1.h[0]\n"
584 "ldr q6, [x10, #0xe0]\n"
585 "add x26, x26, #0x10\n"
586 "fmla v21.8h, v7.8h, v0.h[0]\n"
587 "fmla v23.8h, v7.8h, v1.h[0]\n"
588 "ldr q7, [x10, #0xf0]\n"
589 "add x25, x25, #0x10\n"
590 "sub x27, x27, #0x8\n"
591 "prfm pldl1keep, [x26, #0x80]\n"
592 "add x10, x10, #0x100\n"
593 "prfm pldl1keep, [x25, #0x80]\n"
594 "fmla v20.8h, v8.8h, v0.h[1]\n"
595 "fmla v22.8h, v8.8h, v1.h[1]\n"
596 "fmla v21.8h, v9.8h, v0.h[1]\n"
597 "fmla v23.8h, v9.8h, v1.h[1]\n"
598 "fmla v20.8h, v10.8h, v0.h[2]\n"
599 "fmla v22.8h, v10.8h, v1.h[2]\n"
600 "fmla v21.8h, v11.8h, v0.h[2]\n"
601 "fmla v23.8h, v11.8h, v1.h[2]\n"
602 "fmla v20.8h, v12.8h, v0.h[3]\n"
603 "fmla v22.8h, v12.8h, v1.h[3]\n"
604 "fmla v21.8h, v13.8h, v0.h[3]\n"
605 "fmla v23.8h, v13.8h, v1.h[3]\n"
606 "fmla v20.8h, v14.8h, v0.h[4]\n"
607 "fmla v22.8h, v14.8h, v1.h[4]\n"
608 "fmla v21.8h, v15.8h, v0.h[4]\n"
609 "fmla v23.8h, v15.8h, v1.h[4]\n"
610 "fmla v20.8h, v16.8h, v0.h[5]\n"
611 "fmla v22.8h, v16.8h, v1.h[5]\n"
612 "fmla v21.8h, v17.8h, v0.h[5]\n"
613 "fmla v23.8h, v17.8h, v1.h[5]\n"
614 "fmla v20.8h, v18.8h, v0.h[6]\n"
615 "fmla v22.8h, v18.8h, v1.h[6]\n"
616 "fmla v21.8h, v19.8h, v0.h[6]\n"
617 "fmla v23.8h, v19.8h, v1.h[6]\n"
618 "fmla v20.8h, v6.8h, v0.h[7]\n"
619 "fmla v22.8h, v6.8h, v1.h[7]\n"
620 "fmla v21.8h, v7.8h, v0.h[7]\n"
621 "fmla v23.8h, v7.8h, v1.h[7]\n"
622 "53:" // Height 2: Multiply loop: Main loop skip
623 "cbz x27, 55f\n"
624 "54:" // Height 2: Multiply loop: Odd block loop
625 "ldr h0, [x26], #0x2\n"
626 "ldr h1, [x25], #0x2\n"
627 "sub x27, x27, #0x1\n"
628 "ldr q8, [x10, #0x0]\n"
629 "ldr q9, [x10, #0x10]\n"
630 "add x10, x10, #0x20\n"
631 "fmla v20.8h, v8.8h, v0.h[0]\n"
632 "fmla v22.8h, v8.8h, v1.h[0]\n"
633 "fmla v21.8h, v9.8h, v0.h[0]\n"
634 "fmla v23.8h, v9.8h, v1.h[0]\n"
635 "cbnz x27, 54b\n"
636 "55:" // Height 2: Multiply loop: No odd multiplies
637 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
638 "add x28, x28, #0x1\n"
639 "cmp x28, x20\n"
640 "bne 48b\n"
641 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
642 "prfm pstl1keep, [x9, #0x0]\n"
643 "add x26, x9, x20, LSL #1\n"
644 "prfm pstl1keep, [x26, #0x0]\n"
645 "tbz %x[flags], #1, 56f\n"
646 "add x21, %x[args_ptr], %[offset_max]\n"
647 "add x20, %x[args_ptr], %[offset_min]\n"
648 "ld1r { v17.8h }, [x21]\n"
649 "ld1r { v16.8h }, [x20]\n"
650 "fmin v20.8h, v20.8h, v17.8h\n"
651 "fmin v21.8h, v21.8h, v17.8h\n"
652 "fmin v22.8h, v22.8h, v17.8h\n"
653 "fmin v23.8h, v23.8h, v17.8h\n"
654 "fmax v20.8h, v20.8h, v16.8h\n"
655 "fmax v21.8h, v21.8h, v16.8h\n"
656 "fmax v22.8h, v22.8h, v16.8h\n"
657 "fmax v23.8h, v23.8h, v16.8h\n"
658 "56:" // Height 2: No activation
659 "cmp x11, #0x10\n"
660 "bge 65f\n"
661 "tbz x11, #3, 60f\n"
662 "st1 { v20.8h }, [x9], #0x10\n"
663 "st1 { v22.8h }, [x26], #0x10\n"
664 "tbz x11, #2, 58f\n"
665 "str d21, [x9], #0x8\n"
666 "str d23, [x26], #0x8\n"
667 "tbz x11, #1, 57f\n"
668 "st1 { v21.s }[2], [x9], #0x4\n"
669 "st1 { v23.s }[2], [x26], #0x4\n"
670 "tbz x11, #0, 64f\n"
671 "st1 { v21.h }[6], [x9]\n"
672 "st1 { v23.h }[6], [x26]\n"
673 "b 64f\n"
674 "57:" // Height 2: Partial direct writeback: partial_1_12
675 "tbz x11, #0, 64f\n"
676 "st1 { v21.h }[4], [x9]\n"
677 "st1 { v23.h }[4], [x26]\n"
678 "b 64f\n"
679 "58:" // Height 2: Partial direct writeback: partial_2_8
680 "tbz x11, #1, 59f\n"
681 "str s21, [x9], #0x4\n"
682 "str s23, [x26], #0x4\n"
683 "tbz x11, #0, 64f\n"
684 "st1 { v21.h }[2], [x9]\n"
685 "st1 { v23.h }[2], [x26]\n"
686 "b 64f\n"
687 "59:" // Height 2: Partial direct writeback: partial_1_8
688 "tbz x11, #0, 64f\n"
689 "str h21, [x9, #0x0]\n"
690 "str h23, [x26, #0x0]\n"
691 "b 64f\n"
692 "60:" // Height 2: Partial direct writeback: partial_4_0
693 "tbz x11, #2, 62f\n"
694 "str d20, [x9], #0x8\n"
695 "str d22, [x26], #0x8\n"
696 "tbz x11, #1, 61f\n"
697 "st1 { v20.s }[2], [x9], #0x4\n"
698 "st1 { v22.s }[2], [x26], #0x4\n"
699 "tbz x11, #0, 64f\n"
700 "st1 { v20.h }[6], [x9]\n"
701 "st1 { v22.h }[6], [x26]\n"
702 "b 64f\n"
703 "61:" // Height 2: Partial direct writeback: partial_1_4
704 "tbz x11, #0, 64f\n"
705 "st1 { v20.h }[4], [x9]\n"
706 "st1 { v22.h }[4], [x26]\n"
707 "b 64f\n"
708 "62:" // Height 2: Partial direct writeback: partial_2_0
709 "tbz x11, #1, 63f\n"
710 "str s20, [x9], #0x4\n"
711 "str s22, [x26], #0x4\n"
712 "tbz x11, #0, 64f\n"
713 "st1 { v20.h }[2], [x9]\n"
714 "st1 { v22.h }[2], [x26]\n"
715 "b 64f\n"
716 "63:" // Height 2: Partial direct writeback: partial_1_0
717 "str h20, [x9, #0x0]\n"
718 "str h22, [x26, #0x0]\n"
719 "64:" // Height 2: Partial direct writeback: Done
720 "b 66f\n"
721 "65:" // Height 2: Full writeback
722 "str q20, [x9, #0x0]\n"
723 "str q21, [x9, #0x10]\n"
724 "add x9, x9, #0x20\n"
725 "str q22, [x26, #0x0]\n"
726 "str q23, [x26, #0x10]\n"
727 "66:" // Height 2: Writeback done
728 "subs x11, x11, #0x10\n"
729 "bgt 35b\n"
730 "b 200f\n"
731 "67:" // Height 3
732 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
733 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
734 "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
735 "68:" // Height 3: Column loop
736 "cbz x10, 69f\n"
737 "ldr q20, [x10, #0x0]\n"
738 "ldr q21, [x10, #0x10]\n"
739 "add x10, x10, #0x20\n"
740 "mov v22.16b, v20.16b\n"
741 "mov v23.16b, v21.16b\n"
742 "mov v24.16b, v20.16b\n"
743 "mov v25.16b, v21.16b\n"
744 "b 80f\n"
745 "69:" // Height 3: no bias
746 "tbz %x[flags], #0, 79f\n"
747 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
748 "cmp x11, #0x10\n"
749 "add x26, x9, x20, LSL #1\n"
750 "add x25, x26, x20, LSL #1\n"
751 "bge 78f\n"
752 "tbz x11, #3, 73f\n"
753 "ld1 { v20.8h }, [x9], #0x10\n"
754 "ld1 { v22.8h }, [x26], #0x10\n"
755 "ld1 { v24.8h }, [x25], #0x10\n"
756 "tbz x11, #2, 71f\n"
757 "ldr d21, [x9], #0x8\n"
758 "ldr d23, [x26], #0x8\n"
759 "ldr d25, [x25], #0x8\n"
760 "tbz x11, #1, 70f\n"
761 "ld1 { v21.s }[2], [x9], #0x4\n"
762 "ld1 { v23.s }[2], [x26], #0x4\n"
763 "mov x20, #0x1c\n"
764 "ld1 { v25.s }[2], [x25], #0x4\n"
765 "tbz x11, #0, 77f\n"
766 "ld1 { v21.h }[6], [x9]\n"
767 "ld1 { v23.h }[6], [x26]\n"
768 "ld1 { v25.h }[6], [x25]\n"
769 "b 77f\n"
770 "70:" // Height 3: Partial accumulate: partial_1_12
771 "mov x20, #0x18\n"
772 "tbz x11, #0, 77f\n"
773 "ld1 { v21.h }[4], [x9]\n"
774 "ld1 { v23.h }[4], [x26]\n"
775 "ld1 { v25.h }[4], [x25]\n"
776 "b 77f\n"
777 "71:" // Height 3: Partial accumulate: partial_2_8
778 "tbz x11, #1, 72f\n"
779 "ldr s21, [x9], #0x4\n"
780 "ldr s23, [x26], #0x4\n"
781 "mov x20, #0x14\n"
782 "ldr s25, [x25], #0x4\n"
783 "tbz x11, #0, 77f\n"
784 "ld1 { v21.h }[2], [x9]\n"
785 "ld1 { v23.h }[2], [x26]\n"
786 "ld1 { v25.h }[2], [x25]\n"
787 "b 77f\n"
788 "72:" // Height 3: Partial accumulate: partial_1_8
789 "mov x20, #0x10\n"
790 "tbz x11, #0, 77f\n"
791 "ldr h21, [x9, #0x0]\n"
792 "ldr h23, [x26, #0x0]\n"
793 "ldr h25, [x25, #0x0]\n"
794 "b 77f\n"
795 "73:" // Height 3: Partial accumulate: partial_4_0
796 "tbz x11, #2, 75f\n"
797 "ldr d20, [x9], #0x8\n"
798 "ldr d22, [x26], #0x8\n"
799 "ldr d24, [x25], #0x8\n"
800 "tbz x11, #1, 74f\n"
801 "ld1 { v20.s }[2], [x9], #0x4\n"
802 "ld1 { v22.s }[2], [x26], #0x4\n"
803 "mov x20, #0xc\n"
804 "ld1 { v24.s }[2], [x25], #0x4\n"
805 "tbz x11, #0, 77f\n"
806 "ld1 { v20.h }[6], [x9]\n"
807 "ld1 { v22.h }[6], [x26]\n"
808 "ld1 { v24.h }[6], [x25]\n"
809 "b 77f\n"
810 "74:" // Height 3: Partial accumulate: partial_1_4
811 "mov x20, #0x8\n"
812 "tbz x11, #0, 77f\n"
813 "ld1 { v20.h }[4], [x9]\n"
814 "ld1 { v22.h }[4], [x26]\n"
815 "ld1 { v24.h }[4], [x25]\n"
816 "b 77f\n"
817 "75:" // Height 3: Partial accumulate: partial_2_0
818 "tbz x11, #1, 76f\n"
819 "ldr s20, [x9], #0x4\n"
820 "ldr s22, [x26], #0x4\n"
821 "mov x20, #0x4\n"
822 "ldr s24, [x25], #0x4\n"
823 "tbz x11, #0, 77f\n"
824 "ld1 { v20.h }[2], [x9]\n"
825 "ld1 { v22.h }[2], [x26]\n"
826 "ld1 { v24.h }[2], [x25]\n"
827 "b 77f\n"
828 "76:" // Height 3: Partial accumulate: partial_1_0
829 "ldr h20, [x9, #0x0]\n"
830 "ldr h22, [x26, #0x0]\n"
831 "mov x20, #0x0\n"
832 "ldr h24, [x25, #0x0]\n"
833 "77:" // Height 3: Partial accumulate: Done
834 "sub x9, x9, x20\n"
835 "b 80f\n"
836 "78:" // Height 3: full accumulate
837 "ldr q20, [x9, #0x0]\n"
838 "ldr q21, [x9, #0x10]\n"
839 "ldr q22, [x26, #0x0]\n"
840 "ldr q23, [x26, #0x10]\n"
841 "ldr q24, [x25, #0x0]\n"
842 "ldr q25, [x25, #0x10]\n"
843 "b 80f\n"
844 "79:" // Height 3: no accumulate
845 "movi v20.16b, #0x0\n"
846 "movi v21.16b, #0x0\n"
847 "movi v22.16b, #0x0\n"
848 "movi v23.16b, #0x0\n"
849 "movi v24.16b, #0x0\n"
850 "movi v25.16b, #0x0\n"
851 "80:" // Height 3: setup done
852 "mov x28, #0x0\n"
853 "81:" // Height 3: String loop
854 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
855 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
856 "ldr w27, [x20, x28, LSL #0x2]\n"
857 "tbz %x[flags], #3, 82f\n"
858 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
859 "add x20, x20, x21, LSL #3\n"
860 "ldr x26, [x20, #0x0]\n"
861 "ldr x25, [x20, #0x8]\n"
862 "ldr x24, [x20, #0x10]\n"
863 "cbnz x28, 83f\n"
864 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
865 "add x26, x26, x20, LSL #1\n"
866 "add x25, x25, x20, LSL #1\n"
867 "add x24, x24, x20, LSL #1\n"
868 "b 83f\n"
869 "82:" // Height 3: setup direct input
870 "mov x26, %x[input_ptr]\n"
871 "add x25, x26, x21, LSL #1\n"
872 "add x24, x25, x21, LSL #1\n"
873 "83:" // Height 3: input setup done
874 "cmp x27, #0x8\n"
875 "blt 86f\n"
876 "ldr q0, [x26, #0x0]\n"
877 "ldr q1, [x25, #0x0]\n"
878 "cmp x27, #0x10\n"
879 "ldr q2, [x24, #0x0]\n"
880 "ldr q6, [x10, #0x0]\n"
881 "ldr q7, [x10, #0x10]\n"
882 "ldr q8, [x10, #0x20]\n"
883 "ldr q9, [x10, #0x30]\n"
884 "ldr q10, [x10, #0x40]\n"
885 "ldr q11, [x10, #0x50]\n"
886 "ldr q12, [x10, #0x60]\n"
887 "ldr q13, [x10, #0x70]\n"
888 "ldr q14, [x10, #0x80]\n"
889 "ldr q15, [x10, #0x90]\n"
890 "ldr q16, [x10, #0xa0]\n"
891 "ldr q17, [x10, #0xb0]\n"
892 "ldr q18, [x10, #0xc0]\n"
893 "ldr q19, [x10, #0xd0]\n"
894 "blt 85f\n"
895 "84:" // Height 3: Multiply loop: Main loop head
896 "fmla v20.8h, v6.8h, v0.h[0]\n"
897 "fmla v22.8h, v6.8h, v1.h[0]\n"
898 "sub x27, x27, #0x8\n"
899 "add x26, x26, #0x10\n"
900 "fmla v24.8h, v6.8h, v2.h[0]\n"
901 "ldr q6, [x10, #0xe0]\n"
902 "fmla v21.8h, v7.8h, v0.h[0]\n"
903 "add x25, x25, #0x10\n"
904 "fmla v23.8h, v7.8h, v1.h[0]\n"
905 "fmla v25.8h, v7.8h, v2.h[0]\n"
906 "ldr q7, [x10, #0xf0]\n"
907 "add x24, x24, #0x10\n"
908 "cmp x27, #0x10\n"
909 "add x10, x10, #0x100\n"
910 "prfm pldl1keep, [x26, #0x80]\n"
911 "prfm pldl1keep, [x25, #0x80]\n"
912 "fmla v20.8h, v8.8h, v0.h[1]\n"
913 "fmla v22.8h, v8.8h, v1.h[1]\n"
914 "prfm pldl1keep, [x24, #0x80]\n"
915 "fmla v24.8h, v8.8h, v2.h[1]\n"
916 "ldr q8, [x10, #0x20]\n"
917 "fmla v21.8h, v9.8h, v0.h[1]\n"
918 "fmla v23.8h, v9.8h, v1.h[1]\n"
919 "fmla v25.8h, v9.8h, v2.h[1]\n"
920 "ldr q9, [x10, #0x30]\n"
921 "fmla v20.8h, v10.8h, v0.h[2]\n"
922 "fmla v22.8h, v10.8h, v1.h[2]\n"
923 "fmla v24.8h, v10.8h, v2.h[2]\n"
924 "ldr q10, [x10, #0x40]\n"
925 "fmla v21.8h, v11.8h, v0.h[2]\n"
926 "fmla v23.8h, v11.8h, v1.h[2]\n"
927 "fmla v25.8h, v11.8h, v2.h[2]\n"
928 "ldr q11, [x10, #0x50]\n"
929 "fmla v20.8h, v12.8h, v0.h[3]\n"
930 "fmla v22.8h, v12.8h, v1.h[3]\n"
931 "fmla v24.8h, v12.8h, v2.h[3]\n"
932 "ldr q12, [x10, #0x60]\n"
933 "fmla v21.8h, v13.8h, v0.h[3]\n"
934 "fmla v23.8h, v13.8h, v1.h[3]\n"
935 "fmla v25.8h, v13.8h, v2.h[3]\n"
936 "ldr q13, [x10, #0x70]\n"
937 "fmla v20.8h, v14.8h, v0.h[4]\n"
938 "fmla v22.8h, v14.8h, v1.h[4]\n"
939 "fmla v24.8h, v14.8h, v2.h[4]\n"
940 "ldr q14, [x10, #0x80]\n"
941 "fmla v21.8h, v15.8h, v0.h[4]\n"
942 "fmla v23.8h, v15.8h, v1.h[4]\n"
943 "fmla v25.8h, v15.8h, v2.h[4]\n"
944 "ldr q15, [x10, #0x90]\n"
945 "fmla v20.8h, v16.8h, v0.h[5]\n"
946 "fmla v22.8h, v16.8h, v1.h[5]\n"
947 "fmla v24.8h, v16.8h, v2.h[5]\n"
948 "ldr q16, [x10, #0xa0]\n"
949 "fmla v21.8h, v17.8h, v0.h[5]\n"
950 "fmla v23.8h, v17.8h, v1.h[5]\n"
951 "fmla v25.8h, v17.8h, v2.h[5]\n"
952 "ldr q17, [x10, #0xb0]\n"
953 "fmla v20.8h, v18.8h, v0.h[6]\n"
954 "fmla v22.8h, v18.8h, v1.h[6]\n"
955 "fmla v24.8h, v18.8h, v2.h[6]\n"
956 "ldr q18, [x10, #0xc0]\n"
957 "fmla v21.8h, v19.8h, v0.h[6]\n"
958 "fmla v23.8h, v19.8h, v1.h[6]\n"
959 "fmla v25.8h, v19.8h, v2.h[6]\n"
960 "ldr q19, [x10, #0xd0]\n"
961 "fmla v20.8h, v6.8h, v0.h[7]\n"
962 "fmla v22.8h, v6.8h, v1.h[7]\n"
963 "fmla v24.8h, v6.8h, v2.h[7]\n"
964 "ldr q6, [x10, #0x0]\n"
965 "fmla v21.8h, v7.8h, v0.h[7]\n"
966 "ldr q0, [x26, #0x0]\n"
967 "fmla v23.8h, v7.8h, v1.h[7]\n"
968 "ldr q1, [x25, #0x0]\n"
969 "fmla v25.8h, v7.8h, v2.h[7]\n"
970 "ldr q2, [x24, #0x0]\n"
971 "ldr q7, [x10, #0x10]\n"
972 "bge 84b\n"
973 "85:" // Height 3: Multiply loop: Single iteration only
974 "fmla v20.8h, v6.8h, v0.h[0]\n"
975 "fmla v22.8h, v6.8h, v1.h[0]\n"
976 "add x26, x26, #0x10\n"
977 "add x25, x25, #0x10\n"
978 "fmla v24.8h, v6.8h, v2.h[0]\n"
979 "ldr q6, [x10, #0xe0]\n"
980 "fmla v21.8h, v7.8h, v0.h[0]\n"
981 "add x24, x24, #0x10\n"
982 "fmla v23.8h, v7.8h, v1.h[0]\n"
983 "fmla v25.8h, v7.8h, v2.h[0]\n"
984 "ldr q7, [x10, #0xf0]\n"
985 "prfm pldl1keep, [x26, #0x80]\n"
986 "sub x27, x27, #0x8\n"
987 "prfm pldl1keep, [x25, #0x80]\n"
988 "add x10, x10, #0x100\n"
989 "prfm pldl1keep, [x24, #0x80]\n"
990 "fmla v20.8h, v8.8h, v0.h[1]\n"
991 "fmla v22.8h, v8.8h, v1.h[1]\n"
992 "fmla v24.8h, v8.8h, v2.h[1]\n"
993 "fmla v21.8h, v9.8h, v0.h[1]\n"
994 "fmla v23.8h, v9.8h, v1.h[1]\n"
995 "fmla v25.8h, v9.8h, v2.h[1]\n"
996 "fmla v20.8h, v10.8h, v0.h[2]\n"
997 "fmla v22.8h, v10.8h, v1.h[2]\n"
998 "fmla v24.8h, v10.8h, v2.h[2]\n"
999 "fmla v21.8h, v11.8h, v0.h[2]\n"
1000 "fmla v23.8h, v11.8h, v1.h[2]\n"
1001 "fmla v25.8h, v11.8h, v2.h[2]\n"
1002 "fmla v20.8h, v12.8h, v0.h[3]\n"
1003 "fmla v22.8h, v12.8h, v1.h[3]\n"
1004 "fmla v24.8h, v12.8h, v2.h[3]\n"
1005 "fmla v21.8h, v13.8h, v0.h[3]\n"
1006 "fmla v23.8h, v13.8h, v1.h[3]\n"
1007 "fmla v25.8h, v13.8h, v2.h[3]\n"
1008 "fmla v20.8h, v14.8h, v0.h[4]\n"
1009 "fmla v22.8h, v14.8h, v1.h[4]\n"
1010 "fmla v24.8h, v14.8h, v2.h[4]\n"
1011 "fmla v21.8h, v15.8h, v0.h[4]\n"
1012 "fmla v23.8h, v15.8h, v1.h[4]\n"
1013 "fmla v25.8h, v15.8h, v2.h[4]\n"
1014 "fmla v20.8h, v16.8h, v0.h[5]\n"
1015 "fmla v22.8h, v16.8h, v1.h[5]\n"
1016 "fmla v24.8h, v16.8h, v2.h[5]\n"
1017 "fmla v21.8h, v17.8h, v0.h[5]\n"
1018 "fmla v23.8h, v17.8h, v1.h[5]\n"
1019 "fmla v25.8h, v17.8h, v2.h[5]\n"
1020 "fmla v20.8h, v18.8h, v0.h[6]\n"
1021 "fmla v22.8h, v18.8h, v1.h[6]\n"
1022 "fmla v24.8h, v18.8h, v2.h[6]\n"
1023 "fmla v21.8h, v19.8h, v0.h[6]\n"
1024 "fmla v23.8h, v19.8h, v1.h[6]\n"
1025 "fmla v25.8h, v19.8h, v2.h[6]\n"
1026 "fmla v20.8h, v6.8h, v0.h[7]\n"
1027 "fmla v22.8h, v6.8h, v1.h[7]\n"
1028 "fmla v24.8h, v6.8h, v2.h[7]\n"
1029 "fmla v21.8h, v7.8h, v0.h[7]\n"
1030 "fmla v23.8h, v7.8h, v1.h[7]\n"
1031 "fmla v25.8h, v7.8h, v2.h[7]\n"
1032 "86:" // Height 3: Multiply loop: Main loop skip
1033 "cbz x27, 88f\n"
1034 "87:" // Height 3: Multiply loop: Odd block loop
1035 "ldr h0, [x26], #0x2\n"
1036 "ldr h1, [x25], #0x2\n"
1037 "sub x27, x27, #0x1\n"
1038 "ldr h2, [x24], #0x2\n"
1039 "ldr q8, [x10, #0x0]\n"
1040 "ldr q9, [x10, #0x10]\n"
1041 "add x10, x10, #0x20\n"
1042 "fmla v20.8h, v8.8h, v0.h[0]\n"
1043 "fmla v22.8h, v8.8h, v1.h[0]\n"
1044 "fmla v24.8h, v8.8h, v2.h[0]\n"
1045 "fmla v21.8h, v9.8h, v0.h[0]\n"
1046 "fmla v23.8h, v9.8h, v1.h[0]\n"
1047 "fmla v25.8h, v9.8h, v2.h[0]\n"
1048 "cbnz x27, 87b\n"
1049 "88:" // Height 3: Multiply loop: No odd multiplies
1050 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1051 "add x28, x28, #0x1\n"
1052 "cmp x28, x20\n"
1053 "bne 81b\n"
1054 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1055 "prfm pstl1keep, [x9, #0x0]\n"
1056 "add x26, x9, x20, LSL #1\n"
1057 "prfm pstl1keep, [x26, #0x0]\n"
1058 "add x25, x26, x20, LSL #1\n"
1059 "prfm pstl1keep, [x25, #0x0]\n"
1060 "tbz %x[flags], #1, 89f\n"
1061 "add x21, %x[args_ptr], %[offset_max]\n"
1062 "add x20, %x[args_ptr], %[offset_min]\n"
1063 "ld1r { v17.8h }, [x21]\n"
1064 "ld1r { v16.8h }, [x20]\n"
1065 "fmin v20.8h, v20.8h, v17.8h\n"
1066 "fmin v21.8h, v21.8h, v17.8h\n"
1067 "fmin v22.8h, v22.8h, v17.8h\n"
1068 "fmin v23.8h, v23.8h, v17.8h\n"
1069 "fmin v24.8h, v24.8h, v17.8h\n"
1070 "fmin v25.8h, v25.8h, v17.8h\n"
1071 "fmax v20.8h, v20.8h, v16.8h\n"
1072 "fmax v21.8h, v21.8h, v16.8h\n"
1073 "fmax v22.8h, v22.8h, v16.8h\n"
1074 "fmax v23.8h, v23.8h, v16.8h\n"
1075 "fmax v24.8h, v24.8h, v16.8h\n"
1076 "fmax v25.8h, v25.8h, v16.8h\n"
1077 "89:" // Height 3: No activation
1078 "cmp x11, #0x10\n"
1079 "bge 98f\n"
1080 "tbz x11, #3, 93f\n"
1081 "st1 { v20.8h }, [x9], #0x10\n"
1082 "st1 { v22.8h }, [x26], #0x10\n"
1083 "st1 { v24.8h }, [x25], #0x10\n"
1084 "tbz x11, #2, 91f\n"
1085 "str d21, [x9], #0x8\n"
1086 "str d23, [x26], #0x8\n"
1087 "str d25, [x25], #0x8\n"
1088 "tbz x11, #1, 90f\n"
1089 "st1 { v21.s }[2], [x9], #0x4\n"
1090 "st1 { v23.s }[2], [x26], #0x4\n"
1091 "st1 { v25.s }[2], [x25], #0x4\n"
1092 "tbz x11, #0, 97f\n"
1093 "st1 { v21.h }[6], [x9]\n"
1094 "st1 { v23.h }[6], [x26]\n"
1095 "st1 { v25.h }[6], [x25]\n"
1096 "b 97f\n"
1097 "90:" // Height 3: Partial direct writeback: partial_1_12
1098 "tbz x11, #0, 97f\n"
1099 "st1 { v21.h }[4], [x9]\n"
1100 "st1 { v23.h }[4], [x26]\n"
1101 "st1 { v25.h }[4], [x25]\n"
1102 "b 97f\n"
1103 "91:" // Height 3: Partial direct writeback: partial_2_8
1104 "tbz x11, #1, 92f\n"
1105 "str s21, [x9], #0x4\n"
1106 "str s23, [x26], #0x4\n"
1107 "str s25, [x25], #0x4\n"
1108 "tbz x11, #0, 97f\n"
1109 "st1 { v21.h }[2], [x9]\n"
1110 "st1 { v23.h }[2], [x26]\n"
1111 "st1 { v25.h }[2], [x25]\n"
1112 "b 97f\n"
1113 "92:" // Height 3: Partial direct writeback: partial_1_8
1114 "tbz x11, #0, 97f\n"
1115 "str h21, [x9, #0x0]\n"
1116 "str h23, [x26, #0x0]\n"
1117 "str h25, [x25, #0x0]\n"
1118 "b 97f\n"
1119 "93:" // Height 3: Partial direct writeback: partial_4_0
1120 "tbz x11, #2, 95f\n"
1121 "str d20, [x9], #0x8\n"
1122 "str d22, [x26], #0x8\n"
1123 "str d24, [x25], #0x8\n"
1124 "tbz x11, #1, 94f\n"
1125 "st1 { v20.s }[2], [x9], #0x4\n"
1126 "st1 { v22.s }[2], [x26], #0x4\n"
1127 "st1 { v24.s }[2], [x25], #0x4\n"
1128 "tbz x11, #0, 97f\n"
1129 "st1 { v20.h }[6], [x9]\n"
1130 "st1 { v22.h }[6], [x26]\n"
1131 "st1 { v24.h }[6], [x25]\n"
1132 "b 97f\n"
1133 "94:" // Height 3: Partial direct writeback: partial_1_4
1134 "tbz x11, #0, 97f\n"
1135 "st1 { v20.h }[4], [x9]\n"
1136 "st1 { v22.h }[4], [x26]\n"
1137 "st1 { v24.h }[4], [x25]\n"
1138 "b 97f\n"
1139 "95:" // Height 3: Partial direct writeback: partial_2_0
1140 "tbz x11, #1, 96f\n"
1141 "str s20, [x9], #0x4\n"
1142 "str s22, [x26], #0x4\n"
1143 "str s24, [x25], #0x4\n"
1144 "tbz x11, #0, 97f\n"
1145 "st1 { v20.h }[2], [x9]\n"
1146 "st1 { v22.h }[2], [x26]\n"
1147 "st1 { v24.h }[2], [x25]\n"
1148 "b 97f\n"
1149 "96:" // Height 3: Partial direct writeback: partial_1_0
1150 "str h20, [x9, #0x0]\n"
1151 "str h22, [x26, #0x0]\n"
1152 "str h24, [x25, #0x0]\n"
1153 "97:" // Height 3: Partial direct writeback: Done
1154 "b 99f\n"
1155 "98:" // Height 3: Full writeback
1156 "str q20, [x9, #0x0]\n"
1157 "str q21, [x9, #0x10]\n"
1158 "add x9, x9, #0x20\n"
1159 "str q22, [x26, #0x0]\n"
1160 "str q23, [x26, #0x10]\n"
1161 "str q24, [x25, #0x0]\n"
1162 "str q25, [x25, #0x10]\n"
1163 "99:" // Height 3: Writeback done
1164 "subs x11, x11, #0x10\n"
1165 "bgt 68b\n"
1166 "b 200f\n"
1167 "100:" // Height 4
1168 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
1169 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1170 "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
1171 "101:" // Height 4: Column loop
1172 "cbz x10, 102f\n"
1173 "ldr q20, [x10, #0x0]\n"
1174 "ldr q21, [x10, #0x10]\n"
1175 "add x10, x10, #0x20\n"
1176 "mov v22.16b, v20.16b\n"
1177 "mov v23.16b, v21.16b\n"
1178 "mov v24.16b, v20.16b\n"
1179 "mov v25.16b, v21.16b\n"
1180 "mov v26.16b, v20.16b\n"
1181 "mov v27.16b, v21.16b\n"
1182 "b 113f\n"
1183 "102:" // Height 4: no bias
1184 "tbz %x[flags], #0, 112f\n"
1185 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1186 "cmp x11, #0x10\n"
1187 "add x26, x9, x20, LSL #1\n"
1188 "add x25, x26, x20, LSL #1\n"
1189 "add x24, x25, x20, LSL #1\n"
1190 "bge 111f\n"
1191 "tbz x11, #3, 106f\n"
1192 "ld1 { v20.8h }, [x9], #0x10\n"
1193 "ld1 { v22.8h }, [x26], #0x10\n"
1194 "ld1 { v24.8h }, [x25], #0x10\n"
1195 "ld1 { v26.8h }, [x24], #0x10\n"
1196 "tbz x11, #2, 104f\n"
1197 "ldr d21, [x9], #0x8\n"
1198 "ldr d23, [x26], #0x8\n"
1199 "ldr d25, [x25], #0x8\n"
1200 "ldr d27, [x24], #0x8\n"
1201 "tbz x11, #1, 103f\n"
1202 "ld1 { v21.s }[2], [x9], #0x4\n"
1203 "ld1 { v23.s }[2], [x26], #0x4\n"
1204 "mov x20, #0x1c\n"
1205 "ld1 { v25.s }[2], [x25], #0x4\n"
1206 "ld1 { v27.s }[2], [x24], #0x4\n"
1207 "tbz x11, #0, 110f\n"
1208 "ld1 { v21.h }[6], [x9]\n"
1209 "ld1 { v23.h }[6], [x26]\n"
1210 "ld1 { v25.h }[6], [x25]\n"
1211 "ld1 { v27.h }[6], [x24]\n"
1212 "b 110f\n"
1213 "103:" // Height 4: Partial accumulate: partial_1_12
1214 "mov x20, #0x18\n"
1215 "tbz x11, #0, 110f\n"
1216 "ld1 { v21.h }[4], [x9]\n"
1217 "ld1 { v23.h }[4], [x26]\n"
1218 "ld1 { v25.h }[4], [x25]\n"
1219 "ld1 { v27.h }[4], [x24]\n"
1220 "b 110f\n"
1221 "104:" // Height 4: Partial accumulate: partial_2_8
1222 "tbz x11, #1, 105f\n"
1223 "ldr s21, [x9], #0x4\n"
1224 "ldr s23, [x26], #0x4\n"
1225 "mov x20, #0x14\n"
1226 "ldr s25, [x25], #0x4\n"
1227 "ldr s27, [x24], #0x4\n"
1228 "tbz x11, #0, 110f\n"
1229 "ld1 { v21.h }[2], [x9]\n"
1230 "ld1 { v23.h }[2], [x26]\n"
1231 "ld1 { v25.h }[2], [x25]\n"
1232 "ld1 { v27.h }[2], [x24]\n"
1233 "b 110f\n"
1234 "105:" // Height 4: Partial accumulate: partial_1_8
1235 "mov x20, #0x10\n"
1236 "tbz x11, #0, 110f\n"
1237 "ldr h21, [x9, #0x0]\n"
1238 "ldr h23, [x26, #0x0]\n"
1239 "ldr h25, [x25, #0x0]\n"
1240 "ldr h27, [x24, #0x0]\n"
1241 "b 110f\n"
1242 "106:" // Height 4: Partial accumulate: partial_4_0
1243 "tbz x11, #2, 108f\n"
1244 "ldr d20, [x9], #0x8\n"
1245 "ldr d22, [x26], #0x8\n"
1246 "ldr d24, [x25], #0x8\n"
1247 "ldr d26, [x24], #0x8\n"
1248 "tbz x11, #1, 107f\n"
1249 "ld1 { v20.s }[2], [x9], #0x4\n"
1250 "ld1 { v22.s }[2], [x26], #0x4\n"
1251 "mov x20, #0xc\n"
1252 "ld1 { v24.s }[2], [x25], #0x4\n"
1253 "ld1 { v26.s }[2], [x24], #0x4\n"
1254 "tbz x11, #0, 110f\n"
1255 "ld1 { v20.h }[6], [x9]\n"
1256 "ld1 { v22.h }[6], [x26]\n"
1257 "ld1 { v24.h }[6], [x25]\n"
1258 "ld1 { v26.h }[6], [x24]\n"
1259 "b 110f\n"
1260 "107:" // Height 4: Partial accumulate: partial_1_4
1261 "mov x20, #0x8\n"
1262 "tbz x11, #0, 110f\n"
1263 "ld1 { v20.h }[4], [x9]\n"
1264 "ld1 { v22.h }[4], [x26]\n"
1265 "ld1 { v24.h }[4], [x25]\n"
1266 "ld1 { v26.h }[4], [x24]\n"
1267 "b 110f\n"
1268 "108:" // Height 4: Partial accumulate: partial_2_0
1269 "tbz x11, #1, 109f\n"
1270 "ldr s20, [x9], #0x4\n"
1271 "ldr s22, [x26], #0x4\n"
1272 "mov x20, #0x4\n"
1273 "ldr s24, [x25], #0x4\n"
1274 "ldr s26, [x24], #0x4\n"
1275 "tbz x11, #0, 110f\n"
1276 "ld1 { v20.h }[2], [x9]\n"
1277 "ld1 { v22.h }[2], [x26]\n"
1278 "ld1 { v24.h }[2], [x25]\n"
1279 "ld1 { v26.h }[2], [x24]\n"
1280 "b 110f\n"
1281 "109:" // Height 4: Partial accumulate: partial_1_0
1282 "ldr h20, [x9, #0x0]\n"
1283 "ldr h22, [x26, #0x0]\n"
1284 "mov x20, #0x0\n"
1285 "ldr h24, [x25, #0x0]\n"
1286 "ldr h26, [x24, #0x0]\n"
1287 "110:" // Height 4: Partial accumulate: Done
1288 "sub x9, x9, x20\n"
1289 "b 113f\n"
1290 "111:" // Height 4: full accumulate
1291 "ldr q20, [x9, #0x0]\n"
1292 "ldr q21, [x9, #0x10]\n"
1293 "ldr q22, [x26, #0x0]\n"
1294 "ldr q23, [x26, #0x10]\n"
1295 "ldr q24, [x25, #0x0]\n"
1296 "ldr q25, [x25, #0x10]\n"
1297 "ldr q26, [x24, #0x0]\n"
1298 "ldr q27, [x24, #0x10]\n"
1299 "b 113f\n"
1300 "112:" // Height 4: no accumulate
1301 "movi v20.16b, #0x0\n"
1302 "movi v21.16b, #0x0\n"
1303 "movi v22.16b, #0x0\n"
1304 "movi v23.16b, #0x0\n"
1305 "movi v24.16b, #0x0\n"
1306 "movi v25.16b, #0x0\n"
1307 "movi v26.16b, #0x0\n"
1308 "movi v27.16b, #0x0\n"
1309 "113:" // Height 4: setup done
1310 "mov x28, #0x0\n"
1311 "114:" // Height 4: String loop
1312 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1313 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1314 "ldr w27, [x20, x28, LSL #0x2]\n"
1315 "tbz %x[flags], #3, 115f\n"
1316 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
1317 "add x20, x20, x21, LSL #3\n"
1318 "ldr x26, [x20, #0x0]\n"
1319 "ldr x25, [x20, #0x8]\n"
1320 "ldr x24, [x20, #0x10]\n"
1321 "ldr x23, [x20, #0x18]\n"
1322 "cbnz x28, 116f\n"
1323 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1324 "add x26, x26, x20, LSL #1\n"
1325 "add x25, x25, x20, LSL #1\n"
1326 "add x24, x24, x20, LSL #1\n"
1327 "add x23, x23, x20, LSL #1\n"
1328 "b 116f\n"
1329 "115:" // Height 4: setup direct input
1330 "mov x26, %x[input_ptr]\n"
1331 "add x25, x26, x21, LSL #1\n"
1332 "add x24, x25, x21, LSL #1\n"
1333 "add x23, x24, x21, LSL #1\n"
1334 "116:" // Height 4: input setup done
1335 "cmp x27, #0x8\n"
1336 "blt 119f\n"
1337 "ldr q0, [x26, #0x0]\n"
1338 "ldr q1, [x25, #0x0]\n"
1339 "cmp x27, #0x10\n"
1340 "ldr q2, [x24, #0x0]\n"
1341 "ldr q3, [x23, #0x0]\n"
1342 "ldr q6, [x10, #0x0]\n"
1343 "ldr q7, [x10, #0x10]\n"
1344 "ldr q8, [x10, #0x20]\n"
1345 "ldr q9, [x10, #0x30]\n"
1346 "ldr q10, [x10, #0x40]\n"
1347 "ldr q11, [x10, #0x50]\n"
1348 "ldr q12, [x10, #0x60]\n"
1349 "ldr q13, [x10, #0x70]\n"
1350 "ldr q14, [x10, #0x80]\n"
1351 "ldr q15, [x10, #0x90]\n"
1352 "ldr q16, [x10, #0xa0]\n"
1353 "ldr q17, [x10, #0xb0]\n"
1354 "ldr q18, [x10, #0xc0]\n"
1355 "ldr q19, [x10, #0xd0]\n"
1356 "blt 118f\n"
1357 "117:" // Height 4: Multiply loop: Main loop head
1358 "fmla v20.8h, v6.8h, v0.h[0]\n"
1359 "fmla v22.8h, v6.8h, v1.h[0]\n"
1360 "sub x27, x27, #0x8\n"
1361 "add x26, x26, #0x10\n"
1362 "fmla v24.8h, v6.8h, v2.h[0]\n"
1363 "fmla v26.8h, v6.8h, v3.h[0]\n"
1364 "ldr q6, [x10, #0xe0]\n"
1365 "add x25, x25, #0x10\n"
1366 "fmla v21.8h, v7.8h, v0.h[0]\n"
1367 "fmla v23.8h, v7.8h, v1.h[0]\n"
1368 "add x24, x24, #0x10\n"
1369 "add x23, x23, #0x10\n"
1370 "fmla v25.8h, v7.8h, v2.h[0]\n"
1371 "fmla v27.8h, v7.8h, v3.h[0]\n"
1372 "ldr q7, [x10, #0xf0]\n"
1373 "cmp x27, #0x10\n"
1374 "fmla v20.8h, v8.8h, v0.h[1]\n"
1375 "fmla v22.8h, v8.8h, v1.h[1]\n"
1376 "add x10, x10, #0x100\n"
1377 "prfm pldl1keep, [x26, #0x80]\n"
1378 "fmla v24.8h, v8.8h, v2.h[1]\n"
1379 "fmla v26.8h, v8.8h, v3.h[1]\n"
1380 "ldr q8, [x10, #0x20]\n"
1381 "prfm pldl1keep, [x25, #0x80]\n"
1382 "fmla v21.8h, v9.8h, v0.h[1]\n"
1383 "fmla v23.8h, v9.8h, v1.h[1]\n"
1384 "prfm pldl1keep, [x24, #0x80]\n"
1385 "prfm pldl1keep, [x23, #0x80]\n"
1386 "fmla v25.8h, v9.8h, v2.h[1]\n"
1387 "fmla v27.8h, v9.8h, v3.h[1]\n"
1388 "ldr q9, [x10, #0x30]\n"
1389 "fmla v20.8h, v10.8h, v0.h[2]\n"
1390 "fmla v22.8h, v10.8h, v1.h[2]\n"
1391 "fmla v24.8h, v10.8h, v2.h[2]\n"
1392 "fmla v26.8h, v10.8h, v3.h[2]\n"
1393 "ldr q10, [x10, #0x40]\n"
1394 "fmla v21.8h, v11.8h, v0.h[2]\n"
1395 "fmla v23.8h, v11.8h, v1.h[2]\n"
1396 "fmla v25.8h, v11.8h, v2.h[2]\n"
1397 "fmla v27.8h, v11.8h, v3.h[2]\n"
1398 "ldr q11, [x10, #0x50]\n"
1399 "fmla v20.8h, v12.8h, v0.h[3]\n"
1400 "fmla v22.8h, v12.8h, v1.h[3]\n"
1401 "fmla v24.8h, v12.8h, v2.h[3]\n"
1402 "fmla v26.8h, v12.8h, v3.h[3]\n"
1403 "ldr q12, [x10, #0x60]\n"
1404 "fmla v21.8h, v13.8h, v0.h[3]\n"
1405 "fmla v23.8h, v13.8h, v1.h[3]\n"
1406 "fmla v25.8h, v13.8h, v2.h[3]\n"
1407 "fmla v27.8h, v13.8h, v3.h[3]\n"
1408 "ldr q13, [x10, #0x70]\n"
1409 "fmla v20.8h, v14.8h, v0.h[4]\n"
1410 "fmla v22.8h, v14.8h, v1.h[4]\n"
1411 "fmla v24.8h, v14.8h, v2.h[4]\n"
1412 "fmla v26.8h, v14.8h, v3.h[4]\n"
1413 "ldr q14, [x10, #0x80]\n"
1414 "fmla v21.8h, v15.8h, v0.h[4]\n"
1415 "fmla v23.8h, v15.8h, v1.h[4]\n"
1416 "fmla v25.8h, v15.8h, v2.h[4]\n"
1417 "fmla v27.8h, v15.8h, v3.h[4]\n"
1418 "ldr q15, [x10, #0x90]\n"
1419 "fmla v20.8h, v16.8h, v0.h[5]\n"
1420 "fmla v22.8h, v16.8h, v1.h[5]\n"
1421 "fmla v24.8h, v16.8h, v2.h[5]\n"
1422 "fmla v26.8h, v16.8h, v3.h[5]\n"
1423 "ldr q16, [x10, #0xa0]\n"
1424 "fmla v21.8h, v17.8h, v0.h[5]\n"
1425 "fmla v23.8h, v17.8h, v1.h[5]\n"
1426 "fmla v25.8h, v17.8h, v2.h[5]\n"
1427 "fmla v27.8h, v17.8h, v3.h[5]\n"
1428 "ldr q17, [x10, #0xb0]\n"
1429 "fmla v20.8h, v18.8h, v0.h[6]\n"
1430 "fmla v22.8h, v18.8h, v1.h[6]\n"
1431 "fmla v24.8h, v18.8h, v2.h[6]\n"
1432 "fmla v26.8h, v18.8h, v3.h[6]\n"
1433 "ldr q18, [x10, #0xc0]\n"
1434 "fmla v21.8h, v19.8h, v0.h[6]\n"
1435 "fmla v23.8h, v19.8h, v1.h[6]\n"
1436 "fmla v25.8h, v19.8h, v2.h[6]\n"
1437 "fmla v27.8h, v19.8h, v3.h[6]\n"
1438 "ldr q19, [x10, #0xd0]\n"
1439 "fmla v20.8h, v6.8h, v0.h[7]\n"
1440 "fmla v22.8h, v6.8h, v1.h[7]\n"
1441 "fmla v24.8h, v6.8h, v2.h[7]\n"
1442 "fmla v26.8h, v6.8h, v3.h[7]\n"
1443 "ldr q6, [x10, #0x0]\n"
1444 "fmla v21.8h, v7.8h, v0.h[7]\n"
1445 "ldr q0, [x26, #0x0]\n"
1446 "fmla v23.8h, v7.8h, v1.h[7]\n"
1447 "ldr q1, [x25, #0x0]\n"
1448 "fmla v25.8h, v7.8h, v2.h[7]\n"
1449 "ldr q2, [x24, #0x0]\n"
1450 "fmla v27.8h, v7.8h, v3.h[7]\n"
1451 "ldr q3, [x23, #0x0]\n"
1452 "ldr q7, [x10, #0x10]\n"
1453 "bge 117b\n"
1454 "118:" // Height 4: Multiply loop: Single iteration only
1455 "fmla v20.8h, v6.8h, v0.h[0]\n"
1456 "fmla v22.8h, v6.8h, v1.h[0]\n"
1457 "add x26, x26, #0x10\n"
1458 "add x25, x25, #0x10\n"
1459 "fmla v24.8h, v6.8h, v2.h[0]\n"
1460 "fmla v26.8h, v6.8h, v3.h[0]\n"
1461 "ldr q6, [x10, #0xe0]\n"
1462 "add x24, x24, #0x10\n"
1463 "fmla v21.8h, v7.8h, v0.h[0]\n"
1464 "fmla v23.8h, v7.8h, v1.h[0]\n"
1465 "add x23, x23, #0x10\n"
1466 "sub x27, x27, #0x8\n"
1467 "fmla v25.8h, v7.8h, v2.h[0]\n"
1468 "fmla v27.8h, v7.8h, v3.h[0]\n"
1469 "ldr q7, [x10, #0xf0]\n"
1470 "prfm pldl1keep, [x26, #0x80]\n"
1471 "fmla v20.8h, v8.8h, v0.h[1]\n"
1472 "fmla v22.8h, v8.8h, v1.h[1]\n"
1473 "prfm pldl1keep, [x25, #0x80]\n"
1474 "prfm pldl1keep, [x24, #0x80]\n"
1475 "fmla v24.8h, v8.8h, v2.h[1]\n"
1476 "fmla v26.8h, v8.8h, v3.h[1]\n"
1477 "prfm pldl1keep, [x23, #0x80]\n"
1478 "add x10, x10, #0x100\n"
1479 "fmla v21.8h, v9.8h, v0.h[1]\n"
1480 "fmla v23.8h, v9.8h, v1.h[1]\n"
1481 "fmla v25.8h, v9.8h, v2.h[1]\n"
1482 "fmla v27.8h, v9.8h, v3.h[1]\n"
1483 "fmla v20.8h, v10.8h, v0.h[2]\n"
1484 "fmla v22.8h, v10.8h, v1.h[2]\n"
1485 "fmla v24.8h, v10.8h, v2.h[2]\n"
1486 "fmla v26.8h, v10.8h, v3.h[2]\n"
1487 "fmla v21.8h, v11.8h, v0.h[2]\n"
1488 "fmla v23.8h, v11.8h, v1.h[2]\n"
1489 "fmla v25.8h, v11.8h, v2.h[2]\n"
1490 "fmla v27.8h, v11.8h, v3.h[2]\n"
1491 "fmla v20.8h, v12.8h, v0.h[3]\n"
1492 "fmla v22.8h, v12.8h, v1.h[3]\n"
1493 "fmla v24.8h, v12.8h, v2.h[3]\n"
1494 "fmla v26.8h, v12.8h, v3.h[3]\n"
1495 "fmla v21.8h, v13.8h, v0.h[3]\n"
1496 "fmla v23.8h, v13.8h, v1.h[3]\n"
1497 "fmla v25.8h, v13.8h, v2.h[3]\n"
1498 "fmla v27.8h, v13.8h, v3.h[3]\n"
1499 "fmla v20.8h, v14.8h, v0.h[4]\n"
1500 "fmla v22.8h, v14.8h, v1.h[4]\n"
1501 "fmla v24.8h, v14.8h, v2.h[4]\n"
1502 "fmla v26.8h, v14.8h, v3.h[4]\n"
1503 "fmla v21.8h, v15.8h, v0.h[4]\n"
1504 "fmla v23.8h, v15.8h, v1.h[4]\n"
1505 "fmla v25.8h, v15.8h, v2.h[4]\n"
1506 "fmla v27.8h, v15.8h, v3.h[4]\n"
1507 "fmla v20.8h, v16.8h, v0.h[5]\n"
1508 "fmla v22.8h, v16.8h, v1.h[5]\n"
1509 "fmla v24.8h, v16.8h, v2.h[5]\n"
1510 "fmla v26.8h, v16.8h, v3.h[5]\n"
1511 "fmla v21.8h, v17.8h, v0.h[5]\n"
1512 "fmla v23.8h, v17.8h, v1.h[5]\n"
1513 "fmla v25.8h, v17.8h, v2.h[5]\n"
1514 "fmla v27.8h, v17.8h, v3.h[5]\n"
1515 "fmla v20.8h, v18.8h, v0.h[6]\n"
1516 "fmla v22.8h, v18.8h, v1.h[6]\n"
1517 "fmla v24.8h, v18.8h, v2.h[6]\n"
1518 "fmla v26.8h, v18.8h, v3.h[6]\n"
1519 "fmla v21.8h, v19.8h, v0.h[6]\n"
1520 "fmla v23.8h, v19.8h, v1.h[6]\n"
1521 "fmla v25.8h, v19.8h, v2.h[6]\n"
1522 "fmla v27.8h, v19.8h, v3.h[6]\n"
1523 "fmla v20.8h, v6.8h, v0.h[7]\n"
1524 "fmla v22.8h, v6.8h, v1.h[7]\n"
1525 "fmla v24.8h, v6.8h, v2.h[7]\n"
1526 "fmla v26.8h, v6.8h, v3.h[7]\n"
1527 "fmla v21.8h, v7.8h, v0.h[7]\n"
1528 "fmla v23.8h, v7.8h, v1.h[7]\n"
1529 "fmla v25.8h, v7.8h, v2.h[7]\n"
1530 "fmla v27.8h, v7.8h, v3.h[7]\n"
1531 "119:" // Height 4: Multiply loop: Main loop skip
1532 "cbz x27, 121f\n"
1533 "120:" // Height 4: Multiply loop: Odd block loop
1534 "ldr h0, [x26], #0x2\n"
1535 "ldr h1, [x25], #0x2\n"
1536 "sub x27, x27, #0x1\n"
1537 "ldr h2, [x24], #0x2\n"
1538 "ldr h3, [x23], #0x2\n"
1539 "ldr q8, [x10, #0x0]\n"
1540 "ldr q9, [x10, #0x10]\n"
1541 "add x10, x10, #0x20\n"
1542 "fmla v20.8h, v8.8h, v0.h[0]\n"
1543 "fmla v22.8h, v8.8h, v1.h[0]\n"
1544 "fmla v24.8h, v8.8h, v2.h[0]\n"
1545 "fmla v26.8h, v8.8h, v3.h[0]\n"
1546 "fmla v21.8h, v9.8h, v0.h[0]\n"
1547 "fmla v23.8h, v9.8h, v1.h[0]\n"
1548 "fmla v25.8h, v9.8h, v2.h[0]\n"
1549 "fmla v27.8h, v9.8h, v3.h[0]\n"
1550 "cbnz x27, 120b\n"
1551 "121:" // Height 4: Multiply loop: No odd multiplies
1552 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
1553 "add x28, x28, #0x1\n"
1554 "cmp x28, x20\n"
1555 "bne 114b\n"
1556 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1557 "prfm pstl1keep, [x9, #0x0]\n"
1558 "add x26, x9, x20, LSL #1\n"
1559 "prfm pstl1keep, [x26, #0x0]\n"
1560 "add x25, x26, x20, LSL #1\n"
1561 "prfm pstl1keep, [x25, #0x0]\n"
1562 "add x24, x25, x20, LSL #1\n"
1563 "prfm pstl1keep, [x24, #0x0]\n"
1564 "tbz %x[flags], #1, 122f\n"
1565 "add x21, %x[args_ptr], %[offset_max]\n"
1566 "add x20, %x[args_ptr], %[offset_min]\n"
1567 "ld1r { v17.8h }, [x21]\n"
1568 "ld1r { v16.8h }, [x20]\n"
1569 "fmin v20.8h, v20.8h, v17.8h\n"
1570 "fmin v21.8h, v21.8h, v17.8h\n"
1571 "fmin v22.8h, v22.8h, v17.8h\n"
1572 "fmin v23.8h, v23.8h, v17.8h\n"
1573 "fmin v24.8h, v24.8h, v17.8h\n"
1574 "fmin v25.8h, v25.8h, v17.8h\n"
1575 "fmin v26.8h, v26.8h, v17.8h\n"
1576 "fmin v27.8h, v27.8h, v17.8h\n"
1577 "fmax v20.8h, v20.8h, v16.8h\n"
1578 "fmax v21.8h, v21.8h, v16.8h\n"
1579 "fmax v22.8h, v22.8h, v16.8h\n"
1580 "fmax v23.8h, v23.8h, v16.8h\n"
1581 "fmax v24.8h, v24.8h, v16.8h\n"
1582 "fmax v25.8h, v25.8h, v16.8h\n"
1583 "fmax v26.8h, v26.8h, v16.8h\n"
1584 "fmax v27.8h, v27.8h, v16.8h\n"
1585 "122:" // Height 4: No activation
1586 "cmp x11, #0x10\n"
1587 "bge 131f\n"
1588 "tbz x11, #3, 126f\n"
1589 "st1 { v20.8h }, [x9], #0x10\n"
1590 "st1 { v22.8h }, [x26], #0x10\n"
1591 "st1 { v24.8h }, [x25], #0x10\n"
1592 "st1 { v26.8h }, [x24], #0x10\n"
1593 "tbz x11, #2, 124f\n"
1594 "str d21, [x9], #0x8\n"
1595 "str d23, [x26], #0x8\n"
1596 "str d25, [x25], #0x8\n"
1597 "str d27, [x24], #0x8\n"
1598 "tbz x11, #1, 123f\n"
1599 "st1 { v21.s }[2], [x9], #0x4\n"
1600 "st1 { v23.s }[2], [x26], #0x4\n"
1601 "st1 { v25.s }[2], [x25], #0x4\n"
1602 "st1 { v27.s }[2], [x24], #0x4\n"
1603 "tbz x11, #0, 130f\n"
1604 "st1 { v21.h }[6], [x9]\n"
1605 "st1 { v23.h }[6], [x26]\n"
1606 "st1 { v25.h }[6], [x25]\n"
1607 "st1 { v27.h }[6], [x24]\n"
1608 "b 130f\n"
1609 "123:" // Height 4: Partial direct writeback: partial_1_12
1610 "tbz x11, #0, 130f\n"
1611 "st1 { v21.h }[4], [x9]\n"
1612 "st1 { v23.h }[4], [x26]\n"
1613 "st1 { v25.h }[4], [x25]\n"
1614 "st1 { v27.h }[4], [x24]\n"
1615 "b 130f\n"
1616 "124:" // Height 4: Partial direct writeback: partial_2_8
1617 "tbz x11, #1, 125f\n"
1618 "str s21, [x9], #0x4\n"
1619 "str s23, [x26], #0x4\n"
1620 "str s25, [x25], #0x4\n"
1621 "str s27, [x24], #0x4\n"
1622 "tbz x11, #0, 130f\n"
1623 "st1 { v21.h }[2], [x9]\n"
1624 "st1 { v23.h }[2], [x26]\n"
1625 "st1 { v25.h }[2], [x25]\n"
1626 "st1 { v27.h }[2], [x24]\n"
1627 "b 130f\n"
1628 "125:" // Height 4: Partial direct writeback: partial_1_8
1629 "tbz x11, #0, 130f\n"
1630 "str h21, [x9, #0x0]\n"
1631 "str h23, [x26, #0x0]\n"
1632 "str h25, [x25, #0x0]\n"
1633 "str h27, [x24, #0x0]\n"
1634 "b 130f\n"
1635 "126:" // Height 4: Partial direct writeback: partial_4_0
1636 "tbz x11, #2, 128f\n"
1637 "str d20, [x9], #0x8\n"
1638 "str d22, [x26], #0x8\n"
1639 "str d24, [x25], #0x8\n"
1640 "str d26, [x24], #0x8\n"
1641 "tbz x11, #1, 127f\n"
1642 "st1 { v20.s }[2], [x9], #0x4\n"
1643 "st1 { v22.s }[2], [x26], #0x4\n"
1644 "st1 { v24.s }[2], [x25], #0x4\n"
1645 "st1 { v26.s }[2], [x24], #0x4\n"
1646 "tbz x11, #0, 130f\n"
1647 "st1 { v20.h }[6], [x9]\n"
1648 "st1 { v22.h }[6], [x26]\n"
1649 "st1 { v24.h }[6], [x25]\n"
1650 "st1 { v26.h }[6], [x24]\n"
1651 "b 130f\n"
1652 "127:" // Height 4: Partial direct writeback: partial_1_4
1653 "tbz x11, #0, 130f\n"
1654 "st1 { v20.h }[4], [x9]\n"
1655 "st1 { v22.h }[4], [x26]\n"
1656 "st1 { v24.h }[4], [x25]\n"
1657 "st1 { v26.h }[4], [x24]\n"
1658 "b 130f\n"
1659 "128:" // Height 4: Partial direct writeback: partial_2_0
1660 "tbz x11, #1, 129f\n"
1661 "str s20, [x9], #0x4\n"
1662 "str s22, [x26], #0x4\n"
1663 "str s24, [x25], #0x4\n"
1664 "str s26, [x24], #0x4\n"
1665 "tbz x11, #0, 130f\n"
1666 "st1 { v20.h }[2], [x9]\n"
1667 "st1 { v22.h }[2], [x26]\n"
1668 "st1 { v24.h }[2], [x25]\n"
1669 "st1 { v26.h }[2], [x24]\n"
1670 "b 130f\n"
1671 "129:" // Height 4: Partial direct writeback: partial_1_0
1672 "str h20, [x9, #0x0]\n"
1673 "str h22, [x26, #0x0]\n"
1674 "str h24, [x25, #0x0]\n"
1675 "str h26, [x24, #0x0]\n"
1676 "130:" // Height 4: Partial direct writeback: Done
1677 "b 132f\n"
1678 "131:" // Height 4: Full writeback
1679 "str q20, [x9, #0x0]\n"
1680 "str q21, [x9, #0x10]\n"
1681 "add x9, x9, #0x20\n"
1682 "str q22, [x26, #0x0]\n"
1683 "str q23, [x26, #0x10]\n"
1684 "str q24, [x25, #0x0]\n"
1685 "str q25, [x25, #0x10]\n"
1686 "str q26, [x24, #0x0]\n"
1687 "str q27, [x24, #0x10]\n"
1688 "132:" // Height 4: Writeback done
1689 "subs x11, x11, #0x10\n"
1690 "bgt 101b\n"
1691 "b 200f\n"
1692 "133:" // Height 5
1693 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
1694 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1695 "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
1696 "134:" // Height 5: Column loop
1697 "cbz x10, 135f\n"
1698 "ldr q20, [x10, #0x0]\n"
1699 "ldr q21, [x10, #0x10]\n"
1700 "add x10, x10, #0x20\n"
1701 "mov v22.16b, v20.16b\n"
1702 "mov v23.16b, v21.16b\n"
1703 "mov v24.16b, v20.16b\n"
1704 "mov v25.16b, v21.16b\n"
1705 "mov v26.16b, v20.16b\n"
1706 "mov v27.16b, v21.16b\n"
1707 "mov v28.16b, v20.16b\n"
1708 "mov v29.16b, v21.16b\n"
1709 "b 146f\n"
1710 "135:" // Height 5: no bias
1711 "tbz %x[flags], #0, 145f\n"
1712 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1713 "cmp x11, #0x10\n"
1714 "add x26, x9, x20, LSL #1\n"
1715 "add x25, x26, x20, LSL #1\n"
1716 "add x24, x25, x20, LSL #1\n"
1717 "add x23, x24, x20, LSL #1\n"
1718 "bge 144f\n"
1719 "tbz x11, #3, 139f\n"
1720 "ld1 { v20.8h }, [x9], #0x10\n"
1721 "ld1 { v22.8h }, [x26], #0x10\n"
1722 "ld1 { v24.8h }, [x25], #0x10\n"
1723 "ld1 { v26.8h }, [x24], #0x10\n"
1724 "ld1 { v28.8h }, [x23], #0x10\n"
1725 "tbz x11, #2, 137f\n"
1726 "ldr d21, [x9], #0x8\n"
1727 "ldr d23, [x26], #0x8\n"
1728 "ldr d25, [x25], #0x8\n"
1729 "ldr d27, [x24], #0x8\n"
1730 "ldr d29, [x23], #0x8\n"
1731 "tbz x11, #1, 136f\n"
1732 "ld1 { v21.s }[2], [x9], #0x4\n"
1733 "ld1 { v23.s }[2], [x26], #0x4\n"
1734 "mov x20, #0x1c\n"
1735 "ld1 { v25.s }[2], [x25], #0x4\n"
1736 "ld1 { v27.s }[2], [x24], #0x4\n"
1737 "ld1 { v29.s }[2], [x23], #0x4\n"
1738 "tbz x11, #0, 143f\n"
1739 "ld1 { v21.h }[6], [x9]\n"
1740 "ld1 { v23.h }[6], [x26]\n"
1741 "ld1 { v25.h }[6], [x25]\n"
1742 "ld1 { v27.h }[6], [x24]\n"
1743 "ld1 { v29.h }[6], [x23]\n"
1744 "b 143f\n"
1745 "136:" // Height 5: Partial accumulate: partial_1_12
1746 "mov x20, #0x18\n"
1747 "tbz x11, #0, 143f\n"
1748 "ld1 { v21.h }[4], [x9]\n"
1749 "ld1 { v23.h }[4], [x26]\n"
1750 "ld1 { v25.h }[4], [x25]\n"
1751 "ld1 { v27.h }[4], [x24]\n"
1752 "ld1 { v29.h }[4], [x23]\n"
1753 "b 143f\n"
1754 "137:" // Height 5: Partial accumulate: partial_2_8
1755 "tbz x11, #1, 138f\n"
1756 "ldr s21, [x9], #0x4\n"
1757 "ldr s23, [x26], #0x4\n"
1758 "mov x20, #0x14\n"
1759 "ldr s25, [x25], #0x4\n"
1760 "ldr s27, [x24], #0x4\n"
1761 "ldr s29, [x23], #0x4\n"
1762 "tbz x11, #0, 143f\n"
1763 "ld1 { v21.h }[2], [x9]\n"
1764 "ld1 { v23.h }[2], [x26]\n"
1765 "ld1 { v25.h }[2], [x25]\n"
1766 "ld1 { v27.h }[2], [x24]\n"
1767 "ld1 { v29.h }[2], [x23]\n"
1768 "b 143f\n"
1769 "138:" // Height 5: Partial accumulate: partial_1_8
1770 "mov x20, #0x10\n"
1771 "tbz x11, #0, 143f\n"
1772 "ldr h21, [x9, #0x0]\n"
1773 "ldr h23, [x26, #0x0]\n"
1774 "ldr h25, [x25, #0x0]\n"
1775 "ldr h27, [x24, #0x0]\n"
1776 "ldr h29, [x23, #0x0]\n"
1777 "b 143f\n"
1778 "139:" // Height 5: Partial accumulate: partial_4_0
1779 "tbz x11, #2, 141f\n"
1780 "ldr d20, [x9], #0x8\n"
1781 "ldr d22, [x26], #0x8\n"
1782 "ldr d24, [x25], #0x8\n"
1783 "ldr d26, [x24], #0x8\n"
1784 "ldr d28, [x23], #0x8\n"
1785 "tbz x11, #1, 140f\n"
1786 "ld1 { v20.s }[2], [x9], #0x4\n"
1787 "ld1 { v22.s }[2], [x26], #0x4\n"
1788 "mov x20, #0xc\n"
1789 "ld1 { v24.s }[2], [x25], #0x4\n"
1790 "ld1 { v26.s }[2], [x24], #0x4\n"
1791 "ld1 { v28.s }[2], [x23], #0x4\n"
1792 "tbz x11, #0, 143f\n"
1793 "ld1 { v20.h }[6], [x9]\n"
1794 "ld1 { v22.h }[6], [x26]\n"
1795 "ld1 { v24.h }[6], [x25]\n"
1796 "ld1 { v26.h }[6], [x24]\n"
1797 "ld1 { v28.h }[6], [x23]\n"
1798 "b 143f\n"
1799 "140:" // Height 5: Partial accumulate: partial_1_4
1800 "mov x20, #0x8\n"
1801 "tbz x11, #0, 143f\n"
1802 "ld1 { v20.h }[4], [x9]\n"
1803 "ld1 { v22.h }[4], [x26]\n"
1804 "ld1 { v24.h }[4], [x25]\n"
1805 "ld1 { v26.h }[4], [x24]\n"
1806 "ld1 { v28.h }[4], [x23]\n"
1807 "b 143f\n"
1808 "141:" // Height 5: Partial accumulate: partial_2_0
1809 "tbz x11, #1, 142f\n"
1810 "ldr s20, [x9], #0x4\n"
1811 "ldr s22, [x26], #0x4\n"
1812 "mov x20, #0x4\n"
1813 "ldr s24, [x25], #0x4\n"
1814 "ldr s26, [x24], #0x4\n"
1815 "ldr s28, [x23], #0x4\n"
1816 "tbz x11, #0, 143f\n"
1817 "ld1 { v20.h }[2], [x9]\n"
1818 "ld1 { v22.h }[2], [x26]\n"
1819 "ld1 { v24.h }[2], [x25]\n"
1820 "ld1 { v26.h }[2], [x24]\n"
1821 "ld1 { v28.h }[2], [x23]\n"
1822 "b 143f\n"
1823 "142:" // Height 5: Partial accumulate: partial_1_0
1824 "ldr h20, [x9, #0x0]\n"
1825 "ldr h22, [x26, #0x0]\n"
1826 "mov x20, #0x0\n"
1827 "ldr h24, [x25, #0x0]\n"
1828 "ldr h26, [x24, #0x0]\n"
1829 "ldr h28, [x23, #0x0]\n"
1830 "143:" // Height 5: Partial accumulate: Done
1831 "sub x9, x9, x20\n"
1832 "b 146f\n"
1833 "144:" // Height 5: full accumulate
1834 "ldr q20, [x9, #0x0]\n"
1835 "ldr q21, [x9, #0x10]\n"
1836 "ldr q22, [x26, #0x0]\n"
1837 "ldr q23, [x26, #0x10]\n"
1838 "ldr q24, [x25, #0x0]\n"
1839 "ldr q25, [x25, #0x10]\n"
1840 "ldr q26, [x24, #0x0]\n"
1841 "ldr q27, [x24, #0x10]\n"
1842 "ldr q28, [x23, #0x0]\n"
1843 "ldr q29, [x23, #0x10]\n"
1844 "b 146f\n"
1845 "145:" // Height 5: no accumulate
1846 "movi v20.16b, #0x0\n"
1847 "movi v21.16b, #0x0\n"
1848 "movi v22.16b, #0x0\n"
1849 "movi v23.16b, #0x0\n"
1850 "movi v24.16b, #0x0\n"
1851 "movi v25.16b, #0x0\n"
1852 "movi v26.16b, #0x0\n"
1853 "movi v27.16b, #0x0\n"
1854 "movi v28.16b, #0x0\n"
1855 "movi v29.16b, #0x0\n"
1856 "146:" // Height 5: setup done
1857 "mov x28, #0x0\n"
1858 "147:" // Height 5: String loop
1859 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1860 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
1861 "ldr w27, [x20, x28, LSL #0x2]\n"
1862 "tbz %x[flags], #3, 148f\n"
1863 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
1864 "add x20, x20, x21, LSL #3\n"
1865 "ldr x26, [x20, #0x0]\n"
1866 "ldr x25, [x20, #0x8]\n"
1867 "ldr x24, [x20, #0x10]\n"
1868 "ldr x23, [x20, #0x18]\n"
1869 "ldr x22, [x20, #0x20]\n"
1870 "cbnz x28, 149f\n"
1871 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1872 "add x26, x26, x20, LSL #1\n"
1873 "add x25, x25, x20, LSL #1\n"
1874 "add x24, x24, x20, LSL #1\n"
1875 "add x23, x23, x20, LSL #1\n"
1876 "add x22, x22, x20, LSL #1\n"
1877 "b 149f\n"
1878 "148:" // Height 5: setup direct input
1879 "mov x26, %x[input_ptr]\n"
1880 "add x25, x26, x21, LSL #1\n"
1881 "add x24, x25, x21, LSL #1\n"
1882 "add x23, x24, x21, LSL #1\n"
1883 "add x22, x23, x21, LSL #1\n"
1884 "149:" // Height 5: input setup done
1885 "cmp x27, #0x8\n"
1886 "blt 152f\n"
1887 "ldr q0, [x26, #0x0]\n"
1888 "ldr q1, [x25, #0x0]\n"
1889 "cmp x27, #0x10\n"
1890 "ldr q2, [x24, #0x0]\n"
1891 "ldr q3, [x23, #0x0]\n"
1892 "ldr q4, [x22, #0x0]\n"
1893 "ldr q6, [x10, #0x0]\n"
1894 "ldr q7, [x10, #0x10]\n"
1895 "ldr q8, [x10, #0x20]\n"
1896 "ldr q9, [x10, #0x30]\n"
1897 "ldr q10, [x10, #0x40]\n"
1898 "ldr q11, [x10, #0x50]\n"
1899 "ldr q12, [x10, #0x60]\n"
1900 "ldr q13, [x10, #0x70]\n"
1901 "ldr q14, [x10, #0x80]\n"
1902 "ldr q15, [x10, #0x90]\n"
1903 "ldr q16, [x10, #0xa0]\n"
1904 "ldr q17, [x10, #0xb0]\n"
1905 "ldr q18, [x10, #0xc0]\n"
1906 "ldr q19, [x10, #0xd0]\n"
1907 "blt 151f\n"
1908 "150:" // Height 5: Multiply loop: Main loop head
1909 "fmla v20.8h, v6.8h, v0.h[0]\n"
1910 "fmla v22.8h, v6.8h, v1.h[0]\n"
1911 "sub x27, x27, #0x8\n"
1912 "add x26, x26, #0x10\n"
1913 "fmla v24.8h, v6.8h, v2.h[0]\n"
1914 "fmla v26.8h, v6.8h, v3.h[0]\n"
1915 "add x25, x25, #0x10\n"
1916 "add x24, x24, #0x10\n"
1917 "fmla v28.8h, v6.8h, v4.h[0]\n"
1918 "ldr q6, [x10, #0xe0]\n"
1919 "fmla v21.8h, v7.8h, v0.h[0]\n"
1920 "add x23, x23, #0x10\n"
1921 "fmla v23.8h, v7.8h, v1.h[0]\n"
1922 "fmla v25.8h, v7.8h, v2.h[0]\n"
1923 "add x22, x22, #0x10\n"
1924 "cmp x27, #0x10\n"
1925 "fmla v27.8h, v7.8h, v3.h[0]\n"
1926 "fmla v29.8h, v7.8h, v4.h[0]\n"
1927 "ldr q7, [x10, #0xf0]\n"
1928 "add x10, x10, #0x100\n"
1929 "fmla v20.8h, v8.8h, v0.h[1]\n"
1930 "fmla v22.8h, v8.8h, v1.h[1]\n"
1931 "prfm pldl1keep, [x26, #0x80]\n"
1932 "prfm pldl1keep, [x25, #0x80]\n"
1933 "fmla v24.8h, v8.8h, v2.h[1]\n"
1934 "fmla v26.8h, v8.8h, v3.h[1]\n"
1935 "prfm pldl1keep, [x24, #0x80]\n"
1936 "prfm pldl1keep, [x23, #0x80]\n"
1937 "fmla v28.8h, v8.8h, v4.h[1]\n"
1938 "ldr q8, [x10, #0x20]\n"
1939 "fmla v21.8h, v9.8h, v0.h[1]\n"
1940 "prfm pldl1keep, [x22, #0x80]\n"
1941 "fmla v23.8h, v9.8h, v1.h[1]\n"
1942 "fmla v25.8h, v9.8h, v2.h[1]\n"
1943 "fmla v27.8h, v9.8h, v3.h[1]\n"
1944 "fmla v29.8h, v9.8h, v4.h[1]\n"
1945 "ldr q9, [x10, #0x30]\n"
1946 "fmla v20.8h, v10.8h, v0.h[2]\n"
1947 "fmla v22.8h, v10.8h, v1.h[2]\n"
1948 "fmla v24.8h, v10.8h, v2.h[2]\n"
1949 "fmla v26.8h, v10.8h, v3.h[2]\n"
1950 "fmla v28.8h, v10.8h, v4.h[2]\n"
1951 "ldr q10, [x10, #0x40]\n"
1952 "fmla v21.8h, v11.8h, v0.h[2]\n"
1953 "fmla v23.8h, v11.8h, v1.h[2]\n"
1954 "fmla v25.8h, v11.8h, v2.h[2]\n"
1955 "fmla v27.8h, v11.8h, v3.h[2]\n"
1956 "fmla v29.8h, v11.8h, v4.h[2]\n"
1957 "ldr q11, [x10, #0x50]\n"
1958 "fmla v20.8h, v12.8h, v0.h[3]\n"
1959 "fmla v22.8h, v12.8h, v1.h[3]\n"
1960 "fmla v24.8h, v12.8h, v2.h[3]\n"
1961 "fmla v26.8h, v12.8h, v3.h[3]\n"
1962 "fmla v28.8h, v12.8h, v4.h[3]\n"
1963 "ldr q12, [x10, #0x60]\n"
1964 "fmla v21.8h, v13.8h, v0.h[3]\n"
1965 "fmla v23.8h, v13.8h, v1.h[3]\n"
1966 "fmla v25.8h, v13.8h, v2.h[3]\n"
1967 "fmla v27.8h, v13.8h, v3.h[3]\n"
1968 "fmla v29.8h, v13.8h, v4.h[3]\n"
1969 "ldr q13, [x10, #0x70]\n"
1970 "fmla v20.8h, v14.8h, v0.h[4]\n"
1971 "fmla v22.8h, v14.8h, v1.h[4]\n"
1972 "fmla v24.8h, v14.8h, v2.h[4]\n"
1973 "fmla v26.8h, v14.8h, v3.h[4]\n"
1974 "fmla v28.8h, v14.8h, v4.h[4]\n"
1975 "ldr q14, [x10, #0x80]\n"
1976 "fmla v21.8h, v15.8h, v0.h[4]\n"
1977 "fmla v23.8h, v15.8h, v1.h[4]\n"
1978 "fmla v25.8h, v15.8h, v2.h[4]\n"
1979 "fmla v27.8h, v15.8h, v3.h[4]\n"
1980 "fmla v29.8h, v15.8h, v4.h[4]\n"
1981 "ldr q15, [x10, #0x90]\n"
1982 "fmla v20.8h, v16.8h, v0.h[5]\n"
1983 "fmla v22.8h, v16.8h, v1.h[5]\n"
1984 "fmla v24.8h, v16.8h, v2.h[5]\n"
1985 "fmla v26.8h, v16.8h, v3.h[5]\n"
1986 "fmla v28.8h, v16.8h, v4.h[5]\n"
1987 "ldr q16, [x10, #0xa0]\n"
1988 "fmla v21.8h, v17.8h, v0.h[5]\n"
1989 "fmla v23.8h, v17.8h, v1.h[5]\n"
1990 "fmla v25.8h, v17.8h, v2.h[5]\n"
1991 "fmla v27.8h, v17.8h, v3.h[5]\n"
1992 "fmla v29.8h, v17.8h, v4.h[5]\n"
1993 "ldr q17, [x10, #0xb0]\n"
1994 "fmla v20.8h, v18.8h, v0.h[6]\n"
1995 "fmla v22.8h, v18.8h, v1.h[6]\n"
1996 "fmla v24.8h, v18.8h, v2.h[6]\n"
1997 "fmla v26.8h, v18.8h, v3.h[6]\n"
1998 "fmla v28.8h, v18.8h, v4.h[6]\n"
1999 "ldr q18, [x10, #0xc0]\n"
2000 "fmla v21.8h, v19.8h, v0.h[6]\n"
2001 "fmla v23.8h, v19.8h, v1.h[6]\n"
2002 "fmla v25.8h, v19.8h, v2.h[6]\n"
2003 "fmla v27.8h, v19.8h, v3.h[6]\n"
2004 "fmla v29.8h, v19.8h, v4.h[6]\n"
2005 "ldr q19, [x10, #0xd0]\n"
2006 "fmla v20.8h, v6.8h, v0.h[7]\n"
2007 "fmla v22.8h, v6.8h, v1.h[7]\n"
2008 "fmla v24.8h, v6.8h, v2.h[7]\n"
2009 "fmla v26.8h, v6.8h, v3.h[7]\n"
2010 "fmla v28.8h, v6.8h, v4.h[7]\n"
2011 "ldr q6, [x10, #0x0]\n"
2012 "fmla v21.8h, v7.8h, v0.h[7]\n"
2013 "ldr q0, [x26, #0x0]\n"
2014 "fmla v23.8h, v7.8h, v1.h[7]\n"
2015 "ldr q1, [x25, #0x0]\n"
2016 "fmla v25.8h, v7.8h, v2.h[7]\n"
2017 "ldr q2, [x24, #0x0]\n"
2018 "fmla v27.8h, v7.8h, v3.h[7]\n"
2019 "ldr q3, [x23, #0x0]\n"
2020 "fmla v29.8h, v7.8h, v4.h[7]\n"
2021 "ldr q4, [x22, #0x0]\n"
2022 "ldr q7, [x10, #0x10]\n"
2023 "bge 150b\n"
2024 "151:" // Height 5: Multiply loop: Single iteration only
2025 "fmla v20.8h, v6.8h, v0.h[0]\n"
2026 "fmla v22.8h, v6.8h, v1.h[0]\n"
2027 "add x26, x26, #0x10\n"
2028 "add x25, x25, #0x10\n"
2029 "fmla v24.8h, v6.8h, v2.h[0]\n"
2030 "fmla v26.8h, v6.8h, v3.h[0]\n"
2031 "add x24, x24, #0x10\n"
2032 "add x23, x23, #0x10\n"
2033 "fmla v28.8h, v6.8h, v4.h[0]\n"
2034 "ldr q6, [x10, #0xe0]\n"
2035 "fmla v21.8h, v7.8h, v0.h[0]\n"
2036 "add x22, x22, #0x10\n"
2037 "fmla v23.8h, v7.8h, v1.h[0]\n"
2038 "fmla v25.8h, v7.8h, v2.h[0]\n"
2039 "sub x27, x27, #0x8\n"
2040 "prfm pldl1keep, [x26, #0x80]\n"
2041 "fmla v27.8h, v7.8h, v3.h[0]\n"
2042 "fmla v29.8h, v7.8h, v4.h[0]\n"
2043 "ldr q7, [x10, #0xf0]\n"
2044 "prfm pldl1keep, [x25, #0x80]\n"
2045 "fmla v20.8h, v8.8h, v0.h[1]\n"
2046 "fmla v22.8h, v8.8h, v1.h[1]\n"
2047 "prfm pldl1keep, [x24, #0x80]\n"
2048 "prfm pldl1keep, [x23, #0x80]\n"
2049 "fmla v24.8h, v8.8h, v2.h[1]\n"
2050 "fmla v26.8h, v8.8h, v3.h[1]\n"
2051 "prfm pldl1keep, [x22, #0x80]\n"
2052 "add x10, x10, #0x100\n"
2053 "fmla v28.8h, v8.8h, v4.h[1]\n"
2054 "fmla v21.8h, v9.8h, v0.h[1]\n"
2055 "fmla v23.8h, v9.8h, v1.h[1]\n"
2056 "fmla v25.8h, v9.8h, v2.h[1]\n"
2057 "fmla v27.8h, v9.8h, v3.h[1]\n"
2058 "fmla v29.8h, v9.8h, v4.h[1]\n"
2059 "fmla v20.8h, v10.8h, v0.h[2]\n"
2060 "fmla v22.8h, v10.8h, v1.h[2]\n"
2061 "fmla v24.8h, v10.8h, v2.h[2]\n"
2062 "fmla v26.8h, v10.8h, v3.h[2]\n"
2063 "fmla v28.8h, v10.8h, v4.h[2]\n"
2064 "fmla v21.8h, v11.8h, v0.h[2]\n"
2065 "fmla v23.8h, v11.8h, v1.h[2]\n"
2066 "fmla v25.8h, v11.8h, v2.h[2]\n"
2067 "fmla v27.8h, v11.8h, v3.h[2]\n"
2068 "fmla v29.8h, v11.8h, v4.h[2]\n"
2069 "fmla v20.8h, v12.8h, v0.h[3]\n"
2070 "fmla v22.8h, v12.8h, v1.h[3]\n"
2071 "fmla v24.8h, v12.8h, v2.h[3]\n"
2072 "fmla v26.8h, v12.8h, v3.h[3]\n"
2073 "fmla v28.8h, v12.8h, v4.h[3]\n"
2074 "fmla v21.8h, v13.8h, v0.h[3]\n"
2075 "fmla v23.8h, v13.8h, v1.h[3]\n"
2076 "fmla v25.8h, v13.8h, v2.h[3]\n"
2077 "fmla v27.8h, v13.8h, v3.h[3]\n"
2078 "fmla v29.8h, v13.8h, v4.h[3]\n"
2079 "fmla v20.8h, v14.8h, v0.h[4]\n"
2080 "fmla v22.8h, v14.8h, v1.h[4]\n"
2081 "fmla v24.8h, v14.8h, v2.h[4]\n"
2082 "fmla v26.8h, v14.8h, v3.h[4]\n"
2083 "fmla v28.8h, v14.8h, v4.h[4]\n"
2084 "fmla v21.8h, v15.8h, v0.h[4]\n"
2085 "fmla v23.8h, v15.8h, v1.h[4]\n"
2086 "fmla v25.8h, v15.8h, v2.h[4]\n"
2087 "fmla v27.8h, v15.8h, v3.h[4]\n"
2088 "fmla v29.8h, v15.8h, v4.h[4]\n"
2089 "fmla v20.8h, v16.8h, v0.h[5]\n"
2090 "fmla v22.8h, v16.8h, v1.h[5]\n"
2091 "fmla v24.8h, v16.8h, v2.h[5]\n"
2092 "fmla v26.8h, v16.8h, v3.h[5]\n"
2093 "fmla v28.8h, v16.8h, v4.h[5]\n"
2094 "fmla v21.8h, v17.8h, v0.h[5]\n"
2095 "fmla v23.8h, v17.8h, v1.h[5]\n"
2096 "fmla v25.8h, v17.8h, v2.h[5]\n"
2097 "fmla v27.8h, v17.8h, v3.h[5]\n"
2098 "fmla v29.8h, v17.8h, v4.h[5]\n"
2099 "fmla v20.8h, v18.8h, v0.h[6]\n"
2100 "fmla v22.8h, v18.8h, v1.h[6]\n"
2101 "fmla v24.8h, v18.8h, v2.h[6]\n"
2102 "fmla v26.8h, v18.8h, v3.h[6]\n"
2103 "fmla v28.8h, v18.8h, v4.h[6]\n"
2104 "fmla v21.8h, v19.8h, v0.h[6]\n"
2105 "fmla v23.8h, v19.8h, v1.h[6]\n"
2106 "fmla v25.8h, v19.8h, v2.h[6]\n"
2107 "fmla v27.8h, v19.8h, v3.h[6]\n"
2108 "fmla v29.8h, v19.8h, v4.h[6]\n"
2109 "fmla v20.8h, v6.8h, v0.h[7]\n"
2110 "fmla v22.8h, v6.8h, v1.h[7]\n"
2111 "fmla v24.8h, v6.8h, v2.h[7]\n"
2112 "fmla v26.8h, v6.8h, v3.h[7]\n"
2113 "fmla v28.8h, v6.8h, v4.h[7]\n"
2114 "fmla v21.8h, v7.8h, v0.h[7]\n"
2115 "fmla v23.8h, v7.8h, v1.h[7]\n"
2116 "fmla v25.8h, v7.8h, v2.h[7]\n"
2117 "fmla v27.8h, v7.8h, v3.h[7]\n"
2118 "fmla v29.8h, v7.8h, v4.h[7]\n"
2119 "152:" // Height 5: Multiply loop: Main loop skip
2120 "cbz x27, 154f\n"
2121 "153:" // Height 5: Multiply loop: Odd block loop
2122 "ldr h0, [x26], #0x2\n"
2123 "ldr h1, [x25], #0x2\n"
2124 "sub x27, x27, #0x1\n"
2125 "ldr h2, [x24], #0x2\n"
2126 "ldr h3, [x23], #0x2\n"
2127 "ldr h4, [x22], #0x2\n"
2128 "ldr q8, [x10, #0x0]\n"
2129 "ldr q9, [x10, #0x10]\n"
2130 "add x10, x10, #0x20\n"
2131 "fmla v20.8h, v8.8h, v0.h[0]\n"
2132 "fmla v22.8h, v8.8h, v1.h[0]\n"
2133 "fmla v24.8h, v8.8h, v2.h[0]\n"
2134 "fmla v26.8h, v8.8h, v3.h[0]\n"
2135 "fmla v28.8h, v8.8h, v4.h[0]\n"
2136 "fmla v21.8h, v9.8h, v0.h[0]\n"
2137 "fmla v23.8h, v9.8h, v1.h[0]\n"
2138 "fmla v25.8h, v9.8h, v2.h[0]\n"
2139 "fmla v27.8h, v9.8h, v3.h[0]\n"
2140 "fmla v29.8h, v9.8h, v4.h[0]\n"
2141 "cbnz x27, 153b\n"
2142 "154:" // Height 5: Multiply loop: No odd multiplies
2143 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
2144 "add x28, x28, #0x1\n"
2145 "cmp x28, x20\n"
2146 "bne 147b\n"
2147 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2148 "prfm pstl1keep, [x9, #0x0]\n"
2149 "add x26, x9, x20, LSL #1\n"
2150 "prfm pstl1keep, [x26, #0x0]\n"
2151 "add x25, x26, x20, LSL #1\n"
2152 "prfm pstl1keep, [x25, #0x0]\n"
2153 "add x24, x25, x20, LSL #1\n"
2154 "prfm pstl1keep, [x24, #0x0]\n"
2155 "add x23, x24, x20, LSL #1\n"
2156 "prfm pstl1keep, [x23, #0x0]\n"
2157 "tbz %x[flags], #1, 155f\n"
2158 "add x21, %x[args_ptr], %[offset_max]\n"
2159 "add x20, %x[args_ptr], %[offset_min]\n"
2160 "ld1r { v17.8h }, [x21]\n"
2161 "ld1r { v16.8h }, [x20]\n"
2162 "fmin v20.8h, v20.8h, v17.8h\n"
2163 "fmin v21.8h, v21.8h, v17.8h\n"
2164 "fmin v22.8h, v22.8h, v17.8h\n"
2165 "fmin v23.8h, v23.8h, v17.8h\n"
2166 "fmin v24.8h, v24.8h, v17.8h\n"
2167 "fmin v25.8h, v25.8h, v17.8h\n"
2168 "fmin v26.8h, v26.8h, v17.8h\n"
2169 "fmin v27.8h, v27.8h, v17.8h\n"
2170 "fmin v28.8h, v28.8h, v17.8h\n"
2171 "fmin v29.8h, v29.8h, v17.8h\n"
2172 "fmax v20.8h, v20.8h, v16.8h\n"
2173 "fmax v21.8h, v21.8h, v16.8h\n"
2174 "fmax v22.8h, v22.8h, v16.8h\n"
2175 "fmax v23.8h, v23.8h, v16.8h\n"
2176 "fmax v24.8h, v24.8h, v16.8h\n"
2177 "fmax v25.8h, v25.8h, v16.8h\n"
2178 "fmax v26.8h, v26.8h, v16.8h\n"
2179 "fmax v27.8h, v27.8h, v16.8h\n"
2180 "fmax v28.8h, v28.8h, v16.8h\n"
2181 "fmax v29.8h, v29.8h, v16.8h\n"
2182 "155:" // Height 5: No activation
2183 "cmp x11, #0x10\n"
2184 "bge 164f\n"
2185 "tbz x11, #3, 159f\n"
2186 "st1 { v20.8h }, [x9], #0x10\n"
2187 "st1 { v22.8h }, [x26], #0x10\n"
2188 "st1 { v24.8h }, [x25], #0x10\n"
2189 "st1 { v26.8h }, [x24], #0x10\n"
2190 "st1 { v28.8h }, [x23], #0x10\n"
2191 "tbz x11, #2, 157f\n"
2192 "str d21, [x9], #0x8\n"
2193 "str d23, [x26], #0x8\n"
2194 "str d25, [x25], #0x8\n"
2195 "str d27, [x24], #0x8\n"
2196 "str d29, [x23], #0x8\n"
2197 "tbz x11, #1, 156f\n"
2198 "st1 { v21.s }[2], [x9], #0x4\n"
2199 "st1 { v23.s }[2], [x26], #0x4\n"
2200 "st1 { v25.s }[2], [x25], #0x4\n"
2201 "st1 { v27.s }[2], [x24], #0x4\n"
2202 "st1 { v29.s }[2], [x23], #0x4\n"
2203 "tbz x11, #0, 163f\n"
2204 "st1 { v21.h }[6], [x9]\n"
2205 "st1 { v23.h }[6], [x26]\n"
2206 "st1 { v25.h }[6], [x25]\n"
2207 "st1 { v27.h }[6], [x24]\n"
2208 "st1 { v29.h }[6], [x23]\n"
2209 "b 163f\n"
2210 "156:" // Height 5: Partial direct writeback: partial_1_12
2211 "tbz x11, #0, 163f\n"
2212 "st1 { v21.h }[4], [x9]\n"
2213 "st1 { v23.h }[4], [x26]\n"
2214 "st1 { v25.h }[4], [x25]\n"
2215 "st1 { v27.h }[4], [x24]\n"
2216 "st1 { v29.h }[4], [x23]\n"
2217 "b 163f\n"
2218 "157:" // Height 5: Partial direct writeback: partial_2_8
2219 "tbz x11, #1, 158f\n"
2220 "str s21, [x9], #0x4\n"
2221 "str s23, [x26], #0x4\n"
2222 "str s25, [x25], #0x4\n"
2223 "str s27, [x24], #0x4\n"
2224 "str s29, [x23], #0x4\n"
2225 "tbz x11, #0, 163f\n"
2226 "st1 { v21.h }[2], [x9]\n"
2227 "st1 { v23.h }[2], [x26]\n"
2228 "st1 { v25.h }[2], [x25]\n"
2229 "st1 { v27.h }[2], [x24]\n"
2230 "st1 { v29.h }[2], [x23]\n"
2231 "b 163f\n"
2232 "158:" // Height 5: Partial direct writeback: partial_1_8
2233 "tbz x11, #0, 163f\n"
2234 "str h21, [x9, #0x0]\n"
2235 "str h23, [x26, #0x0]\n"
2236 "str h25, [x25, #0x0]\n"
2237 "str h27, [x24, #0x0]\n"
2238 "str h29, [x23, #0x0]\n"
2239 "b 163f\n"
2240 "159:" // Height 5: Partial direct writeback: partial_4_0
2241 "tbz x11, #2, 161f\n"
2242 "str d20, [x9], #0x8\n"
2243 "str d22, [x26], #0x8\n"
2244 "str d24, [x25], #0x8\n"
2245 "str d26, [x24], #0x8\n"
2246 "str d28, [x23], #0x8\n"
2247 "tbz x11, #1, 160f\n"
2248 "st1 { v20.s }[2], [x9], #0x4\n"
2249 "st1 { v22.s }[2], [x26], #0x4\n"
2250 "st1 { v24.s }[2], [x25], #0x4\n"
2251 "st1 { v26.s }[2], [x24], #0x4\n"
2252 "st1 { v28.s }[2], [x23], #0x4\n"
2253 "tbz x11, #0, 163f\n"
2254 "st1 { v20.h }[6], [x9]\n"
2255 "st1 { v22.h }[6], [x26]\n"
2256 "st1 { v24.h }[6], [x25]\n"
2257 "st1 { v26.h }[6], [x24]\n"
2258 "st1 { v28.h }[6], [x23]\n"
2259 "b 163f\n"
2260 "160:" // Height 5: Partial direct writeback: partial_1_4
2261 "tbz x11, #0, 163f\n"
2262 "st1 { v20.h }[4], [x9]\n"
2263 "st1 { v22.h }[4], [x26]\n"
2264 "st1 { v24.h }[4], [x25]\n"
2265 "st1 { v26.h }[4], [x24]\n"
2266 "st1 { v28.h }[4], [x23]\n"
2267 "b 163f\n"
2268 "161:" // Height 5: Partial direct writeback: partial_2_0
2269 "tbz x11, #1, 162f\n"
2270 "str s20, [x9], #0x4\n"
2271 "str s22, [x26], #0x4\n"
2272 "str s24, [x25], #0x4\n"
2273 "str s26, [x24], #0x4\n"
2274 "str s28, [x23], #0x4\n"
2275 "tbz x11, #0, 163f\n"
2276 "st1 { v20.h }[2], [x9]\n"
2277 "st1 { v22.h }[2], [x26]\n"
2278 "st1 { v24.h }[2], [x25]\n"
2279 "st1 { v26.h }[2], [x24]\n"
2280 "st1 { v28.h }[2], [x23]\n"
2281 "b 163f\n"
2282 "162:" // Height 5: Partial direct writeback: partial_1_0
2283 "str h20, [x9, #0x0]\n"
2284 "str h22, [x26, #0x0]\n"
2285 "str h24, [x25, #0x0]\n"
2286 "str h26, [x24, #0x0]\n"
2287 "str h28, [x23, #0x0]\n"
2288 "163:" // Height 5: Partial direct writeback: Done
2289 "b 165f\n"
2290 "164:" // Height 5: Full writeback
2291 "str q20, [x9, #0x0]\n"
2292 "str q21, [x9, #0x10]\n"
2293 "add x9, x9, #0x20\n"
2294 "str q22, [x26, #0x0]\n"
2295 "str q23, [x26, #0x10]\n"
2296 "str q24, [x25, #0x0]\n"
2297 "str q25, [x25, #0x10]\n"
2298 "str q26, [x24, #0x0]\n"
2299 "str q27, [x24, #0x10]\n"
2300 "str q28, [x23, #0x0]\n"
2301 "str q29, [x23, #0x10]\n"
2302 "165:" // Height 5: Writeback done
2303 "subs x11, x11, #0x10\n"
2304 "bgt 134b\n"
2305 "b 200f\n"
2306 "166:" // Height 6
2307 "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
2308 "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n"
2309 "mov x20, #0xc\n"
2310 "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
2311 "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
2312 "madd x20, x21, x20, x9\n"
2313 "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
2314 "167:" // Height 6: Column loop
2315 "cbz x10, 168f\n"
2316 "ldr q20, [x10, #0x0]\n"
2317 "ldr q21, [x10, #0x10]\n"
2318 "add x10, x10, #0x20\n"
2319 "mov v22.16b, v20.16b\n"
2320 "mov v23.16b, v21.16b\n"
2321 "mov v24.16b, v20.16b\n"
2322 "mov v25.16b, v21.16b\n"
2323 "mov v26.16b, v20.16b\n"
2324 "mov v27.16b, v21.16b\n"
2325 "mov v28.16b, v20.16b\n"
2326 "mov v29.16b, v21.16b\n"
2327 "mov v30.16b, v20.16b\n"
2328 "mov v31.16b, v21.16b\n"
2329 "b 179f\n"
2330 "168:" // Height 6: no bias
2331 "tbz %x[flags], #0, 178f\n"
2332 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2333 "cmp x11, #0x10\n"
2334 "add x26, x9, x20, LSL #1\n"
2335 "add x25, x26, x20, LSL #1\n"
2336 "add x24, x25, x20, LSL #1\n"
2337 "add x23, x24, x20, LSL #1\n"
2338 "add x22, x23, x20, LSL #1\n"
2339 "bge 177f\n"
2340 "tbz x11, #3, 172f\n"
2341 "ld1 { v20.8h }, [x9], #0x10\n"
2342 "ld1 { v22.8h }, [x26], #0x10\n"
2343 "ld1 { v24.8h }, [x25], #0x10\n"
2344 "ld1 { v26.8h }, [x24], #0x10\n"
2345 "ld1 { v28.8h }, [x23], #0x10\n"
2346 "ld1 { v30.8h }, [x22], #0x10\n"
2347 "tbz x11, #2, 170f\n"
2348 "ldr d21, [x9], #0x8\n"
2349 "ldr d23, [x26], #0x8\n"
2350 "ldr d25, [x25], #0x8\n"
2351 "ldr d27, [x24], #0x8\n"
2352 "ldr d29, [x23], #0x8\n"
2353 "ldr d31, [x22], #0x8\n"
2354 "tbz x11, #1, 169f\n"
2355 "ld1 { v21.s }[2], [x9], #0x4\n"
2356 "ld1 { v23.s }[2], [x26], #0x4\n"
2357 "mov x20, #0x1c\n"
2358 "ld1 { v25.s }[2], [x25], #0x4\n"
2359 "ld1 { v27.s }[2], [x24], #0x4\n"
2360 "ld1 { v29.s }[2], [x23], #0x4\n"
2361 "ld1 { v31.s }[2], [x22], #0x4\n"
2362 "tbz x11, #0, 176f\n"
2363 "ld1 { v21.h }[6], [x9]\n"
2364 "ld1 { v23.h }[6], [x26]\n"
2365 "ld1 { v25.h }[6], [x25]\n"
2366 "ld1 { v27.h }[6], [x24]\n"
2367 "ld1 { v29.h }[6], [x23]\n"
2368 "ld1 { v31.h }[6], [x22]\n"
2369 "b 176f\n"
2370 "169:" // Height 6: Partial accumulate: partial_1_12
2371 "mov x20, #0x18\n"
2372 "tbz x11, #0, 176f\n"
2373 "ld1 { v21.h }[4], [x9]\n"
2374 "ld1 { v23.h }[4], [x26]\n"
2375 "ld1 { v25.h }[4], [x25]\n"
2376 "ld1 { v27.h }[4], [x24]\n"
2377 "ld1 { v29.h }[4], [x23]\n"
2378 "ld1 { v31.h }[4], [x22]\n"
2379 "b 176f\n"
2380 "170:" // Height 6: Partial accumulate: partial_2_8
2381 "tbz x11, #1, 171f\n"
2382 "ldr s21, [x9], #0x4\n"
2383 "ldr s23, [x26], #0x4\n"
2384 "mov x20, #0x14\n"
2385 "ldr s25, [x25], #0x4\n"
2386 "ldr s27, [x24], #0x4\n"
2387 "ldr s29, [x23], #0x4\n"
2388 "ldr s31, [x22], #0x4\n"
2389 "tbz x11, #0, 176f\n"
2390 "ld1 { v21.h }[2], [x9]\n"
2391 "ld1 { v23.h }[2], [x26]\n"
2392 "ld1 { v25.h }[2], [x25]\n"
2393 "ld1 { v27.h }[2], [x24]\n"
2394 "ld1 { v29.h }[2], [x23]\n"
2395 "ld1 { v31.h }[2], [x22]\n"
2396 "b 176f\n"
2397 "171:" // Height 6: Partial accumulate: partial_1_8
2398 "mov x20, #0x10\n"
2399 "tbz x11, #0, 176f\n"
2400 "ldr h21, [x9, #0x0]\n"
2401 "ldr h23, [x26, #0x0]\n"
2402 "ldr h25, [x25, #0x0]\n"
2403 "ldr h27, [x24, #0x0]\n"
2404 "ldr h29, [x23, #0x0]\n"
2405 "ldr h31, [x22, #0x0]\n"
2406 "b 176f\n"
2407 "172:" // Height 6: Partial accumulate: partial_4_0
2408 "tbz x11, #2, 174f\n"
2409 "ldr d20, [x9], #0x8\n"
2410 "ldr d22, [x26], #0x8\n"
2411 "ldr d24, [x25], #0x8\n"
2412 "ldr d26, [x24], #0x8\n"
2413 "ldr d28, [x23], #0x8\n"
2414 "ldr d30, [x22], #0x8\n"
2415 "tbz x11, #1, 173f\n"
2416 "ld1 { v20.s }[2], [x9], #0x4\n"
2417 "ld1 { v22.s }[2], [x26], #0x4\n"
2418 "mov x20, #0xc\n"
2419 "ld1 { v24.s }[2], [x25], #0x4\n"
2420 "ld1 { v26.s }[2], [x24], #0x4\n"
2421 "ld1 { v28.s }[2], [x23], #0x4\n"
2422 "ld1 { v30.s }[2], [x22], #0x4\n"
2423 "tbz x11, #0, 176f\n"
2424 "ld1 { v20.h }[6], [x9]\n"
2425 "ld1 { v22.h }[6], [x26]\n"
2426 "ld1 { v24.h }[6], [x25]\n"
2427 "ld1 { v26.h }[6], [x24]\n"
2428 "ld1 { v28.h }[6], [x23]\n"
2429 "ld1 { v30.h }[6], [x22]\n"
2430 "b 176f\n"
2431 "173:" // Height 6: Partial accumulate: partial_1_4
2432 "mov x20, #0x8\n"
2433 "tbz x11, #0, 176f\n"
2434 "ld1 { v20.h }[4], [x9]\n"
2435 "ld1 { v22.h }[4], [x26]\n"
2436 "ld1 { v24.h }[4], [x25]\n"
2437 "ld1 { v26.h }[4], [x24]\n"
2438 "ld1 { v28.h }[4], [x23]\n"
2439 "ld1 { v30.h }[4], [x22]\n"
2440 "b 176f\n"
2441 "174:" // Height 6: Partial accumulate: partial_2_0
2442 "tbz x11, #1, 175f\n"
2443 "ldr s20, [x9], #0x4\n"
2444 "ldr s22, [x26], #0x4\n"
2445 "mov x20, #0x4\n"
2446 "ldr s24, [x25], #0x4\n"
2447 "ldr s26, [x24], #0x4\n"
2448 "ldr s28, [x23], #0x4\n"
2449 "ldr s30, [x22], #0x4\n"
2450 "tbz x11, #0, 176f\n"
2451 "ld1 { v20.h }[2], [x9]\n"
2452 "ld1 { v22.h }[2], [x26]\n"
2453 "ld1 { v24.h }[2], [x25]\n"
2454 "ld1 { v26.h }[2], [x24]\n"
2455 "ld1 { v28.h }[2], [x23]\n"
2456 "ld1 { v30.h }[2], [x22]\n"
2457 "b 176f\n"
2458 "175:" // Height 6: Partial accumulate: partial_1_0
2459 "ldr h20, [x9, #0x0]\n"
2460 "ldr h22, [x26, #0x0]\n"
2461 "mov x20, #0x0\n"
2462 "ldr h24, [x25, #0x0]\n"
2463 "ldr h26, [x24, #0x0]\n"
2464 "ldr h28, [x23, #0x0]\n"
2465 "ldr h30, [x22, #0x0]\n"
2466 "176:" // Height 6: Partial accumulate: Done
2467 "sub x9, x9, x20\n"
2468 "b 179f\n"
2469 "177:" // Height 6: full accumulate
2470 "ldr q20, [x9, #0x0]\n"
2471 "ldr q21, [x9, #0x10]\n"
2472 "ldr q22, [x26, #0x0]\n"
2473 "ldr q23, [x26, #0x10]\n"
2474 "ldr q24, [x25, #0x0]\n"
2475 "ldr q25, [x25, #0x10]\n"
2476 "ldr q26, [x24, #0x0]\n"
2477 "ldr q27, [x24, #0x10]\n"
2478 "ldr q28, [x23, #0x0]\n"
2479 "ldr q29, [x23, #0x10]\n"
2480 "ldr q30, [x22, #0x0]\n"
2481 "ldr q31, [x22, #0x10]\n"
2482 "b 179f\n"
2483 "178:" // Height 6: no accumulate
2484 "movi v20.16b, #0x0\n"
2485 "movi v21.16b, #0x0\n"
2486 "movi v22.16b, #0x0\n"
2487 "movi v23.16b, #0x0\n"
2488 "movi v24.16b, #0x0\n"
2489 "movi v25.16b, #0x0\n"
2490 "movi v26.16b, #0x0\n"
2491 "movi v27.16b, #0x0\n"
2492 "movi v28.16b, #0x0\n"
2493 "movi v29.16b, #0x0\n"
2494 "movi v30.16b, #0x0\n"
2495 "movi v31.16b, #0x0\n"
2496 "179:" // Height 6: setup done
2497 "mov x28, #0x0\n"
2498 "180:" // Height 6: String loop
2499 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
2500 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
2501 "ldr w27, [x20, x28, LSL #0x2]\n"
2502 "tbz %x[flags], #3, 181f\n"
2503 "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
2504 "add x20, x20, x21, LSL #3\n"
2505 "ldr x26, [x20, #0x0]\n"
2506 "ldr x25, [x20, #0x8]\n"
2507 "ldr x24, [x20, #0x10]\n"
2508 "ldr x23, [x20, #0x18]\n"
2509 "ldr x22, [x20, #0x20]\n"
2510 "ldr x21, [x20, #0x28]\n"
2511 "cbnz x28, 182f\n"
2512 "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
2513 "add x26, x26, x20, LSL #1\n"
2514 "add x25, x25, x20, LSL #1\n"
2515 "add x24, x24, x20, LSL #1\n"
2516 "add x23, x23, x20, LSL #1\n"
2517 "add x22, x22, x20, LSL #1\n"
2518 "add x21, x21, x20, LSL #1\n"
2519 "b 182f\n"
2520 "181:" // Height 6: setup direct input
2521 "mov x26, %x[input_ptr]\n"
2522 "add x25, x26, x21, LSL #1\n"
2523 "add x24, x25, x21, LSL #1\n"
2524 "add x23, x24, x21, LSL #1\n"
2525 "add x22, x23, x21, LSL #1\n"
2526 "add x21, x22, x21, LSL #1\n"
2527 "182:" // Height 6: input setup done
2528 "cmp x27, #0x8\n"
2529 "blt 185f\n"
2530 "ldr q0, [x26, #0x0]\n"
2531 "ldr q1, [x25, #0x0]\n"
2532 "cmp x27, #0x10\n"
2533 "ldr q2, [x24, #0x0]\n"
2534 "ldr q3, [x23, #0x0]\n"
2535 "ldr q4, [x22, #0x0]\n"
2536 "ldr q5, [x21, #0x0]\n"
2537 "ldr q6, [x10, #0x0]\n"
2538 "ldr q7, [x10, #0x10]\n"
2539 "ldr q8, [x10, #0x20]\n"
2540 "ldr q9, [x10, #0x30]\n"
2541 "ldr q10, [x10, #0x40]\n"
2542 "ldr q11, [x10, #0x50]\n"
2543 "ldr q12, [x10, #0x60]\n"
2544 "ldr q13, [x10, #0x70]\n"
2545 "ldr q14, [x10, #0x80]\n"
2546 "ldr q15, [x10, #0x90]\n"
2547 "ldr q16, [x10, #0xa0]\n"
2548 "ldr q17, [x10, #0xb0]\n"
2549 "ldr q18, [x10, #0xc0]\n"
2550 "ldr q19, [x10, #0xd0]\n"
2551 "blt 184f\n"
2552 "183:" // Height 6: Multiply loop: Main loop head
2553 "fmla v20.8h, v6.8h, v0.h[0]\n"
2554 "fmla v22.8h, v6.8h, v1.h[0]\n"
2555 "sub x27, x27, #0x8\n"
2556 "add x26, x26, #0x10\n"
2557 "fmla v24.8h, v6.8h, v2.h[0]\n"
2558 "fmla v26.8h, v6.8h, v3.h[0]\n"
2559 "add x25, x25, #0x10\n"
2560 "add x24, x24, #0x10\n"
2561 "fmla v28.8h, v6.8h, v4.h[0]\n"
2562 "fmla v30.8h, v6.8h, v5.h[0]\n"
2563 "ldr q6, [x10, #0xe0]\n"
2564 "add x23, x23, #0x10\n"
2565 "fmla v21.8h, v7.8h, v0.h[0]\n"
2566 "fmla v23.8h, v7.8h, v1.h[0]\n"
2567 "add x22, x22, #0x10\n"
2568 "add x21, x21, #0x10\n"
2569 "fmla v25.8h, v7.8h, v2.h[0]\n"
2570 "fmla v27.8h, v7.8h, v3.h[0]\n"
2571 "cmp x27, #0x10\n"
2572 "prfm pldl1keep, [x26, #0x80]\n"
2573 "fmla v29.8h, v7.8h, v4.h[0]\n"
2574 "fmla v31.8h, v7.8h, v5.h[0]\n"
2575 "ldr q7, [x10, #0xf0]\n"
2576 "add x10, x10, #0x100\n"
2577 "fmla v20.8h, v8.8h, v0.h[1]\n"
2578 "fmla v22.8h, v8.8h, v1.h[1]\n"
2579 "prfm pldl1keep, [x25, #0x80]\n"
2580 "prfm pldl1keep, [x24, #0x80]\n"
2581 "fmla v24.8h, v8.8h, v2.h[1]\n"
2582 "fmla v26.8h, v8.8h, v3.h[1]\n"
2583 "prfm pldl1keep, [x23, #0x80]\n"
2584 "prfm pldl1keep, [x22, #0x80]\n"
2585 "fmla v28.8h, v8.8h, v4.h[1]\n"
2586 "fmla v30.8h, v8.8h, v5.h[1]\n"
2587 "ldr q8, [x10, #0x20]\n"
2588 "prfm pldl1keep, [x21, #0x80]\n"
2589 "fmla v21.8h, v9.8h, v0.h[1]\n"
2590 "fmla v23.8h, v9.8h, v1.h[1]\n"
2591 "fmla v25.8h, v9.8h, v2.h[1]\n"
2592 "fmla v27.8h, v9.8h, v3.h[1]\n"
2593 "fmla v29.8h, v9.8h, v4.h[1]\n"
2594 "fmla v31.8h, v9.8h, v5.h[1]\n"
2595 "ldr q9, [x10, #0x30]\n"
2596 "fmla v20.8h, v10.8h, v0.h[2]\n"
2597 "fmla v22.8h, v10.8h, v1.h[2]\n"
2598 "fmla v24.8h, v10.8h, v2.h[2]\n"
2599 "fmla v26.8h, v10.8h, v3.h[2]\n"
2600 "fmla v28.8h, v10.8h, v4.h[2]\n"
2601 "fmla v30.8h, v10.8h, v5.h[2]\n"
2602 "ldr q10, [x10, #0x40]\n"
2603 "fmla v21.8h, v11.8h, v0.h[2]\n"
2604 "fmla v23.8h, v11.8h, v1.h[2]\n"
2605 "fmla v25.8h, v11.8h, v2.h[2]\n"
2606 "fmla v27.8h, v11.8h, v3.h[2]\n"
2607 "fmla v29.8h, v11.8h, v4.h[2]\n"
2608 "fmla v31.8h, v11.8h, v5.h[2]\n"
2609 "ldr q11, [x10, #0x50]\n"
2610 "fmla v20.8h, v12.8h, v0.h[3]\n"
2611 "fmla v22.8h, v12.8h, v1.h[3]\n"
2612 "fmla v24.8h, v12.8h, v2.h[3]\n"
2613 "fmla v26.8h, v12.8h, v3.h[3]\n"
2614 "fmla v28.8h, v12.8h, v4.h[3]\n"
2615 "fmla v30.8h, v12.8h, v5.h[3]\n"
2616 "ldr q12, [x10, #0x60]\n"
2617 "fmla v21.8h, v13.8h, v0.h[3]\n"
2618 "fmla v23.8h, v13.8h, v1.h[3]\n"
2619 "fmla v25.8h, v13.8h, v2.h[3]\n"
2620 "fmla v27.8h, v13.8h, v3.h[3]\n"
2621 "fmla v29.8h, v13.8h, v4.h[3]\n"
2622 "fmla v31.8h, v13.8h, v5.h[3]\n"
2623 "ldr q13, [x10, #0x70]\n"
2624 "fmla v20.8h, v14.8h, v0.h[4]\n"
2625 "fmla v22.8h, v14.8h, v1.h[4]\n"
2626 "fmla v24.8h, v14.8h, v2.h[4]\n"
2627 "fmla v26.8h, v14.8h, v3.h[4]\n"
2628 "fmla v28.8h, v14.8h, v4.h[4]\n"
2629 "fmla v30.8h, v14.8h, v5.h[4]\n"
2630 "ldr q14, [x10, #0x80]\n"
2631 "fmla v21.8h, v15.8h, v0.h[4]\n"
2632 "fmla v23.8h, v15.8h, v1.h[4]\n"
2633 "fmla v25.8h, v15.8h, v2.h[4]\n"
2634 "fmla v27.8h, v15.8h, v3.h[4]\n"
2635 "fmla v29.8h, v15.8h, v4.h[4]\n"
2636 "fmla v31.8h, v15.8h, v5.h[4]\n"
2637 "ldr q15, [x10, #0x90]\n"
2638 "fmla v20.8h, v16.8h, v0.h[5]\n"
2639 "fmla v22.8h, v16.8h, v1.h[5]\n"
2640 "fmla v24.8h, v16.8h, v2.h[5]\n"
2641 "fmla v26.8h, v16.8h, v3.h[5]\n"
2642 "fmla v28.8h, v16.8h, v4.h[5]\n"
2643 "fmla v30.8h, v16.8h, v5.h[5]\n"
2644 "ldr q16, [x10, #0xa0]\n"
2645 "fmla v21.8h, v17.8h, v0.h[5]\n"
2646 "fmla v23.8h, v17.8h, v1.h[5]\n"
2647 "fmla v25.8h, v17.8h, v2.h[5]\n"
2648 "fmla v27.8h, v17.8h, v3.h[5]\n"
2649 "fmla v29.8h, v17.8h, v4.h[5]\n"
2650 "fmla v31.8h, v17.8h, v5.h[5]\n"
2651 "ldr q17, [x10, #0xb0]\n"
2652 "fmla v20.8h, v18.8h, v0.h[6]\n"
2653 "fmla v22.8h, v18.8h, v1.h[6]\n"
2654 "fmla v24.8h, v18.8h, v2.h[6]\n"
2655 "fmla v26.8h, v18.8h, v3.h[6]\n"
2656 "fmla v28.8h, v18.8h, v4.h[6]\n"
2657 "fmla v30.8h, v18.8h, v5.h[6]\n"
2658 "ldr q18, [x10, #0xc0]\n"
2659 "fmla v21.8h, v19.8h, v0.h[6]\n"
2660 "fmla v23.8h, v19.8h, v1.h[6]\n"
2661 "fmla v25.8h, v19.8h, v2.h[6]\n"
2662 "fmla v27.8h, v19.8h, v3.h[6]\n"
2663 "fmla v29.8h, v19.8h, v4.h[6]\n"
2664 "fmla v31.8h, v19.8h, v5.h[6]\n"
2665 "ldr q19, [x10, #0xd0]\n"
2666 "fmla v20.8h, v6.8h, v0.h[7]\n"
2667 "fmla v22.8h, v6.8h, v1.h[7]\n"
2668 "fmla v24.8h, v6.8h, v2.h[7]\n"
2669 "fmla v26.8h, v6.8h, v3.h[7]\n"
2670 "fmla v28.8h, v6.8h, v4.h[7]\n"
2671 "fmla v30.8h, v6.8h, v5.h[7]\n"
2672 "ldr q6, [x10, #0x0]\n"
2673 "fmla v21.8h, v7.8h, v0.h[7]\n"
2674 "ldr q0, [x26, #0x0]\n"
2675 "fmla v23.8h, v7.8h, v1.h[7]\n"
2676 "ldr q1, [x25, #0x0]\n"
2677 "fmla v25.8h, v7.8h, v2.h[7]\n"
2678 "ldr q2, [x24, #0x0]\n"
2679 "fmla v27.8h, v7.8h, v3.h[7]\n"
2680 "ldr q3, [x23, #0x0]\n"
2681 "fmla v29.8h, v7.8h, v4.h[7]\n"
2682 "ldr q4, [x22, #0x0]\n"
2683 "fmla v31.8h, v7.8h, v5.h[7]\n"
2684 "ldr q5, [x21, #0x0]\n"
2685 "ldr q7, [x10, #0x10]\n"
2686 "bge 183b\n"
2687 "184:" // Height 6: Multiply loop: Single iteration only
2688 "fmla v20.8h, v6.8h, v0.h[0]\n"
2689 "fmla v22.8h, v6.8h, v1.h[0]\n"
2690 "add x26, x26, #0x10\n"
2691 "add x25, x25, #0x10\n"
2692 "fmla v24.8h, v6.8h, v2.h[0]\n"
2693 "fmla v26.8h, v6.8h, v3.h[0]\n"
2694 "add x24, x24, #0x10\n"
2695 "add x23, x23, #0x10\n"
2696 "fmla v28.8h, v6.8h, v4.h[0]\n"
2697 "fmla v30.8h, v6.8h, v5.h[0]\n"
2698 "ldr q6, [x10, #0xe0]\n"
2699 "add x22, x22, #0x10\n"
2700 "fmla v21.8h, v7.8h, v0.h[0]\n"
2701 "fmla v23.8h, v7.8h, v1.h[0]\n"
2702 "add x21, x21, #0x10\n"
2703 "sub x27, x27, #0x8\n"
2704 "fmla v25.8h, v7.8h, v2.h[0]\n"
2705 "fmla v27.8h, v7.8h, v3.h[0]\n"
2706 "prfm pldl1keep, [x26, #0x80]\n"
2707 "prfm pldl1keep, [x25, #0x80]\n"
2708 "fmla v29.8h, v7.8h, v4.h[0]\n"
2709 "fmla v31.8h, v7.8h, v5.h[0]\n"
2710 "ldr q7, [x10, #0xf0]\n"
2711 "prfm pldl1keep, [x24, #0x80]\n"
2712 "fmla v20.8h, v8.8h, v0.h[1]\n"
2713 "fmla v22.8h, v8.8h, v1.h[1]\n"
2714 "prfm pldl1keep, [x23, #0x80]\n"
2715 "prfm pldl1keep, [x22, #0x80]\n"
2716 "fmla v24.8h, v8.8h, v2.h[1]\n"
2717 "fmla v26.8h, v8.8h, v3.h[1]\n"
2718 "prfm pldl1keep, [x21, #0x80]\n"
2719 "add x10, x10, #0x100\n"
2720 "fmla v28.8h, v8.8h, v4.h[1]\n"
2721 "fmla v30.8h, v8.8h, v5.h[1]\n"
2722 "fmla v21.8h, v9.8h, v0.h[1]\n"
2723 "fmla v23.8h, v9.8h, v1.h[1]\n"
2724 "fmla v25.8h, v9.8h, v2.h[1]\n"
2725 "fmla v27.8h, v9.8h, v3.h[1]\n"
2726 "fmla v29.8h, v9.8h, v4.h[1]\n"
2727 "fmla v31.8h, v9.8h, v5.h[1]\n"
2728 "fmla v20.8h, v10.8h, v0.h[2]\n"
2729 "fmla v22.8h, v10.8h, v1.h[2]\n"
2730 "fmla v24.8h, v10.8h, v2.h[2]\n"
2731 "fmla v26.8h, v10.8h, v3.h[2]\n"
2732 "fmla v28.8h, v10.8h, v4.h[2]\n"
2733 "fmla v30.8h, v10.8h, v5.h[2]\n"
2734 "fmla v21.8h, v11.8h, v0.h[2]\n"
2735 "fmla v23.8h, v11.8h, v1.h[2]\n"
2736 "fmla v25.8h, v11.8h, v2.h[2]\n"
2737 "fmla v27.8h, v11.8h, v3.h[2]\n"
2738 "fmla v29.8h, v11.8h, v4.h[2]\n"
2739 "fmla v31.8h, v11.8h, v5.h[2]\n"
2740 "fmla v20.8h, v12.8h, v0.h[3]\n"
2741 "fmla v22.8h, v12.8h, v1.h[3]\n"
2742 "fmla v24.8h, v12.8h, v2.h[3]\n"
2743 "fmla v26.8h, v12.8h, v3.h[3]\n"
2744 "fmla v28.8h, v12.8h, v4.h[3]\n"
2745 "fmla v30.8h, v12.8h, v5.h[3]\n"
2746 "fmla v21.8h, v13.8h, v0.h[3]\n"
2747 "fmla v23.8h, v13.8h, v1.h[3]\n"
2748 "fmla v25.8h, v13.8h, v2.h[3]\n"
2749 "fmla v27.8h, v13.8h, v3.h[3]\n"
2750 "fmla v29.8h, v13.8h, v4.h[3]\n"
2751 "fmla v31.8h, v13.8h, v5.h[3]\n"
2752 "fmla v20.8h, v14.8h, v0.h[4]\n"
2753 "fmla v22.8h, v14.8h, v1.h[4]\n"
2754 "fmla v24.8h, v14.8h, v2.h[4]\n"
2755 "fmla v26.8h, v14.8h, v3.h[4]\n"
2756 "fmla v28.8h, v14.8h, v4.h[4]\n"
2757 "fmla v30.8h, v14.8h, v5.h[4]\n"
2758 "fmla v21.8h, v15.8h, v0.h[4]\n"
2759 "fmla v23.8h, v15.8h, v1.h[4]\n"
2760 "fmla v25.8h, v15.8h, v2.h[4]\n"
2761 "fmla v27.8h, v15.8h, v3.h[4]\n"
2762 "fmla v29.8h, v15.8h, v4.h[4]\n"
2763 "fmla v31.8h, v15.8h, v5.h[4]\n"
2764 "fmla v20.8h, v16.8h, v0.h[5]\n"
2765 "fmla v22.8h, v16.8h, v1.h[5]\n"
2766 "fmla v24.8h, v16.8h, v2.h[5]\n"
2767 "fmla v26.8h, v16.8h, v3.h[5]\n"
2768 "fmla v28.8h, v16.8h, v4.h[5]\n"
2769 "fmla v30.8h, v16.8h, v5.h[5]\n"
2770 "fmla v21.8h, v17.8h, v0.h[5]\n"
2771 "fmla v23.8h, v17.8h, v1.h[5]\n"
2772 "fmla v25.8h, v17.8h, v2.h[5]\n"
2773 "fmla v27.8h, v17.8h, v3.h[5]\n"
2774 "fmla v29.8h, v17.8h, v4.h[5]\n"
2775 "fmla v31.8h, v17.8h, v5.h[5]\n"
2776 "fmla v20.8h, v18.8h, v0.h[6]\n"
2777 "fmla v22.8h, v18.8h, v1.h[6]\n"
2778 "fmla v24.8h, v18.8h, v2.h[6]\n"
2779 "fmla v26.8h, v18.8h, v3.h[6]\n"
2780 "fmla v28.8h, v18.8h, v4.h[6]\n"
2781 "fmla v30.8h, v18.8h, v5.h[6]\n"
2782 "fmla v21.8h, v19.8h, v0.h[6]\n"
2783 "fmla v23.8h, v19.8h, v1.h[6]\n"
2784 "fmla v25.8h, v19.8h, v2.h[6]\n"
2785 "fmla v27.8h, v19.8h, v3.h[6]\n"
2786 "fmla v29.8h, v19.8h, v4.h[6]\n"
2787 "fmla v31.8h, v19.8h, v5.h[6]\n"
2788 "fmla v20.8h, v6.8h, v0.h[7]\n"
2789 "fmla v22.8h, v6.8h, v1.h[7]\n"
2790 "fmla v24.8h, v6.8h, v2.h[7]\n"
2791 "fmla v26.8h, v6.8h, v3.h[7]\n"
2792 "fmla v28.8h, v6.8h, v4.h[7]\n"
2793 "fmla v30.8h, v6.8h, v5.h[7]\n"
2794 "fmla v21.8h, v7.8h, v0.h[7]\n"
2795 "fmla v23.8h, v7.8h, v1.h[7]\n"
2796 "fmla v25.8h, v7.8h, v2.h[7]\n"
2797 "fmla v27.8h, v7.8h, v3.h[7]\n"
2798 "fmla v29.8h, v7.8h, v4.h[7]\n"
2799 "fmla v31.8h, v7.8h, v5.h[7]\n"
2800 "185:" // Height 6: Multiply loop: Main loop skip
2801 "cbz x27, 187f\n"
2802 "186:" // Height 6: Multiply loop: Odd block loop
2803 "ldr h0, [x26], #0x2\n"
2804 "ldr h1, [x25], #0x2\n"
2805 "sub x27, x27, #0x1\n"
2806 "ldr h2, [x24], #0x2\n"
2807 "ldr h3, [x23], #0x2\n"
2808 "ldr h4, [x22], #0x2\n"
2809 "ldr h5, [x21], #0x2\n"
2810 "ldr q8, [x10, #0x0]\n"
2811 "ldr q9, [x10, #0x10]\n"
2812 "add x10, x10, #0x20\n"
2813 "fmla v20.8h, v8.8h, v0.h[0]\n"
2814 "fmla v22.8h, v8.8h, v1.h[0]\n"
2815 "fmla v24.8h, v8.8h, v2.h[0]\n"
2816 "fmla v26.8h, v8.8h, v3.h[0]\n"
2817 "fmla v28.8h, v8.8h, v4.h[0]\n"
2818 "fmla v30.8h, v8.8h, v5.h[0]\n"
2819 "fmla v21.8h, v9.8h, v0.h[0]\n"
2820 "fmla v23.8h, v9.8h, v1.h[0]\n"
2821 "fmla v25.8h, v9.8h, v2.h[0]\n"
2822 "fmla v27.8h, v9.8h, v3.h[0]\n"
2823 "fmla v29.8h, v9.8h, v4.h[0]\n"
2824 "fmla v31.8h, v9.8h, v5.h[0]\n"
2825 "cbnz x27, 186b\n"
2826 "187:" // Height 6: Multiply loop: No odd multiplies
2827 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
2828 "add x28, x28, #0x1\n"
2829 "cmp x28, x20\n"
2830 "bne 180b\n"
2831 "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
2832 "prfm pstl1keep, [x9, #0x0]\n"
2833 "add x26, x9, x20, LSL #1\n"
2834 "prfm pstl1keep, [x26, #0x0]\n"
2835 "add x25, x26, x20, LSL #1\n"
2836 "prfm pstl1keep, [x25, #0x0]\n"
2837 "add x24, x25, x20, LSL #1\n"
2838 "prfm pstl1keep, [x24, #0x0]\n"
2839 "add x23, x24, x20, LSL #1\n"
2840 "add x22, x23, x20, LSL #1\n"
2841 "prfm pstl1keep, [x23, #0x0]\n"
2842 "prfm pstl1keep, [x22, #0x0]\n"
2843 "tbz %x[flags], #1, 188f\n"
2844 "add x21, %x[args_ptr], %[offset_max]\n"
2845 "add x20, %x[args_ptr], %[offset_min]\n"
2846 "ld1r { v17.8h }, [x21]\n"
2847 "ld1r { v16.8h }, [x20]\n"
2848 "fmin v20.8h, v20.8h, v17.8h\n"
2849 "fmin v21.8h, v21.8h, v17.8h\n"
2850 "fmin v22.8h, v22.8h, v17.8h\n"
2851 "fmin v23.8h, v23.8h, v17.8h\n"
2852 "fmin v24.8h, v24.8h, v17.8h\n"
2853 "fmin v25.8h, v25.8h, v17.8h\n"
2854 "fmin v26.8h, v26.8h, v17.8h\n"
2855 "fmin v27.8h, v27.8h, v17.8h\n"
2856 "fmin v28.8h, v28.8h, v17.8h\n"
2857 "fmin v29.8h, v29.8h, v17.8h\n"
2858 "fmin v30.8h, v30.8h, v17.8h\n"
2859 "fmin v31.8h, v31.8h, v17.8h\n"
2860 "fmax v20.8h, v20.8h, v16.8h\n"
2861 "fmax v21.8h, v21.8h, v16.8h\n"
2862 "fmax v22.8h, v22.8h, v16.8h\n"
2863 "fmax v23.8h, v23.8h, v16.8h\n"
2864 "fmax v24.8h, v24.8h, v16.8h\n"
2865 "fmax v25.8h, v25.8h, v16.8h\n"
2866 "fmax v26.8h, v26.8h, v16.8h\n"
2867 "fmax v27.8h, v27.8h, v16.8h\n"
2868 "fmax v28.8h, v28.8h, v16.8h\n"
2869 "fmax v29.8h, v29.8h, v16.8h\n"
2870 "fmax v30.8h, v30.8h, v16.8h\n"
2871 "fmax v31.8h, v31.8h, v16.8h\n"
2872 "188:" // Height 6: No activation
2873 "cmp x11, #0x10\n"
2874 "bge 197f\n"
2875 "tbz x11, #3, 192f\n"
2876 "st1 { v20.8h }, [x9], #0x10\n"
2877 "st1 { v22.8h }, [x26], #0x10\n"
2878 "st1 { v24.8h }, [x25], #0x10\n"
2879 "st1 { v26.8h }, [x24], #0x10\n"
2880 "st1 { v28.8h }, [x23], #0x10\n"
2881 "st1 { v30.8h }, [x22], #0x10\n"
2882 "tbz x11, #2, 190f\n"
2883 "str d21, [x9], #0x8\n"
2884 "str d23, [x26], #0x8\n"
2885 "str d25, [x25], #0x8\n"
2886 "str d27, [x24], #0x8\n"
2887 "str d29, [x23], #0x8\n"
2888 "str d31, [x22], #0x8\n"
2889 "tbz x11, #1, 189f\n"
2890 "st1 { v21.s }[2], [x9], #0x4\n"
2891 "st1 { v23.s }[2], [x26], #0x4\n"
2892 "st1 { v25.s }[2], [x25], #0x4\n"
2893 "st1 { v27.s }[2], [x24], #0x4\n"
2894 "st1 { v29.s }[2], [x23], #0x4\n"
2895 "st1 { v31.s }[2], [x22], #0x4\n"
2896 "tbz x11, #0, 196f\n"
2897 "st1 { v21.h }[6], [x9]\n"
2898 "st1 { v23.h }[6], [x26]\n"
2899 "st1 { v25.h }[6], [x25]\n"
2900 "st1 { v27.h }[6], [x24]\n"
2901 "st1 { v29.h }[6], [x23]\n"
2902 "st1 { v31.h }[6], [x22]\n"
2903 "b 196f\n"
2904 "189:" // Height 6: Partial direct writeback: partial_1_12
2905 "tbz x11, #0, 196f\n"
2906 "st1 { v21.h }[4], [x9]\n"
2907 "st1 { v23.h }[4], [x26]\n"
2908 "st1 { v25.h }[4], [x25]\n"
2909 "st1 { v27.h }[4], [x24]\n"
2910 "st1 { v29.h }[4], [x23]\n"
2911 "st1 { v31.h }[4], [x22]\n"
2912 "b 196f\n"
2913 "190:" // Height 6: Partial direct writeback: partial_2_8
2914 "tbz x11, #1, 191f\n"
2915 "str s21, [x9], #0x4\n"
2916 "str s23, [x26], #0x4\n"
2917 "str s25, [x25], #0x4\n"
2918 "str s27, [x24], #0x4\n"
2919 "str s29, [x23], #0x4\n"
2920 "str s31, [x22], #0x4\n"
2921 "tbz x11, #0, 196f\n"
2922 "st1 { v21.h }[2], [x9]\n"
2923 "st1 { v23.h }[2], [x26]\n"
2924 "st1 { v25.h }[2], [x25]\n"
2925 "st1 { v27.h }[2], [x24]\n"
2926 "st1 { v29.h }[2], [x23]\n"
2927 "st1 { v31.h }[2], [x22]\n"
2928 "b 196f\n"
2929 "191:" // Height 6: Partial direct writeback: partial_1_8
2930 "tbz x11, #0, 196f\n"
2931 "str h21, [x9, #0x0]\n"
2932 "str h23, [x26, #0x0]\n"
2933 "str h25, [x25, #0x0]\n"
2934 "str h27, [x24, #0x0]\n"
2935 "str h29, [x23, #0x0]\n"
2936 "str h31, [x22, #0x0]\n"
2937 "b 196f\n"
2938 "192:" // Height 6: Partial direct writeback: partial_4_0
2939 "tbz x11, #2, 194f\n"
2940 "str d20, [x9], #0x8\n"
2941 "str d22, [x26], #0x8\n"
2942 "str d24, [x25], #0x8\n"
2943 "str d26, [x24], #0x8\n"
2944 "str d28, [x23], #0x8\n"
2945 "str d30, [x22], #0x8\n"
2946 "tbz x11, #1, 193f\n"
2947 "st1 { v20.s }[2], [x9], #0x4\n"
2948 "st1 { v22.s }[2], [x26], #0x4\n"
2949 "st1 { v24.s }[2], [x25], #0x4\n"
2950 "st1 { v26.s }[2], [x24], #0x4\n"
2951 "st1 { v28.s }[2], [x23], #0x4\n"
2952 "st1 { v30.s }[2], [x22], #0x4\n"
2953 "tbz x11, #0, 196f\n"
2954 "st1 { v20.h }[6], [x9]\n"
2955 "st1 { v22.h }[6], [x26]\n"
2956 "st1 { v24.h }[6], [x25]\n"
2957 "st1 { v26.h }[6], [x24]\n"
2958 "st1 { v28.h }[6], [x23]\n"
2959 "st1 { v30.h }[6], [x22]\n"
2960 "b 196f\n"
2961 "193:" // Height 6: Partial direct writeback: partial_1_4
2962 "tbz x11, #0, 196f\n"
2963 "st1 { v20.h }[4], [x9]\n"
2964 "st1 { v22.h }[4], [x26]\n"
2965 "st1 { v24.h }[4], [x25]\n"
2966 "st1 { v26.h }[4], [x24]\n"
2967 "st1 { v28.h }[4], [x23]\n"
2968 "st1 { v30.h }[4], [x22]\n"
2969 "b 196f\n"
2970 "194:" // Height 6: Partial direct writeback: partial_2_0
2971 "tbz x11, #1, 195f\n"
2972 "str s20, [x9], #0x4\n"
2973 "str s22, [x26], #0x4\n"
2974 "str s24, [x25], #0x4\n"
2975 "str s26, [x24], #0x4\n"
2976 "str s28, [x23], #0x4\n"
2977 "str s30, [x22], #0x4\n"
2978 "tbz x11, #0, 196f\n"
2979 "st1 { v20.h }[2], [x9]\n"
2980 "st1 { v22.h }[2], [x26]\n"
2981 "st1 { v24.h }[2], [x25]\n"
2982 "st1 { v26.h }[2], [x24]\n"
2983 "st1 { v28.h }[2], [x23]\n"
2984 "st1 { v30.h }[2], [x22]\n"
2985 "b 196f\n"
2986 "195:" // Height 6: Partial direct writeback: partial_1_0
2987 "str h20, [x9, #0x0]\n"
2988 "str h22, [x26, #0x0]\n"
2989 "str h24, [x25, #0x0]\n"
2990 "str h26, [x24, #0x0]\n"
2991 "str h28, [x23, #0x0]\n"
2992 "str h30, [x22, #0x0]\n"
2993 "196:" // Height 6: Partial direct writeback: Done
2994 "b 198f\n"
2995 "197:" // Height 6: Full writeback
2996 "str q20, [x9, #0x0]\n"
2997 "str q21, [x9, #0x10]\n"
2998 "add x9, x9, #0x20\n"
2999 "str q22, [x26, #0x0]\n"
3000 "str q23, [x26, #0x10]\n"
3001 "str q24, [x25, #0x0]\n"
3002 "str q25, [x25, #0x10]\n"
3003 "str q26, [x24, #0x0]\n"
3004 "str q27, [x24, #0x10]\n"
3005 "str q28, [x23, #0x0]\n"
3006 "str q29, [x23, #0x10]\n"
3007 "str q30, [x22, #0x0]\n"
3008 "str q31, [x22, #0x10]\n"
3009 "198:" // Height 6: Writeback done
3010 "subs x11, x11, #0x10\n"
3011 "bgt 167b\n"
3012 "subs %x[m], %x[m], #0x6\n"
3013 "beq 200f\n"
3014 "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
3015 "tbz %x[flags], #3, 199f\n"
3016 "add x21, x21, #0x6\n"
3017 "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
3018 "b 1b\n"
3019 "199:" // Update direct input
3020 "mov x20, #0xc\n"
3021 "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
3022 "b 1b\n"
3023 "200:" // Exit
3024 : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m)
3025 17 : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)),
3026 [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)),
3027 [offsetof_N] "I"(offsetof(KernelArgs, N)),
3028 [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)),
3029 [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)),
3030 [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)),
3031 [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)),
3032 [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)),
3033 [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths))
3034 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
3035 "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
3036 "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
3037 17 }
3038
3039 #endif // Architectural features check.
3040