Line |
Branch |
Exec |
Source |
1 |
|
|
// |
2 |
|
|
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com> |
3 |
|
|
// |
4 |
|
|
// SPDX-License-Identifier: Apache-2.0 |
5 |
|
|
// |
6 |
|
|
|
7 |
|
|
// Do not flag up inline assembly blocks |
8 |
|
|
#pragma GCC diagnostic ignored "-Woverlength-strings" |
9 |
|
|
|
10 |
|
|
#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \ |
11 |
|
|
!defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
12 |
|
|
#error This file must be compiled for AArch64, FEAT_FP16. |
13 |
|
|
#else // Architectural features check. |
14 |
|
|
|
15 |
|
|
#include "kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h" |
16 |
|
|
|
17 |
|
|
#include <arm_neon.h> |
18 |
|
|
#include <stddef.h> |
19 |
|
|
#include <stdint.h> |
20 |
|
|
|
21 |
|
|
#include "kai/kai_common.h" |
22 |
|
|
|
23 |
|
|
static const size_t kai_mr = 6; |
24 |
|
|
static const size_t kai_nr = 16; |
25 |
|
|
static const size_t kai_kr = 1; |
26 |
|
|
static const size_t kai_sr = 1; |
27 |
|
|
|
28 |
|
18 |
size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { |
29 |
|
18 |
return kai_mr; |
30 |
|
|
} |
31 |
|
|
|
32 |
|
18 |
size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { |
33 |
|
18 |
return kai_nr; |
34 |
|
|
} |
35 |
|
|
|
36 |
|
17 |
size_t kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { |
37 |
|
17 |
return kai_nr; |
38 |
|
|
} |
39 |
|
|
|
40 |
|
17 |
size_t kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { |
41 |
|
17 |
return kai_kr; |
42 |
|
|
} |
43 |
|
|
|
44 |
|
17 |
size_t kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { |
45 |
|
17 |
return kai_sr; |
46 |
|
|
} |
47 |
|
|
|
48 |
|
16 |
size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m_idx, size_t stride) { |
49 |
|
− |
KAI_ASSUME(m_idx % kai_mr == 0); |
50 |
|
|
|
51 |
|
16 |
return m_idx * stride; |
52 |
|
|
} |
53 |
|
|
|
54 |
|
16 |
size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t n_idx, size_t k) { |
55 |
|
− |
KAI_ASSUME(n_idx % kai_nr == 0); |
56 |
|
|
|
57 |
|
16 |
return n_idx / kai_nr * (kai_nr * sizeof(uint16_t) + kai_nr * k * sizeof(uint16_t)); |
58 |
|
|
} |
59 |
|
|
|
60 |
|
16 |
size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla( |
61 |
|
|
size_t m_idx, size_t n_idx, size_t stride) { |
62 |
|
− |
KAI_ASSUME(m_idx % kai_mr == 0); |
63 |
|
− |
KAI_ASSUME(n_idx % kai_nr == 0); |
64 |
|
|
|
65 |
|
16 |
return m_idx * stride + n_idx * sizeof(uint16_t); |
66 |
|
|
} |
67 |
|
|
|
68 |
|
16 |
size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m, size_t n) { |
69 |
|
16 |
return m * n * sizeof(uint16_t); |
70 |
|
|
} |
71 |
|
|
|
72 |
|
17 |
void kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla( |
73 |
|
|
size_t m, size_t n, size_t k, // |
74 |
|
|
const void* lhs, size_t lhs_stride, // |
75 |
|
|
const void* rhs_packed, // |
76 |
|
|
void* dst, size_t dst_stride_row, size_t dst_stride_col, // |
77 |
|
|
float clamp_min, float clamp_max) { |
78 |
|
− |
KAI_ASSERT(dst_stride_col == sizeof(uint16_t)); |
79 |
|
|
|
80 |
|
|
typedef struct { |
81 |
|
|
float16_t maxval; |
82 |
|
|
float16_t minval; |
83 |
|
|
unsigned int num_strings; |
84 |
|
|
const unsigned int* string_lengths; |
85 |
|
|
size_t N; |
86 |
|
|
const void* B_ptr; |
87 |
|
|
size_t output_offset; |
88 |
|
|
size_t input_initial_col; |
89 |
|
|
size_t input_offset; |
90 |
|
|
void* output_ptr; |
91 |
|
|
const void* bias; |
92 |
|
|
} KernelArgs; |
93 |
|
|
|
94 |
|
17 |
KernelArgs ka; |
95 |
|
|
|
96 |
|
17 |
unsigned long flags = 0; |
97 |
|
|
|
98 |
|
17 |
unsigned int string_length = k; |
99 |
|
17 |
ka.num_strings = 1; |
100 |
|
17 |
ka.string_lengths = &string_length; |
101 |
|
17 |
ka.N = n; |
102 |
|
17 |
ka.B_ptr = rhs_packed; |
103 |
|
17 |
ka.bias = NULL; |
104 |
|
|
|
105 |
|
|
// Direct input. |
106 |
|
17 |
const void* input_ptr = lhs; |
107 |
|
17 |
ka.input_offset = lhs_stride / sizeof(uint16_t); |
108 |
|
17 |
ka.input_initial_col = 0; |
109 |
|
|
|
110 |
|
|
// Direct output. |
111 |
|
17 |
ka.output_ptr = dst; |
112 |
|
17 |
ka.output_offset = dst_stride_row / sizeof(uint16_t); |
113 |
|
|
|
114 |
|
|
// Clamping output. |
115 |
|
17 |
flags |= 0x2; |
116 |
|
17 |
ka.maxval = (float16_t)clamp_max; |
117 |
|
17 |
ka.minval = (float16_t)clamp_min; |
118 |
|
|
|
119 |
|
34 |
__asm__ __volatile__( |
120 |
|
|
"1:" // Row loop |
121 |
|
|
"cmp %x[m], #0x6\n" |
122 |
|
|
"bge 166f\n" |
123 |
|
|
"cmp %x[m], #0x4\n" |
124 |
|
|
"bgt 133f\n" |
125 |
|
|
"beq 100f\n" |
126 |
|
|
"cmp %x[m], #0x2\n" |
127 |
|
|
"bgt 67f\n" |
128 |
|
|
"beq 34f\n" |
129 |
|
|
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n" |
130 |
|
|
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" |
131 |
|
|
"ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
132 |
|
|
"2:" // Height 1: Column loop |
133 |
|
|
"cbz x10, 3f\n" |
134 |
|
|
"ldr q20, [x10, #0x0]\n" |
135 |
|
|
"ldr q21, [x10, #0x10]\n" |
136 |
|
|
"add x10, x10, #0x20\n" |
137 |
|
|
"b 14f\n" |
138 |
|
|
"3:" // Height 1: no bias |
139 |
|
|
"tbz %x[flags], #0, 13f\n" |
140 |
|
|
"cmp x11, #0x10\n" |
141 |
|
|
"bge 12f\n" |
142 |
|
|
"tbz x11, #3, 7f\n" |
143 |
|
|
"ld1 { v20.8h }, [x9], #0x10\n" |
144 |
|
|
"tbz x11, #2, 5f\n" |
145 |
|
|
"ldr d21, [x9], #0x8\n" |
146 |
|
|
"tbz x11, #1, 4f\n" |
147 |
|
|
"ld1 { v21.s }[2], [x9], #0x4\n" |
148 |
|
|
"mov x20, #0x1c\n" |
149 |
|
|
"tbz x11, #0, 11f\n" |
150 |
|
|
"ld1 { v21.h }[6], [x9]\n" |
151 |
|
|
"b 11f\n" |
152 |
|
|
"4:" // Height 1: Partial accumulate: partial_1_12 |
153 |
|
|
"mov x20, #0x18\n" |
154 |
|
|
"tbz x11, #0, 11f\n" |
155 |
|
|
"ld1 { v21.h }[4], [x9]\n" |
156 |
|
|
"b 11f\n" |
157 |
|
|
"5:" // Height 1: Partial accumulate: partial_2_8 |
158 |
|
|
"tbz x11, #1, 6f\n" |
159 |
|
|
"ldr s21, [x9], #0x4\n" |
160 |
|
|
"mov x20, #0x14\n" |
161 |
|
|
"tbz x11, #0, 11f\n" |
162 |
|
|
"ld1 { v21.h }[2], [x9]\n" |
163 |
|
|
"b 11f\n" |
164 |
|
|
"6:" // Height 1: Partial accumulate: partial_1_8 |
165 |
|
|
"mov x20, #0x10\n" |
166 |
|
|
"tbz x11, #0, 11f\n" |
167 |
|
|
"ldr h21, [x9, #0x0]\n" |
168 |
|
|
"b 11f\n" |
169 |
|
|
"7:" // Height 1: Partial accumulate: partial_4_0 |
170 |
|
|
"tbz x11, #2, 9f\n" |
171 |
|
|
"ldr d20, [x9], #0x8\n" |
172 |
|
|
"tbz x11, #1, 8f\n" |
173 |
|
|
"ld1 { v20.s }[2], [x9], #0x4\n" |
174 |
|
|
"mov x20, #0xc\n" |
175 |
|
|
"tbz x11, #0, 11f\n" |
176 |
|
|
"ld1 { v20.h }[6], [x9]\n" |
177 |
|
|
"b 11f\n" |
178 |
|
|
"8:" // Height 1: Partial accumulate: partial_1_4 |
179 |
|
|
"mov x20, #0x8\n" |
180 |
|
|
"tbz x11, #0, 11f\n" |
181 |
|
|
"ld1 { v20.h }[4], [x9]\n" |
182 |
|
|
"b 11f\n" |
183 |
|
|
"9:" // Height 1: Partial accumulate: partial_2_0 |
184 |
|
|
"tbz x11, #1, 10f\n" |
185 |
|
|
"ldr s20, [x9], #0x4\n" |
186 |
|
|
"mov x20, #0x4\n" |
187 |
|
|
"tbz x11, #0, 11f\n" |
188 |
|
|
"ld1 { v20.h }[2], [x9]\n" |
189 |
|
|
"b 11f\n" |
190 |
|
|
"10:" // Height 1: Partial accumulate: partial_1_0 |
191 |
|
|
"ldr h20, [x9, #0x0]\n" |
192 |
|
|
"mov x20, #0x0\n" |
193 |
|
|
"11:" // Height 1: Partial accumulate: Done |
194 |
|
|
"sub x9, x9, x20\n" |
195 |
|
|
"b 14f\n" |
196 |
|
|
"12:" // Height 1: full accumulate |
197 |
|
|
"ldr q20, [x9, #0x0]\n" |
198 |
|
|
"ldr q21, [x9, #0x10]\n" |
199 |
|
|
"b 14f\n" |
200 |
|
|
"13:" // Height 1: no accumulate |
201 |
|
|
"movi v20.16b, #0x0\n" |
202 |
|
|
"movi v21.16b, #0x0\n" |
203 |
|
|
"14:" // Height 1: setup done |
204 |
|
|
"mov x28, #0x0\n" |
205 |
|
|
"15:" // Height 1: String loop |
206 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" |
207 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
208 |
|
|
"ldr w27, [x20, x28, LSL #0x2]\n" |
209 |
|
|
"tbz %x[flags], #3, 16f\n" |
210 |
|
|
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" |
211 |
|
|
"add x20, x20, x21, LSL #3\n" |
212 |
|
|
"ldr x26, [x20, #0x0]\n" |
213 |
|
|
"cbnz x28, 17f\n" |
214 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" |
215 |
|
|
"add x26, x26, x20, LSL #1\n" |
216 |
|
|
"b 17f\n" |
217 |
|
|
"16:" // Height 1: setup direct input |
218 |
|
|
"mov x26, %x[input_ptr]\n" |
219 |
|
|
"17:" // Height 1: input setup done |
220 |
|
|
"cmp x27, #0x8\n" |
221 |
|
|
"blt 20f\n" |
222 |
|
|
"ldr q0, [x26, #0x0]\n" |
223 |
|
|
"ldr q6, [x10, #0x0]\n" |
224 |
|
|
"cmp x27, #0x10\n" |
225 |
|
|
"ldr q7, [x10, #0x10]\n" |
226 |
|
|
"ldr q8, [x10, #0x20]\n" |
227 |
|
|
"ldr q9, [x10, #0x30]\n" |
228 |
|
|
"ldr q10, [x10, #0x40]\n" |
229 |
|
|
"ldr q11, [x10, #0x50]\n" |
230 |
|
|
"ldr q12, [x10, #0x60]\n" |
231 |
|
|
"ldr q13, [x10, #0x70]\n" |
232 |
|
|
"ldr q14, [x10, #0x80]\n" |
233 |
|
|
"ldr q15, [x10, #0x90]\n" |
234 |
|
|
"ldr q16, [x10, #0xa0]\n" |
235 |
|
|
"ldr q17, [x10, #0xb0]\n" |
236 |
|
|
"ldr q18, [x10, #0xc0]\n" |
237 |
|
|
"ldr q19, [x10, #0xd0]\n" |
238 |
|
|
"blt 19f\n" |
239 |
|
|
"18:" // Height 1: Multiply loop: Main loop head |
240 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
241 |
|
|
"ldr q6, [x10, #0xe0]\n" |
242 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
243 |
|
|
"ldr q7, [x10, #0xf0]\n" |
244 |
|
|
"sub x27, x27, #0x8\n" |
245 |
|
|
"add x26, x26, #0x10\n" |
246 |
|
|
"cmp x27, #0x10\n" |
247 |
|
|
"add x10, x10, #0x100\n" |
248 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
249 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
250 |
|
|
"ldr q8, [x10, #0x20]\n" |
251 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
252 |
|
|
"ldr q9, [x10, #0x30]\n" |
253 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
254 |
|
|
"ldr q10, [x10, #0x40]\n" |
255 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
256 |
|
|
"ldr q11, [x10, #0x50]\n" |
257 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
258 |
|
|
"ldr q12, [x10, #0x60]\n" |
259 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
260 |
|
|
"ldr q13, [x10, #0x70]\n" |
261 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
262 |
|
|
"ldr q14, [x10, #0x80]\n" |
263 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
264 |
|
|
"ldr q15, [x10, #0x90]\n" |
265 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
266 |
|
|
"ldr q16, [x10, #0xa0]\n" |
267 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
268 |
|
|
"ldr q17, [x10, #0xb0]\n" |
269 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
270 |
|
|
"ldr q18, [x10, #0xc0]\n" |
271 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
272 |
|
|
"ldr q19, [x10, #0xd0]\n" |
273 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
274 |
|
|
"ldr q6, [x10, #0x0]\n" |
275 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
276 |
|
|
"ldr q0, [x26, #0x0]\n" |
277 |
|
|
"ldr q7, [x10, #0x10]\n" |
278 |
|
|
"bge 18b\n" |
279 |
|
|
"19:" // Height 1: Multiply loop: Single iteration only |
280 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
281 |
|
|
"ldr q6, [x10, #0xe0]\n" |
282 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
283 |
|
|
"ldr q7, [x10, #0xf0]\n" |
284 |
|
|
"add x26, x26, #0x10\n" |
285 |
|
|
"sub x27, x27, #0x8\n" |
286 |
|
|
"add x10, x10, #0x100\n" |
287 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
288 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
289 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
290 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
291 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
292 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
293 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
294 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
295 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
296 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
297 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
298 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
299 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
300 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
301 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
302 |
|
|
"20:" // Height 1: Multiply loop: Main loop skip |
303 |
|
|
"cbz x27, 22f\n" |
304 |
|
|
"21:" // Height 1: Multiply loop: Odd block loop |
305 |
|
|
"ldr h0, [x26], #0x2\n" |
306 |
|
|
"ldr q8, [x10, #0x0]\n" |
307 |
|
|
"sub x27, x27, #0x1\n" |
308 |
|
|
"ldr q9, [x10, #0x10]\n" |
309 |
|
|
"add x10, x10, #0x20\n" |
310 |
|
|
"fmla v20.8h, v8.8h, v0.h[0]\n" |
311 |
|
|
"fmla v21.8h, v9.8h, v0.h[0]\n" |
312 |
|
|
"cbnz x27, 21b\n" |
313 |
|
|
"22:" // Height 1: Multiply loop: No odd multiplies |
314 |
|
|
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" |
315 |
|
|
"add x28, x28, #0x1\n" |
316 |
|
|
"cmp x28, x20\n" |
317 |
|
|
"bne 15b\n" |
318 |
|
|
"prfm pstl1keep, [x9, #0x0]\n" |
319 |
|
|
"tbz %x[flags], #1, 23f\n" |
320 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
321 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
322 |
|
|
"ld1r { v17.8h }, [x21]\n" |
323 |
|
|
"ld1r { v16.8h }, [x20]\n" |
324 |
|
|
"fmin v20.8h, v20.8h, v17.8h\n" |
325 |
|
|
"fmin v21.8h, v21.8h, v17.8h\n" |
326 |
|
|
"fmax v20.8h, v20.8h, v16.8h\n" |
327 |
|
|
"fmax v21.8h, v21.8h, v16.8h\n" |
328 |
|
|
"23:" // Height 1: No activation |
329 |
|
|
"cmp x11, #0x10\n" |
330 |
|
|
"bge 32f\n" |
331 |
|
|
"tbz x11, #3, 27f\n" |
332 |
|
|
"st1 { v20.8h }, [x9], #0x10\n" |
333 |
|
|
"tbz x11, #2, 25f\n" |
334 |
|
|
"str d21, [x9], #0x8\n" |
335 |
|
|
"tbz x11, #1, 24f\n" |
336 |
|
|
"st1 { v21.s }[2], [x9], #0x4\n" |
337 |
|
|
"tbz x11, #0, 31f\n" |
338 |
|
|
"st1 { v21.h }[6], [x9]\n" |
339 |
|
|
"b 31f\n" |
340 |
|
|
"24:" // Height 1: Partial direct writeback: partial_1_12 |
341 |
|
|
"tbz x11, #0, 31f\n" |
342 |
|
|
"st1 { v21.h }[4], [x9]\n" |
343 |
|
|
"b 31f\n" |
344 |
|
|
"25:" // Height 1: Partial direct writeback: partial_2_8 |
345 |
|
|
"tbz x11, #1, 26f\n" |
346 |
|
|
"str s21, [x9], #0x4\n" |
347 |
|
|
"tbz x11, #0, 31f\n" |
348 |
|
|
"st1 { v21.h }[2], [x9]\n" |
349 |
|
|
"b 31f\n" |
350 |
|
|
"26:" // Height 1: Partial direct writeback: partial_1_8 |
351 |
|
|
"tbz x11, #0, 31f\n" |
352 |
|
|
"str h21, [x9, #0x0]\n" |
353 |
|
|
"b 31f\n" |
354 |
|
|
"27:" // Height 1: Partial direct writeback: partial_4_0 |
355 |
|
|
"tbz x11, #2, 29f\n" |
356 |
|
|
"str d20, [x9], #0x8\n" |
357 |
|
|
"tbz x11, #1, 28f\n" |
358 |
|
|
"st1 { v20.s }[2], [x9], #0x4\n" |
359 |
|
|
"tbz x11, #0, 31f\n" |
360 |
|
|
"st1 { v20.h }[6], [x9]\n" |
361 |
|
|
"b 31f\n" |
362 |
|
|
"28:" // Height 1: Partial direct writeback: partial_1_4 |
363 |
|
|
"tbz x11, #0, 31f\n" |
364 |
|
|
"st1 { v20.h }[4], [x9]\n" |
365 |
|
|
"b 31f\n" |
366 |
|
|
"29:" // Height 1: Partial direct writeback: partial_2_0 |
367 |
|
|
"tbz x11, #1, 30f\n" |
368 |
|
|
"str s20, [x9], #0x4\n" |
369 |
|
|
"tbz x11, #0, 31f\n" |
370 |
|
|
"st1 { v20.h }[2], [x9]\n" |
371 |
|
|
"b 31f\n" |
372 |
|
|
"30:" // Height 1: Partial direct writeback: partial_1_0 |
373 |
|
|
"str h20, [x9, #0x0]\n" |
374 |
|
|
"31:" // Height 1: Partial direct writeback: Done |
375 |
|
|
"b 33f\n" |
376 |
|
|
"32:" // Height 1: Full writeback |
377 |
|
|
"str q20, [x9, #0x0]\n" |
378 |
|
|
"str q21, [x9, #0x10]\n" |
379 |
|
|
"add x9, x9, #0x20\n" |
380 |
|
|
"33:" // Height 1: Writeback done |
381 |
|
|
"subs x11, x11, #0x10\n" |
382 |
|
|
"bgt 2b\n" |
383 |
|
|
"b 200f\n" |
384 |
|
|
"34:" // Height 2 |
385 |
|
|
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n" |
386 |
|
|
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" |
387 |
|
|
"ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
388 |
|
|
"35:" // Height 2: Column loop |
389 |
|
|
"cbz x10, 36f\n" |
390 |
|
|
"ldr q20, [x10, #0x0]\n" |
391 |
|
|
"ldr q21, [x10, #0x10]\n" |
392 |
|
|
"add x10, x10, #0x20\n" |
393 |
|
|
"mov v22.16b, v20.16b\n" |
394 |
|
|
"mov v23.16b, v21.16b\n" |
395 |
|
|
"b 47f\n" |
396 |
|
|
"36:" // Height 2: no bias |
397 |
|
|
"tbz %x[flags], #0, 46f\n" |
398 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
399 |
|
|
"cmp x11, #0x10\n" |
400 |
|
|
"add x26, x9, x20, LSL #1\n" |
401 |
|
|
"bge 45f\n" |
402 |
|
|
"tbz x11, #3, 40f\n" |
403 |
|
|
"ld1 { v20.8h }, [x9], #0x10\n" |
404 |
|
|
"ld1 { v22.8h }, [x26], #0x10\n" |
405 |
|
|
"tbz x11, #2, 38f\n" |
406 |
|
|
"ldr d21, [x9], #0x8\n" |
407 |
|
|
"ldr d23, [x26], #0x8\n" |
408 |
|
|
"tbz x11, #1, 37f\n" |
409 |
|
|
"ld1 { v21.s }[2], [x9], #0x4\n" |
410 |
|
|
"ld1 { v23.s }[2], [x26], #0x4\n" |
411 |
|
|
"mov x20, #0x1c\n" |
412 |
|
|
"tbz x11, #0, 44f\n" |
413 |
|
|
"ld1 { v21.h }[6], [x9]\n" |
414 |
|
|
"ld1 { v23.h }[6], [x26]\n" |
415 |
|
|
"b 44f\n" |
416 |
|
|
"37:" // Height 2: Partial accumulate: partial_1_12 |
417 |
|
|
"mov x20, #0x18\n" |
418 |
|
|
"tbz x11, #0, 44f\n" |
419 |
|
|
"ld1 { v21.h }[4], [x9]\n" |
420 |
|
|
"ld1 { v23.h }[4], [x26]\n" |
421 |
|
|
"b 44f\n" |
422 |
|
|
"38:" // Height 2: Partial accumulate: partial_2_8 |
423 |
|
|
"tbz x11, #1, 39f\n" |
424 |
|
|
"ldr s21, [x9], #0x4\n" |
425 |
|
|
"ldr s23, [x26], #0x4\n" |
426 |
|
|
"mov x20, #0x14\n" |
427 |
|
|
"tbz x11, #0, 44f\n" |
428 |
|
|
"ld1 { v21.h }[2], [x9]\n" |
429 |
|
|
"ld1 { v23.h }[2], [x26]\n" |
430 |
|
|
"b 44f\n" |
431 |
|
|
"39:" // Height 2: Partial accumulate: partial_1_8 |
432 |
|
|
"mov x20, #0x10\n" |
433 |
|
|
"tbz x11, #0, 44f\n" |
434 |
|
|
"ldr h21, [x9, #0x0]\n" |
435 |
|
|
"ldr h23, [x26, #0x0]\n" |
436 |
|
|
"b 44f\n" |
437 |
|
|
"40:" // Height 2: Partial accumulate: partial_4_0 |
438 |
|
|
"tbz x11, #2, 42f\n" |
439 |
|
|
"ldr d20, [x9], #0x8\n" |
440 |
|
|
"ldr d22, [x26], #0x8\n" |
441 |
|
|
"tbz x11, #1, 41f\n" |
442 |
|
|
"ld1 { v20.s }[2], [x9], #0x4\n" |
443 |
|
|
"ld1 { v22.s }[2], [x26], #0x4\n" |
444 |
|
|
"mov x20, #0xc\n" |
445 |
|
|
"tbz x11, #0, 44f\n" |
446 |
|
|
"ld1 { v20.h }[6], [x9]\n" |
447 |
|
|
"ld1 { v22.h }[6], [x26]\n" |
448 |
|
|
"b 44f\n" |
449 |
|
|
"41:" // Height 2: Partial accumulate: partial_1_4 |
450 |
|
|
"mov x20, #0x8\n" |
451 |
|
|
"tbz x11, #0, 44f\n" |
452 |
|
|
"ld1 { v20.h }[4], [x9]\n" |
453 |
|
|
"ld1 { v22.h }[4], [x26]\n" |
454 |
|
|
"b 44f\n" |
455 |
|
|
"42:" // Height 2: Partial accumulate: partial_2_0 |
456 |
|
|
"tbz x11, #1, 43f\n" |
457 |
|
|
"ldr s20, [x9], #0x4\n" |
458 |
|
|
"ldr s22, [x26], #0x4\n" |
459 |
|
|
"mov x20, #0x4\n" |
460 |
|
|
"tbz x11, #0, 44f\n" |
461 |
|
|
"ld1 { v20.h }[2], [x9]\n" |
462 |
|
|
"ld1 { v22.h }[2], [x26]\n" |
463 |
|
|
"b 44f\n" |
464 |
|
|
"43:" // Height 2: Partial accumulate: partial_1_0 |
465 |
|
|
"ldr h20, [x9, #0x0]\n" |
466 |
|
|
"ldr h22, [x26, #0x0]\n" |
467 |
|
|
"mov x20, #0x0\n" |
468 |
|
|
"44:" // Height 2: Partial accumulate: Done |
469 |
|
|
"sub x9, x9, x20\n" |
470 |
|
|
"b 47f\n" |
471 |
|
|
"45:" // Height 2: full accumulate |
472 |
|
|
"ldr q20, [x9, #0x0]\n" |
473 |
|
|
"ldr q21, [x9, #0x10]\n" |
474 |
|
|
"ldr q22, [x26, #0x0]\n" |
475 |
|
|
"ldr q23, [x26, #0x10]\n" |
476 |
|
|
"b 47f\n" |
477 |
|
|
"46:" // Height 2: no accumulate |
478 |
|
|
"movi v20.16b, #0x0\n" |
479 |
|
|
"movi v21.16b, #0x0\n" |
480 |
|
|
"movi v22.16b, #0x0\n" |
481 |
|
|
"movi v23.16b, #0x0\n" |
482 |
|
|
"47:" // Height 2: setup done |
483 |
|
|
"mov x28, #0x0\n" |
484 |
|
|
"48:" // Height 2: String loop |
485 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" |
486 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
487 |
|
|
"ldr w27, [x20, x28, LSL #0x2]\n" |
488 |
|
|
"tbz %x[flags], #3, 49f\n" |
489 |
|
|
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" |
490 |
|
|
"add x20, x20, x21, LSL #3\n" |
491 |
|
|
"ldr x26, [x20, #0x0]\n" |
492 |
|
|
"ldr x25, [x20, #0x8]\n" |
493 |
|
|
"cbnz x28, 50f\n" |
494 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" |
495 |
|
|
"add x26, x26, x20, LSL #1\n" |
496 |
|
|
"add x25, x25, x20, LSL #1\n" |
497 |
|
|
"b 50f\n" |
498 |
|
|
"49:" // Height 2: setup direct input |
499 |
|
|
"mov x26, %x[input_ptr]\n" |
500 |
|
|
"add x25, x26, x21, LSL #1\n" |
501 |
|
|
"50:" // Height 2: input setup done |
502 |
|
|
"cmp x27, #0x8\n" |
503 |
|
|
"blt 53f\n" |
504 |
|
|
"ldr q0, [x26, #0x0]\n" |
505 |
|
|
"ldr q1, [x25, #0x0]\n" |
506 |
|
|
"cmp x27, #0x10\n" |
507 |
|
|
"ldr q6, [x10, #0x0]\n" |
508 |
|
|
"ldr q7, [x10, #0x10]\n" |
509 |
|
|
"ldr q8, [x10, #0x20]\n" |
510 |
|
|
"ldr q9, [x10, #0x30]\n" |
511 |
|
|
"ldr q10, [x10, #0x40]\n" |
512 |
|
|
"ldr q11, [x10, #0x50]\n" |
513 |
|
|
"ldr q12, [x10, #0x60]\n" |
514 |
|
|
"ldr q13, [x10, #0x70]\n" |
515 |
|
|
"ldr q14, [x10, #0x80]\n" |
516 |
|
|
"ldr q15, [x10, #0x90]\n" |
517 |
|
|
"ldr q16, [x10, #0xa0]\n" |
518 |
|
|
"ldr q17, [x10, #0xb0]\n" |
519 |
|
|
"ldr q18, [x10, #0xc0]\n" |
520 |
|
|
"ldr q19, [x10, #0xd0]\n" |
521 |
|
|
"blt 52f\n" |
522 |
|
|
"51:" // Height 2: Multiply loop: Main loop head |
523 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
524 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
525 |
|
|
"ldr q6, [x10, #0xe0]\n" |
526 |
|
|
"sub x27, x27, #0x8\n" |
527 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
528 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
529 |
|
|
"ldr q7, [x10, #0xf0]\n" |
530 |
|
|
"add x26, x26, #0x10\n" |
531 |
|
|
"add x25, x25, #0x10\n" |
532 |
|
|
"cmp x27, #0x10\n" |
533 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
534 |
|
|
"add x10, x10, #0x100\n" |
535 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
536 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
537 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
538 |
|
|
"ldr q8, [x10, #0x20]\n" |
539 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
540 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
541 |
|
|
"ldr q9, [x10, #0x30]\n" |
542 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
543 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
544 |
|
|
"ldr q10, [x10, #0x40]\n" |
545 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
546 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
547 |
|
|
"ldr q11, [x10, #0x50]\n" |
548 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
549 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
550 |
|
|
"ldr q12, [x10, #0x60]\n" |
551 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
552 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
553 |
|
|
"ldr q13, [x10, #0x70]\n" |
554 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
555 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
556 |
|
|
"ldr q14, [x10, #0x80]\n" |
557 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
558 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
559 |
|
|
"ldr q15, [x10, #0x90]\n" |
560 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
561 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
562 |
|
|
"ldr q16, [x10, #0xa0]\n" |
563 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
564 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
565 |
|
|
"ldr q17, [x10, #0xb0]\n" |
566 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
567 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
568 |
|
|
"ldr q18, [x10, #0xc0]\n" |
569 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
570 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
571 |
|
|
"ldr q19, [x10, #0xd0]\n" |
572 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
573 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
574 |
|
|
"ldr q6, [x10, #0x0]\n" |
575 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
576 |
|
|
"ldr q0, [x26, #0x0]\n" |
577 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
578 |
|
|
"ldr q1, [x25, #0x0]\n" |
579 |
|
|
"ldr q7, [x10, #0x10]\n" |
580 |
|
|
"bge 51b\n" |
581 |
|
|
"52:" // Height 2: Multiply loop: Single iteration only |
582 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
583 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
584 |
|
|
"ldr q6, [x10, #0xe0]\n" |
585 |
|
|
"add x26, x26, #0x10\n" |
586 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
587 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
588 |
|
|
"ldr q7, [x10, #0xf0]\n" |
589 |
|
|
"add x25, x25, #0x10\n" |
590 |
|
|
"sub x27, x27, #0x8\n" |
591 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
592 |
|
|
"add x10, x10, #0x100\n" |
593 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
594 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
595 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
596 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
597 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
598 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
599 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
600 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
601 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
602 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
603 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
604 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
605 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
606 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
607 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
608 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
609 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
610 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
611 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
612 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
613 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
614 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
615 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
616 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
617 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
618 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
619 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
620 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
621 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
622 |
|
|
"53:" // Height 2: Multiply loop: Main loop skip |
623 |
|
|
"cbz x27, 55f\n" |
624 |
|
|
"54:" // Height 2: Multiply loop: Odd block loop |
625 |
|
|
"ldr h0, [x26], #0x2\n" |
626 |
|
|
"ldr h1, [x25], #0x2\n" |
627 |
|
|
"sub x27, x27, #0x1\n" |
628 |
|
|
"ldr q8, [x10, #0x0]\n" |
629 |
|
|
"ldr q9, [x10, #0x10]\n" |
630 |
|
|
"add x10, x10, #0x20\n" |
631 |
|
|
"fmla v20.8h, v8.8h, v0.h[0]\n" |
632 |
|
|
"fmla v22.8h, v8.8h, v1.h[0]\n" |
633 |
|
|
"fmla v21.8h, v9.8h, v0.h[0]\n" |
634 |
|
|
"fmla v23.8h, v9.8h, v1.h[0]\n" |
635 |
|
|
"cbnz x27, 54b\n" |
636 |
|
|
"55:" // Height 2: Multiply loop: No odd multiplies |
637 |
|
|
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" |
638 |
|
|
"add x28, x28, #0x1\n" |
639 |
|
|
"cmp x28, x20\n" |
640 |
|
|
"bne 48b\n" |
641 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
642 |
|
|
"prfm pstl1keep, [x9, #0x0]\n" |
643 |
|
|
"add x26, x9, x20, LSL #1\n" |
644 |
|
|
"prfm pstl1keep, [x26, #0x0]\n" |
645 |
|
|
"tbz %x[flags], #1, 56f\n" |
646 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
647 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
648 |
|
|
"ld1r { v17.8h }, [x21]\n" |
649 |
|
|
"ld1r { v16.8h }, [x20]\n" |
650 |
|
|
"fmin v20.8h, v20.8h, v17.8h\n" |
651 |
|
|
"fmin v21.8h, v21.8h, v17.8h\n" |
652 |
|
|
"fmin v22.8h, v22.8h, v17.8h\n" |
653 |
|
|
"fmin v23.8h, v23.8h, v17.8h\n" |
654 |
|
|
"fmax v20.8h, v20.8h, v16.8h\n" |
655 |
|
|
"fmax v21.8h, v21.8h, v16.8h\n" |
656 |
|
|
"fmax v22.8h, v22.8h, v16.8h\n" |
657 |
|
|
"fmax v23.8h, v23.8h, v16.8h\n" |
658 |
|
|
"56:" // Height 2: No activation |
659 |
|
|
"cmp x11, #0x10\n" |
660 |
|
|
"bge 65f\n" |
661 |
|
|
"tbz x11, #3, 60f\n" |
662 |
|
|
"st1 { v20.8h }, [x9], #0x10\n" |
663 |
|
|
"st1 { v22.8h }, [x26], #0x10\n" |
664 |
|
|
"tbz x11, #2, 58f\n" |
665 |
|
|
"str d21, [x9], #0x8\n" |
666 |
|
|
"str d23, [x26], #0x8\n" |
667 |
|
|
"tbz x11, #1, 57f\n" |
668 |
|
|
"st1 { v21.s }[2], [x9], #0x4\n" |
669 |
|
|
"st1 { v23.s }[2], [x26], #0x4\n" |
670 |
|
|
"tbz x11, #0, 64f\n" |
671 |
|
|
"st1 { v21.h }[6], [x9]\n" |
672 |
|
|
"st1 { v23.h }[6], [x26]\n" |
673 |
|
|
"b 64f\n" |
674 |
|
|
"57:" // Height 2: Partial direct writeback: partial_1_12 |
675 |
|
|
"tbz x11, #0, 64f\n" |
676 |
|
|
"st1 { v21.h }[4], [x9]\n" |
677 |
|
|
"st1 { v23.h }[4], [x26]\n" |
678 |
|
|
"b 64f\n" |
679 |
|
|
"58:" // Height 2: Partial direct writeback: partial_2_8 |
680 |
|
|
"tbz x11, #1, 59f\n" |
681 |
|
|
"str s21, [x9], #0x4\n" |
682 |
|
|
"str s23, [x26], #0x4\n" |
683 |
|
|
"tbz x11, #0, 64f\n" |
684 |
|
|
"st1 { v21.h }[2], [x9]\n" |
685 |
|
|
"st1 { v23.h }[2], [x26]\n" |
686 |
|
|
"b 64f\n" |
687 |
|
|
"59:" // Height 2: Partial direct writeback: partial_1_8 |
688 |
|
|
"tbz x11, #0, 64f\n" |
689 |
|
|
"str h21, [x9, #0x0]\n" |
690 |
|
|
"str h23, [x26, #0x0]\n" |
691 |
|
|
"b 64f\n" |
692 |
|
|
"60:" // Height 2: Partial direct writeback: partial_4_0 |
693 |
|
|
"tbz x11, #2, 62f\n" |
694 |
|
|
"str d20, [x9], #0x8\n" |
695 |
|
|
"str d22, [x26], #0x8\n" |
696 |
|
|
"tbz x11, #1, 61f\n" |
697 |
|
|
"st1 { v20.s }[2], [x9], #0x4\n" |
698 |
|
|
"st1 { v22.s }[2], [x26], #0x4\n" |
699 |
|
|
"tbz x11, #0, 64f\n" |
700 |
|
|
"st1 { v20.h }[6], [x9]\n" |
701 |
|
|
"st1 { v22.h }[6], [x26]\n" |
702 |
|
|
"b 64f\n" |
703 |
|
|
"61:" // Height 2: Partial direct writeback: partial_1_4 |
704 |
|
|
"tbz x11, #0, 64f\n" |
705 |
|
|
"st1 { v20.h }[4], [x9]\n" |
706 |
|
|
"st1 { v22.h }[4], [x26]\n" |
707 |
|
|
"b 64f\n" |
708 |
|
|
"62:" // Height 2: Partial direct writeback: partial_2_0 |
709 |
|
|
"tbz x11, #1, 63f\n" |
710 |
|
|
"str s20, [x9], #0x4\n" |
711 |
|
|
"str s22, [x26], #0x4\n" |
712 |
|
|
"tbz x11, #0, 64f\n" |
713 |
|
|
"st1 { v20.h }[2], [x9]\n" |
714 |
|
|
"st1 { v22.h }[2], [x26]\n" |
715 |
|
|
"b 64f\n" |
716 |
|
|
"63:" // Height 2: Partial direct writeback: partial_1_0 |
717 |
|
|
"str h20, [x9, #0x0]\n" |
718 |
|
|
"str h22, [x26, #0x0]\n" |
719 |
|
|
"64:" // Height 2: Partial direct writeback: Done |
720 |
|
|
"b 66f\n" |
721 |
|
|
"65:" // Height 2: Full writeback |
722 |
|
|
"str q20, [x9, #0x0]\n" |
723 |
|
|
"str q21, [x9, #0x10]\n" |
724 |
|
|
"add x9, x9, #0x20\n" |
725 |
|
|
"str q22, [x26, #0x0]\n" |
726 |
|
|
"str q23, [x26, #0x10]\n" |
727 |
|
|
"66:" // Height 2: Writeback done |
728 |
|
|
"subs x11, x11, #0x10\n" |
729 |
|
|
"bgt 35b\n" |
730 |
|
|
"b 200f\n" |
731 |
|
|
"67:" // Height 3 |
732 |
|
|
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n" |
733 |
|
|
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" |
734 |
|
|
"ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
735 |
|
|
"68:" // Height 3: Column loop |
736 |
|
|
"cbz x10, 69f\n" |
737 |
|
|
"ldr q20, [x10, #0x0]\n" |
738 |
|
|
"ldr q21, [x10, #0x10]\n" |
739 |
|
|
"add x10, x10, #0x20\n" |
740 |
|
|
"mov v22.16b, v20.16b\n" |
741 |
|
|
"mov v23.16b, v21.16b\n" |
742 |
|
|
"mov v24.16b, v20.16b\n" |
743 |
|
|
"mov v25.16b, v21.16b\n" |
744 |
|
|
"b 80f\n" |
745 |
|
|
"69:" // Height 3: no bias |
746 |
|
|
"tbz %x[flags], #0, 79f\n" |
747 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
748 |
|
|
"cmp x11, #0x10\n" |
749 |
|
|
"add x26, x9, x20, LSL #1\n" |
750 |
|
|
"add x25, x26, x20, LSL #1\n" |
751 |
|
|
"bge 78f\n" |
752 |
|
|
"tbz x11, #3, 73f\n" |
753 |
|
|
"ld1 { v20.8h }, [x9], #0x10\n" |
754 |
|
|
"ld1 { v22.8h }, [x26], #0x10\n" |
755 |
|
|
"ld1 { v24.8h }, [x25], #0x10\n" |
756 |
|
|
"tbz x11, #2, 71f\n" |
757 |
|
|
"ldr d21, [x9], #0x8\n" |
758 |
|
|
"ldr d23, [x26], #0x8\n" |
759 |
|
|
"ldr d25, [x25], #0x8\n" |
760 |
|
|
"tbz x11, #1, 70f\n" |
761 |
|
|
"ld1 { v21.s }[2], [x9], #0x4\n" |
762 |
|
|
"ld1 { v23.s }[2], [x26], #0x4\n" |
763 |
|
|
"mov x20, #0x1c\n" |
764 |
|
|
"ld1 { v25.s }[2], [x25], #0x4\n" |
765 |
|
|
"tbz x11, #0, 77f\n" |
766 |
|
|
"ld1 { v21.h }[6], [x9]\n" |
767 |
|
|
"ld1 { v23.h }[6], [x26]\n" |
768 |
|
|
"ld1 { v25.h }[6], [x25]\n" |
769 |
|
|
"b 77f\n" |
770 |
|
|
"70:" // Height 3: Partial accumulate: partial_1_12 |
771 |
|
|
"mov x20, #0x18\n" |
772 |
|
|
"tbz x11, #0, 77f\n" |
773 |
|
|
"ld1 { v21.h }[4], [x9]\n" |
774 |
|
|
"ld1 { v23.h }[4], [x26]\n" |
775 |
|
|
"ld1 { v25.h }[4], [x25]\n" |
776 |
|
|
"b 77f\n" |
777 |
|
|
"71:" // Height 3: Partial accumulate: partial_2_8 |
778 |
|
|
"tbz x11, #1, 72f\n" |
779 |
|
|
"ldr s21, [x9], #0x4\n" |
780 |
|
|
"ldr s23, [x26], #0x4\n" |
781 |
|
|
"mov x20, #0x14\n" |
782 |
|
|
"ldr s25, [x25], #0x4\n" |
783 |
|
|
"tbz x11, #0, 77f\n" |
784 |
|
|
"ld1 { v21.h }[2], [x9]\n" |
785 |
|
|
"ld1 { v23.h }[2], [x26]\n" |
786 |
|
|
"ld1 { v25.h }[2], [x25]\n" |
787 |
|
|
"b 77f\n" |
788 |
|
|
"72:" // Height 3: Partial accumulate: partial_1_8 |
789 |
|
|
"mov x20, #0x10\n" |
790 |
|
|
"tbz x11, #0, 77f\n" |
791 |
|
|
"ldr h21, [x9, #0x0]\n" |
792 |
|
|
"ldr h23, [x26, #0x0]\n" |
793 |
|
|
"ldr h25, [x25, #0x0]\n" |
794 |
|
|
"b 77f\n" |
795 |
|
|
"73:" // Height 3: Partial accumulate: partial_4_0 |
796 |
|
|
"tbz x11, #2, 75f\n" |
797 |
|
|
"ldr d20, [x9], #0x8\n" |
798 |
|
|
"ldr d22, [x26], #0x8\n" |
799 |
|
|
"ldr d24, [x25], #0x8\n" |
800 |
|
|
"tbz x11, #1, 74f\n" |
801 |
|
|
"ld1 { v20.s }[2], [x9], #0x4\n" |
802 |
|
|
"ld1 { v22.s }[2], [x26], #0x4\n" |
803 |
|
|
"mov x20, #0xc\n" |
804 |
|
|
"ld1 { v24.s }[2], [x25], #0x4\n" |
805 |
|
|
"tbz x11, #0, 77f\n" |
806 |
|
|
"ld1 { v20.h }[6], [x9]\n" |
807 |
|
|
"ld1 { v22.h }[6], [x26]\n" |
808 |
|
|
"ld1 { v24.h }[6], [x25]\n" |
809 |
|
|
"b 77f\n" |
810 |
|
|
"74:" // Height 3: Partial accumulate: partial_1_4 |
811 |
|
|
"mov x20, #0x8\n" |
812 |
|
|
"tbz x11, #0, 77f\n" |
813 |
|
|
"ld1 { v20.h }[4], [x9]\n" |
814 |
|
|
"ld1 { v22.h }[4], [x26]\n" |
815 |
|
|
"ld1 { v24.h }[4], [x25]\n" |
816 |
|
|
"b 77f\n" |
817 |
|
|
"75:" // Height 3: Partial accumulate: partial_2_0 |
818 |
|
|
"tbz x11, #1, 76f\n" |
819 |
|
|
"ldr s20, [x9], #0x4\n" |
820 |
|
|
"ldr s22, [x26], #0x4\n" |
821 |
|
|
"mov x20, #0x4\n" |
822 |
|
|
"ldr s24, [x25], #0x4\n" |
823 |
|
|
"tbz x11, #0, 77f\n" |
824 |
|
|
"ld1 { v20.h }[2], [x9]\n" |
825 |
|
|
"ld1 { v22.h }[2], [x26]\n" |
826 |
|
|
"ld1 { v24.h }[2], [x25]\n" |
827 |
|
|
"b 77f\n" |
828 |
|
|
"76:" // Height 3: Partial accumulate: partial_1_0 |
829 |
|
|
"ldr h20, [x9, #0x0]\n" |
830 |
|
|
"ldr h22, [x26, #0x0]\n" |
831 |
|
|
"mov x20, #0x0\n" |
832 |
|
|
"ldr h24, [x25, #0x0]\n" |
833 |
|
|
"77:" // Height 3: Partial accumulate: Done |
834 |
|
|
"sub x9, x9, x20\n" |
835 |
|
|
"b 80f\n" |
836 |
|
|
"78:" // Height 3: full accumulate |
837 |
|
|
"ldr q20, [x9, #0x0]\n" |
838 |
|
|
"ldr q21, [x9, #0x10]\n" |
839 |
|
|
"ldr q22, [x26, #0x0]\n" |
840 |
|
|
"ldr q23, [x26, #0x10]\n" |
841 |
|
|
"ldr q24, [x25, #0x0]\n" |
842 |
|
|
"ldr q25, [x25, #0x10]\n" |
843 |
|
|
"b 80f\n" |
844 |
|
|
"79:" // Height 3: no accumulate |
845 |
|
|
"movi v20.16b, #0x0\n" |
846 |
|
|
"movi v21.16b, #0x0\n" |
847 |
|
|
"movi v22.16b, #0x0\n" |
848 |
|
|
"movi v23.16b, #0x0\n" |
849 |
|
|
"movi v24.16b, #0x0\n" |
850 |
|
|
"movi v25.16b, #0x0\n" |
851 |
|
|
"80:" // Height 3: setup done |
852 |
|
|
"mov x28, #0x0\n" |
853 |
|
|
"81:" // Height 3: String loop |
854 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" |
855 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
856 |
|
|
"ldr w27, [x20, x28, LSL #0x2]\n" |
857 |
|
|
"tbz %x[flags], #3, 82f\n" |
858 |
|
|
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" |
859 |
|
|
"add x20, x20, x21, LSL #3\n" |
860 |
|
|
"ldr x26, [x20, #0x0]\n" |
861 |
|
|
"ldr x25, [x20, #0x8]\n" |
862 |
|
|
"ldr x24, [x20, #0x10]\n" |
863 |
|
|
"cbnz x28, 83f\n" |
864 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" |
865 |
|
|
"add x26, x26, x20, LSL #1\n" |
866 |
|
|
"add x25, x25, x20, LSL #1\n" |
867 |
|
|
"add x24, x24, x20, LSL #1\n" |
868 |
|
|
"b 83f\n" |
869 |
|
|
"82:" // Height 3: setup direct input |
870 |
|
|
"mov x26, %x[input_ptr]\n" |
871 |
|
|
"add x25, x26, x21, LSL #1\n" |
872 |
|
|
"add x24, x25, x21, LSL #1\n" |
873 |
|
|
"83:" // Height 3: input setup done |
874 |
|
|
"cmp x27, #0x8\n" |
875 |
|
|
"blt 86f\n" |
876 |
|
|
"ldr q0, [x26, #0x0]\n" |
877 |
|
|
"ldr q1, [x25, #0x0]\n" |
878 |
|
|
"cmp x27, #0x10\n" |
879 |
|
|
"ldr q2, [x24, #0x0]\n" |
880 |
|
|
"ldr q6, [x10, #0x0]\n" |
881 |
|
|
"ldr q7, [x10, #0x10]\n" |
882 |
|
|
"ldr q8, [x10, #0x20]\n" |
883 |
|
|
"ldr q9, [x10, #0x30]\n" |
884 |
|
|
"ldr q10, [x10, #0x40]\n" |
885 |
|
|
"ldr q11, [x10, #0x50]\n" |
886 |
|
|
"ldr q12, [x10, #0x60]\n" |
887 |
|
|
"ldr q13, [x10, #0x70]\n" |
888 |
|
|
"ldr q14, [x10, #0x80]\n" |
889 |
|
|
"ldr q15, [x10, #0x90]\n" |
890 |
|
|
"ldr q16, [x10, #0xa0]\n" |
891 |
|
|
"ldr q17, [x10, #0xb0]\n" |
892 |
|
|
"ldr q18, [x10, #0xc0]\n" |
893 |
|
|
"ldr q19, [x10, #0xd0]\n" |
894 |
|
|
"blt 85f\n" |
895 |
|
|
"84:" // Height 3: Multiply loop: Main loop head |
896 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
897 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
898 |
|
|
"sub x27, x27, #0x8\n" |
899 |
|
|
"add x26, x26, #0x10\n" |
900 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
901 |
|
|
"ldr q6, [x10, #0xe0]\n" |
902 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
903 |
|
|
"add x25, x25, #0x10\n" |
904 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
905 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
906 |
|
|
"ldr q7, [x10, #0xf0]\n" |
907 |
|
|
"add x24, x24, #0x10\n" |
908 |
|
|
"cmp x27, #0x10\n" |
909 |
|
|
"add x10, x10, #0x100\n" |
910 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
911 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
912 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
913 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
914 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
915 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
916 |
|
|
"ldr q8, [x10, #0x20]\n" |
917 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
918 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
919 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
920 |
|
|
"ldr q9, [x10, #0x30]\n" |
921 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
922 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
923 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
924 |
|
|
"ldr q10, [x10, #0x40]\n" |
925 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
926 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
927 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
928 |
|
|
"ldr q11, [x10, #0x50]\n" |
929 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
930 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
931 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
932 |
|
|
"ldr q12, [x10, #0x60]\n" |
933 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
934 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
935 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
936 |
|
|
"ldr q13, [x10, #0x70]\n" |
937 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
938 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
939 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
940 |
|
|
"ldr q14, [x10, #0x80]\n" |
941 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
942 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
943 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
944 |
|
|
"ldr q15, [x10, #0x90]\n" |
945 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
946 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
947 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
948 |
|
|
"ldr q16, [x10, #0xa0]\n" |
949 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
950 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
951 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
952 |
|
|
"ldr q17, [x10, #0xb0]\n" |
953 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
954 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
955 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
956 |
|
|
"ldr q18, [x10, #0xc0]\n" |
957 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
958 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
959 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
960 |
|
|
"ldr q19, [x10, #0xd0]\n" |
961 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
962 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
963 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
964 |
|
|
"ldr q6, [x10, #0x0]\n" |
965 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
966 |
|
|
"ldr q0, [x26, #0x0]\n" |
967 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
968 |
|
|
"ldr q1, [x25, #0x0]\n" |
969 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
970 |
|
|
"ldr q2, [x24, #0x0]\n" |
971 |
|
|
"ldr q7, [x10, #0x10]\n" |
972 |
|
|
"bge 84b\n" |
973 |
|
|
"85:" // Height 3: Multiply loop: Single iteration only |
974 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
975 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
976 |
|
|
"add x26, x26, #0x10\n" |
977 |
|
|
"add x25, x25, #0x10\n" |
978 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
979 |
|
|
"ldr q6, [x10, #0xe0]\n" |
980 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
981 |
|
|
"add x24, x24, #0x10\n" |
982 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
983 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
984 |
|
|
"ldr q7, [x10, #0xf0]\n" |
985 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
986 |
|
|
"sub x27, x27, #0x8\n" |
987 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
988 |
|
|
"add x10, x10, #0x100\n" |
989 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
990 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
991 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
992 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
993 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
994 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
995 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
996 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
997 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
998 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
999 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
1000 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
1001 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
1002 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
1003 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
1004 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
1005 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
1006 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
1007 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
1008 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
1009 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
1010 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
1011 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
1012 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
1013 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
1014 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
1015 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
1016 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
1017 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
1018 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
1019 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
1020 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
1021 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
1022 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
1023 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
1024 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
1025 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
1026 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
1027 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
1028 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
1029 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
1030 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
1031 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
1032 |
|
|
"86:" // Height 3: Multiply loop: Main loop skip |
1033 |
|
|
"cbz x27, 88f\n" |
1034 |
|
|
"87:" // Height 3: Multiply loop: Odd block loop |
1035 |
|
|
"ldr h0, [x26], #0x2\n" |
1036 |
|
|
"ldr h1, [x25], #0x2\n" |
1037 |
|
|
"sub x27, x27, #0x1\n" |
1038 |
|
|
"ldr h2, [x24], #0x2\n" |
1039 |
|
|
"ldr q8, [x10, #0x0]\n" |
1040 |
|
|
"ldr q9, [x10, #0x10]\n" |
1041 |
|
|
"add x10, x10, #0x20\n" |
1042 |
|
|
"fmla v20.8h, v8.8h, v0.h[0]\n" |
1043 |
|
|
"fmla v22.8h, v8.8h, v1.h[0]\n" |
1044 |
|
|
"fmla v24.8h, v8.8h, v2.h[0]\n" |
1045 |
|
|
"fmla v21.8h, v9.8h, v0.h[0]\n" |
1046 |
|
|
"fmla v23.8h, v9.8h, v1.h[0]\n" |
1047 |
|
|
"fmla v25.8h, v9.8h, v2.h[0]\n" |
1048 |
|
|
"cbnz x27, 87b\n" |
1049 |
|
|
"88:" // Height 3: Multiply loop: No odd multiplies |
1050 |
|
|
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" |
1051 |
|
|
"add x28, x28, #0x1\n" |
1052 |
|
|
"cmp x28, x20\n" |
1053 |
|
|
"bne 81b\n" |
1054 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
1055 |
|
|
"prfm pstl1keep, [x9, #0x0]\n" |
1056 |
|
|
"add x26, x9, x20, LSL #1\n" |
1057 |
|
|
"prfm pstl1keep, [x26, #0x0]\n" |
1058 |
|
|
"add x25, x26, x20, LSL #1\n" |
1059 |
|
|
"prfm pstl1keep, [x25, #0x0]\n" |
1060 |
|
|
"tbz %x[flags], #1, 89f\n" |
1061 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
1062 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
1063 |
|
|
"ld1r { v17.8h }, [x21]\n" |
1064 |
|
|
"ld1r { v16.8h }, [x20]\n" |
1065 |
|
|
"fmin v20.8h, v20.8h, v17.8h\n" |
1066 |
|
|
"fmin v21.8h, v21.8h, v17.8h\n" |
1067 |
|
|
"fmin v22.8h, v22.8h, v17.8h\n" |
1068 |
|
|
"fmin v23.8h, v23.8h, v17.8h\n" |
1069 |
|
|
"fmin v24.8h, v24.8h, v17.8h\n" |
1070 |
|
|
"fmin v25.8h, v25.8h, v17.8h\n" |
1071 |
|
|
"fmax v20.8h, v20.8h, v16.8h\n" |
1072 |
|
|
"fmax v21.8h, v21.8h, v16.8h\n" |
1073 |
|
|
"fmax v22.8h, v22.8h, v16.8h\n" |
1074 |
|
|
"fmax v23.8h, v23.8h, v16.8h\n" |
1075 |
|
|
"fmax v24.8h, v24.8h, v16.8h\n" |
1076 |
|
|
"fmax v25.8h, v25.8h, v16.8h\n" |
1077 |
|
|
"89:" // Height 3: No activation |
1078 |
|
|
"cmp x11, #0x10\n" |
1079 |
|
|
"bge 98f\n" |
1080 |
|
|
"tbz x11, #3, 93f\n" |
1081 |
|
|
"st1 { v20.8h }, [x9], #0x10\n" |
1082 |
|
|
"st1 { v22.8h }, [x26], #0x10\n" |
1083 |
|
|
"st1 { v24.8h }, [x25], #0x10\n" |
1084 |
|
|
"tbz x11, #2, 91f\n" |
1085 |
|
|
"str d21, [x9], #0x8\n" |
1086 |
|
|
"str d23, [x26], #0x8\n" |
1087 |
|
|
"str d25, [x25], #0x8\n" |
1088 |
|
|
"tbz x11, #1, 90f\n" |
1089 |
|
|
"st1 { v21.s }[2], [x9], #0x4\n" |
1090 |
|
|
"st1 { v23.s }[2], [x26], #0x4\n" |
1091 |
|
|
"st1 { v25.s }[2], [x25], #0x4\n" |
1092 |
|
|
"tbz x11, #0, 97f\n" |
1093 |
|
|
"st1 { v21.h }[6], [x9]\n" |
1094 |
|
|
"st1 { v23.h }[6], [x26]\n" |
1095 |
|
|
"st1 { v25.h }[6], [x25]\n" |
1096 |
|
|
"b 97f\n" |
1097 |
|
|
"90:" // Height 3: Partial direct writeback: partial_1_12 |
1098 |
|
|
"tbz x11, #0, 97f\n" |
1099 |
|
|
"st1 { v21.h }[4], [x9]\n" |
1100 |
|
|
"st1 { v23.h }[4], [x26]\n" |
1101 |
|
|
"st1 { v25.h }[4], [x25]\n" |
1102 |
|
|
"b 97f\n" |
1103 |
|
|
"91:" // Height 3: Partial direct writeback: partial_2_8 |
1104 |
|
|
"tbz x11, #1, 92f\n" |
1105 |
|
|
"str s21, [x9], #0x4\n" |
1106 |
|
|
"str s23, [x26], #0x4\n" |
1107 |
|
|
"str s25, [x25], #0x4\n" |
1108 |
|
|
"tbz x11, #0, 97f\n" |
1109 |
|
|
"st1 { v21.h }[2], [x9]\n" |
1110 |
|
|
"st1 { v23.h }[2], [x26]\n" |
1111 |
|
|
"st1 { v25.h }[2], [x25]\n" |
1112 |
|
|
"b 97f\n" |
1113 |
|
|
"92:" // Height 3: Partial direct writeback: partial_1_8 |
1114 |
|
|
"tbz x11, #0, 97f\n" |
1115 |
|
|
"str h21, [x9, #0x0]\n" |
1116 |
|
|
"str h23, [x26, #0x0]\n" |
1117 |
|
|
"str h25, [x25, #0x0]\n" |
1118 |
|
|
"b 97f\n" |
1119 |
|
|
"93:" // Height 3: Partial direct writeback: partial_4_0 |
1120 |
|
|
"tbz x11, #2, 95f\n" |
1121 |
|
|
"str d20, [x9], #0x8\n" |
1122 |
|
|
"str d22, [x26], #0x8\n" |
1123 |
|
|
"str d24, [x25], #0x8\n" |
1124 |
|
|
"tbz x11, #1, 94f\n" |
1125 |
|
|
"st1 { v20.s }[2], [x9], #0x4\n" |
1126 |
|
|
"st1 { v22.s }[2], [x26], #0x4\n" |
1127 |
|
|
"st1 { v24.s }[2], [x25], #0x4\n" |
1128 |
|
|
"tbz x11, #0, 97f\n" |
1129 |
|
|
"st1 { v20.h }[6], [x9]\n" |
1130 |
|
|
"st1 { v22.h }[6], [x26]\n" |
1131 |
|
|
"st1 { v24.h }[6], [x25]\n" |
1132 |
|
|
"b 97f\n" |
1133 |
|
|
"94:" // Height 3: Partial direct writeback: partial_1_4 |
1134 |
|
|
"tbz x11, #0, 97f\n" |
1135 |
|
|
"st1 { v20.h }[4], [x9]\n" |
1136 |
|
|
"st1 { v22.h }[4], [x26]\n" |
1137 |
|
|
"st1 { v24.h }[4], [x25]\n" |
1138 |
|
|
"b 97f\n" |
1139 |
|
|
"95:" // Height 3: Partial direct writeback: partial_2_0 |
1140 |
|
|
"tbz x11, #1, 96f\n" |
1141 |
|
|
"str s20, [x9], #0x4\n" |
1142 |
|
|
"str s22, [x26], #0x4\n" |
1143 |
|
|
"str s24, [x25], #0x4\n" |
1144 |
|
|
"tbz x11, #0, 97f\n" |
1145 |
|
|
"st1 { v20.h }[2], [x9]\n" |
1146 |
|
|
"st1 { v22.h }[2], [x26]\n" |
1147 |
|
|
"st1 { v24.h }[2], [x25]\n" |
1148 |
|
|
"b 97f\n" |
1149 |
|
|
"96:" // Height 3: Partial direct writeback: partial_1_0 |
1150 |
|
|
"str h20, [x9, #0x0]\n" |
1151 |
|
|
"str h22, [x26, #0x0]\n" |
1152 |
|
|
"str h24, [x25, #0x0]\n" |
1153 |
|
|
"97:" // Height 3: Partial direct writeback: Done |
1154 |
|
|
"b 99f\n" |
1155 |
|
|
"98:" // Height 3: Full writeback |
1156 |
|
|
"str q20, [x9, #0x0]\n" |
1157 |
|
|
"str q21, [x9, #0x10]\n" |
1158 |
|
|
"add x9, x9, #0x20\n" |
1159 |
|
|
"str q22, [x26, #0x0]\n" |
1160 |
|
|
"str q23, [x26, #0x10]\n" |
1161 |
|
|
"str q24, [x25, #0x0]\n" |
1162 |
|
|
"str q25, [x25, #0x10]\n" |
1163 |
|
|
"99:" // Height 3: Writeback done |
1164 |
|
|
"subs x11, x11, #0x10\n" |
1165 |
|
|
"bgt 68b\n" |
1166 |
|
|
"b 200f\n" |
1167 |
|
|
"100:" // Height 4 |
1168 |
|
|
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n" |
1169 |
|
|
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" |
1170 |
|
|
"ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
1171 |
|
|
"101:" // Height 4: Column loop |
1172 |
|
|
"cbz x10, 102f\n" |
1173 |
|
|
"ldr q20, [x10, #0x0]\n" |
1174 |
|
|
"ldr q21, [x10, #0x10]\n" |
1175 |
|
|
"add x10, x10, #0x20\n" |
1176 |
|
|
"mov v22.16b, v20.16b\n" |
1177 |
|
|
"mov v23.16b, v21.16b\n" |
1178 |
|
|
"mov v24.16b, v20.16b\n" |
1179 |
|
|
"mov v25.16b, v21.16b\n" |
1180 |
|
|
"mov v26.16b, v20.16b\n" |
1181 |
|
|
"mov v27.16b, v21.16b\n" |
1182 |
|
|
"b 113f\n" |
1183 |
|
|
"102:" // Height 4: no bias |
1184 |
|
|
"tbz %x[flags], #0, 112f\n" |
1185 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
1186 |
|
|
"cmp x11, #0x10\n" |
1187 |
|
|
"add x26, x9, x20, LSL #1\n" |
1188 |
|
|
"add x25, x26, x20, LSL #1\n" |
1189 |
|
|
"add x24, x25, x20, LSL #1\n" |
1190 |
|
|
"bge 111f\n" |
1191 |
|
|
"tbz x11, #3, 106f\n" |
1192 |
|
|
"ld1 { v20.8h }, [x9], #0x10\n" |
1193 |
|
|
"ld1 { v22.8h }, [x26], #0x10\n" |
1194 |
|
|
"ld1 { v24.8h }, [x25], #0x10\n" |
1195 |
|
|
"ld1 { v26.8h }, [x24], #0x10\n" |
1196 |
|
|
"tbz x11, #2, 104f\n" |
1197 |
|
|
"ldr d21, [x9], #0x8\n" |
1198 |
|
|
"ldr d23, [x26], #0x8\n" |
1199 |
|
|
"ldr d25, [x25], #0x8\n" |
1200 |
|
|
"ldr d27, [x24], #0x8\n" |
1201 |
|
|
"tbz x11, #1, 103f\n" |
1202 |
|
|
"ld1 { v21.s }[2], [x9], #0x4\n" |
1203 |
|
|
"ld1 { v23.s }[2], [x26], #0x4\n" |
1204 |
|
|
"mov x20, #0x1c\n" |
1205 |
|
|
"ld1 { v25.s }[2], [x25], #0x4\n" |
1206 |
|
|
"ld1 { v27.s }[2], [x24], #0x4\n" |
1207 |
|
|
"tbz x11, #0, 110f\n" |
1208 |
|
|
"ld1 { v21.h }[6], [x9]\n" |
1209 |
|
|
"ld1 { v23.h }[6], [x26]\n" |
1210 |
|
|
"ld1 { v25.h }[6], [x25]\n" |
1211 |
|
|
"ld1 { v27.h }[6], [x24]\n" |
1212 |
|
|
"b 110f\n" |
1213 |
|
|
"103:" // Height 4: Partial accumulate: partial_1_12 |
1214 |
|
|
"mov x20, #0x18\n" |
1215 |
|
|
"tbz x11, #0, 110f\n" |
1216 |
|
|
"ld1 { v21.h }[4], [x9]\n" |
1217 |
|
|
"ld1 { v23.h }[4], [x26]\n" |
1218 |
|
|
"ld1 { v25.h }[4], [x25]\n" |
1219 |
|
|
"ld1 { v27.h }[4], [x24]\n" |
1220 |
|
|
"b 110f\n" |
1221 |
|
|
"104:" // Height 4: Partial accumulate: partial_2_8 |
1222 |
|
|
"tbz x11, #1, 105f\n" |
1223 |
|
|
"ldr s21, [x9], #0x4\n" |
1224 |
|
|
"ldr s23, [x26], #0x4\n" |
1225 |
|
|
"mov x20, #0x14\n" |
1226 |
|
|
"ldr s25, [x25], #0x4\n" |
1227 |
|
|
"ldr s27, [x24], #0x4\n" |
1228 |
|
|
"tbz x11, #0, 110f\n" |
1229 |
|
|
"ld1 { v21.h }[2], [x9]\n" |
1230 |
|
|
"ld1 { v23.h }[2], [x26]\n" |
1231 |
|
|
"ld1 { v25.h }[2], [x25]\n" |
1232 |
|
|
"ld1 { v27.h }[2], [x24]\n" |
1233 |
|
|
"b 110f\n" |
1234 |
|
|
"105:" // Height 4: Partial accumulate: partial_1_8 |
1235 |
|
|
"mov x20, #0x10\n" |
1236 |
|
|
"tbz x11, #0, 110f\n" |
1237 |
|
|
"ldr h21, [x9, #0x0]\n" |
1238 |
|
|
"ldr h23, [x26, #0x0]\n" |
1239 |
|
|
"ldr h25, [x25, #0x0]\n" |
1240 |
|
|
"ldr h27, [x24, #0x0]\n" |
1241 |
|
|
"b 110f\n" |
1242 |
|
|
"106:" // Height 4: Partial accumulate: partial_4_0 |
1243 |
|
|
"tbz x11, #2, 108f\n" |
1244 |
|
|
"ldr d20, [x9], #0x8\n" |
1245 |
|
|
"ldr d22, [x26], #0x8\n" |
1246 |
|
|
"ldr d24, [x25], #0x8\n" |
1247 |
|
|
"ldr d26, [x24], #0x8\n" |
1248 |
|
|
"tbz x11, #1, 107f\n" |
1249 |
|
|
"ld1 { v20.s }[2], [x9], #0x4\n" |
1250 |
|
|
"ld1 { v22.s }[2], [x26], #0x4\n" |
1251 |
|
|
"mov x20, #0xc\n" |
1252 |
|
|
"ld1 { v24.s }[2], [x25], #0x4\n" |
1253 |
|
|
"ld1 { v26.s }[2], [x24], #0x4\n" |
1254 |
|
|
"tbz x11, #0, 110f\n" |
1255 |
|
|
"ld1 { v20.h }[6], [x9]\n" |
1256 |
|
|
"ld1 { v22.h }[6], [x26]\n" |
1257 |
|
|
"ld1 { v24.h }[6], [x25]\n" |
1258 |
|
|
"ld1 { v26.h }[6], [x24]\n" |
1259 |
|
|
"b 110f\n" |
1260 |
|
|
"107:" // Height 4: Partial accumulate: partial_1_4 |
1261 |
|
|
"mov x20, #0x8\n" |
1262 |
|
|
"tbz x11, #0, 110f\n" |
1263 |
|
|
"ld1 { v20.h }[4], [x9]\n" |
1264 |
|
|
"ld1 { v22.h }[4], [x26]\n" |
1265 |
|
|
"ld1 { v24.h }[4], [x25]\n" |
1266 |
|
|
"ld1 { v26.h }[4], [x24]\n" |
1267 |
|
|
"b 110f\n" |
1268 |
|
|
"108:" // Height 4: Partial accumulate: partial_2_0 |
1269 |
|
|
"tbz x11, #1, 109f\n" |
1270 |
|
|
"ldr s20, [x9], #0x4\n" |
1271 |
|
|
"ldr s22, [x26], #0x4\n" |
1272 |
|
|
"mov x20, #0x4\n" |
1273 |
|
|
"ldr s24, [x25], #0x4\n" |
1274 |
|
|
"ldr s26, [x24], #0x4\n" |
1275 |
|
|
"tbz x11, #0, 110f\n" |
1276 |
|
|
"ld1 { v20.h }[2], [x9]\n" |
1277 |
|
|
"ld1 { v22.h }[2], [x26]\n" |
1278 |
|
|
"ld1 { v24.h }[2], [x25]\n" |
1279 |
|
|
"ld1 { v26.h }[2], [x24]\n" |
1280 |
|
|
"b 110f\n" |
1281 |
|
|
"109:" // Height 4: Partial accumulate: partial_1_0 |
1282 |
|
|
"ldr h20, [x9, #0x0]\n" |
1283 |
|
|
"ldr h22, [x26, #0x0]\n" |
1284 |
|
|
"mov x20, #0x0\n" |
1285 |
|
|
"ldr h24, [x25, #0x0]\n" |
1286 |
|
|
"ldr h26, [x24, #0x0]\n" |
1287 |
|
|
"110:" // Height 4: Partial accumulate: Done |
1288 |
|
|
"sub x9, x9, x20\n" |
1289 |
|
|
"b 113f\n" |
1290 |
|
|
"111:" // Height 4: full accumulate |
1291 |
|
|
"ldr q20, [x9, #0x0]\n" |
1292 |
|
|
"ldr q21, [x9, #0x10]\n" |
1293 |
|
|
"ldr q22, [x26, #0x0]\n" |
1294 |
|
|
"ldr q23, [x26, #0x10]\n" |
1295 |
|
|
"ldr q24, [x25, #0x0]\n" |
1296 |
|
|
"ldr q25, [x25, #0x10]\n" |
1297 |
|
|
"ldr q26, [x24, #0x0]\n" |
1298 |
|
|
"ldr q27, [x24, #0x10]\n" |
1299 |
|
|
"b 113f\n" |
1300 |
|
|
"112:" // Height 4: no accumulate |
1301 |
|
|
"movi v20.16b, #0x0\n" |
1302 |
|
|
"movi v21.16b, #0x0\n" |
1303 |
|
|
"movi v22.16b, #0x0\n" |
1304 |
|
|
"movi v23.16b, #0x0\n" |
1305 |
|
|
"movi v24.16b, #0x0\n" |
1306 |
|
|
"movi v25.16b, #0x0\n" |
1307 |
|
|
"movi v26.16b, #0x0\n" |
1308 |
|
|
"movi v27.16b, #0x0\n" |
1309 |
|
|
"113:" // Height 4: setup done |
1310 |
|
|
"mov x28, #0x0\n" |
1311 |
|
|
"114:" // Height 4: String loop |
1312 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" |
1313 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
1314 |
|
|
"ldr w27, [x20, x28, LSL #0x2]\n" |
1315 |
|
|
"tbz %x[flags], #3, 115f\n" |
1316 |
|
|
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" |
1317 |
|
|
"add x20, x20, x21, LSL #3\n" |
1318 |
|
|
"ldr x26, [x20, #0x0]\n" |
1319 |
|
|
"ldr x25, [x20, #0x8]\n" |
1320 |
|
|
"ldr x24, [x20, #0x10]\n" |
1321 |
|
|
"ldr x23, [x20, #0x18]\n" |
1322 |
|
|
"cbnz x28, 116f\n" |
1323 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" |
1324 |
|
|
"add x26, x26, x20, LSL #1\n" |
1325 |
|
|
"add x25, x25, x20, LSL #1\n" |
1326 |
|
|
"add x24, x24, x20, LSL #1\n" |
1327 |
|
|
"add x23, x23, x20, LSL #1\n" |
1328 |
|
|
"b 116f\n" |
1329 |
|
|
"115:" // Height 4: setup direct input |
1330 |
|
|
"mov x26, %x[input_ptr]\n" |
1331 |
|
|
"add x25, x26, x21, LSL #1\n" |
1332 |
|
|
"add x24, x25, x21, LSL #1\n" |
1333 |
|
|
"add x23, x24, x21, LSL #1\n" |
1334 |
|
|
"116:" // Height 4: input setup done |
1335 |
|
|
"cmp x27, #0x8\n" |
1336 |
|
|
"blt 119f\n" |
1337 |
|
|
"ldr q0, [x26, #0x0]\n" |
1338 |
|
|
"ldr q1, [x25, #0x0]\n" |
1339 |
|
|
"cmp x27, #0x10\n" |
1340 |
|
|
"ldr q2, [x24, #0x0]\n" |
1341 |
|
|
"ldr q3, [x23, #0x0]\n" |
1342 |
|
|
"ldr q6, [x10, #0x0]\n" |
1343 |
|
|
"ldr q7, [x10, #0x10]\n" |
1344 |
|
|
"ldr q8, [x10, #0x20]\n" |
1345 |
|
|
"ldr q9, [x10, #0x30]\n" |
1346 |
|
|
"ldr q10, [x10, #0x40]\n" |
1347 |
|
|
"ldr q11, [x10, #0x50]\n" |
1348 |
|
|
"ldr q12, [x10, #0x60]\n" |
1349 |
|
|
"ldr q13, [x10, #0x70]\n" |
1350 |
|
|
"ldr q14, [x10, #0x80]\n" |
1351 |
|
|
"ldr q15, [x10, #0x90]\n" |
1352 |
|
|
"ldr q16, [x10, #0xa0]\n" |
1353 |
|
|
"ldr q17, [x10, #0xb0]\n" |
1354 |
|
|
"ldr q18, [x10, #0xc0]\n" |
1355 |
|
|
"ldr q19, [x10, #0xd0]\n" |
1356 |
|
|
"blt 118f\n" |
1357 |
|
|
"117:" // Height 4: Multiply loop: Main loop head |
1358 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
1359 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
1360 |
|
|
"sub x27, x27, #0x8\n" |
1361 |
|
|
"add x26, x26, #0x10\n" |
1362 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
1363 |
|
|
"fmla v26.8h, v6.8h, v3.h[0]\n" |
1364 |
|
|
"ldr q6, [x10, #0xe0]\n" |
1365 |
|
|
"add x25, x25, #0x10\n" |
1366 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
1367 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
1368 |
|
|
"add x24, x24, #0x10\n" |
1369 |
|
|
"add x23, x23, #0x10\n" |
1370 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
1371 |
|
|
"fmla v27.8h, v7.8h, v3.h[0]\n" |
1372 |
|
|
"ldr q7, [x10, #0xf0]\n" |
1373 |
|
|
"cmp x27, #0x10\n" |
1374 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
1375 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
1376 |
|
|
"add x10, x10, #0x100\n" |
1377 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
1378 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
1379 |
|
|
"fmla v26.8h, v8.8h, v3.h[1]\n" |
1380 |
|
|
"ldr q8, [x10, #0x20]\n" |
1381 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
1382 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
1383 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
1384 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
1385 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1386 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
1387 |
|
|
"fmla v27.8h, v9.8h, v3.h[1]\n" |
1388 |
|
|
"ldr q9, [x10, #0x30]\n" |
1389 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
1390 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
1391 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
1392 |
|
|
"fmla v26.8h, v10.8h, v3.h[2]\n" |
1393 |
|
|
"ldr q10, [x10, #0x40]\n" |
1394 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
1395 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
1396 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
1397 |
|
|
"fmla v27.8h, v11.8h, v3.h[2]\n" |
1398 |
|
|
"ldr q11, [x10, #0x50]\n" |
1399 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
1400 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
1401 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
1402 |
|
|
"fmla v26.8h, v12.8h, v3.h[3]\n" |
1403 |
|
|
"ldr q12, [x10, #0x60]\n" |
1404 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
1405 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
1406 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
1407 |
|
|
"fmla v27.8h, v13.8h, v3.h[3]\n" |
1408 |
|
|
"ldr q13, [x10, #0x70]\n" |
1409 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
1410 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
1411 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
1412 |
|
|
"fmla v26.8h, v14.8h, v3.h[4]\n" |
1413 |
|
|
"ldr q14, [x10, #0x80]\n" |
1414 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
1415 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
1416 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
1417 |
|
|
"fmla v27.8h, v15.8h, v3.h[4]\n" |
1418 |
|
|
"ldr q15, [x10, #0x90]\n" |
1419 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
1420 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
1421 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
1422 |
|
|
"fmla v26.8h, v16.8h, v3.h[5]\n" |
1423 |
|
|
"ldr q16, [x10, #0xa0]\n" |
1424 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
1425 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
1426 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
1427 |
|
|
"fmla v27.8h, v17.8h, v3.h[5]\n" |
1428 |
|
|
"ldr q17, [x10, #0xb0]\n" |
1429 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
1430 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
1431 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
1432 |
|
|
"fmla v26.8h, v18.8h, v3.h[6]\n" |
1433 |
|
|
"ldr q18, [x10, #0xc0]\n" |
1434 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
1435 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
1436 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
1437 |
|
|
"fmla v27.8h, v19.8h, v3.h[6]\n" |
1438 |
|
|
"ldr q19, [x10, #0xd0]\n" |
1439 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
1440 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
1441 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
1442 |
|
|
"fmla v26.8h, v6.8h, v3.h[7]\n" |
1443 |
|
|
"ldr q6, [x10, #0x0]\n" |
1444 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
1445 |
|
|
"ldr q0, [x26, #0x0]\n" |
1446 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
1447 |
|
|
"ldr q1, [x25, #0x0]\n" |
1448 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
1449 |
|
|
"ldr q2, [x24, #0x0]\n" |
1450 |
|
|
"fmla v27.8h, v7.8h, v3.h[7]\n" |
1451 |
|
|
"ldr q3, [x23, #0x0]\n" |
1452 |
|
|
"ldr q7, [x10, #0x10]\n" |
1453 |
|
|
"bge 117b\n" |
1454 |
|
|
"118:" // Height 4: Multiply loop: Single iteration only |
1455 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
1456 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
1457 |
|
|
"add x26, x26, #0x10\n" |
1458 |
|
|
"add x25, x25, #0x10\n" |
1459 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
1460 |
|
|
"fmla v26.8h, v6.8h, v3.h[0]\n" |
1461 |
|
|
"ldr q6, [x10, #0xe0]\n" |
1462 |
|
|
"add x24, x24, #0x10\n" |
1463 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
1464 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
1465 |
|
|
"add x23, x23, #0x10\n" |
1466 |
|
|
"sub x27, x27, #0x8\n" |
1467 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
1468 |
|
|
"fmla v27.8h, v7.8h, v3.h[0]\n" |
1469 |
|
|
"ldr q7, [x10, #0xf0]\n" |
1470 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
1471 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
1472 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
1473 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
1474 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
1475 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
1476 |
|
|
"fmla v26.8h, v8.8h, v3.h[1]\n" |
1477 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1478 |
|
|
"add x10, x10, #0x100\n" |
1479 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
1480 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
1481 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
1482 |
|
|
"fmla v27.8h, v9.8h, v3.h[1]\n" |
1483 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
1484 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
1485 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
1486 |
|
|
"fmla v26.8h, v10.8h, v3.h[2]\n" |
1487 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
1488 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
1489 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
1490 |
|
|
"fmla v27.8h, v11.8h, v3.h[2]\n" |
1491 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
1492 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
1493 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
1494 |
|
|
"fmla v26.8h, v12.8h, v3.h[3]\n" |
1495 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
1496 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
1497 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
1498 |
|
|
"fmla v27.8h, v13.8h, v3.h[3]\n" |
1499 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
1500 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
1501 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
1502 |
|
|
"fmla v26.8h, v14.8h, v3.h[4]\n" |
1503 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
1504 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
1505 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
1506 |
|
|
"fmla v27.8h, v15.8h, v3.h[4]\n" |
1507 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
1508 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
1509 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
1510 |
|
|
"fmla v26.8h, v16.8h, v3.h[5]\n" |
1511 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
1512 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
1513 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
1514 |
|
|
"fmla v27.8h, v17.8h, v3.h[5]\n" |
1515 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
1516 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
1517 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
1518 |
|
|
"fmla v26.8h, v18.8h, v3.h[6]\n" |
1519 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
1520 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
1521 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
1522 |
|
|
"fmla v27.8h, v19.8h, v3.h[6]\n" |
1523 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
1524 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
1525 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
1526 |
|
|
"fmla v26.8h, v6.8h, v3.h[7]\n" |
1527 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
1528 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
1529 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
1530 |
|
|
"fmla v27.8h, v7.8h, v3.h[7]\n" |
1531 |
|
|
"119:" // Height 4: Multiply loop: Main loop skip |
1532 |
|
|
"cbz x27, 121f\n" |
1533 |
|
|
"120:" // Height 4: Multiply loop: Odd block loop |
1534 |
|
|
"ldr h0, [x26], #0x2\n" |
1535 |
|
|
"ldr h1, [x25], #0x2\n" |
1536 |
|
|
"sub x27, x27, #0x1\n" |
1537 |
|
|
"ldr h2, [x24], #0x2\n" |
1538 |
|
|
"ldr h3, [x23], #0x2\n" |
1539 |
|
|
"ldr q8, [x10, #0x0]\n" |
1540 |
|
|
"ldr q9, [x10, #0x10]\n" |
1541 |
|
|
"add x10, x10, #0x20\n" |
1542 |
|
|
"fmla v20.8h, v8.8h, v0.h[0]\n" |
1543 |
|
|
"fmla v22.8h, v8.8h, v1.h[0]\n" |
1544 |
|
|
"fmla v24.8h, v8.8h, v2.h[0]\n" |
1545 |
|
|
"fmla v26.8h, v8.8h, v3.h[0]\n" |
1546 |
|
|
"fmla v21.8h, v9.8h, v0.h[0]\n" |
1547 |
|
|
"fmla v23.8h, v9.8h, v1.h[0]\n" |
1548 |
|
|
"fmla v25.8h, v9.8h, v2.h[0]\n" |
1549 |
|
|
"fmla v27.8h, v9.8h, v3.h[0]\n" |
1550 |
|
|
"cbnz x27, 120b\n" |
1551 |
|
|
"121:" // Height 4: Multiply loop: No odd multiplies |
1552 |
|
|
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" |
1553 |
|
|
"add x28, x28, #0x1\n" |
1554 |
|
|
"cmp x28, x20\n" |
1555 |
|
|
"bne 114b\n" |
1556 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
1557 |
|
|
"prfm pstl1keep, [x9, #0x0]\n" |
1558 |
|
|
"add x26, x9, x20, LSL #1\n" |
1559 |
|
|
"prfm pstl1keep, [x26, #0x0]\n" |
1560 |
|
|
"add x25, x26, x20, LSL #1\n" |
1561 |
|
|
"prfm pstl1keep, [x25, #0x0]\n" |
1562 |
|
|
"add x24, x25, x20, LSL #1\n" |
1563 |
|
|
"prfm pstl1keep, [x24, #0x0]\n" |
1564 |
|
|
"tbz %x[flags], #1, 122f\n" |
1565 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
1566 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
1567 |
|
|
"ld1r { v17.8h }, [x21]\n" |
1568 |
|
|
"ld1r { v16.8h }, [x20]\n" |
1569 |
|
|
"fmin v20.8h, v20.8h, v17.8h\n" |
1570 |
|
|
"fmin v21.8h, v21.8h, v17.8h\n" |
1571 |
|
|
"fmin v22.8h, v22.8h, v17.8h\n" |
1572 |
|
|
"fmin v23.8h, v23.8h, v17.8h\n" |
1573 |
|
|
"fmin v24.8h, v24.8h, v17.8h\n" |
1574 |
|
|
"fmin v25.8h, v25.8h, v17.8h\n" |
1575 |
|
|
"fmin v26.8h, v26.8h, v17.8h\n" |
1576 |
|
|
"fmin v27.8h, v27.8h, v17.8h\n" |
1577 |
|
|
"fmax v20.8h, v20.8h, v16.8h\n" |
1578 |
|
|
"fmax v21.8h, v21.8h, v16.8h\n" |
1579 |
|
|
"fmax v22.8h, v22.8h, v16.8h\n" |
1580 |
|
|
"fmax v23.8h, v23.8h, v16.8h\n" |
1581 |
|
|
"fmax v24.8h, v24.8h, v16.8h\n" |
1582 |
|
|
"fmax v25.8h, v25.8h, v16.8h\n" |
1583 |
|
|
"fmax v26.8h, v26.8h, v16.8h\n" |
1584 |
|
|
"fmax v27.8h, v27.8h, v16.8h\n" |
1585 |
|
|
"122:" // Height 4: No activation |
1586 |
|
|
"cmp x11, #0x10\n" |
1587 |
|
|
"bge 131f\n" |
1588 |
|
|
"tbz x11, #3, 126f\n" |
1589 |
|
|
"st1 { v20.8h }, [x9], #0x10\n" |
1590 |
|
|
"st1 { v22.8h }, [x26], #0x10\n" |
1591 |
|
|
"st1 { v24.8h }, [x25], #0x10\n" |
1592 |
|
|
"st1 { v26.8h }, [x24], #0x10\n" |
1593 |
|
|
"tbz x11, #2, 124f\n" |
1594 |
|
|
"str d21, [x9], #0x8\n" |
1595 |
|
|
"str d23, [x26], #0x8\n" |
1596 |
|
|
"str d25, [x25], #0x8\n" |
1597 |
|
|
"str d27, [x24], #0x8\n" |
1598 |
|
|
"tbz x11, #1, 123f\n" |
1599 |
|
|
"st1 { v21.s }[2], [x9], #0x4\n" |
1600 |
|
|
"st1 { v23.s }[2], [x26], #0x4\n" |
1601 |
|
|
"st1 { v25.s }[2], [x25], #0x4\n" |
1602 |
|
|
"st1 { v27.s }[2], [x24], #0x4\n" |
1603 |
|
|
"tbz x11, #0, 130f\n" |
1604 |
|
|
"st1 { v21.h }[6], [x9]\n" |
1605 |
|
|
"st1 { v23.h }[6], [x26]\n" |
1606 |
|
|
"st1 { v25.h }[6], [x25]\n" |
1607 |
|
|
"st1 { v27.h }[6], [x24]\n" |
1608 |
|
|
"b 130f\n" |
1609 |
|
|
"123:" // Height 4: Partial direct writeback: partial_1_12 |
1610 |
|
|
"tbz x11, #0, 130f\n" |
1611 |
|
|
"st1 { v21.h }[4], [x9]\n" |
1612 |
|
|
"st1 { v23.h }[4], [x26]\n" |
1613 |
|
|
"st1 { v25.h }[4], [x25]\n" |
1614 |
|
|
"st1 { v27.h }[4], [x24]\n" |
1615 |
|
|
"b 130f\n" |
1616 |
|
|
"124:" // Height 4: Partial direct writeback: partial_2_8 |
1617 |
|
|
"tbz x11, #1, 125f\n" |
1618 |
|
|
"str s21, [x9], #0x4\n" |
1619 |
|
|
"str s23, [x26], #0x4\n" |
1620 |
|
|
"str s25, [x25], #0x4\n" |
1621 |
|
|
"str s27, [x24], #0x4\n" |
1622 |
|
|
"tbz x11, #0, 130f\n" |
1623 |
|
|
"st1 { v21.h }[2], [x9]\n" |
1624 |
|
|
"st1 { v23.h }[2], [x26]\n" |
1625 |
|
|
"st1 { v25.h }[2], [x25]\n" |
1626 |
|
|
"st1 { v27.h }[2], [x24]\n" |
1627 |
|
|
"b 130f\n" |
1628 |
|
|
"125:" // Height 4: Partial direct writeback: partial_1_8 |
1629 |
|
|
"tbz x11, #0, 130f\n" |
1630 |
|
|
"str h21, [x9, #0x0]\n" |
1631 |
|
|
"str h23, [x26, #0x0]\n" |
1632 |
|
|
"str h25, [x25, #0x0]\n" |
1633 |
|
|
"str h27, [x24, #0x0]\n" |
1634 |
|
|
"b 130f\n" |
1635 |
|
|
"126:" // Height 4: Partial direct writeback: partial_4_0 |
1636 |
|
|
"tbz x11, #2, 128f\n" |
1637 |
|
|
"str d20, [x9], #0x8\n" |
1638 |
|
|
"str d22, [x26], #0x8\n" |
1639 |
|
|
"str d24, [x25], #0x8\n" |
1640 |
|
|
"str d26, [x24], #0x8\n" |
1641 |
|
|
"tbz x11, #1, 127f\n" |
1642 |
|
|
"st1 { v20.s }[2], [x9], #0x4\n" |
1643 |
|
|
"st1 { v22.s }[2], [x26], #0x4\n" |
1644 |
|
|
"st1 { v24.s }[2], [x25], #0x4\n" |
1645 |
|
|
"st1 { v26.s }[2], [x24], #0x4\n" |
1646 |
|
|
"tbz x11, #0, 130f\n" |
1647 |
|
|
"st1 { v20.h }[6], [x9]\n" |
1648 |
|
|
"st1 { v22.h }[6], [x26]\n" |
1649 |
|
|
"st1 { v24.h }[6], [x25]\n" |
1650 |
|
|
"st1 { v26.h }[6], [x24]\n" |
1651 |
|
|
"b 130f\n" |
1652 |
|
|
"127:" // Height 4: Partial direct writeback: partial_1_4 |
1653 |
|
|
"tbz x11, #0, 130f\n" |
1654 |
|
|
"st1 { v20.h }[4], [x9]\n" |
1655 |
|
|
"st1 { v22.h }[4], [x26]\n" |
1656 |
|
|
"st1 { v24.h }[4], [x25]\n" |
1657 |
|
|
"st1 { v26.h }[4], [x24]\n" |
1658 |
|
|
"b 130f\n" |
1659 |
|
|
"128:" // Height 4: Partial direct writeback: partial_2_0 |
1660 |
|
|
"tbz x11, #1, 129f\n" |
1661 |
|
|
"str s20, [x9], #0x4\n" |
1662 |
|
|
"str s22, [x26], #0x4\n" |
1663 |
|
|
"str s24, [x25], #0x4\n" |
1664 |
|
|
"str s26, [x24], #0x4\n" |
1665 |
|
|
"tbz x11, #0, 130f\n" |
1666 |
|
|
"st1 { v20.h }[2], [x9]\n" |
1667 |
|
|
"st1 { v22.h }[2], [x26]\n" |
1668 |
|
|
"st1 { v24.h }[2], [x25]\n" |
1669 |
|
|
"st1 { v26.h }[2], [x24]\n" |
1670 |
|
|
"b 130f\n" |
1671 |
|
|
"129:" // Height 4: Partial direct writeback: partial_1_0 |
1672 |
|
|
"str h20, [x9, #0x0]\n" |
1673 |
|
|
"str h22, [x26, #0x0]\n" |
1674 |
|
|
"str h24, [x25, #0x0]\n" |
1675 |
|
|
"str h26, [x24, #0x0]\n" |
1676 |
|
|
"130:" // Height 4: Partial direct writeback: Done |
1677 |
|
|
"b 132f\n" |
1678 |
|
|
"131:" // Height 4: Full writeback |
1679 |
|
|
"str q20, [x9, #0x0]\n" |
1680 |
|
|
"str q21, [x9, #0x10]\n" |
1681 |
|
|
"add x9, x9, #0x20\n" |
1682 |
|
|
"str q22, [x26, #0x0]\n" |
1683 |
|
|
"str q23, [x26, #0x10]\n" |
1684 |
|
|
"str q24, [x25, #0x0]\n" |
1685 |
|
|
"str q25, [x25, #0x10]\n" |
1686 |
|
|
"str q26, [x24, #0x0]\n" |
1687 |
|
|
"str q27, [x24, #0x10]\n" |
1688 |
|
|
"132:" // Height 4: Writeback done |
1689 |
|
|
"subs x11, x11, #0x10\n" |
1690 |
|
|
"bgt 101b\n" |
1691 |
|
|
"b 200f\n" |
1692 |
|
|
"133:" // Height 5 |
1693 |
|
|
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n" |
1694 |
|
|
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" |
1695 |
|
|
"ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
1696 |
|
|
"134:" // Height 5: Column loop |
1697 |
|
|
"cbz x10, 135f\n" |
1698 |
|
|
"ldr q20, [x10, #0x0]\n" |
1699 |
|
|
"ldr q21, [x10, #0x10]\n" |
1700 |
|
|
"add x10, x10, #0x20\n" |
1701 |
|
|
"mov v22.16b, v20.16b\n" |
1702 |
|
|
"mov v23.16b, v21.16b\n" |
1703 |
|
|
"mov v24.16b, v20.16b\n" |
1704 |
|
|
"mov v25.16b, v21.16b\n" |
1705 |
|
|
"mov v26.16b, v20.16b\n" |
1706 |
|
|
"mov v27.16b, v21.16b\n" |
1707 |
|
|
"mov v28.16b, v20.16b\n" |
1708 |
|
|
"mov v29.16b, v21.16b\n" |
1709 |
|
|
"b 146f\n" |
1710 |
|
|
"135:" // Height 5: no bias |
1711 |
|
|
"tbz %x[flags], #0, 145f\n" |
1712 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
1713 |
|
|
"cmp x11, #0x10\n" |
1714 |
|
|
"add x26, x9, x20, LSL #1\n" |
1715 |
|
|
"add x25, x26, x20, LSL #1\n" |
1716 |
|
|
"add x24, x25, x20, LSL #1\n" |
1717 |
|
|
"add x23, x24, x20, LSL #1\n" |
1718 |
|
|
"bge 144f\n" |
1719 |
|
|
"tbz x11, #3, 139f\n" |
1720 |
|
|
"ld1 { v20.8h }, [x9], #0x10\n" |
1721 |
|
|
"ld1 { v22.8h }, [x26], #0x10\n" |
1722 |
|
|
"ld1 { v24.8h }, [x25], #0x10\n" |
1723 |
|
|
"ld1 { v26.8h }, [x24], #0x10\n" |
1724 |
|
|
"ld1 { v28.8h }, [x23], #0x10\n" |
1725 |
|
|
"tbz x11, #2, 137f\n" |
1726 |
|
|
"ldr d21, [x9], #0x8\n" |
1727 |
|
|
"ldr d23, [x26], #0x8\n" |
1728 |
|
|
"ldr d25, [x25], #0x8\n" |
1729 |
|
|
"ldr d27, [x24], #0x8\n" |
1730 |
|
|
"ldr d29, [x23], #0x8\n" |
1731 |
|
|
"tbz x11, #1, 136f\n" |
1732 |
|
|
"ld1 { v21.s }[2], [x9], #0x4\n" |
1733 |
|
|
"ld1 { v23.s }[2], [x26], #0x4\n" |
1734 |
|
|
"mov x20, #0x1c\n" |
1735 |
|
|
"ld1 { v25.s }[2], [x25], #0x4\n" |
1736 |
|
|
"ld1 { v27.s }[2], [x24], #0x4\n" |
1737 |
|
|
"ld1 { v29.s }[2], [x23], #0x4\n" |
1738 |
|
|
"tbz x11, #0, 143f\n" |
1739 |
|
|
"ld1 { v21.h }[6], [x9]\n" |
1740 |
|
|
"ld1 { v23.h }[6], [x26]\n" |
1741 |
|
|
"ld1 { v25.h }[6], [x25]\n" |
1742 |
|
|
"ld1 { v27.h }[6], [x24]\n" |
1743 |
|
|
"ld1 { v29.h }[6], [x23]\n" |
1744 |
|
|
"b 143f\n" |
1745 |
|
|
"136:" // Height 5: Partial accumulate: partial_1_12 |
1746 |
|
|
"mov x20, #0x18\n" |
1747 |
|
|
"tbz x11, #0, 143f\n" |
1748 |
|
|
"ld1 { v21.h }[4], [x9]\n" |
1749 |
|
|
"ld1 { v23.h }[4], [x26]\n" |
1750 |
|
|
"ld1 { v25.h }[4], [x25]\n" |
1751 |
|
|
"ld1 { v27.h }[4], [x24]\n" |
1752 |
|
|
"ld1 { v29.h }[4], [x23]\n" |
1753 |
|
|
"b 143f\n" |
1754 |
|
|
"137:" // Height 5: Partial accumulate: partial_2_8 |
1755 |
|
|
"tbz x11, #1, 138f\n" |
1756 |
|
|
"ldr s21, [x9], #0x4\n" |
1757 |
|
|
"ldr s23, [x26], #0x4\n" |
1758 |
|
|
"mov x20, #0x14\n" |
1759 |
|
|
"ldr s25, [x25], #0x4\n" |
1760 |
|
|
"ldr s27, [x24], #0x4\n" |
1761 |
|
|
"ldr s29, [x23], #0x4\n" |
1762 |
|
|
"tbz x11, #0, 143f\n" |
1763 |
|
|
"ld1 { v21.h }[2], [x9]\n" |
1764 |
|
|
"ld1 { v23.h }[2], [x26]\n" |
1765 |
|
|
"ld1 { v25.h }[2], [x25]\n" |
1766 |
|
|
"ld1 { v27.h }[2], [x24]\n" |
1767 |
|
|
"ld1 { v29.h }[2], [x23]\n" |
1768 |
|
|
"b 143f\n" |
1769 |
|
|
"138:" // Height 5: Partial accumulate: partial_1_8 |
1770 |
|
|
"mov x20, #0x10\n" |
1771 |
|
|
"tbz x11, #0, 143f\n" |
1772 |
|
|
"ldr h21, [x9, #0x0]\n" |
1773 |
|
|
"ldr h23, [x26, #0x0]\n" |
1774 |
|
|
"ldr h25, [x25, #0x0]\n" |
1775 |
|
|
"ldr h27, [x24, #0x0]\n" |
1776 |
|
|
"ldr h29, [x23, #0x0]\n" |
1777 |
|
|
"b 143f\n" |
1778 |
|
|
"139:" // Height 5: Partial accumulate: partial_4_0 |
1779 |
|
|
"tbz x11, #2, 141f\n" |
1780 |
|
|
"ldr d20, [x9], #0x8\n" |
1781 |
|
|
"ldr d22, [x26], #0x8\n" |
1782 |
|
|
"ldr d24, [x25], #0x8\n" |
1783 |
|
|
"ldr d26, [x24], #0x8\n" |
1784 |
|
|
"ldr d28, [x23], #0x8\n" |
1785 |
|
|
"tbz x11, #1, 140f\n" |
1786 |
|
|
"ld1 { v20.s }[2], [x9], #0x4\n" |
1787 |
|
|
"ld1 { v22.s }[2], [x26], #0x4\n" |
1788 |
|
|
"mov x20, #0xc\n" |
1789 |
|
|
"ld1 { v24.s }[2], [x25], #0x4\n" |
1790 |
|
|
"ld1 { v26.s }[2], [x24], #0x4\n" |
1791 |
|
|
"ld1 { v28.s }[2], [x23], #0x4\n" |
1792 |
|
|
"tbz x11, #0, 143f\n" |
1793 |
|
|
"ld1 { v20.h }[6], [x9]\n" |
1794 |
|
|
"ld1 { v22.h }[6], [x26]\n" |
1795 |
|
|
"ld1 { v24.h }[6], [x25]\n" |
1796 |
|
|
"ld1 { v26.h }[6], [x24]\n" |
1797 |
|
|
"ld1 { v28.h }[6], [x23]\n" |
1798 |
|
|
"b 143f\n" |
1799 |
|
|
"140:" // Height 5: Partial accumulate: partial_1_4 |
1800 |
|
|
"mov x20, #0x8\n" |
1801 |
|
|
"tbz x11, #0, 143f\n" |
1802 |
|
|
"ld1 { v20.h }[4], [x9]\n" |
1803 |
|
|
"ld1 { v22.h }[4], [x26]\n" |
1804 |
|
|
"ld1 { v24.h }[4], [x25]\n" |
1805 |
|
|
"ld1 { v26.h }[4], [x24]\n" |
1806 |
|
|
"ld1 { v28.h }[4], [x23]\n" |
1807 |
|
|
"b 143f\n" |
1808 |
|
|
"141:" // Height 5: Partial accumulate: partial_2_0 |
1809 |
|
|
"tbz x11, #1, 142f\n" |
1810 |
|
|
"ldr s20, [x9], #0x4\n" |
1811 |
|
|
"ldr s22, [x26], #0x4\n" |
1812 |
|
|
"mov x20, #0x4\n" |
1813 |
|
|
"ldr s24, [x25], #0x4\n" |
1814 |
|
|
"ldr s26, [x24], #0x4\n" |
1815 |
|
|
"ldr s28, [x23], #0x4\n" |
1816 |
|
|
"tbz x11, #0, 143f\n" |
1817 |
|
|
"ld1 { v20.h }[2], [x9]\n" |
1818 |
|
|
"ld1 { v22.h }[2], [x26]\n" |
1819 |
|
|
"ld1 { v24.h }[2], [x25]\n" |
1820 |
|
|
"ld1 { v26.h }[2], [x24]\n" |
1821 |
|
|
"ld1 { v28.h }[2], [x23]\n" |
1822 |
|
|
"b 143f\n" |
1823 |
|
|
"142:" // Height 5: Partial accumulate: partial_1_0 |
1824 |
|
|
"ldr h20, [x9, #0x0]\n" |
1825 |
|
|
"ldr h22, [x26, #0x0]\n" |
1826 |
|
|
"mov x20, #0x0\n" |
1827 |
|
|
"ldr h24, [x25, #0x0]\n" |
1828 |
|
|
"ldr h26, [x24, #0x0]\n" |
1829 |
|
|
"ldr h28, [x23, #0x0]\n" |
1830 |
|
|
"143:" // Height 5: Partial accumulate: Done |
1831 |
|
|
"sub x9, x9, x20\n" |
1832 |
|
|
"b 146f\n" |
1833 |
|
|
"144:" // Height 5: full accumulate |
1834 |
|
|
"ldr q20, [x9, #0x0]\n" |
1835 |
|
|
"ldr q21, [x9, #0x10]\n" |
1836 |
|
|
"ldr q22, [x26, #0x0]\n" |
1837 |
|
|
"ldr q23, [x26, #0x10]\n" |
1838 |
|
|
"ldr q24, [x25, #0x0]\n" |
1839 |
|
|
"ldr q25, [x25, #0x10]\n" |
1840 |
|
|
"ldr q26, [x24, #0x0]\n" |
1841 |
|
|
"ldr q27, [x24, #0x10]\n" |
1842 |
|
|
"ldr q28, [x23, #0x0]\n" |
1843 |
|
|
"ldr q29, [x23, #0x10]\n" |
1844 |
|
|
"b 146f\n" |
1845 |
|
|
"145:" // Height 5: no accumulate |
1846 |
|
|
"movi v20.16b, #0x0\n" |
1847 |
|
|
"movi v21.16b, #0x0\n" |
1848 |
|
|
"movi v22.16b, #0x0\n" |
1849 |
|
|
"movi v23.16b, #0x0\n" |
1850 |
|
|
"movi v24.16b, #0x0\n" |
1851 |
|
|
"movi v25.16b, #0x0\n" |
1852 |
|
|
"movi v26.16b, #0x0\n" |
1853 |
|
|
"movi v27.16b, #0x0\n" |
1854 |
|
|
"movi v28.16b, #0x0\n" |
1855 |
|
|
"movi v29.16b, #0x0\n" |
1856 |
|
|
"146:" // Height 5: setup done |
1857 |
|
|
"mov x28, #0x0\n" |
1858 |
|
|
"147:" // Height 5: String loop |
1859 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" |
1860 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
1861 |
|
|
"ldr w27, [x20, x28, LSL #0x2]\n" |
1862 |
|
|
"tbz %x[flags], #3, 148f\n" |
1863 |
|
|
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" |
1864 |
|
|
"add x20, x20, x21, LSL #3\n" |
1865 |
|
|
"ldr x26, [x20, #0x0]\n" |
1866 |
|
|
"ldr x25, [x20, #0x8]\n" |
1867 |
|
|
"ldr x24, [x20, #0x10]\n" |
1868 |
|
|
"ldr x23, [x20, #0x18]\n" |
1869 |
|
|
"ldr x22, [x20, #0x20]\n" |
1870 |
|
|
"cbnz x28, 149f\n" |
1871 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" |
1872 |
|
|
"add x26, x26, x20, LSL #1\n" |
1873 |
|
|
"add x25, x25, x20, LSL #1\n" |
1874 |
|
|
"add x24, x24, x20, LSL #1\n" |
1875 |
|
|
"add x23, x23, x20, LSL #1\n" |
1876 |
|
|
"add x22, x22, x20, LSL #1\n" |
1877 |
|
|
"b 149f\n" |
1878 |
|
|
"148:" // Height 5: setup direct input |
1879 |
|
|
"mov x26, %x[input_ptr]\n" |
1880 |
|
|
"add x25, x26, x21, LSL #1\n" |
1881 |
|
|
"add x24, x25, x21, LSL #1\n" |
1882 |
|
|
"add x23, x24, x21, LSL #1\n" |
1883 |
|
|
"add x22, x23, x21, LSL #1\n" |
1884 |
|
|
"149:" // Height 5: input setup done |
1885 |
|
|
"cmp x27, #0x8\n" |
1886 |
|
|
"blt 152f\n" |
1887 |
|
|
"ldr q0, [x26, #0x0]\n" |
1888 |
|
|
"ldr q1, [x25, #0x0]\n" |
1889 |
|
|
"cmp x27, #0x10\n" |
1890 |
|
|
"ldr q2, [x24, #0x0]\n" |
1891 |
|
|
"ldr q3, [x23, #0x0]\n" |
1892 |
|
|
"ldr q4, [x22, #0x0]\n" |
1893 |
|
|
"ldr q6, [x10, #0x0]\n" |
1894 |
|
|
"ldr q7, [x10, #0x10]\n" |
1895 |
|
|
"ldr q8, [x10, #0x20]\n" |
1896 |
|
|
"ldr q9, [x10, #0x30]\n" |
1897 |
|
|
"ldr q10, [x10, #0x40]\n" |
1898 |
|
|
"ldr q11, [x10, #0x50]\n" |
1899 |
|
|
"ldr q12, [x10, #0x60]\n" |
1900 |
|
|
"ldr q13, [x10, #0x70]\n" |
1901 |
|
|
"ldr q14, [x10, #0x80]\n" |
1902 |
|
|
"ldr q15, [x10, #0x90]\n" |
1903 |
|
|
"ldr q16, [x10, #0xa0]\n" |
1904 |
|
|
"ldr q17, [x10, #0xb0]\n" |
1905 |
|
|
"ldr q18, [x10, #0xc0]\n" |
1906 |
|
|
"ldr q19, [x10, #0xd0]\n" |
1907 |
|
|
"blt 151f\n" |
1908 |
|
|
"150:" // Height 5: Multiply loop: Main loop head |
1909 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
1910 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
1911 |
|
|
"sub x27, x27, #0x8\n" |
1912 |
|
|
"add x26, x26, #0x10\n" |
1913 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
1914 |
|
|
"fmla v26.8h, v6.8h, v3.h[0]\n" |
1915 |
|
|
"add x25, x25, #0x10\n" |
1916 |
|
|
"add x24, x24, #0x10\n" |
1917 |
|
|
"fmla v28.8h, v6.8h, v4.h[0]\n" |
1918 |
|
|
"ldr q6, [x10, #0xe0]\n" |
1919 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
1920 |
|
|
"add x23, x23, #0x10\n" |
1921 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
1922 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
1923 |
|
|
"add x22, x22, #0x10\n" |
1924 |
|
|
"cmp x27, #0x10\n" |
1925 |
|
|
"fmla v27.8h, v7.8h, v3.h[0]\n" |
1926 |
|
|
"fmla v29.8h, v7.8h, v4.h[0]\n" |
1927 |
|
|
"ldr q7, [x10, #0xf0]\n" |
1928 |
|
|
"add x10, x10, #0x100\n" |
1929 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
1930 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
1931 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
1932 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
1933 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
1934 |
|
|
"fmla v26.8h, v8.8h, v3.h[1]\n" |
1935 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
1936 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1937 |
|
|
"fmla v28.8h, v8.8h, v4.h[1]\n" |
1938 |
|
|
"ldr q8, [x10, #0x20]\n" |
1939 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
1940 |
|
|
"prfm pldl1keep, [x22, #0x80]\n" |
1941 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
1942 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
1943 |
|
|
"fmla v27.8h, v9.8h, v3.h[1]\n" |
1944 |
|
|
"fmla v29.8h, v9.8h, v4.h[1]\n" |
1945 |
|
|
"ldr q9, [x10, #0x30]\n" |
1946 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
1947 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
1948 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
1949 |
|
|
"fmla v26.8h, v10.8h, v3.h[2]\n" |
1950 |
|
|
"fmla v28.8h, v10.8h, v4.h[2]\n" |
1951 |
|
|
"ldr q10, [x10, #0x40]\n" |
1952 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
1953 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
1954 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
1955 |
|
|
"fmla v27.8h, v11.8h, v3.h[2]\n" |
1956 |
|
|
"fmla v29.8h, v11.8h, v4.h[2]\n" |
1957 |
|
|
"ldr q11, [x10, #0x50]\n" |
1958 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
1959 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
1960 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
1961 |
|
|
"fmla v26.8h, v12.8h, v3.h[3]\n" |
1962 |
|
|
"fmla v28.8h, v12.8h, v4.h[3]\n" |
1963 |
|
|
"ldr q12, [x10, #0x60]\n" |
1964 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
1965 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
1966 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
1967 |
|
|
"fmla v27.8h, v13.8h, v3.h[3]\n" |
1968 |
|
|
"fmla v29.8h, v13.8h, v4.h[3]\n" |
1969 |
|
|
"ldr q13, [x10, #0x70]\n" |
1970 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
1971 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
1972 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
1973 |
|
|
"fmla v26.8h, v14.8h, v3.h[4]\n" |
1974 |
|
|
"fmla v28.8h, v14.8h, v4.h[4]\n" |
1975 |
|
|
"ldr q14, [x10, #0x80]\n" |
1976 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
1977 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
1978 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
1979 |
|
|
"fmla v27.8h, v15.8h, v3.h[4]\n" |
1980 |
|
|
"fmla v29.8h, v15.8h, v4.h[4]\n" |
1981 |
|
|
"ldr q15, [x10, #0x90]\n" |
1982 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
1983 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
1984 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
1985 |
|
|
"fmla v26.8h, v16.8h, v3.h[5]\n" |
1986 |
|
|
"fmla v28.8h, v16.8h, v4.h[5]\n" |
1987 |
|
|
"ldr q16, [x10, #0xa0]\n" |
1988 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
1989 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
1990 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
1991 |
|
|
"fmla v27.8h, v17.8h, v3.h[5]\n" |
1992 |
|
|
"fmla v29.8h, v17.8h, v4.h[5]\n" |
1993 |
|
|
"ldr q17, [x10, #0xb0]\n" |
1994 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
1995 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
1996 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
1997 |
|
|
"fmla v26.8h, v18.8h, v3.h[6]\n" |
1998 |
|
|
"fmla v28.8h, v18.8h, v4.h[6]\n" |
1999 |
|
|
"ldr q18, [x10, #0xc0]\n" |
2000 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
2001 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
2002 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
2003 |
|
|
"fmla v27.8h, v19.8h, v3.h[6]\n" |
2004 |
|
|
"fmla v29.8h, v19.8h, v4.h[6]\n" |
2005 |
|
|
"ldr q19, [x10, #0xd0]\n" |
2006 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
2007 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
2008 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
2009 |
|
|
"fmla v26.8h, v6.8h, v3.h[7]\n" |
2010 |
|
|
"fmla v28.8h, v6.8h, v4.h[7]\n" |
2011 |
|
|
"ldr q6, [x10, #0x0]\n" |
2012 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
2013 |
|
|
"ldr q0, [x26, #0x0]\n" |
2014 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
2015 |
|
|
"ldr q1, [x25, #0x0]\n" |
2016 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
2017 |
|
|
"ldr q2, [x24, #0x0]\n" |
2018 |
|
|
"fmla v27.8h, v7.8h, v3.h[7]\n" |
2019 |
|
|
"ldr q3, [x23, #0x0]\n" |
2020 |
|
|
"fmla v29.8h, v7.8h, v4.h[7]\n" |
2021 |
|
|
"ldr q4, [x22, #0x0]\n" |
2022 |
|
|
"ldr q7, [x10, #0x10]\n" |
2023 |
|
|
"bge 150b\n" |
2024 |
|
|
"151:" // Height 5: Multiply loop: Single iteration only |
2025 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
2026 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
2027 |
|
|
"add x26, x26, #0x10\n" |
2028 |
|
|
"add x25, x25, #0x10\n" |
2029 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
2030 |
|
|
"fmla v26.8h, v6.8h, v3.h[0]\n" |
2031 |
|
|
"add x24, x24, #0x10\n" |
2032 |
|
|
"add x23, x23, #0x10\n" |
2033 |
|
|
"fmla v28.8h, v6.8h, v4.h[0]\n" |
2034 |
|
|
"ldr q6, [x10, #0xe0]\n" |
2035 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
2036 |
|
|
"add x22, x22, #0x10\n" |
2037 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
2038 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
2039 |
|
|
"sub x27, x27, #0x8\n" |
2040 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
2041 |
|
|
"fmla v27.8h, v7.8h, v3.h[0]\n" |
2042 |
|
|
"fmla v29.8h, v7.8h, v4.h[0]\n" |
2043 |
|
|
"ldr q7, [x10, #0xf0]\n" |
2044 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
2045 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
2046 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
2047 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
2048 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
2049 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
2050 |
|
|
"fmla v26.8h, v8.8h, v3.h[1]\n" |
2051 |
|
|
"prfm pldl1keep, [x22, #0x80]\n" |
2052 |
|
|
"add x10, x10, #0x100\n" |
2053 |
|
|
"fmla v28.8h, v8.8h, v4.h[1]\n" |
2054 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
2055 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
2056 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
2057 |
|
|
"fmla v27.8h, v9.8h, v3.h[1]\n" |
2058 |
|
|
"fmla v29.8h, v9.8h, v4.h[1]\n" |
2059 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
2060 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
2061 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
2062 |
|
|
"fmla v26.8h, v10.8h, v3.h[2]\n" |
2063 |
|
|
"fmla v28.8h, v10.8h, v4.h[2]\n" |
2064 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
2065 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
2066 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
2067 |
|
|
"fmla v27.8h, v11.8h, v3.h[2]\n" |
2068 |
|
|
"fmla v29.8h, v11.8h, v4.h[2]\n" |
2069 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
2070 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
2071 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
2072 |
|
|
"fmla v26.8h, v12.8h, v3.h[3]\n" |
2073 |
|
|
"fmla v28.8h, v12.8h, v4.h[3]\n" |
2074 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
2075 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
2076 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
2077 |
|
|
"fmla v27.8h, v13.8h, v3.h[3]\n" |
2078 |
|
|
"fmla v29.8h, v13.8h, v4.h[3]\n" |
2079 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
2080 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
2081 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
2082 |
|
|
"fmla v26.8h, v14.8h, v3.h[4]\n" |
2083 |
|
|
"fmla v28.8h, v14.8h, v4.h[4]\n" |
2084 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
2085 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
2086 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
2087 |
|
|
"fmla v27.8h, v15.8h, v3.h[4]\n" |
2088 |
|
|
"fmla v29.8h, v15.8h, v4.h[4]\n" |
2089 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
2090 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
2091 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
2092 |
|
|
"fmla v26.8h, v16.8h, v3.h[5]\n" |
2093 |
|
|
"fmla v28.8h, v16.8h, v4.h[5]\n" |
2094 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
2095 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
2096 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
2097 |
|
|
"fmla v27.8h, v17.8h, v3.h[5]\n" |
2098 |
|
|
"fmla v29.8h, v17.8h, v4.h[5]\n" |
2099 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
2100 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
2101 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
2102 |
|
|
"fmla v26.8h, v18.8h, v3.h[6]\n" |
2103 |
|
|
"fmla v28.8h, v18.8h, v4.h[6]\n" |
2104 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
2105 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
2106 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
2107 |
|
|
"fmla v27.8h, v19.8h, v3.h[6]\n" |
2108 |
|
|
"fmla v29.8h, v19.8h, v4.h[6]\n" |
2109 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
2110 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
2111 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
2112 |
|
|
"fmla v26.8h, v6.8h, v3.h[7]\n" |
2113 |
|
|
"fmla v28.8h, v6.8h, v4.h[7]\n" |
2114 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
2115 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
2116 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
2117 |
|
|
"fmla v27.8h, v7.8h, v3.h[7]\n" |
2118 |
|
|
"fmla v29.8h, v7.8h, v4.h[7]\n" |
2119 |
|
|
"152:" // Height 5: Multiply loop: Main loop skip |
2120 |
|
|
"cbz x27, 154f\n" |
2121 |
|
|
"153:" // Height 5: Multiply loop: Odd block loop |
2122 |
|
|
"ldr h0, [x26], #0x2\n" |
2123 |
|
|
"ldr h1, [x25], #0x2\n" |
2124 |
|
|
"sub x27, x27, #0x1\n" |
2125 |
|
|
"ldr h2, [x24], #0x2\n" |
2126 |
|
|
"ldr h3, [x23], #0x2\n" |
2127 |
|
|
"ldr h4, [x22], #0x2\n" |
2128 |
|
|
"ldr q8, [x10, #0x0]\n" |
2129 |
|
|
"ldr q9, [x10, #0x10]\n" |
2130 |
|
|
"add x10, x10, #0x20\n" |
2131 |
|
|
"fmla v20.8h, v8.8h, v0.h[0]\n" |
2132 |
|
|
"fmla v22.8h, v8.8h, v1.h[0]\n" |
2133 |
|
|
"fmla v24.8h, v8.8h, v2.h[0]\n" |
2134 |
|
|
"fmla v26.8h, v8.8h, v3.h[0]\n" |
2135 |
|
|
"fmla v28.8h, v8.8h, v4.h[0]\n" |
2136 |
|
|
"fmla v21.8h, v9.8h, v0.h[0]\n" |
2137 |
|
|
"fmla v23.8h, v9.8h, v1.h[0]\n" |
2138 |
|
|
"fmla v25.8h, v9.8h, v2.h[0]\n" |
2139 |
|
|
"fmla v27.8h, v9.8h, v3.h[0]\n" |
2140 |
|
|
"fmla v29.8h, v9.8h, v4.h[0]\n" |
2141 |
|
|
"cbnz x27, 153b\n" |
2142 |
|
|
"154:" // Height 5: Multiply loop: No odd multiplies |
2143 |
|
|
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" |
2144 |
|
|
"add x28, x28, #0x1\n" |
2145 |
|
|
"cmp x28, x20\n" |
2146 |
|
|
"bne 147b\n" |
2147 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
2148 |
|
|
"prfm pstl1keep, [x9, #0x0]\n" |
2149 |
|
|
"add x26, x9, x20, LSL #1\n" |
2150 |
|
|
"prfm pstl1keep, [x26, #0x0]\n" |
2151 |
|
|
"add x25, x26, x20, LSL #1\n" |
2152 |
|
|
"prfm pstl1keep, [x25, #0x0]\n" |
2153 |
|
|
"add x24, x25, x20, LSL #1\n" |
2154 |
|
|
"prfm pstl1keep, [x24, #0x0]\n" |
2155 |
|
|
"add x23, x24, x20, LSL #1\n" |
2156 |
|
|
"prfm pstl1keep, [x23, #0x0]\n" |
2157 |
|
|
"tbz %x[flags], #1, 155f\n" |
2158 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
2159 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
2160 |
|
|
"ld1r { v17.8h }, [x21]\n" |
2161 |
|
|
"ld1r { v16.8h }, [x20]\n" |
2162 |
|
|
"fmin v20.8h, v20.8h, v17.8h\n" |
2163 |
|
|
"fmin v21.8h, v21.8h, v17.8h\n" |
2164 |
|
|
"fmin v22.8h, v22.8h, v17.8h\n" |
2165 |
|
|
"fmin v23.8h, v23.8h, v17.8h\n" |
2166 |
|
|
"fmin v24.8h, v24.8h, v17.8h\n" |
2167 |
|
|
"fmin v25.8h, v25.8h, v17.8h\n" |
2168 |
|
|
"fmin v26.8h, v26.8h, v17.8h\n" |
2169 |
|
|
"fmin v27.8h, v27.8h, v17.8h\n" |
2170 |
|
|
"fmin v28.8h, v28.8h, v17.8h\n" |
2171 |
|
|
"fmin v29.8h, v29.8h, v17.8h\n" |
2172 |
|
|
"fmax v20.8h, v20.8h, v16.8h\n" |
2173 |
|
|
"fmax v21.8h, v21.8h, v16.8h\n" |
2174 |
|
|
"fmax v22.8h, v22.8h, v16.8h\n" |
2175 |
|
|
"fmax v23.8h, v23.8h, v16.8h\n" |
2176 |
|
|
"fmax v24.8h, v24.8h, v16.8h\n" |
2177 |
|
|
"fmax v25.8h, v25.8h, v16.8h\n" |
2178 |
|
|
"fmax v26.8h, v26.8h, v16.8h\n" |
2179 |
|
|
"fmax v27.8h, v27.8h, v16.8h\n" |
2180 |
|
|
"fmax v28.8h, v28.8h, v16.8h\n" |
2181 |
|
|
"fmax v29.8h, v29.8h, v16.8h\n" |
2182 |
|
|
"155:" // Height 5: No activation |
2183 |
|
|
"cmp x11, #0x10\n" |
2184 |
|
|
"bge 164f\n" |
2185 |
|
|
"tbz x11, #3, 159f\n" |
2186 |
|
|
"st1 { v20.8h }, [x9], #0x10\n" |
2187 |
|
|
"st1 { v22.8h }, [x26], #0x10\n" |
2188 |
|
|
"st1 { v24.8h }, [x25], #0x10\n" |
2189 |
|
|
"st1 { v26.8h }, [x24], #0x10\n" |
2190 |
|
|
"st1 { v28.8h }, [x23], #0x10\n" |
2191 |
|
|
"tbz x11, #2, 157f\n" |
2192 |
|
|
"str d21, [x9], #0x8\n" |
2193 |
|
|
"str d23, [x26], #0x8\n" |
2194 |
|
|
"str d25, [x25], #0x8\n" |
2195 |
|
|
"str d27, [x24], #0x8\n" |
2196 |
|
|
"str d29, [x23], #0x8\n" |
2197 |
|
|
"tbz x11, #1, 156f\n" |
2198 |
|
|
"st1 { v21.s }[2], [x9], #0x4\n" |
2199 |
|
|
"st1 { v23.s }[2], [x26], #0x4\n" |
2200 |
|
|
"st1 { v25.s }[2], [x25], #0x4\n" |
2201 |
|
|
"st1 { v27.s }[2], [x24], #0x4\n" |
2202 |
|
|
"st1 { v29.s }[2], [x23], #0x4\n" |
2203 |
|
|
"tbz x11, #0, 163f\n" |
2204 |
|
|
"st1 { v21.h }[6], [x9]\n" |
2205 |
|
|
"st1 { v23.h }[6], [x26]\n" |
2206 |
|
|
"st1 { v25.h }[6], [x25]\n" |
2207 |
|
|
"st1 { v27.h }[6], [x24]\n" |
2208 |
|
|
"st1 { v29.h }[6], [x23]\n" |
2209 |
|
|
"b 163f\n" |
2210 |
|
|
"156:" // Height 5: Partial direct writeback: partial_1_12 |
2211 |
|
|
"tbz x11, #0, 163f\n" |
2212 |
|
|
"st1 { v21.h }[4], [x9]\n" |
2213 |
|
|
"st1 { v23.h }[4], [x26]\n" |
2214 |
|
|
"st1 { v25.h }[4], [x25]\n" |
2215 |
|
|
"st1 { v27.h }[4], [x24]\n" |
2216 |
|
|
"st1 { v29.h }[4], [x23]\n" |
2217 |
|
|
"b 163f\n" |
2218 |
|
|
"157:" // Height 5: Partial direct writeback: partial_2_8 |
2219 |
|
|
"tbz x11, #1, 158f\n" |
2220 |
|
|
"str s21, [x9], #0x4\n" |
2221 |
|
|
"str s23, [x26], #0x4\n" |
2222 |
|
|
"str s25, [x25], #0x4\n" |
2223 |
|
|
"str s27, [x24], #0x4\n" |
2224 |
|
|
"str s29, [x23], #0x4\n" |
2225 |
|
|
"tbz x11, #0, 163f\n" |
2226 |
|
|
"st1 { v21.h }[2], [x9]\n" |
2227 |
|
|
"st1 { v23.h }[2], [x26]\n" |
2228 |
|
|
"st1 { v25.h }[2], [x25]\n" |
2229 |
|
|
"st1 { v27.h }[2], [x24]\n" |
2230 |
|
|
"st1 { v29.h }[2], [x23]\n" |
2231 |
|
|
"b 163f\n" |
2232 |
|
|
"158:" // Height 5: Partial direct writeback: partial_1_8 |
2233 |
|
|
"tbz x11, #0, 163f\n" |
2234 |
|
|
"str h21, [x9, #0x0]\n" |
2235 |
|
|
"str h23, [x26, #0x0]\n" |
2236 |
|
|
"str h25, [x25, #0x0]\n" |
2237 |
|
|
"str h27, [x24, #0x0]\n" |
2238 |
|
|
"str h29, [x23, #0x0]\n" |
2239 |
|
|
"b 163f\n" |
2240 |
|
|
"159:" // Height 5: Partial direct writeback: partial_4_0 |
2241 |
|
|
"tbz x11, #2, 161f\n" |
2242 |
|
|
"str d20, [x9], #0x8\n" |
2243 |
|
|
"str d22, [x26], #0x8\n" |
2244 |
|
|
"str d24, [x25], #0x8\n" |
2245 |
|
|
"str d26, [x24], #0x8\n" |
2246 |
|
|
"str d28, [x23], #0x8\n" |
2247 |
|
|
"tbz x11, #1, 160f\n" |
2248 |
|
|
"st1 { v20.s }[2], [x9], #0x4\n" |
2249 |
|
|
"st1 { v22.s }[2], [x26], #0x4\n" |
2250 |
|
|
"st1 { v24.s }[2], [x25], #0x4\n" |
2251 |
|
|
"st1 { v26.s }[2], [x24], #0x4\n" |
2252 |
|
|
"st1 { v28.s }[2], [x23], #0x4\n" |
2253 |
|
|
"tbz x11, #0, 163f\n" |
2254 |
|
|
"st1 { v20.h }[6], [x9]\n" |
2255 |
|
|
"st1 { v22.h }[6], [x26]\n" |
2256 |
|
|
"st1 { v24.h }[6], [x25]\n" |
2257 |
|
|
"st1 { v26.h }[6], [x24]\n" |
2258 |
|
|
"st1 { v28.h }[6], [x23]\n" |
2259 |
|
|
"b 163f\n" |
2260 |
|
|
"160:" // Height 5: Partial direct writeback: partial_1_4 |
2261 |
|
|
"tbz x11, #0, 163f\n" |
2262 |
|
|
"st1 { v20.h }[4], [x9]\n" |
2263 |
|
|
"st1 { v22.h }[4], [x26]\n" |
2264 |
|
|
"st1 { v24.h }[4], [x25]\n" |
2265 |
|
|
"st1 { v26.h }[4], [x24]\n" |
2266 |
|
|
"st1 { v28.h }[4], [x23]\n" |
2267 |
|
|
"b 163f\n" |
2268 |
|
|
"161:" // Height 5: Partial direct writeback: partial_2_0 |
2269 |
|
|
"tbz x11, #1, 162f\n" |
2270 |
|
|
"str s20, [x9], #0x4\n" |
2271 |
|
|
"str s22, [x26], #0x4\n" |
2272 |
|
|
"str s24, [x25], #0x4\n" |
2273 |
|
|
"str s26, [x24], #0x4\n" |
2274 |
|
|
"str s28, [x23], #0x4\n" |
2275 |
|
|
"tbz x11, #0, 163f\n" |
2276 |
|
|
"st1 { v20.h }[2], [x9]\n" |
2277 |
|
|
"st1 { v22.h }[2], [x26]\n" |
2278 |
|
|
"st1 { v24.h }[2], [x25]\n" |
2279 |
|
|
"st1 { v26.h }[2], [x24]\n" |
2280 |
|
|
"st1 { v28.h }[2], [x23]\n" |
2281 |
|
|
"b 163f\n" |
2282 |
|
|
"162:" // Height 5: Partial direct writeback: partial_1_0 |
2283 |
|
|
"str h20, [x9, #0x0]\n" |
2284 |
|
|
"str h22, [x26, #0x0]\n" |
2285 |
|
|
"str h24, [x25, #0x0]\n" |
2286 |
|
|
"str h26, [x24, #0x0]\n" |
2287 |
|
|
"str h28, [x23, #0x0]\n" |
2288 |
|
|
"163:" // Height 5: Partial direct writeback: Done |
2289 |
|
|
"b 165f\n" |
2290 |
|
|
"164:" // Height 5: Full writeback |
2291 |
|
|
"str q20, [x9, #0x0]\n" |
2292 |
|
|
"str q21, [x9, #0x10]\n" |
2293 |
|
|
"add x9, x9, #0x20\n" |
2294 |
|
|
"str q22, [x26, #0x0]\n" |
2295 |
|
|
"str q23, [x26, #0x10]\n" |
2296 |
|
|
"str q24, [x25, #0x0]\n" |
2297 |
|
|
"str q25, [x25, #0x10]\n" |
2298 |
|
|
"str q26, [x24, #0x0]\n" |
2299 |
|
|
"str q27, [x24, #0x10]\n" |
2300 |
|
|
"str q28, [x23, #0x0]\n" |
2301 |
|
|
"str q29, [x23, #0x10]\n" |
2302 |
|
|
"165:" // Height 5: Writeback done |
2303 |
|
|
"subs x11, x11, #0x10\n" |
2304 |
|
|
"bgt 134b\n" |
2305 |
|
|
"b 200f\n" |
2306 |
|
|
"166:" // Height 6 |
2307 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" |
2308 |
|
|
"ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
2309 |
|
|
"mov x20, #0xc\n" |
2310 |
|
|
"ldr x11, [%x[args_ptr], %[offsetof_N]]\n" |
2311 |
|
|
"ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" |
2312 |
|
|
"madd x20, x21, x20, x9\n" |
2313 |
|
|
"str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" |
2314 |
|
|
"167:" // Height 6: Column loop |
2315 |
|
|
"cbz x10, 168f\n" |
2316 |
|
|
"ldr q20, [x10, #0x0]\n" |
2317 |
|
|
"ldr q21, [x10, #0x10]\n" |
2318 |
|
|
"add x10, x10, #0x20\n" |
2319 |
|
|
"mov v22.16b, v20.16b\n" |
2320 |
|
|
"mov v23.16b, v21.16b\n" |
2321 |
|
|
"mov v24.16b, v20.16b\n" |
2322 |
|
|
"mov v25.16b, v21.16b\n" |
2323 |
|
|
"mov v26.16b, v20.16b\n" |
2324 |
|
|
"mov v27.16b, v21.16b\n" |
2325 |
|
|
"mov v28.16b, v20.16b\n" |
2326 |
|
|
"mov v29.16b, v21.16b\n" |
2327 |
|
|
"mov v30.16b, v20.16b\n" |
2328 |
|
|
"mov v31.16b, v21.16b\n" |
2329 |
|
|
"b 179f\n" |
2330 |
|
|
"168:" // Height 6: no bias |
2331 |
|
|
"tbz %x[flags], #0, 178f\n" |
2332 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
2333 |
|
|
"cmp x11, #0x10\n" |
2334 |
|
|
"add x26, x9, x20, LSL #1\n" |
2335 |
|
|
"add x25, x26, x20, LSL #1\n" |
2336 |
|
|
"add x24, x25, x20, LSL #1\n" |
2337 |
|
|
"add x23, x24, x20, LSL #1\n" |
2338 |
|
|
"add x22, x23, x20, LSL #1\n" |
2339 |
|
|
"bge 177f\n" |
2340 |
|
|
"tbz x11, #3, 172f\n" |
2341 |
|
|
"ld1 { v20.8h }, [x9], #0x10\n" |
2342 |
|
|
"ld1 { v22.8h }, [x26], #0x10\n" |
2343 |
|
|
"ld1 { v24.8h }, [x25], #0x10\n" |
2344 |
|
|
"ld1 { v26.8h }, [x24], #0x10\n" |
2345 |
|
|
"ld1 { v28.8h }, [x23], #0x10\n" |
2346 |
|
|
"ld1 { v30.8h }, [x22], #0x10\n" |
2347 |
|
|
"tbz x11, #2, 170f\n" |
2348 |
|
|
"ldr d21, [x9], #0x8\n" |
2349 |
|
|
"ldr d23, [x26], #0x8\n" |
2350 |
|
|
"ldr d25, [x25], #0x8\n" |
2351 |
|
|
"ldr d27, [x24], #0x8\n" |
2352 |
|
|
"ldr d29, [x23], #0x8\n" |
2353 |
|
|
"ldr d31, [x22], #0x8\n" |
2354 |
|
|
"tbz x11, #1, 169f\n" |
2355 |
|
|
"ld1 { v21.s }[2], [x9], #0x4\n" |
2356 |
|
|
"ld1 { v23.s }[2], [x26], #0x4\n" |
2357 |
|
|
"mov x20, #0x1c\n" |
2358 |
|
|
"ld1 { v25.s }[2], [x25], #0x4\n" |
2359 |
|
|
"ld1 { v27.s }[2], [x24], #0x4\n" |
2360 |
|
|
"ld1 { v29.s }[2], [x23], #0x4\n" |
2361 |
|
|
"ld1 { v31.s }[2], [x22], #0x4\n" |
2362 |
|
|
"tbz x11, #0, 176f\n" |
2363 |
|
|
"ld1 { v21.h }[6], [x9]\n" |
2364 |
|
|
"ld1 { v23.h }[6], [x26]\n" |
2365 |
|
|
"ld1 { v25.h }[6], [x25]\n" |
2366 |
|
|
"ld1 { v27.h }[6], [x24]\n" |
2367 |
|
|
"ld1 { v29.h }[6], [x23]\n" |
2368 |
|
|
"ld1 { v31.h }[6], [x22]\n" |
2369 |
|
|
"b 176f\n" |
2370 |
|
|
"169:" // Height 6: Partial accumulate: partial_1_12 |
2371 |
|
|
"mov x20, #0x18\n" |
2372 |
|
|
"tbz x11, #0, 176f\n" |
2373 |
|
|
"ld1 { v21.h }[4], [x9]\n" |
2374 |
|
|
"ld1 { v23.h }[4], [x26]\n" |
2375 |
|
|
"ld1 { v25.h }[4], [x25]\n" |
2376 |
|
|
"ld1 { v27.h }[4], [x24]\n" |
2377 |
|
|
"ld1 { v29.h }[4], [x23]\n" |
2378 |
|
|
"ld1 { v31.h }[4], [x22]\n" |
2379 |
|
|
"b 176f\n" |
2380 |
|
|
"170:" // Height 6: Partial accumulate: partial_2_8 |
2381 |
|
|
"tbz x11, #1, 171f\n" |
2382 |
|
|
"ldr s21, [x9], #0x4\n" |
2383 |
|
|
"ldr s23, [x26], #0x4\n" |
2384 |
|
|
"mov x20, #0x14\n" |
2385 |
|
|
"ldr s25, [x25], #0x4\n" |
2386 |
|
|
"ldr s27, [x24], #0x4\n" |
2387 |
|
|
"ldr s29, [x23], #0x4\n" |
2388 |
|
|
"ldr s31, [x22], #0x4\n" |
2389 |
|
|
"tbz x11, #0, 176f\n" |
2390 |
|
|
"ld1 { v21.h }[2], [x9]\n" |
2391 |
|
|
"ld1 { v23.h }[2], [x26]\n" |
2392 |
|
|
"ld1 { v25.h }[2], [x25]\n" |
2393 |
|
|
"ld1 { v27.h }[2], [x24]\n" |
2394 |
|
|
"ld1 { v29.h }[2], [x23]\n" |
2395 |
|
|
"ld1 { v31.h }[2], [x22]\n" |
2396 |
|
|
"b 176f\n" |
2397 |
|
|
"171:" // Height 6: Partial accumulate: partial_1_8 |
2398 |
|
|
"mov x20, #0x10\n" |
2399 |
|
|
"tbz x11, #0, 176f\n" |
2400 |
|
|
"ldr h21, [x9, #0x0]\n" |
2401 |
|
|
"ldr h23, [x26, #0x0]\n" |
2402 |
|
|
"ldr h25, [x25, #0x0]\n" |
2403 |
|
|
"ldr h27, [x24, #0x0]\n" |
2404 |
|
|
"ldr h29, [x23, #0x0]\n" |
2405 |
|
|
"ldr h31, [x22, #0x0]\n" |
2406 |
|
|
"b 176f\n" |
2407 |
|
|
"172:" // Height 6: Partial accumulate: partial_4_0 |
2408 |
|
|
"tbz x11, #2, 174f\n" |
2409 |
|
|
"ldr d20, [x9], #0x8\n" |
2410 |
|
|
"ldr d22, [x26], #0x8\n" |
2411 |
|
|
"ldr d24, [x25], #0x8\n" |
2412 |
|
|
"ldr d26, [x24], #0x8\n" |
2413 |
|
|
"ldr d28, [x23], #0x8\n" |
2414 |
|
|
"ldr d30, [x22], #0x8\n" |
2415 |
|
|
"tbz x11, #1, 173f\n" |
2416 |
|
|
"ld1 { v20.s }[2], [x9], #0x4\n" |
2417 |
|
|
"ld1 { v22.s }[2], [x26], #0x4\n" |
2418 |
|
|
"mov x20, #0xc\n" |
2419 |
|
|
"ld1 { v24.s }[2], [x25], #0x4\n" |
2420 |
|
|
"ld1 { v26.s }[2], [x24], #0x4\n" |
2421 |
|
|
"ld1 { v28.s }[2], [x23], #0x4\n" |
2422 |
|
|
"ld1 { v30.s }[2], [x22], #0x4\n" |
2423 |
|
|
"tbz x11, #0, 176f\n" |
2424 |
|
|
"ld1 { v20.h }[6], [x9]\n" |
2425 |
|
|
"ld1 { v22.h }[6], [x26]\n" |
2426 |
|
|
"ld1 { v24.h }[6], [x25]\n" |
2427 |
|
|
"ld1 { v26.h }[6], [x24]\n" |
2428 |
|
|
"ld1 { v28.h }[6], [x23]\n" |
2429 |
|
|
"ld1 { v30.h }[6], [x22]\n" |
2430 |
|
|
"b 176f\n" |
2431 |
|
|
"173:" // Height 6: Partial accumulate: partial_1_4 |
2432 |
|
|
"mov x20, #0x8\n" |
2433 |
|
|
"tbz x11, #0, 176f\n" |
2434 |
|
|
"ld1 { v20.h }[4], [x9]\n" |
2435 |
|
|
"ld1 { v22.h }[4], [x26]\n" |
2436 |
|
|
"ld1 { v24.h }[4], [x25]\n" |
2437 |
|
|
"ld1 { v26.h }[4], [x24]\n" |
2438 |
|
|
"ld1 { v28.h }[4], [x23]\n" |
2439 |
|
|
"ld1 { v30.h }[4], [x22]\n" |
2440 |
|
|
"b 176f\n" |
2441 |
|
|
"174:" // Height 6: Partial accumulate: partial_2_0 |
2442 |
|
|
"tbz x11, #1, 175f\n" |
2443 |
|
|
"ldr s20, [x9], #0x4\n" |
2444 |
|
|
"ldr s22, [x26], #0x4\n" |
2445 |
|
|
"mov x20, #0x4\n" |
2446 |
|
|
"ldr s24, [x25], #0x4\n" |
2447 |
|
|
"ldr s26, [x24], #0x4\n" |
2448 |
|
|
"ldr s28, [x23], #0x4\n" |
2449 |
|
|
"ldr s30, [x22], #0x4\n" |
2450 |
|
|
"tbz x11, #0, 176f\n" |
2451 |
|
|
"ld1 { v20.h }[2], [x9]\n" |
2452 |
|
|
"ld1 { v22.h }[2], [x26]\n" |
2453 |
|
|
"ld1 { v24.h }[2], [x25]\n" |
2454 |
|
|
"ld1 { v26.h }[2], [x24]\n" |
2455 |
|
|
"ld1 { v28.h }[2], [x23]\n" |
2456 |
|
|
"ld1 { v30.h }[2], [x22]\n" |
2457 |
|
|
"b 176f\n" |
2458 |
|
|
"175:" // Height 6: Partial accumulate: partial_1_0 |
2459 |
|
|
"ldr h20, [x9, #0x0]\n" |
2460 |
|
|
"ldr h22, [x26, #0x0]\n" |
2461 |
|
|
"mov x20, #0x0\n" |
2462 |
|
|
"ldr h24, [x25, #0x0]\n" |
2463 |
|
|
"ldr h26, [x24, #0x0]\n" |
2464 |
|
|
"ldr h28, [x23, #0x0]\n" |
2465 |
|
|
"ldr h30, [x22, #0x0]\n" |
2466 |
|
|
"176:" // Height 6: Partial accumulate: Done |
2467 |
|
|
"sub x9, x9, x20\n" |
2468 |
|
|
"b 179f\n" |
2469 |
|
|
"177:" // Height 6: full accumulate |
2470 |
|
|
"ldr q20, [x9, #0x0]\n" |
2471 |
|
|
"ldr q21, [x9, #0x10]\n" |
2472 |
|
|
"ldr q22, [x26, #0x0]\n" |
2473 |
|
|
"ldr q23, [x26, #0x10]\n" |
2474 |
|
|
"ldr q24, [x25, #0x0]\n" |
2475 |
|
|
"ldr q25, [x25, #0x10]\n" |
2476 |
|
|
"ldr q26, [x24, #0x0]\n" |
2477 |
|
|
"ldr q27, [x24, #0x10]\n" |
2478 |
|
|
"ldr q28, [x23, #0x0]\n" |
2479 |
|
|
"ldr q29, [x23, #0x10]\n" |
2480 |
|
|
"ldr q30, [x22, #0x0]\n" |
2481 |
|
|
"ldr q31, [x22, #0x10]\n" |
2482 |
|
|
"b 179f\n" |
2483 |
|
|
"178:" // Height 6: no accumulate |
2484 |
|
|
"movi v20.16b, #0x0\n" |
2485 |
|
|
"movi v21.16b, #0x0\n" |
2486 |
|
|
"movi v22.16b, #0x0\n" |
2487 |
|
|
"movi v23.16b, #0x0\n" |
2488 |
|
|
"movi v24.16b, #0x0\n" |
2489 |
|
|
"movi v25.16b, #0x0\n" |
2490 |
|
|
"movi v26.16b, #0x0\n" |
2491 |
|
|
"movi v27.16b, #0x0\n" |
2492 |
|
|
"movi v28.16b, #0x0\n" |
2493 |
|
|
"movi v29.16b, #0x0\n" |
2494 |
|
|
"movi v30.16b, #0x0\n" |
2495 |
|
|
"movi v31.16b, #0x0\n" |
2496 |
|
|
"179:" // Height 6: setup done |
2497 |
|
|
"mov x28, #0x0\n" |
2498 |
|
|
"180:" // Height 6: String loop |
2499 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" |
2500 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
2501 |
|
|
"ldr w27, [x20, x28, LSL #0x2]\n" |
2502 |
|
|
"tbz %x[flags], #3, 181f\n" |
2503 |
|
|
"ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" |
2504 |
|
|
"add x20, x20, x21, LSL #3\n" |
2505 |
|
|
"ldr x26, [x20, #0x0]\n" |
2506 |
|
|
"ldr x25, [x20, #0x8]\n" |
2507 |
|
|
"ldr x24, [x20, #0x10]\n" |
2508 |
|
|
"ldr x23, [x20, #0x18]\n" |
2509 |
|
|
"ldr x22, [x20, #0x20]\n" |
2510 |
|
|
"ldr x21, [x20, #0x28]\n" |
2511 |
|
|
"cbnz x28, 182f\n" |
2512 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" |
2513 |
|
|
"add x26, x26, x20, LSL #1\n" |
2514 |
|
|
"add x25, x25, x20, LSL #1\n" |
2515 |
|
|
"add x24, x24, x20, LSL #1\n" |
2516 |
|
|
"add x23, x23, x20, LSL #1\n" |
2517 |
|
|
"add x22, x22, x20, LSL #1\n" |
2518 |
|
|
"add x21, x21, x20, LSL #1\n" |
2519 |
|
|
"b 182f\n" |
2520 |
|
|
"181:" // Height 6: setup direct input |
2521 |
|
|
"mov x26, %x[input_ptr]\n" |
2522 |
|
|
"add x25, x26, x21, LSL #1\n" |
2523 |
|
|
"add x24, x25, x21, LSL #1\n" |
2524 |
|
|
"add x23, x24, x21, LSL #1\n" |
2525 |
|
|
"add x22, x23, x21, LSL #1\n" |
2526 |
|
|
"add x21, x22, x21, LSL #1\n" |
2527 |
|
|
"182:" // Height 6: input setup done |
2528 |
|
|
"cmp x27, #0x8\n" |
2529 |
|
|
"blt 185f\n" |
2530 |
|
|
"ldr q0, [x26, #0x0]\n" |
2531 |
|
|
"ldr q1, [x25, #0x0]\n" |
2532 |
|
|
"cmp x27, #0x10\n" |
2533 |
|
|
"ldr q2, [x24, #0x0]\n" |
2534 |
|
|
"ldr q3, [x23, #0x0]\n" |
2535 |
|
|
"ldr q4, [x22, #0x0]\n" |
2536 |
|
|
"ldr q5, [x21, #0x0]\n" |
2537 |
|
|
"ldr q6, [x10, #0x0]\n" |
2538 |
|
|
"ldr q7, [x10, #0x10]\n" |
2539 |
|
|
"ldr q8, [x10, #0x20]\n" |
2540 |
|
|
"ldr q9, [x10, #0x30]\n" |
2541 |
|
|
"ldr q10, [x10, #0x40]\n" |
2542 |
|
|
"ldr q11, [x10, #0x50]\n" |
2543 |
|
|
"ldr q12, [x10, #0x60]\n" |
2544 |
|
|
"ldr q13, [x10, #0x70]\n" |
2545 |
|
|
"ldr q14, [x10, #0x80]\n" |
2546 |
|
|
"ldr q15, [x10, #0x90]\n" |
2547 |
|
|
"ldr q16, [x10, #0xa0]\n" |
2548 |
|
|
"ldr q17, [x10, #0xb0]\n" |
2549 |
|
|
"ldr q18, [x10, #0xc0]\n" |
2550 |
|
|
"ldr q19, [x10, #0xd0]\n" |
2551 |
|
|
"blt 184f\n" |
2552 |
|
|
"183:" // Height 6: Multiply loop: Main loop head |
2553 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
2554 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
2555 |
|
|
"sub x27, x27, #0x8\n" |
2556 |
|
|
"add x26, x26, #0x10\n" |
2557 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
2558 |
|
|
"fmla v26.8h, v6.8h, v3.h[0]\n" |
2559 |
|
|
"add x25, x25, #0x10\n" |
2560 |
|
|
"add x24, x24, #0x10\n" |
2561 |
|
|
"fmla v28.8h, v6.8h, v4.h[0]\n" |
2562 |
|
|
"fmla v30.8h, v6.8h, v5.h[0]\n" |
2563 |
|
|
"ldr q6, [x10, #0xe0]\n" |
2564 |
|
|
"add x23, x23, #0x10\n" |
2565 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
2566 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
2567 |
|
|
"add x22, x22, #0x10\n" |
2568 |
|
|
"add x21, x21, #0x10\n" |
2569 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
2570 |
|
|
"fmla v27.8h, v7.8h, v3.h[0]\n" |
2571 |
|
|
"cmp x27, #0x10\n" |
2572 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
2573 |
|
|
"fmla v29.8h, v7.8h, v4.h[0]\n" |
2574 |
|
|
"fmla v31.8h, v7.8h, v5.h[0]\n" |
2575 |
|
|
"ldr q7, [x10, #0xf0]\n" |
2576 |
|
|
"add x10, x10, #0x100\n" |
2577 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
2578 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
2579 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
2580 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
2581 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
2582 |
|
|
"fmla v26.8h, v8.8h, v3.h[1]\n" |
2583 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
2584 |
|
|
"prfm pldl1keep, [x22, #0x80]\n" |
2585 |
|
|
"fmla v28.8h, v8.8h, v4.h[1]\n" |
2586 |
|
|
"fmla v30.8h, v8.8h, v5.h[1]\n" |
2587 |
|
|
"ldr q8, [x10, #0x20]\n" |
2588 |
|
|
"prfm pldl1keep, [x21, #0x80]\n" |
2589 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
2590 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
2591 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
2592 |
|
|
"fmla v27.8h, v9.8h, v3.h[1]\n" |
2593 |
|
|
"fmla v29.8h, v9.8h, v4.h[1]\n" |
2594 |
|
|
"fmla v31.8h, v9.8h, v5.h[1]\n" |
2595 |
|
|
"ldr q9, [x10, #0x30]\n" |
2596 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
2597 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
2598 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
2599 |
|
|
"fmla v26.8h, v10.8h, v3.h[2]\n" |
2600 |
|
|
"fmla v28.8h, v10.8h, v4.h[2]\n" |
2601 |
|
|
"fmla v30.8h, v10.8h, v5.h[2]\n" |
2602 |
|
|
"ldr q10, [x10, #0x40]\n" |
2603 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
2604 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
2605 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
2606 |
|
|
"fmla v27.8h, v11.8h, v3.h[2]\n" |
2607 |
|
|
"fmla v29.8h, v11.8h, v4.h[2]\n" |
2608 |
|
|
"fmla v31.8h, v11.8h, v5.h[2]\n" |
2609 |
|
|
"ldr q11, [x10, #0x50]\n" |
2610 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
2611 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
2612 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
2613 |
|
|
"fmla v26.8h, v12.8h, v3.h[3]\n" |
2614 |
|
|
"fmla v28.8h, v12.8h, v4.h[3]\n" |
2615 |
|
|
"fmla v30.8h, v12.8h, v5.h[3]\n" |
2616 |
|
|
"ldr q12, [x10, #0x60]\n" |
2617 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
2618 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
2619 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
2620 |
|
|
"fmla v27.8h, v13.8h, v3.h[3]\n" |
2621 |
|
|
"fmla v29.8h, v13.8h, v4.h[3]\n" |
2622 |
|
|
"fmla v31.8h, v13.8h, v5.h[3]\n" |
2623 |
|
|
"ldr q13, [x10, #0x70]\n" |
2624 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
2625 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
2626 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
2627 |
|
|
"fmla v26.8h, v14.8h, v3.h[4]\n" |
2628 |
|
|
"fmla v28.8h, v14.8h, v4.h[4]\n" |
2629 |
|
|
"fmla v30.8h, v14.8h, v5.h[4]\n" |
2630 |
|
|
"ldr q14, [x10, #0x80]\n" |
2631 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
2632 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
2633 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
2634 |
|
|
"fmla v27.8h, v15.8h, v3.h[4]\n" |
2635 |
|
|
"fmla v29.8h, v15.8h, v4.h[4]\n" |
2636 |
|
|
"fmla v31.8h, v15.8h, v5.h[4]\n" |
2637 |
|
|
"ldr q15, [x10, #0x90]\n" |
2638 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
2639 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
2640 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
2641 |
|
|
"fmla v26.8h, v16.8h, v3.h[5]\n" |
2642 |
|
|
"fmla v28.8h, v16.8h, v4.h[5]\n" |
2643 |
|
|
"fmla v30.8h, v16.8h, v5.h[5]\n" |
2644 |
|
|
"ldr q16, [x10, #0xa0]\n" |
2645 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
2646 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
2647 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
2648 |
|
|
"fmla v27.8h, v17.8h, v3.h[5]\n" |
2649 |
|
|
"fmla v29.8h, v17.8h, v4.h[5]\n" |
2650 |
|
|
"fmla v31.8h, v17.8h, v5.h[5]\n" |
2651 |
|
|
"ldr q17, [x10, #0xb0]\n" |
2652 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
2653 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
2654 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
2655 |
|
|
"fmla v26.8h, v18.8h, v3.h[6]\n" |
2656 |
|
|
"fmla v28.8h, v18.8h, v4.h[6]\n" |
2657 |
|
|
"fmla v30.8h, v18.8h, v5.h[6]\n" |
2658 |
|
|
"ldr q18, [x10, #0xc0]\n" |
2659 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
2660 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
2661 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
2662 |
|
|
"fmla v27.8h, v19.8h, v3.h[6]\n" |
2663 |
|
|
"fmla v29.8h, v19.8h, v4.h[6]\n" |
2664 |
|
|
"fmla v31.8h, v19.8h, v5.h[6]\n" |
2665 |
|
|
"ldr q19, [x10, #0xd0]\n" |
2666 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
2667 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
2668 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
2669 |
|
|
"fmla v26.8h, v6.8h, v3.h[7]\n" |
2670 |
|
|
"fmla v28.8h, v6.8h, v4.h[7]\n" |
2671 |
|
|
"fmla v30.8h, v6.8h, v5.h[7]\n" |
2672 |
|
|
"ldr q6, [x10, #0x0]\n" |
2673 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
2674 |
|
|
"ldr q0, [x26, #0x0]\n" |
2675 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
2676 |
|
|
"ldr q1, [x25, #0x0]\n" |
2677 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
2678 |
|
|
"ldr q2, [x24, #0x0]\n" |
2679 |
|
|
"fmla v27.8h, v7.8h, v3.h[7]\n" |
2680 |
|
|
"ldr q3, [x23, #0x0]\n" |
2681 |
|
|
"fmla v29.8h, v7.8h, v4.h[7]\n" |
2682 |
|
|
"ldr q4, [x22, #0x0]\n" |
2683 |
|
|
"fmla v31.8h, v7.8h, v5.h[7]\n" |
2684 |
|
|
"ldr q5, [x21, #0x0]\n" |
2685 |
|
|
"ldr q7, [x10, #0x10]\n" |
2686 |
|
|
"bge 183b\n" |
2687 |
|
|
"184:" // Height 6: Multiply loop: Single iteration only |
2688 |
|
|
"fmla v20.8h, v6.8h, v0.h[0]\n" |
2689 |
|
|
"fmla v22.8h, v6.8h, v1.h[0]\n" |
2690 |
|
|
"add x26, x26, #0x10\n" |
2691 |
|
|
"add x25, x25, #0x10\n" |
2692 |
|
|
"fmla v24.8h, v6.8h, v2.h[0]\n" |
2693 |
|
|
"fmla v26.8h, v6.8h, v3.h[0]\n" |
2694 |
|
|
"add x24, x24, #0x10\n" |
2695 |
|
|
"add x23, x23, #0x10\n" |
2696 |
|
|
"fmla v28.8h, v6.8h, v4.h[0]\n" |
2697 |
|
|
"fmla v30.8h, v6.8h, v5.h[0]\n" |
2698 |
|
|
"ldr q6, [x10, #0xe0]\n" |
2699 |
|
|
"add x22, x22, #0x10\n" |
2700 |
|
|
"fmla v21.8h, v7.8h, v0.h[0]\n" |
2701 |
|
|
"fmla v23.8h, v7.8h, v1.h[0]\n" |
2702 |
|
|
"add x21, x21, #0x10\n" |
2703 |
|
|
"sub x27, x27, #0x8\n" |
2704 |
|
|
"fmla v25.8h, v7.8h, v2.h[0]\n" |
2705 |
|
|
"fmla v27.8h, v7.8h, v3.h[0]\n" |
2706 |
|
|
"prfm pldl1keep, [x26, #0x80]\n" |
2707 |
|
|
"prfm pldl1keep, [x25, #0x80]\n" |
2708 |
|
|
"fmla v29.8h, v7.8h, v4.h[0]\n" |
2709 |
|
|
"fmla v31.8h, v7.8h, v5.h[0]\n" |
2710 |
|
|
"ldr q7, [x10, #0xf0]\n" |
2711 |
|
|
"prfm pldl1keep, [x24, #0x80]\n" |
2712 |
|
|
"fmla v20.8h, v8.8h, v0.h[1]\n" |
2713 |
|
|
"fmla v22.8h, v8.8h, v1.h[1]\n" |
2714 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
2715 |
|
|
"prfm pldl1keep, [x22, #0x80]\n" |
2716 |
|
|
"fmla v24.8h, v8.8h, v2.h[1]\n" |
2717 |
|
|
"fmla v26.8h, v8.8h, v3.h[1]\n" |
2718 |
|
|
"prfm pldl1keep, [x21, #0x80]\n" |
2719 |
|
|
"add x10, x10, #0x100\n" |
2720 |
|
|
"fmla v28.8h, v8.8h, v4.h[1]\n" |
2721 |
|
|
"fmla v30.8h, v8.8h, v5.h[1]\n" |
2722 |
|
|
"fmla v21.8h, v9.8h, v0.h[1]\n" |
2723 |
|
|
"fmla v23.8h, v9.8h, v1.h[1]\n" |
2724 |
|
|
"fmla v25.8h, v9.8h, v2.h[1]\n" |
2725 |
|
|
"fmla v27.8h, v9.8h, v3.h[1]\n" |
2726 |
|
|
"fmla v29.8h, v9.8h, v4.h[1]\n" |
2727 |
|
|
"fmla v31.8h, v9.8h, v5.h[1]\n" |
2728 |
|
|
"fmla v20.8h, v10.8h, v0.h[2]\n" |
2729 |
|
|
"fmla v22.8h, v10.8h, v1.h[2]\n" |
2730 |
|
|
"fmla v24.8h, v10.8h, v2.h[2]\n" |
2731 |
|
|
"fmla v26.8h, v10.8h, v3.h[2]\n" |
2732 |
|
|
"fmla v28.8h, v10.8h, v4.h[2]\n" |
2733 |
|
|
"fmla v30.8h, v10.8h, v5.h[2]\n" |
2734 |
|
|
"fmla v21.8h, v11.8h, v0.h[2]\n" |
2735 |
|
|
"fmla v23.8h, v11.8h, v1.h[2]\n" |
2736 |
|
|
"fmla v25.8h, v11.8h, v2.h[2]\n" |
2737 |
|
|
"fmla v27.8h, v11.8h, v3.h[2]\n" |
2738 |
|
|
"fmla v29.8h, v11.8h, v4.h[2]\n" |
2739 |
|
|
"fmla v31.8h, v11.8h, v5.h[2]\n" |
2740 |
|
|
"fmla v20.8h, v12.8h, v0.h[3]\n" |
2741 |
|
|
"fmla v22.8h, v12.8h, v1.h[3]\n" |
2742 |
|
|
"fmla v24.8h, v12.8h, v2.h[3]\n" |
2743 |
|
|
"fmla v26.8h, v12.8h, v3.h[3]\n" |
2744 |
|
|
"fmla v28.8h, v12.8h, v4.h[3]\n" |
2745 |
|
|
"fmla v30.8h, v12.8h, v5.h[3]\n" |
2746 |
|
|
"fmla v21.8h, v13.8h, v0.h[3]\n" |
2747 |
|
|
"fmla v23.8h, v13.8h, v1.h[3]\n" |
2748 |
|
|
"fmla v25.8h, v13.8h, v2.h[3]\n" |
2749 |
|
|
"fmla v27.8h, v13.8h, v3.h[3]\n" |
2750 |
|
|
"fmla v29.8h, v13.8h, v4.h[3]\n" |
2751 |
|
|
"fmla v31.8h, v13.8h, v5.h[3]\n" |
2752 |
|
|
"fmla v20.8h, v14.8h, v0.h[4]\n" |
2753 |
|
|
"fmla v22.8h, v14.8h, v1.h[4]\n" |
2754 |
|
|
"fmla v24.8h, v14.8h, v2.h[4]\n" |
2755 |
|
|
"fmla v26.8h, v14.8h, v3.h[4]\n" |
2756 |
|
|
"fmla v28.8h, v14.8h, v4.h[4]\n" |
2757 |
|
|
"fmla v30.8h, v14.8h, v5.h[4]\n" |
2758 |
|
|
"fmla v21.8h, v15.8h, v0.h[4]\n" |
2759 |
|
|
"fmla v23.8h, v15.8h, v1.h[4]\n" |
2760 |
|
|
"fmla v25.8h, v15.8h, v2.h[4]\n" |
2761 |
|
|
"fmla v27.8h, v15.8h, v3.h[4]\n" |
2762 |
|
|
"fmla v29.8h, v15.8h, v4.h[4]\n" |
2763 |
|
|
"fmla v31.8h, v15.8h, v5.h[4]\n" |
2764 |
|
|
"fmla v20.8h, v16.8h, v0.h[5]\n" |
2765 |
|
|
"fmla v22.8h, v16.8h, v1.h[5]\n" |
2766 |
|
|
"fmla v24.8h, v16.8h, v2.h[5]\n" |
2767 |
|
|
"fmla v26.8h, v16.8h, v3.h[5]\n" |
2768 |
|
|
"fmla v28.8h, v16.8h, v4.h[5]\n" |
2769 |
|
|
"fmla v30.8h, v16.8h, v5.h[5]\n" |
2770 |
|
|
"fmla v21.8h, v17.8h, v0.h[5]\n" |
2771 |
|
|
"fmla v23.8h, v17.8h, v1.h[5]\n" |
2772 |
|
|
"fmla v25.8h, v17.8h, v2.h[5]\n" |
2773 |
|
|
"fmla v27.8h, v17.8h, v3.h[5]\n" |
2774 |
|
|
"fmla v29.8h, v17.8h, v4.h[5]\n" |
2775 |
|
|
"fmla v31.8h, v17.8h, v5.h[5]\n" |
2776 |
|
|
"fmla v20.8h, v18.8h, v0.h[6]\n" |
2777 |
|
|
"fmla v22.8h, v18.8h, v1.h[6]\n" |
2778 |
|
|
"fmla v24.8h, v18.8h, v2.h[6]\n" |
2779 |
|
|
"fmla v26.8h, v18.8h, v3.h[6]\n" |
2780 |
|
|
"fmla v28.8h, v18.8h, v4.h[6]\n" |
2781 |
|
|
"fmla v30.8h, v18.8h, v5.h[6]\n" |
2782 |
|
|
"fmla v21.8h, v19.8h, v0.h[6]\n" |
2783 |
|
|
"fmla v23.8h, v19.8h, v1.h[6]\n" |
2784 |
|
|
"fmla v25.8h, v19.8h, v2.h[6]\n" |
2785 |
|
|
"fmla v27.8h, v19.8h, v3.h[6]\n" |
2786 |
|
|
"fmla v29.8h, v19.8h, v4.h[6]\n" |
2787 |
|
|
"fmla v31.8h, v19.8h, v5.h[6]\n" |
2788 |
|
|
"fmla v20.8h, v6.8h, v0.h[7]\n" |
2789 |
|
|
"fmla v22.8h, v6.8h, v1.h[7]\n" |
2790 |
|
|
"fmla v24.8h, v6.8h, v2.h[7]\n" |
2791 |
|
|
"fmla v26.8h, v6.8h, v3.h[7]\n" |
2792 |
|
|
"fmla v28.8h, v6.8h, v4.h[7]\n" |
2793 |
|
|
"fmla v30.8h, v6.8h, v5.h[7]\n" |
2794 |
|
|
"fmla v21.8h, v7.8h, v0.h[7]\n" |
2795 |
|
|
"fmla v23.8h, v7.8h, v1.h[7]\n" |
2796 |
|
|
"fmla v25.8h, v7.8h, v2.h[7]\n" |
2797 |
|
|
"fmla v27.8h, v7.8h, v3.h[7]\n" |
2798 |
|
|
"fmla v29.8h, v7.8h, v4.h[7]\n" |
2799 |
|
|
"fmla v31.8h, v7.8h, v5.h[7]\n" |
2800 |
|
|
"185:" // Height 6: Multiply loop: Main loop skip |
2801 |
|
|
"cbz x27, 187f\n" |
2802 |
|
|
"186:" // Height 6: Multiply loop: Odd block loop |
2803 |
|
|
"ldr h0, [x26], #0x2\n" |
2804 |
|
|
"ldr h1, [x25], #0x2\n" |
2805 |
|
|
"sub x27, x27, #0x1\n" |
2806 |
|
|
"ldr h2, [x24], #0x2\n" |
2807 |
|
|
"ldr h3, [x23], #0x2\n" |
2808 |
|
|
"ldr h4, [x22], #0x2\n" |
2809 |
|
|
"ldr h5, [x21], #0x2\n" |
2810 |
|
|
"ldr q8, [x10, #0x0]\n" |
2811 |
|
|
"ldr q9, [x10, #0x10]\n" |
2812 |
|
|
"add x10, x10, #0x20\n" |
2813 |
|
|
"fmla v20.8h, v8.8h, v0.h[0]\n" |
2814 |
|
|
"fmla v22.8h, v8.8h, v1.h[0]\n" |
2815 |
|
|
"fmla v24.8h, v8.8h, v2.h[0]\n" |
2816 |
|
|
"fmla v26.8h, v8.8h, v3.h[0]\n" |
2817 |
|
|
"fmla v28.8h, v8.8h, v4.h[0]\n" |
2818 |
|
|
"fmla v30.8h, v8.8h, v5.h[0]\n" |
2819 |
|
|
"fmla v21.8h, v9.8h, v0.h[0]\n" |
2820 |
|
|
"fmla v23.8h, v9.8h, v1.h[0]\n" |
2821 |
|
|
"fmla v25.8h, v9.8h, v2.h[0]\n" |
2822 |
|
|
"fmla v27.8h, v9.8h, v3.h[0]\n" |
2823 |
|
|
"fmla v29.8h, v9.8h, v4.h[0]\n" |
2824 |
|
|
"fmla v31.8h, v9.8h, v5.h[0]\n" |
2825 |
|
|
"cbnz x27, 186b\n" |
2826 |
|
|
"187:" // Height 6: Multiply loop: No odd multiplies |
2827 |
|
|
"ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" |
2828 |
|
|
"add x28, x28, #0x1\n" |
2829 |
|
|
"cmp x28, x20\n" |
2830 |
|
|
"bne 180b\n" |
2831 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" |
2832 |
|
|
"prfm pstl1keep, [x9, #0x0]\n" |
2833 |
|
|
"add x26, x9, x20, LSL #1\n" |
2834 |
|
|
"prfm pstl1keep, [x26, #0x0]\n" |
2835 |
|
|
"add x25, x26, x20, LSL #1\n" |
2836 |
|
|
"prfm pstl1keep, [x25, #0x0]\n" |
2837 |
|
|
"add x24, x25, x20, LSL #1\n" |
2838 |
|
|
"prfm pstl1keep, [x24, #0x0]\n" |
2839 |
|
|
"add x23, x24, x20, LSL #1\n" |
2840 |
|
|
"add x22, x23, x20, LSL #1\n" |
2841 |
|
|
"prfm pstl1keep, [x23, #0x0]\n" |
2842 |
|
|
"prfm pstl1keep, [x22, #0x0]\n" |
2843 |
|
|
"tbz %x[flags], #1, 188f\n" |
2844 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
2845 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
2846 |
|
|
"ld1r { v17.8h }, [x21]\n" |
2847 |
|
|
"ld1r { v16.8h }, [x20]\n" |
2848 |
|
|
"fmin v20.8h, v20.8h, v17.8h\n" |
2849 |
|
|
"fmin v21.8h, v21.8h, v17.8h\n" |
2850 |
|
|
"fmin v22.8h, v22.8h, v17.8h\n" |
2851 |
|
|
"fmin v23.8h, v23.8h, v17.8h\n" |
2852 |
|
|
"fmin v24.8h, v24.8h, v17.8h\n" |
2853 |
|
|
"fmin v25.8h, v25.8h, v17.8h\n" |
2854 |
|
|
"fmin v26.8h, v26.8h, v17.8h\n" |
2855 |
|
|
"fmin v27.8h, v27.8h, v17.8h\n" |
2856 |
|
|
"fmin v28.8h, v28.8h, v17.8h\n" |
2857 |
|
|
"fmin v29.8h, v29.8h, v17.8h\n" |
2858 |
|
|
"fmin v30.8h, v30.8h, v17.8h\n" |
2859 |
|
|
"fmin v31.8h, v31.8h, v17.8h\n" |
2860 |
|
|
"fmax v20.8h, v20.8h, v16.8h\n" |
2861 |
|
|
"fmax v21.8h, v21.8h, v16.8h\n" |
2862 |
|
|
"fmax v22.8h, v22.8h, v16.8h\n" |
2863 |
|
|
"fmax v23.8h, v23.8h, v16.8h\n" |
2864 |
|
|
"fmax v24.8h, v24.8h, v16.8h\n" |
2865 |
|
|
"fmax v25.8h, v25.8h, v16.8h\n" |
2866 |
|
|
"fmax v26.8h, v26.8h, v16.8h\n" |
2867 |
|
|
"fmax v27.8h, v27.8h, v16.8h\n" |
2868 |
|
|
"fmax v28.8h, v28.8h, v16.8h\n" |
2869 |
|
|
"fmax v29.8h, v29.8h, v16.8h\n" |
2870 |
|
|
"fmax v30.8h, v30.8h, v16.8h\n" |
2871 |
|
|
"fmax v31.8h, v31.8h, v16.8h\n" |
2872 |
|
|
"188:" // Height 6: No activation |
2873 |
|
|
"cmp x11, #0x10\n" |
2874 |
|
|
"bge 197f\n" |
2875 |
|
|
"tbz x11, #3, 192f\n" |
2876 |
|
|
"st1 { v20.8h }, [x9], #0x10\n" |
2877 |
|
|
"st1 { v22.8h }, [x26], #0x10\n" |
2878 |
|
|
"st1 { v24.8h }, [x25], #0x10\n" |
2879 |
|
|
"st1 { v26.8h }, [x24], #0x10\n" |
2880 |
|
|
"st1 { v28.8h }, [x23], #0x10\n" |
2881 |
|
|
"st1 { v30.8h }, [x22], #0x10\n" |
2882 |
|
|
"tbz x11, #2, 190f\n" |
2883 |
|
|
"str d21, [x9], #0x8\n" |
2884 |
|
|
"str d23, [x26], #0x8\n" |
2885 |
|
|
"str d25, [x25], #0x8\n" |
2886 |
|
|
"str d27, [x24], #0x8\n" |
2887 |
|
|
"str d29, [x23], #0x8\n" |
2888 |
|
|
"str d31, [x22], #0x8\n" |
2889 |
|
|
"tbz x11, #1, 189f\n" |
2890 |
|
|
"st1 { v21.s }[2], [x9], #0x4\n" |
2891 |
|
|
"st1 { v23.s }[2], [x26], #0x4\n" |
2892 |
|
|
"st1 { v25.s }[2], [x25], #0x4\n" |
2893 |
|
|
"st1 { v27.s }[2], [x24], #0x4\n" |
2894 |
|
|
"st1 { v29.s }[2], [x23], #0x4\n" |
2895 |
|
|
"st1 { v31.s }[2], [x22], #0x4\n" |
2896 |
|
|
"tbz x11, #0, 196f\n" |
2897 |
|
|
"st1 { v21.h }[6], [x9]\n" |
2898 |
|
|
"st1 { v23.h }[6], [x26]\n" |
2899 |
|
|
"st1 { v25.h }[6], [x25]\n" |
2900 |
|
|
"st1 { v27.h }[6], [x24]\n" |
2901 |
|
|
"st1 { v29.h }[6], [x23]\n" |
2902 |
|
|
"st1 { v31.h }[6], [x22]\n" |
2903 |
|
|
"b 196f\n" |
2904 |
|
|
"189:" // Height 6: Partial direct writeback: partial_1_12 |
2905 |
|
|
"tbz x11, #0, 196f\n" |
2906 |
|
|
"st1 { v21.h }[4], [x9]\n" |
2907 |
|
|
"st1 { v23.h }[4], [x26]\n" |
2908 |
|
|
"st1 { v25.h }[4], [x25]\n" |
2909 |
|
|
"st1 { v27.h }[4], [x24]\n" |
2910 |
|
|
"st1 { v29.h }[4], [x23]\n" |
2911 |
|
|
"st1 { v31.h }[4], [x22]\n" |
2912 |
|
|
"b 196f\n" |
2913 |
|
|
"190:" // Height 6: Partial direct writeback: partial_2_8 |
2914 |
|
|
"tbz x11, #1, 191f\n" |
2915 |
|
|
"str s21, [x9], #0x4\n" |
2916 |
|
|
"str s23, [x26], #0x4\n" |
2917 |
|
|
"str s25, [x25], #0x4\n" |
2918 |
|
|
"str s27, [x24], #0x4\n" |
2919 |
|
|
"str s29, [x23], #0x4\n" |
2920 |
|
|
"str s31, [x22], #0x4\n" |
2921 |
|
|
"tbz x11, #0, 196f\n" |
2922 |
|
|
"st1 { v21.h }[2], [x9]\n" |
2923 |
|
|
"st1 { v23.h }[2], [x26]\n" |
2924 |
|
|
"st1 { v25.h }[2], [x25]\n" |
2925 |
|
|
"st1 { v27.h }[2], [x24]\n" |
2926 |
|
|
"st1 { v29.h }[2], [x23]\n" |
2927 |
|
|
"st1 { v31.h }[2], [x22]\n" |
2928 |
|
|
"b 196f\n" |
2929 |
|
|
"191:" // Height 6: Partial direct writeback: partial_1_8 |
2930 |
|
|
"tbz x11, #0, 196f\n" |
2931 |
|
|
"str h21, [x9, #0x0]\n" |
2932 |
|
|
"str h23, [x26, #0x0]\n" |
2933 |
|
|
"str h25, [x25, #0x0]\n" |
2934 |
|
|
"str h27, [x24, #0x0]\n" |
2935 |
|
|
"str h29, [x23, #0x0]\n" |
2936 |
|
|
"str h31, [x22, #0x0]\n" |
2937 |
|
|
"b 196f\n" |
2938 |
|
|
"192:" // Height 6: Partial direct writeback: partial_4_0 |
2939 |
|
|
"tbz x11, #2, 194f\n" |
2940 |
|
|
"str d20, [x9], #0x8\n" |
2941 |
|
|
"str d22, [x26], #0x8\n" |
2942 |
|
|
"str d24, [x25], #0x8\n" |
2943 |
|
|
"str d26, [x24], #0x8\n" |
2944 |
|
|
"str d28, [x23], #0x8\n" |
2945 |
|
|
"str d30, [x22], #0x8\n" |
2946 |
|
|
"tbz x11, #1, 193f\n" |
2947 |
|
|
"st1 { v20.s }[2], [x9], #0x4\n" |
2948 |
|
|
"st1 { v22.s }[2], [x26], #0x4\n" |
2949 |
|
|
"st1 { v24.s }[2], [x25], #0x4\n" |
2950 |
|
|
"st1 { v26.s }[2], [x24], #0x4\n" |
2951 |
|
|
"st1 { v28.s }[2], [x23], #0x4\n" |
2952 |
|
|
"st1 { v30.s }[2], [x22], #0x4\n" |
2953 |
|
|
"tbz x11, #0, 196f\n" |
2954 |
|
|
"st1 { v20.h }[6], [x9]\n" |
2955 |
|
|
"st1 { v22.h }[6], [x26]\n" |
2956 |
|
|
"st1 { v24.h }[6], [x25]\n" |
2957 |
|
|
"st1 { v26.h }[6], [x24]\n" |
2958 |
|
|
"st1 { v28.h }[6], [x23]\n" |
2959 |
|
|
"st1 { v30.h }[6], [x22]\n" |
2960 |
|
|
"b 196f\n" |
2961 |
|
|
"193:" // Height 6: Partial direct writeback: partial_1_4 |
2962 |
|
|
"tbz x11, #0, 196f\n" |
2963 |
|
|
"st1 { v20.h }[4], [x9]\n" |
2964 |
|
|
"st1 { v22.h }[4], [x26]\n" |
2965 |
|
|
"st1 { v24.h }[4], [x25]\n" |
2966 |
|
|
"st1 { v26.h }[4], [x24]\n" |
2967 |
|
|
"st1 { v28.h }[4], [x23]\n" |
2968 |
|
|
"st1 { v30.h }[4], [x22]\n" |
2969 |
|
|
"b 196f\n" |
2970 |
|
|
"194:" // Height 6: Partial direct writeback: partial_2_0 |
2971 |
|
|
"tbz x11, #1, 195f\n" |
2972 |
|
|
"str s20, [x9], #0x4\n" |
2973 |
|
|
"str s22, [x26], #0x4\n" |
2974 |
|
|
"str s24, [x25], #0x4\n" |
2975 |
|
|
"str s26, [x24], #0x4\n" |
2976 |
|
|
"str s28, [x23], #0x4\n" |
2977 |
|
|
"str s30, [x22], #0x4\n" |
2978 |
|
|
"tbz x11, #0, 196f\n" |
2979 |
|
|
"st1 { v20.h }[2], [x9]\n" |
2980 |
|
|
"st1 { v22.h }[2], [x26]\n" |
2981 |
|
|
"st1 { v24.h }[2], [x25]\n" |
2982 |
|
|
"st1 { v26.h }[2], [x24]\n" |
2983 |
|
|
"st1 { v28.h }[2], [x23]\n" |
2984 |
|
|
"st1 { v30.h }[2], [x22]\n" |
2985 |
|
|
"b 196f\n" |
2986 |
|
|
"195:" // Height 6: Partial direct writeback: partial_1_0 |
2987 |
|
|
"str h20, [x9, #0x0]\n" |
2988 |
|
|
"str h22, [x26, #0x0]\n" |
2989 |
|
|
"str h24, [x25, #0x0]\n" |
2990 |
|
|
"str h26, [x24, #0x0]\n" |
2991 |
|
|
"str h28, [x23, #0x0]\n" |
2992 |
|
|
"str h30, [x22, #0x0]\n" |
2993 |
|
|
"196:" // Height 6: Partial direct writeback: Done |
2994 |
|
|
"b 198f\n" |
2995 |
|
|
"197:" // Height 6: Full writeback |
2996 |
|
|
"str q20, [x9, #0x0]\n" |
2997 |
|
|
"str q21, [x9, #0x10]\n" |
2998 |
|
|
"add x9, x9, #0x20\n" |
2999 |
|
|
"str q22, [x26, #0x0]\n" |
3000 |
|
|
"str q23, [x26, #0x10]\n" |
3001 |
|
|
"str q24, [x25, #0x0]\n" |
3002 |
|
|
"str q25, [x25, #0x10]\n" |
3003 |
|
|
"str q26, [x24, #0x0]\n" |
3004 |
|
|
"str q27, [x24, #0x10]\n" |
3005 |
|
|
"str q28, [x23, #0x0]\n" |
3006 |
|
|
"str q29, [x23, #0x10]\n" |
3007 |
|
|
"str q30, [x22, #0x0]\n" |
3008 |
|
|
"str q31, [x22, #0x10]\n" |
3009 |
|
|
"198:" // Height 6: Writeback done |
3010 |
|
|
"subs x11, x11, #0x10\n" |
3011 |
|
|
"bgt 167b\n" |
3012 |
|
|
"subs %x[m], %x[m], #0x6\n" |
3013 |
|
|
"beq 200f\n" |
3014 |
|
|
"ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
3015 |
|
|
"tbz %x[flags], #3, 199f\n" |
3016 |
|
|
"add x21, x21, #0x6\n" |
3017 |
|
|
"str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" |
3018 |
|
|
"b 1b\n" |
3019 |
|
|
"199:" // Update direct input |
3020 |
|
|
"mov x20, #0xc\n" |
3021 |
|
|
"madd %x[input_ptr], x20, x21, %x[input_ptr]\n" |
3022 |
|
|
"b 1b\n" |
3023 |
|
|
"200:" // Exit |
3024 |
|
|
: [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) |
3025 |
|
17 |
: [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), |
3026 |
|
|
[offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), |
3027 |
|
|
[offsetof_N] "I"(offsetof(KernelArgs, N)), |
3028 |
|
|
[offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), |
3029 |
|
|
[offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), |
3030 |
|
|
[offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), |
3031 |
|
|
[offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), |
3032 |
|
|
[offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), |
3033 |
|
|
[offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) |
3034 |
|
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", |
3035 |
|
|
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", |
3036 |
|
|
"v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); |
3037 |
|
17 |
} |
3038 |
|
|
|
3039 |
|
|
#endif // Architectural features check. |
3040 |
|
|
|