Line |
Branch |
Exec |
Source |
1 |
|
|
// |
2 |
|
|
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com> |
3 |
|
|
// |
4 |
|
|
// SPDX-License-Identifier: Apache-2.0 |
5 |
|
|
// |
6 |
|
|
|
7 |
|
|
// Do not flag up inline assembly blocks |
8 |
|
|
#pragma GCC diagnostic ignored "-Woverlength-strings" |
9 |
|
|
|
10 |
|
|
#if !defined(__aarch64__) || !defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) |
11 |
|
|
#error This file must be compiled for AArch64, FEAT_BF16. |
12 |
|
|
#else // Architectural features check. |
13 |
|
|
|
14 |
|
|
#include "kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.h" |
15 |
|
|
|
16 |
|
|
#include <stddef.h> |
17 |
|
|
#include <stdint.h> |
18 |
|
|
|
19 |
|
|
#include "kai/kai_common.h" |
20 |
|
|
|
21 |
|
|
static const size_t kai_mr = 1; |
22 |
|
|
static const size_t kai_nr = 12; |
23 |
|
|
static const size_t kai_kr = 4; |
24 |
|
|
static const size_t kai_sr = 1; |
25 |
|
|
|
26 |
|
|
static const size_t kai_m_step = 1; |
27 |
|
|
static const size_t kai_n_step = 36; |
28 |
|
|
|
29 |
|
188 |
size_t kai_get_m_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) { |
30 |
|
188 |
return kai_m_step; |
31 |
|
|
} |
32 |
|
|
|
33 |
|
280 |
size_t kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) { |
34 |
|
280 |
return kai_n_step; |
35 |
|
|
} |
36 |
|
|
|
37 |
|
✗ |
size_t kai_get_mr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) { |
38 |
|
✗ |
return kai_mr; |
39 |
|
|
} |
40 |
|
|
|
41 |
|
24 |
size_t kai_get_nr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) { |
42 |
|
24 |
return kai_nr; |
43 |
|
|
} |
44 |
|
|
|
45 |
|
24 |
size_t kai_get_kr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) { |
46 |
|
24 |
return kai_kr; |
47 |
|
|
} |
48 |
|
|
|
49 |
|
24 |
size_t kai_get_sr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) { |
50 |
|
24 |
return kai_sr; |
51 |
|
|
} |
52 |
|
|
|
53 |
|
92 |
size_t kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t m_idx, size_t k) { |
54 |
|
− |
KAI_ASSUME(m_idx == 0); |
55 |
|
|
|
56 |
|
92 |
return m_idx * kai_roundup(k, kai_kr) * sizeof(uint16_t); |
57 |
|
|
} |
58 |
|
|
|
59 |
|
92 |
size_t kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t n_idx, size_t k) { |
60 |
|
− |
KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0); |
61 |
|
92 |
return n_idx * (kai_roundup(k, kai_kr) * sizeof(uint16_t) + sizeof(float)); |
62 |
|
|
} |
63 |
|
|
|
64 |
|
92 |
size_t kai_get_dst_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot( |
65 |
|
|
size_t m_idx, size_t n_idx, size_t dst_stride) { |
66 |
|
− |
KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0); |
67 |
|
− |
KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0); |
68 |
|
|
|
69 |
|
92 |
return (m_idx * dst_stride) + (n_idx * sizeof(float)); |
70 |
|
|
} |
71 |
|
|
|
72 |
|
92 |
size_t kai_get_dst_size_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t m, size_t n) { |
73 |
|
92 |
return m * n * sizeof(float); |
74 |
|
|
} |
75 |
|
|
|
76 |
|
93 |
void kai_run_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot( |
77 |
|
|
size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row, |
78 |
|
|
size_t dst_stride_col, float clamp_min, float clamp_max) { |
79 |
|
93 |
KAI_UNUSED(dst_stride_row); |
80 |
|
93 |
KAI_UNUSED(dst_stride_col); |
81 |
|
|
|
82 |
|
− |
KAI_ASSUME(m == 1); |
83 |
|
|
|
84 |
|
|
typedef struct { |
85 |
|
|
float maxval; |
86 |
|
|
float minval; |
87 |
|
|
} KernelArgs; |
88 |
|
|
|
89 |
|
93 |
KernelArgs ka; |
90 |
|
93 |
ka.maxval = clamp_max; |
91 |
|
93 |
ka.minval = clamp_min; |
92 |
|
|
|
93 |
|
93 |
size_t N = n; |
94 |
|
93 |
size_t K = k; |
95 |
|
|
|
96 |
|
93 |
const void* A_ptr = lhs_packed; |
97 |
|
93 |
const void* B_ptr = rhs_packed; |
98 |
|
93 |
void* output_ptr = dst; |
99 |
|
|
|
100 |
|
93 |
uint64_t flags = 0; |
101 |
|
|
|
102 |
|
186 |
__asm__ __volatile__( |
103 |
|
|
"add x26, %x[K], #0x3\n" |
104 |
|
|
"mov x20, #0xc\n" |
105 |
|
|
"bic x26, x26, #0x3\n" |
106 |
|
|
"add x25, %x[N], #0x3\n" |
107 |
|
|
"lsr x25, x25, #0x2\n" |
108 |
|
|
"lsl x26, x26, #0x1\n" |
109 |
|
|
"add x26, x26, #0x4\n" |
110 |
|
|
"mul x26, x26, x20\n" |
111 |
|
|
"1:" // Column loop |
112 |
|
|
"cmp x25, #0x9\n" |
113 |
|
|
"bge 89f\n" |
114 |
|
|
"cmp x25, #0x7\n" |
115 |
|
|
"bgt 78f\n" |
116 |
|
|
"beq 67f\n" |
117 |
|
|
"cmp x25, #0x5\n" |
118 |
|
|
"bgt 56f\n" |
119 |
|
|
"beq 45f\n" |
120 |
|
|
"cmp x25, #0x3\n" |
121 |
|
|
"bgt 34f\n" |
122 |
|
|
"beq 23f\n" |
123 |
|
|
"cmp x25, #0x1\n" |
124 |
|
|
"bgt 12f\n" |
125 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
126 |
|
|
"mov x24, %x[K]\n" |
127 |
|
|
"movi v16.16b, #0x0\n" |
128 |
|
|
"mov x23, %x[A_ptr]\n" |
129 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
130 |
|
|
"cmp x24, #0x4\n" |
131 |
|
|
"zip2 v15.4s, v14.4s, v16.4s\n" |
132 |
|
|
"zip1 v14.4s, v14.4s, v16.4s\n" |
133 |
|
|
"blt 4f\n" |
134 |
|
|
"cmp x24, #0x8\n" |
135 |
|
|
"blt 3f\n" |
136 |
|
|
"2:" // Width 1: Multiply loop: Main loop head |
137 |
|
|
"ld1r { v0.2d }, [x23]\n" |
138 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
139 |
|
|
"sub x24, x24, #0x4\n" |
140 |
|
|
"add x23, x23, #0x8\n" |
141 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
142 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
143 |
|
|
"cmp x24, #0x8\n" |
144 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
145 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
146 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
147 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
148 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
149 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
150 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
151 |
|
|
"bge 2b\n" |
152 |
|
|
"3:" // Width 1: Multiply loop: Single iteration only |
153 |
|
|
"ld1r { v0.2d }, [x23]\n" |
154 |
|
|
"ldr q3, [%x[B_ptr], #0x0]\n" |
155 |
|
|
"add x23, x23, #0x8\n" |
156 |
|
|
"sub x24, x24, #0x4\n" |
157 |
|
|
"ldr q4, [%x[B_ptr], #0x10]\n" |
158 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
159 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
160 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
161 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
162 |
|
|
".inst 0x6e40fc6e // bfdot v14.4s, v3.8h, v0.8h\n" |
163 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
164 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
165 |
|
|
".inst 0x6e40fc8f // bfdot v15.4s, v4.8h, v0.8h\n" |
166 |
|
|
"4:" // Width 1: Multiply loop: Main loop skip |
167 |
|
|
"cbz x24, 7f\n" |
168 |
|
|
"tbz x24, #1, 5f\n" |
169 |
|
|
"ldr s0, [x23], #0x4\n" |
170 |
|
|
"tbz x24, #0, 6f\n" |
171 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
172 |
|
|
"b 6f\n" |
173 |
|
|
"5:" // Width 1: Multiply loop: Ragged operand read: partial_1_0 |
174 |
|
|
"ldr h0, [x23, #0x0]\n" |
175 |
|
|
"6:" // Width 1: Multiply loop: Ragged operand read: Done |
176 |
|
|
"ldr q5, [%x[B_ptr], #0x0]\n" |
177 |
|
|
"ldr q6, [%x[B_ptr], #0x10]\n" |
178 |
|
|
"dup v0.2d, v0.d[0]\n" |
179 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
180 |
|
|
".inst 0x6e40fcae // bfdot v14.4s, v5.8h, v0.8h\n" |
181 |
|
|
".inst 0x6e40fccf // bfdot v15.4s, v6.8h, v0.8h\n" |
182 |
|
|
"7:" // Width 1: Multiply loop: No odd multiplies |
183 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
184 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
185 |
|
|
"tbz %x[flags], #1, 8f\n" |
186 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
187 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
188 |
|
|
"ld1r { v17.4s }, [x21]\n" |
189 |
|
|
"ld1r { v16.4s }, [x20]\n" |
190 |
|
|
"fmin v14.4s, v14.4s, v17.4s\n" |
191 |
|
|
"fmax v14.4s, v14.4s, v16.4s\n" |
192 |
|
|
"8:" // Width 1: No activation |
193 |
|
|
"cmp %x[N], #0x4\n" |
194 |
|
|
"blt 9f\n" |
195 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
196 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
197 |
|
|
"b 11f\n" |
198 |
|
|
"9:" // Width 1: Partial writeback |
199 |
|
|
"tbz %x[N], #1, 10f\n" |
200 |
|
|
"str d14, [%x[output_ptr]], #0x8\n" |
201 |
|
|
"tbz %x[N], #0, 11f\n" |
202 |
|
|
"st1 { v14.s }[2], [%x[output_ptr]]\n" |
203 |
|
|
"b 11f\n" |
204 |
|
|
"10:" // Width 1: Partial direct writeback: partial_1_0 |
205 |
|
|
"str s14, [%x[output_ptr], #0x0]\n" |
206 |
|
|
"11:" // Width 1: Writeback done |
207 |
|
|
"b 100f\n" |
208 |
|
|
"12:" // Width 2 |
209 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
210 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
211 |
|
|
"mov x24, %x[K]\n" |
212 |
|
|
"movi v18.16b, #0x0\n" |
213 |
|
|
"mov x23, %x[A_ptr]\n" |
214 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
215 |
|
|
"cmp x24, #0x4\n" |
216 |
|
|
"zip2 v17.4s, v15.4s, v18.4s\n" |
217 |
|
|
"zip1 v16.4s, v15.4s, v18.4s\n" |
218 |
|
|
"zip2 v15.4s, v14.4s, v18.4s\n" |
219 |
|
|
"zip1 v14.4s, v14.4s, v18.4s\n" |
220 |
|
|
"blt 15f\n" |
221 |
|
|
"cmp x24, #0x8\n" |
222 |
|
|
"blt 14f\n" |
223 |
|
|
"13:" // Width 2: Multiply loop: Main loop head |
224 |
|
|
"ld1r { v0.2d }, [x23]\n" |
225 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
226 |
|
|
"sub x24, x24, #0x4\n" |
227 |
|
|
"add x23, x23, #0x8\n" |
228 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
229 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
230 |
|
|
"cmp x24, #0x8\n" |
231 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
232 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
233 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
234 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
235 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
236 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
237 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
238 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
239 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
240 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
241 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
242 |
|
|
"bge 13b\n" |
243 |
|
|
"14:" // Width 2: Multiply loop: Single iteration only |
244 |
|
|
"ld1r { v0.2d }, [x23]\n" |
245 |
|
|
"ldr q5, [%x[B_ptr], #0x0]\n" |
246 |
|
|
"add x23, x23, #0x8\n" |
247 |
|
|
"sub x24, x24, #0x4\n" |
248 |
|
|
"ldr q6, [%x[B_ptr], #0x10]\n" |
249 |
|
|
"ldr q7, [%x[B_ptr], #0x20]\n" |
250 |
|
|
"ldr q8, [%x[B_ptr], #0x30]\n" |
251 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
252 |
|
|
".inst 0x6e40fcae // bfdot v14.4s, v5.8h, v0.8h\n" |
253 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
254 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
255 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
256 |
|
|
".inst 0x6e40fccf // bfdot v15.4s, v6.8h, v0.8h\n" |
257 |
|
|
".inst 0x6e40fcf0 // bfdot v16.4s, v7.8h, v0.8h\n" |
258 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
259 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
260 |
|
|
".inst 0x6e40fd11 // bfdot v17.4s, v8.8h, v0.8h\n" |
261 |
|
|
"15:" // Width 2: Multiply loop: Main loop skip |
262 |
|
|
"cbz x24, 18f\n" |
263 |
|
|
"tbz x24, #1, 16f\n" |
264 |
|
|
"ldr s0, [x23], #0x4\n" |
265 |
|
|
"tbz x24, #0, 17f\n" |
266 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
267 |
|
|
"b 17f\n" |
268 |
|
|
"16:" // Width 2: Multiply loop: Ragged operand read: partial_1_0 |
269 |
|
|
"ldr h0, [x23, #0x0]\n" |
270 |
|
|
"17:" // Width 2: Multiply loop: Ragged operand read: Done |
271 |
|
|
"ldr q9, [%x[B_ptr], #0x0]\n" |
272 |
|
|
"ldr q10, [%x[B_ptr], #0x10]\n" |
273 |
|
|
"dup v0.2d, v0.d[0]\n" |
274 |
|
|
"ldr q11, [%x[B_ptr], #0x20]\n" |
275 |
|
|
"ldr q12, [%x[B_ptr], #0x30]\n" |
276 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
277 |
|
|
".inst 0x6e40fd2e // bfdot v14.4s, v9.8h, v0.8h\n" |
278 |
|
|
".inst 0x6e40fd4f // bfdot v15.4s, v10.8h, v0.8h\n" |
279 |
|
|
".inst 0x6e40fd70 // bfdot v16.4s, v11.8h, v0.8h\n" |
280 |
|
|
".inst 0x6e40fd91 // bfdot v17.4s, v12.8h, v0.8h\n" |
281 |
|
|
"18:" // Width 2: Multiply loop: No odd multiplies |
282 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
283 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
284 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
285 |
|
|
"tbz %x[flags], #1, 19f\n" |
286 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
287 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
288 |
|
|
"ld1r { v17.4s }, [x21]\n" |
289 |
|
|
"ld1r { v16.4s }, [x20]\n" |
290 |
|
|
"fmin v14.4s, v14.4s, v17.4s\n" |
291 |
|
|
"fmin v15.4s, v15.4s, v17.4s\n" |
292 |
|
|
"fmax v14.4s, v14.4s, v16.4s\n" |
293 |
|
|
"fmax v15.4s, v15.4s, v16.4s\n" |
294 |
|
|
"19:" // Width 2: No activation |
295 |
|
|
"cmp %x[N], #0x8\n" |
296 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
297 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
298 |
|
|
"blt 20f\n" |
299 |
|
|
"str q15, [%x[output_ptr], #0x0]\n" |
300 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
301 |
|
|
"b 22f\n" |
302 |
|
|
"20:" // Width 2: Partial writeback |
303 |
|
|
"tbz %x[N], #1, 21f\n" |
304 |
|
|
"str d15, [%x[output_ptr]], #0x8\n" |
305 |
|
|
"tbz %x[N], #0, 22f\n" |
306 |
|
|
"st1 { v15.s }[2], [%x[output_ptr]]\n" |
307 |
|
|
"b 22f\n" |
308 |
|
|
"21:" // Width 2: Partial direct writeback: partial_1_4 |
309 |
|
|
"tbz %x[N], #0, 22f\n" |
310 |
|
|
"str s15, [%x[output_ptr], #0x0]\n" |
311 |
|
|
"22:" // Width 2: Writeback done |
312 |
|
|
"b 100f\n" |
313 |
|
|
"23:" // Width 3 |
314 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
315 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
316 |
|
|
"mov x24, %x[K]\n" |
317 |
|
|
"movi v20.16b, #0x0\n" |
318 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
319 |
|
|
"mov x23, %x[A_ptr]\n" |
320 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
321 |
|
|
"cmp x24, #0x4\n" |
322 |
|
|
"zip2 v17.4s, v15.4s, v20.4s\n" |
323 |
|
|
"zip2 v19.4s, v16.4s, v20.4s\n" |
324 |
|
|
"zip1 v18.4s, v16.4s, v20.4s\n" |
325 |
|
|
"zip1 v16.4s, v15.4s, v20.4s\n" |
326 |
|
|
"zip2 v15.4s, v14.4s, v20.4s\n" |
327 |
|
|
"zip1 v14.4s, v14.4s, v20.4s\n" |
328 |
|
|
"blt 26f\n" |
329 |
|
|
"cmp x24, #0x8\n" |
330 |
|
|
"blt 25f\n" |
331 |
|
|
"24:" // Width 3: Multiply loop: Main loop head |
332 |
|
|
"ld1r { v0.2d }, [x23]\n" |
333 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
334 |
|
|
"sub x24, x24, #0x4\n" |
335 |
|
|
"add x23, x23, #0x8\n" |
336 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
337 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
338 |
|
|
"cmp x24, #0x8\n" |
339 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
340 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
341 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
342 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
343 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
344 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
345 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
346 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
347 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
348 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
349 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
350 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
351 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
352 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
353 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
354 |
|
|
"bge 24b\n" |
355 |
|
|
"25:" // Width 3: Multiply loop: Single iteration only |
356 |
|
|
"ld1r { v0.2d }, [x23]\n" |
357 |
|
|
"ldr q7, [%x[B_ptr], #0x0]\n" |
358 |
|
|
"add x23, x23, #0x8\n" |
359 |
|
|
"sub x24, x24, #0x4\n" |
360 |
|
|
"ldr q8, [%x[B_ptr], #0x10]\n" |
361 |
|
|
"ldr q9, [%x[B_ptr], #0x20]\n" |
362 |
|
|
"ldr q10, [%x[B_ptr], #0x30]\n" |
363 |
|
|
"ldr q11, [%x[B_ptr], #0x40]\n" |
364 |
|
|
"ldr q12, [%x[B_ptr], #0x50]\n" |
365 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
366 |
|
|
".inst 0x6e40fcee // bfdot v14.4s, v7.8h, v0.8h\n" |
367 |
|
|
".inst 0x6e40fd0f // bfdot v15.4s, v8.8h, v0.8h\n" |
368 |
|
|
".inst 0x6e40fd30 // bfdot v16.4s, v9.8h, v0.8h\n" |
369 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
370 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
371 |
|
|
".inst 0x6e40fd51 // bfdot v17.4s, v10.8h, v0.8h\n" |
372 |
|
|
".inst 0x6e40fd72 // bfdot v18.4s, v11.8h, v0.8h\n" |
373 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
374 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
375 |
|
|
".inst 0x6e40fd93 // bfdot v19.4s, v12.8h, v0.8h\n" |
376 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
377 |
|
|
"26:" // Width 3: Multiply loop: Main loop skip |
378 |
|
|
"cbz x24, 29f\n" |
379 |
|
|
"tbz x24, #1, 27f\n" |
380 |
|
|
"ldr s0, [x23], #0x4\n" |
381 |
|
|
"tbz x24, #0, 28f\n" |
382 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
383 |
|
|
"b 28f\n" |
384 |
|
|
"27:" // Width 3: Multiply loop: Ragged operand read: partial_1_0 |
385 |
|
|
"ldr h0, [x23, #0x0]\n" |
386 |
|
|
"28:" // Width 3: Multiply loop: Ragged operand read: Done |
387 |
|
|
"ldr q13, [%x[B_ptr], #0x0]\n" |
388 |
|
|
"ldr q1, [%x[B_ptr], #0x10]\n" |
389 |
|
|
"dup v0.2d, v0.d[0]\n" |
390 |
|
|
"ldr q2, [%x[B_ptr], #0x20]\n" |
391 |
|
|
"ldr q3, [%x[B_ptr], #0x30]\n" |
392 |
|
|
"ldr q4, [%x[B_ptr], #0x40]\n" |
393 |
|
|
"ldr q5, [%x[B_ptr], #0x50]\n" |
394 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
395 |
|
|
".inst 0x6e40fdae // bfdot v14.4s, v13.8h, v0.8h\n" |
396 |
|
|
".inst 0x6e40fc2f // bfdot v15.4s, v1.8h, v0.8h\n" |
397 |
|
|
".inst 0x6e40fc50 // bfdot v16.4s, v2.8h, v0.8h\n" |
398 |
|
|
".inst 0x6e40fc71 // bfdot v17.4s, v3.8h, v0.8h\n" |
399 |
|
|
".inst 0x6e40fc92 // bfdot v18.4s, v4.8h, v0.8h\n" |
400 |
|
|
".inst 0x6e40fcb3 // bfdot v19.4s, v5.8h, v0.8h\n" |
401 |
|
|
"29:" // Width 3: Multiply loop: No odd multiplies |
402 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
403 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
404 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
405 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
406 |
|
|
"tbz %x[flags], #1, 30f\n" |
407 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
408 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
409 |
|
|
"ld1r { v18.4s }, [x21]\n" |
410 |
|
|
"ld1r { v17.4s }, [x20]\n" |
411 |
|
|
"fmin v14.4s, v14.4s, v18.4s\n" |
412 |
|
|
"fmin v15.4s, v15.4s, v18.4s\n" |
413 |
|
|
"fmin v16.4s, v16.4s, v18.4s\n" |
414 |
|
|
"fmax v14.4s, v14.4s, v17.4s\n" |
415 |
|
|
"fmax v15.4s, v15.4s, v17.4s\n" |
416 |
|
|
"fmax v16.4s, v16.4s, v17.4s\n" |
417 |
|
|
"30:" // Width 3: No activation |
418 |
|
|
"cmp %x[N], #0xc\n" |
419 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
420 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
421 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x20\n" |
422 |
|
|
"blt 31f\n" |
423 |
|
|
"str q16, [%x[output_ptr], #0x0]\n" |
424 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
425 |
|
|
"b 33f\n" |
426 |
|
|
"31:" // Width 3: Partial writeback |
427 |
|
|
"tbz %x[N], #1, 32f\n" |
428 |
|
|
"str d16, [%x[output_ptr]], #0x8\n" |
429 |
|
|
"tbz %x[N], #0, 33f\n" |
430 |
|
|
"st1 { v16.s }[2], [%x[output_ptr]]\n" |
431 |
|
|
"b 33f\n" |
432 |
|
|
"32:" // Width 3: Partial direct writeback: partial_1_8 |
433 |
|
|
"tbz %x[N], #0, 33f\n" |
434 |
|
|
"str s16, [%x[output_ptr], #0x0]\n" |
435 |
|
|
"33:" // Width 3: Writeback done |
436 |
|
|
"b 100f\n" |
437 |
|
|
"34:" // Width 4 |
438 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
439 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
440 |
|
|
"mov x24, %x[K]\n" |
441 |
|
|
"add x20, %x[B_ptr], x26\n" |
442 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
443 |
|
|
"ldr q17, [x20, #0x0]\n" |
444 |
|
|
"movi v22.16b, #0x0\n" |
445 |
|
|
"mov x23, %x[A_ptr]\n" |
446 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
447 |
|
|
"add x20, x20, #0x30\n" |
448 |
|
|
"cmp x24, #0x4\n" |
449 |
|
|
"zip2 v21.4s, v17.4s, v22.4s\n" |
450 |
|
|
"zip1 v20.4s, v17.4s, v22.4s\n" |
451 |
|
|
"zip2 v19.4s, v16.4s, v22.4s\n" |
452 |
|
|
"zip1 v18.4s, v16.4s, v22.4s\n" |
453 |
|
|
"zip2 v17.4s, v15.4s, v22.4s\n" |
454 |
|
|
"zip1 v16.4s, v15.4s, v22.4s\n" |
455 |
|
|
"zip2 v15.4s, v14.4s, v22.4s\n" |
456 |
|
|
"zip1 v14.4s, v14.4s, v22.4s\n" |
457 |
|
|
"blt 37f\n" |
458 |
|
|
"cmp x24, #0x8\n" |
459 |
|
|
"blt 36f\n" |
460 |
|
|
"35:" // Width 4: Multiply loop: Main loop head |
461 |
|
|
"ld1r { v0.2d }, [x23]\n" |
462 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
463 |
|
|
"sub x24, x24, #0x4\n" |
464 |
|
|
"add x23, x23, #0x8\n" |
465 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
466 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
467 |
|
|
"cmp x24, #0x8\n" |
468 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
469 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
470 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
471 |
|
|
"ldr q7, [x20, #0x0]\n" |
472 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
473 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
474 |
|
|
"ldr q8, [x20, #0x10]\n" |
475 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
476 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
477 |
|
|
"add x20, x20, #0x60\n" |
478 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
479 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
480 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
481 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
482 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
483 |
|
|
".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n" |
484 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
485 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
486 |
|
|
".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n" |
487 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
488 |
|
|
"bge 35b\n" |
489 |
|
|
"36:" // Width 4: Multiply loop: Single iteration only |
490 |
|
|
"ld1r { v0.2d }, [x23]\n" |
491 |
|
|
"ldr q9, [%x[B_ptr], #0x0]\n" |
492 |
|
|
"add x23, x23, #0x8\n" |
493 |
|
|
"sub x24, x24, #0x4\n" |
494 |
|
|
"ldr q10, [%x[B_ptr], #0x10]\n" |
495 |
|
|
"ldr q11, [%x[B_ptr], #0x20]\n" |
496 |
|
|
"ldr q12, [%x[B_ptr], #0x30]\n" |
497 |
|
|
"ldr q13, [%x[B_ptr], #0x40]\n" |
498 |
|
|
"ldr q1, [%x[B_ptr], #0x50]\n" |
499 |
|
|
"ldr q2, [x20, #0x0]\n" |
500 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
501 |
|
|
".inst 0x6e40fd2e // bfdot v14.4s, v9.8h, v0.8h\n" |
502 |
|
|
"ldr q3, [x20, #0x10]\n" |
503 |
|
|
".inst 0x6e40fd4f // bfdot v15.4s, v10.8h, v0.8h\n" |
504 |
|
|
".inst 0x6e40fd70 // bfdot v16.4s, v11.8h, v0.8h\n" |
505 |
|
|
"add x20, x20, #0x60\n" |
506 |
|
|
".inst 0x6e40fd91 // bfdot v17.4s, v12.8h, v0.8h\n" |
507 |
|
|
".inst 0x6e40fdb2 // bfdot v18.4s, v13.8h, v0.8h\n" |
508 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
509 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
510 |
|
|
".inst 0x6e40fc33 // bfdot v19.4s, v1.8h, v0.8h\n" |
511 |
|
|
".inst 0x6e40fc54 // bfdot v20.4s, v2.8h, v0.8h\n" |
512 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
513 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
514 |
|
|
".inst 0x6e40fc75 // bfdot v21.4s, v3.8h, v0.8h\n" |
515 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
516 |
|
|
"37:" // Width 4: Multiply loop: Main loop skip |
517 |
|
|
"cbz x24, 40f\n" |
518 |
|
|
"tbz x24, #1, 38f\n" |
519 |
|
|
"ldr s0, [x23], #0x4\n" |
520 |
|
|
"tbz x24, #0, 39f\n" |
521 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
522 |
|
|
"b 39f\n" |
523 |
|
|
"38:" // Width 4: Multiply loop: Ragged operand read: partial_1_0 |
524 |
|
|
"ldr h0, [x23, #0x0]\n" |
525 |
|
|
"39:" // Width 4: Multiply loop: Ragged operand read: Done |
526 |
|
|
"ldr q4, [%x[B_ptr], #0x0]\n" |
527 |
|
|
"ldr q5, [%x[B_ptr], #0x10]\n" |
528 |
|
|
"dup v0.2d, v0.d[0]\n" |
529 |
|
|
"ldr q6, [%x[B_ptr], #0x20]\n" |
530 |
|
|
"ldr q7, [%x[B_ptr], #0x30]\n" |
531 |
|
|
"ldr q8, [%x[B_ptr], #0x40]\n" |
532 |
|
|
"ldr q9, [%x[B_ptr], #0x50]\n" |
533 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
534 |
|
|
"ldr q10, [x20, #0x0]\n" |
535 |
|
|
"ldr q11, [x20, #0x10]\n" |
536 |
|
|
".inst 0x6e40fc8e // bfdot v14.4s, v4.8h, v0.8h\n" |
537 |
|
|
".inst 0x6e40fcaf // bfdot v15.4s, v5.8h, v0.8h\n" |
538 |
|
|
".inst 0x6e40fcd0 // bfdot v16.4s, v6.8h, v0.8h\n" |
539 |
|
|
".inst 0x6e40fcf1 // bfdot v17.4s, v7.8h, v0.8h\n" |
540 |
|
|
".inst 0x6e40fd12 // bfdot v18.4s, v8.8h, v0.8h\n" |
541 |
|
|
".inst 0x6e40fd33 // bfdot v19.4s, v9.8h, v0.8h\n" |
542 |
|
|
".inst 0x6e40fd54 // bfdot v20.4s, v10.8h, v0.8h\n" |
543 |
|
|
".inst 0x6e40fd75 // bfdot v21.4s, v11.8h, v0.8h\n" |
544 |
|
|
"40:" // Width 4: Multiply loop: No odd multiplies |
545 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
546 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
547 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
548 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
549 |
|
|
"faddp v17.4s, v20.4s, v21.4s\n" |
550 |
|
|
"tbz %x[flags], #1, 41f\n" |
551 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
552 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
553 |
|
|
"ld1r { v19.4s }, [x21]\n" |
554 |
|
|
"ld1r { v18.4s }, [x20]\n" |
555 |
|
|
"fmin v14.4s, v14.4s, v19.4s\n" |
556 |
|
|
"fmin v15.4s, v15.4s, v19.4s\n" |
557 |
|
|
"fmin v16.4s, v16.4s, v19.4s\n" |
558 |
|
|
"fmin v17.4s, v17.4s, v19.4s\n" |
559 |
|
|
"fmax v14.4s, v14.4s, v18.4s\n" |
560 |
|
|
"fmax v15.4s, v15.4s, v18.4s\n" |
561 |
|
|
"fmax v16.4s, v16.4s, v18.4s\n" |
562 |
|
|
"fmax v17.4s, v17.4s, v18.4s\n" |
563 |
|
|
"41:" // Width 4: No activation |
564 |
|
|
"cmp %x[N], #0x10\n" |
565 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
566 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
567 |
|
|
"str q16, [%x[output_ptr], #0x20]\n" |
568 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x30\n" |
569 |
|
|
"blt 42f\n" |
570 |
|
|
"str q17, [%x[output_ptr], #0x0]\n" |
571 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
572 |
|
|
"b 44f\n" |
573 |
|
|
"42:" // Width 4: Partial writeback |
574 |
|
|
"tbz %x[N], #1, 43f\n" |
575 |
|
|
"str d17, [%x[output_ptr]], #0x8\n" |
576 |
|
|
"tbz %x[N], #0, 44f\n" |
577 |
|
|
"st1 { v17.s }[2], [%x[output_ptr]]\n" |
578 |
|
|
"b 44f\n" |
579 |
|
|
"43:" // Width 4: Partial direct writeback: partial_1_12 |
580 |
|
|
"tbz %x[N], #0, 44f\n" |
581 |
|
|
"str s17, [%x[output_ptr], #0x0]\n" |
582 |
|
|
"44:" // Width 4: Writeback done |
583 |
|
|
"b 100f\n" |
584 |
|
|
"45:" // Width 5 |
585 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
586 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
587 |
|
|
"mov x24, %x[K]\n" |
588 |
|
|
"add x20, %x[B_ptr], x26\n" |
589 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
590 |
|
|
"ldr q17, [x20, #0x0]\n" |
591 |
|
|
"movi v24.16b, #0x0\n" |
592 |
|
|
"mov x23, %x[A_ptr]\n" |
593 |
|
|
"ldr q18, [x20, #0x10]\n" |
594 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
595 |
|
|
"add x20, x20, #0x30\n" |
596 |
|
|
"cmp x24, #0x4\n" |
597 |
|
|
"zip2 v21.4s, v17.4s, v24.4s\n" |
598 |
|
|
"zip1 v20.4s, v17.4s, v24.4s\n" |
599 |
|
|
"zip2 v19.4s, v16.4s, v24.4s\n" |
600 |
|
|
"zip2 v17.4s, v15.4s, v24.4s\n" |
601 |
|
|
"zip2 v23.4s, v18.4s, v24.4s\n" |
602 |
|
|
"zip1 v22.4s, v18.4s, v24.4s\n" |
603 |
|
|
"zip1 v18.4s, v16.4s, v24.4s\n" |
604 |
|
|
"zip1 v16.4s, v15.4s, v24.4s\n" |
605 |
|
|
"zip2 v15.4s, v14.4s, v24.4s\n" |
606 |
|
|
"zip1 v14.4s, v14.4s, v24.4s\n" |
607 |
|
|
"blt 48f\n" |
608 |
|
|
"cmp x24, #0x8\n" |
609 |
|
|
"blt 47f\n" |
610 |
|
|
"46:" // Width 5: Multiply loop: Main loop head |
611 |
|
|
"ld1r { v0.2d }, [x23]\n" |
612 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
613 |
|
|
"sub x24, x24, #0x4\n" |
614 |
|
|
"add x23, x23, #0x8\n" |
615 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
616 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
617 |
|
|
"cmp x24, #0x8\n" |
618 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
619 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
620 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
621 |
|
|
"ldr q7, [x20, #0x0]\n" |
622 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
623 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
624 |
|
|
"ldr q8, [x20, #0x10]\n" |
625 |
|
|
"ldr q9, [x20, #0x20]\n" |
626 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
627 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
628 |
|
|
"ldr q10, [x20, #0x30]\n" |
629 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
630 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
631 |
|
|
"add x20, x20, #0x60\n" |
632 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
633 |
|
|
".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n" |
634 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
635 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
636 |
|
|
".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n" |
637 |
|
|
".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n" |
638 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
639 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
640 |
|
|
".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n" |
641 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
642 |
|
|
"bge 46b\n" |
643 |
|
|
"47:" // Width 5: Multiply loop: Single iteration only |
644 |
|
|
"ld1r { v0.2d }, [x23]\n" |
645 |
|
|
"ldr q11, [%x[B_ptr], #0x0]\n" |
646 |
|
|
"add x23, x23, #0x8\n" |
647 |
|
|
"sub x24, x24, #0x4\n" |
648 |
|
|
"ldr q12, [%x[B_ptr], #0x10]\n" |
649 |
|
|
"ldr q13, [%x[B_ptr], #0x20]\n" |
650 |
|
|
"ldr q1, [%x[B_ptr], #0x30]\n" |
651 |
|
|
"ldr q2, [%x[B_ptr], #0x40]\n" |
652 |
|
|
"ldr q3, [%x[B_ptr], #0x50]\n" |
653 |
|
|
"ldr q4, [x20, #0x0]\n" |
654 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
655 |
|
|
".inst 0x6e40fd6e // bfdot v14.4s, v11.8h, v0.8h\n" |
656 |
|
|
"ldr q5, [x20, #0x10]\n" |
657 |
|
|
"ldr q6, [x20, #0x20]\n" |
658 |
|
|
".inst 0x6e40fd8f // bfdot v15.4s, v12.8h, v0.8h\n" |
659 |
|
|
".inst 0x6e40fdb0 // bfdot v16.4s, v13.8h, v0.8h\n" |
660 |
|
|
"ldr q7, [x20, #0x30]\n" |
661 |
|
|
".inst 0x6e40fc31 // bfdot v17.4s, v1.8h, v0.8h\n" |
662 |
|
|
".inst 0x6e40fc52 // bfdot v18.4s, v2.8h, v0.8h\n" |
663 |
|
|
"add x20, x20, #0x60\n" |
664 |
|
|
".inst 0x6e40fc73 // bfdot v19.4s, v3.8h, v0.8h\n" |
665 |
|
|
".inst 0x6e40fc94 // bfdot v20.4s, v4.8h, v0.8h\n" |
666 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
667 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
668 |
|
|
".inst 0x6e40fcb5 // bfdot v21.4s, v5.8h, v0.8h\n" |
669 |
|
|
".inst 0x6e40fcd6 // bfdot v22.4s, v6.8h, v0.8h\n" |
670 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
671 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
672 |
|
|
".inst 0x6e40fcf7 // bfdot v23.4s, v7.8h, v0.8h\n" |
673 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
674 |
|
|
"48:" // Width 5: Multiply loop: Main loop skip |
675 |
|
|
"cbz x24, 51f\n" |
676 |
|
|
"tbz x24, #1, 49f\n" |
677 |
|
|
"ldr s0, [x23], #0x4\n" |
678 |
|
|
"tbz x24, #0, 50f\n" |
679 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
680 |
|
|
"b 50f\n" |
681 |
|
|
"49:" // Width 5: Multiply loop: Ragged operand read: partial_1_0 |
682 |
|
|
"ldr h0, [x23, #0x0]\n" |
683 |
|
|
"50:" // Width 5: Multiply loop: Ragged operand read: Done |
684 |
|
|
"ldr q8, [%x[B_ptr], #0x0]\n" |
685 |
|
|
"ldr q9, [%x[B_ptr], #0x10]\n" |
686 |
|
|
"dup v0.2d, v0.d[0]\n" |
687 |
|
|
"ldr q10, [%x[B_ptr], #0x20]\n" |
688 |
|
|
"ldr q11, [%x[B_ptr], #0x30]\n" |
689 |
|
|
"ldr q12, [%x[B_ptr], #0x40]\n" |
690 |
|
|
"ldr q13, [%x[B_ptr], #0x50]\n" |
691 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
692 |
|
|
"ldr q1, [x20, #0x0]\n" |
693 |
|
|
"ldr q2, [x20, #0x10]\n" |
694 |
|
|
".inst 0x6e40fd0e // bfdot v14.4s, v8.8h, v0.8h\n" |
695 |
|
|
".inst 0x6e40fd2f // bfdot v15.4s, v9.8h, v0.8h\n" |
696 |
|
|
"ldr q3, [x20, #0x20]\n" |
697 |
|
|
"ldr q4, [x20, #0x30]\n" |
698 |
|
|
".inst 0x6e40fd50 // bfdot v16.4s, v10.8h, v0.8h\n" |
699 |
|
|
".inst 0x6e40fd71 // bfdot v17.4s, v11.8h, v0.8h\n" |
700 |
|
|
".inst 0x6e40fd92 // bfdot v18.4s, v12.8h, v0.8h\n" |
701 |
|
|
".inst 0x6e40fdb3 // bfdot v19.4s, v13.8h, v0.8h\n" |
702 |
|
|
".inst 0x6e40fc34 // bfdot v20.4s, v1.8h, v0.8h\n" |
703 |
|
|
".inst 0x6e40fc55 // bfdot v21.4s, v2.8h, v0.8h\n" |
704 |
|
|
".inst 0x6e40fc76 // bfdot v22.4s, v3.8h, v0.8h\n" |
705 |
|
|
".inst 0x6e40fc97 // bfdot v23.4s, v4.8h, v0.8h\n" |
706 |
|
|
"51:" // Width 5: Multiply loop: No odd multiplies |
707 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
708 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
709 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
710 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
711 |
|
|
"faddp v17.4s, v20.4s, v21.4s\n" |
712 |
|
|
"faddp v18.4s, v22.4s, v23.4s\n" |
713 |
|
|
"tbz %x[flags], #1, 52f\n" |
714 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
715 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
716 |
|
|
"ld1r { v20.4s }, [x21]\n" |
717 |
|
|
"ld1r { v19.4s }, [x20]\n" |
718 |
|
|
"fmin v14.4s, v14.4s, v20.4s\n" |
719 |
|
|
"fmin v15.4s, v15.4s, v20.4s\n" |
720 |
|
|
"fmin v16.4s, v16.4s, v20.4s\n" |
721 |
|
|
"fmin v17.4s, v17.4s, v20.4s\n" |
722 |
|
|
"fmin v18.4s, v18.4s, v20.4s\n" |
723 |
|
|
"fmax v14.4s, v14.4s, v19.4s\n" |
724 |
|
|
"fmax v15.4s, v15.4s, v19.4s\n" |
725 |
|
|
"fmax v16.4s, v16.4s, v19.4s\n" |
726 |
|
|
"fmax v17.4s, v17.4s, v19.4s\n" |
727 |
|
|
"fmax v18.4s, v18.4s, v19.4s\n" |
728 |
|
|
"52:" // Width 5: No activation |
729 |
|
|
"cmp %x[N], #0x14\n" |
730 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
731 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
732 |
|
|
"str q16, [%x[output_ptr], #0x20]\n" |
733 |
|
|
"str q17, [%x[output_ptr], #0x30]\n" |
734 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x40\n" |
735 |
|
|
"blt 53f\n" |
736 |
|
|
"str q18, [%x[output_ptr], #0x0]\n" |
737 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
738 |
|
|
"b 55f\n" |
739 |
|
|
"53:" // Width 5: Partial writeback |
740 |
|
|
"tbz %x[N], #1, 54f\n" |
741 |
|
|
"str d18, [%x[output_ptr]], #0x8\n" |
742 |
|
|
"tbz %x[N], #0, 55f\n" |
743 |
|
|
"st1 { v18.s }[2], [%x[output_ptr]]\n" |
744 |
|
|
"b 55f\n" |
745 |
|
|
"54:" // Width 5: Partial direct writeback: partial_1_16 |
746 |
|
|
"tbz %x[N], #0, 55f\n" |
747 |
|
|
"str s18, [%x[output_ptr], #0x0]\n" |
748 |
|
|
"55:" // Width 5: Writeback done |
749 |
|
|
"b 100f\n" |
750 |
|
|
"56:" // Width 6 |
751 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
752 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
753 |
|
|
"mov x24, %x[K]\n" |
754 |
|
|
"add x20, %x[B_ptr], x26\n" |
755 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
756 |
|
|
"ldr q17, [x20, #0x0]\n" |
757 |
|
|
"movi v26.16b, #0x0\n" |
758 |
|
|
"mov x23, %x[A_ptr]\n" |
759 |
|
|
"ldr q18, [x20, #0x10]\n" |
760 |
|
|
"ldr q19, [x20, #0x20]\n" |
761 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
762 |
|
|
"add x20, x20, #0x30\n" |
763 |
|
|
"cmp x24, #0x4\n" |
764 |
|
|
"zip2 v21.4s, v17.4s, v26.4s\n" |
765 |
|
|
"zip1 v20.4s, v17.4s, v26.4s\n" |
766 |
|
|
"zip2 v17.4s, v15.4s, v26.4s\n" |
767 |
|
|
"zip2 v25.4s, v19.4s, v26.4s\n" |
768 |
|
|
"zip1 v24.4s, v19.4s, v26.4s\n" |
769 |
|
|
"zip2 v23.4s, v18.4s, v26.4s\n" |
770 |
|
|
"zip1 v22.4s, v18.4s, v26.4s\n" |
771 |
|
|
"zip2 v19.4s, v16.4s, v26.4s\n" |
772 |
|
|
"zip1 v18.4s, v16.4s, v26.4s\n" |
773 |
|
|
"zip1 v16.4s, v15.4s, v26.4s\n" |
774 |
|
|
"zip2 v15.4s, v14.4s, v26.4s\n" |
775 |
|
|
"zip1 v14.4s, v14.4s, v26.4s\n" |
776 |
|
|
"blt 59f\n" |
777 |
|
|
"cmp x24, #0x8\n" |
778 |
|
|
"blt 58f\n" |
779 |
|
|
"57:" // Width 6: Multiply loop: Main loop head |
780 |
|
|
"ld1r { v0.2d }, [x23]\n" |
781 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
782 |
|
|
"sub x24, x24, #0x4\n" |
783 |
|
|
"add x23, x23, #0x8\n" |
784 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
785 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
786 |
|
|
"cmp x24, #0x8\n" |
787 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
788 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
789 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
790 |
|
|
"ldr q7, [x20, #0x0]\n" |
791 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
792 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
793 |
|
|
"ldr q8, [x20, #0x10]\n" |
794 |
|
|
"ldr q9, [x20, #0x20]\n" |
795 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
796 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
797 |
|
|
"ldr q10, [x20, #0x30]\n" |
798 |
|
|
"ldr q11, [x20, #0x40]\n" |
799 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
800 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
801 |
|
|
"ldr q12, [x20, #0x50]\n" |
802 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
803 |
|
|
".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n" |
804 |
|
|
"add x20, x20, #0x60\n" |
805 |
|
|
".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n" |
806 |
|
|
".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n" |
807 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
808 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
809 |
|
|
".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n" |
810 |
|
|
".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n" |
811 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
812 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
813 |
|
|
".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n" |
814 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
815 |
|
|
"bge 57b\n" |
816 |
|
|
"58:" // Width 6: Multiply loop: Single iteration only |
817 |
|
|
"ld1r { v0.2d }, [x23]\n" |
818 |
|
|
"ldr q13, [%x[B_ptr], #0x0]\n" |
819 |
|
|
"add x23, x23, #0x8\n" |
820 |
|
|
"sub x24, x24, #0x4\n" |
821 |
|
|
"ldr q1, [%x[B_ptr], #0x10]\n" |
822 |
|
|
"ldr q2, [%x[B_ptr], #0x20]\n" |
823 |
|
|
"ldr q3, [%x[B_ptr], #0x30]\n" |
824 |
|
|
"ldr q4, [%x[B_ptr], #0x40]\n" |
825 |
|
|
"ldr q5, [%x[B_ptr], #0x50]\n" |
826 |
|
|
"ldr q6, [x20, #0x0]\n" |
827 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
828 |
|
|
".inst 0x6e40fdae // bfdot v14.4s, v13.8h, v0.8h\n" |
829 |
|
|
"ldr q7, [x20, #0x10]\n" |
830 |
|
|
"ldr q8, [x20, #0x20]\n" |
831 |
|
|
".inst 0x6e40fc2f // bfdot v15.4s, v1.8h, v0.8h\n" |
832 |
|
|
".inst 0x6e40fc50 // bfdot v16.4s, v2.8h, v0.8h\n" |
833 |
|
|
"ldr q9, [x20, #0x30]\n" |
834 |
|
|
"ldr q10, [x20, #0x40]\n" |
835 |
|
|
".inst 0x6e40fc71 // bfdot v17.4s, v3.8h, v0.8h\n" |
836 |
|
|
".inst 0x6e40fc92 // bfdot v18.4s, v4.8h, v0.8h\n" |
837 |
|
|
"ldr q11, [x20, #0x50]\n" |
838 |
|
|
".inst 0x6e40fcb3 // bfdot v19.4s, v5.8h, v0.8h\n" |
839 |
|
|
".inst 0x6e40fcd4 // bfdot v20.4s, v6.8h, v0.8h\n" |
840 |
|
|
"add x20, x20, #0x60\n" |
841 |
|
|
".inst 0x6e40fcf5 // bfdot v21.4s, v7.8h, v0.8h\n" |
842 |
|
|
".inst 0x6e40fd16 // bfdot v22.4s, v8.8h, v0.8h\n" |
843 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
844 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
845 |
|
|
".inst 0x6e40fd37 // bfdot v23.4s, v9.8h, v0.8h\n" |
846 |
|
|
".inst 0x6e40fd58 // bfdot v24.4s, v10.8h, v0.8h\n" |
847 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
848 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
849 |
|
|
".inst 0x6e40fd79 // bfdot v25.4s, v11.8h, v0.8h\n" |
850 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
851 |
|
|
"59:" // Width 6: Multiply loop: Main loop skip |
852 |
|
|
"cbz x24, 62f\n" |
853 |
|
|
"tbz x24, #1, 60f\n" |
854 |
|
|
"ldr s0, [x23], #0x4\n" |
855 |
|
|
"tbz x24, #0, 61f\n" |
856 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
857 |
|
|
"b 61f\n" |
858 |
|
|
"60:" // Width 6: Multiply loop: Ragged operand read: partial_1_0 |
859 |
|
|
"ldr h0, [x23, #0x0]\n" |
860 |
|
|
"61:" // Width 6: Multiply loop: Ragged operand read: Done |
861 |
|
|
"ldr q12, [%x[B_ptr], #0x0]\n" |
862 |
|
|
"ldr q13, [%x[B_ptr], #0x10]\n" |
863 |
|
|
"dup v0.2d, v0.d[0]\n" |
864 |
|
|
"ldr q1, [%x[B_ptr], #0x20]\n" |
865 |
|
|
"ldr q2, [%x[B_ptr], #0x30]\n" |
866 |
|
|
"ldr q3, [%x[B_ptr], #0x40]\n" |
867 |
|
|
"ldr q4, [%x[B_ptr], #0x50]\n" |
868 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
869 |
|
|
"ldr q5, [x20, #0x0]\n" |
870 |
|
|
"ldr q6, [x20, #0x10]\n" |
871 |
|
|
".inst 0x6e40fd8e // bfdot v14.4s, v12.8h, v0.8h\n" |
872 |
|
|
".inst 0x6e40fdaf // bfdot v15.4s, v13.8h, v0.8h\n" |
873 |
|
|
"ldr q7, [x20, #0x20]\n" |
874 |
|
|
"ldr q8, [x20, #0x30]\n" |
875 |
|
|
".inst 0x6e40fc30 // bfdot v16.4s, v1.8h, v0.8h\n" |
876 |
|
|
".inst 0x6e40fc51 // bfdot v17.4s, v2.8h, v0.8h\n" |
877 |
|
|
"ldr q9, [x20, #0x40]\n" |
878 |
|
|
"ldr q10, [x20, #0x50]\n" |
879 |
|
|
".inst 0x6e40fc72 // bfdot v18.4s, v3.8h, v0.8h\n" |
880 |
|
|
".inst 0x6e40fc93 // bfdot v19.4s, v4.8h, v0.8h\n" |
881 |
|
|
".inst 0x6e40fcb4 // bfdot v20.4s, v5.8h, v0.8h\n" |
882 |
|
|
".inst 0x6e40fcd5 // bfdot v21.4s, v6.8h, v0.8h\n" |
883 |
|
|
".inst 0x6e40fcf6 // bfdot v22.4s, v7.8h, v0.8h\n" |
884 |
|
|
".inst 0x6e40fd17 // bfdot v23.4s, v8.8h, v0.8h\n" |
885 |
|
|
".inst 0x6e40fd38 // bfdot v24.4s, v9.8h, v0.8h\n" |
886 |
|
|
".inst 0x6e40fd59 // bfdot v25.4s, v10.8h, v0.8h\n" |
887 |
|
|
"62:" // Width 6: Multiply loop: No odd multiplies |
888 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
889 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
890 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
891 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
892 |
|
|
"faddp v17.4s, v20.4s, v21.4s\n" |
893 |
|
|
"faddp v18.4s, v22.4s, v23.4s\n" |
894 |
|
|
"faddp v19.4s, v24.4s, v25.4s\n" |
895 |
|
|
"tbz %x[flags], #1, 63f\n" |
896 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
897 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
898 |
|
|
"ld1r { v21.4s }, [x21]\n" |
899 |
|
|
"ld1r { v20.4s }, [x20]\n" |
900 |
|
|
"fmin v14.4s, v14.4s, v21.4s\n" |
901 |
|
|
"fmin v15.4s, v15.4s, v21.4s\n" |
902 |
|
|
"fmin v16.4s, v16.4s, v21.4s\n" |
903 |
|
|
"fmin v17.4s, v17.4s, v21.4s\n" |
904 |
|
|
"fmin v18.4s, v18.4s, v21.4s\n" |
905 |
|
|
"fmin v19.4s, v19.4s, v21.4s\n" |
906 |
|
|
"fmax v14.4s, v14.4s, v20.4s\n" |
907 |
|
|
"fmax v15.4s, v15.4s, v20.4s\n" |
908 |
|
|
"fmax v16.4s, v16.4s, v20.4s\n" |
909 |
|
|
"fmax v17.4s, v17.4s, v20.4s\n" |
910 |
|
|
"fmax v18.4s, v18.4s, v20.4s\n" |
911 |
|
|
"fmax v19.4s, v19.4s, v20.4s\n" |
912 |
|
|
"63:" // Width 6: No activation |
913 |
|
|
"cmp %x[N], #0x18\n" |
914 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
915 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
916 |
|
|
"str q16, [%x[output_ptr], #0x20]\n" |
917 |
|
|
"str q17, [%x[output_ptr], #0x30]\n" |
918 |
|
|
"str q18, [%x[output_ptr], #0x40]\n" |
919 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x50\n" |
920 |
|
|
"blt 64f\n" |
921 |
|
|
"str q19, [%x[output_ptr], #0x0]\n" |
922 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
923 |
|
|
"b 66f\n" |
924 |
|
|
"64:" // Width 6: Partial writeback |
925 |
|
|
"tbz %x[N], #1, 65f\n" |
926 |
|
|
"str d19, [%x[output_ptr]], #0x8\n" |
927 |
|
|
"tbz %x[N], #0, 66f\n" |
928 |
|
|
"st1 { v19.s }[2], [%x[output_ptr]]\n" |
929 |
|
|
"b 66f\n" |
930 |
|
|
"65:" // Width 6: Partial direct writeback: partial_1_20 |
931 |
|
|
"tbz %x[N], #0, 66f\n" |
932 |
|
|
"str s19, [%x[output_ptr], #0x0]\n" |
933 |
|
|
"66:" // Width 6: Writeback done |
934 |
|
|
"b 100f\n" |
935 |
|
|
"67:" // Width 7 |
936 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
937 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
938 |
|
|
"mov x24, %x[K]\n" |
939 |
|
|
"add x21, %x[B_ptr], x26\n" |
940 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
941 |
|
|
"ldr q17, [x21, #0x0]\n" |
942 |
|
|
"add x20, %x[B_ptr], x26, LSL #1\n" |
943 |
|
|
"movi v28.16b, #0x0\n" |
944 |
|
|
"ldr q18, [x21, #0x10]\n" |
945 |
|
|
"ldr q19, [x21, #0x20]\n" |
946 |
|
|
"mov x23, %x[A_ptr]\n" |
947 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
948 |
|
|
"ldr q20, [x20, #0x0]\n" |
949 |
|
|
"cmp x24, #0x4\n" |
950 |
|
|
"add x21, x21, #0x30\n" |
951 |
|
|
"add x20, x20, #0x30\n" |
952 |
|
|
"zip2 v21.4s, v17.4s, v28.4s\n" |
953 |
|
|
"zip2 v25.4s, v19.4s, v28.4s\n" |
954 |
|
|
"zip1 v24.4s, v19.4s, v28.4s\n" |
955 |
|
|
"zip2 v23.4s, v18.4s, v28.4s\n" |
956 |
|
|
"zip2 v27.4s, v20.4s, v28.4s\n" |
957 |
|
|
"zip1 v26.4s, v20.4s, v28.4s\n" |
958 |
|
|
"zip1 v22.4s, v18.4s, v28.4s\n" |
959 |
|
|
"zip1 v20.4s, v17.4s, v28.4s\n" |
960 |
|
|
"zip2 v19.4s, v16.4s, v28.4s\n" |
961 |
|
|
"zip1 v18.4s, v16.4s, v28.4s\n" |
962 |
|
|
"zip2 v17.4s, v15.4s, v28.4s\n" |
963 |
|
|
"zip1 v16.4s, v15.4s, v28.4s\n" |
964 |
|
|
"zip2 v15.4s, v14.4s, v28.4s\n" |
965 |
|
|
"zip1 v14.4s, v14.4s, v28.4s\n" |
966 |
|
|
"blt 70f\n" |
967 |
|
|
"cmp x24, #0x8\n" |
968 |
|
|
"blt 69f\n" |
969 |
|
|
"68:" // Width 7: Multiply loop: Main loop head |
970 |
|
|
"ld1r { v0.2d }, [x23]\n" |
971 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
972 |
|
|
"sub x24, x24, #0x4\n" |
973 |
|
|
"add x23, x23, #0x8\n" |
974 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
975 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
976 |
|
|
"cmp x24, #0x8\n" |
977 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
978 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
979 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
980 |
|
|
"ldr q7, [x21, #0x0]\n" |
981 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
982 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
983 |
|
|
"ldr q8, [x21, #0x10]\n" |
984 |
|
|
"ldr q9, [x21, #0x20]\n" |
985 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
986 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
987 |
|
|
"ldr q10, [x21, #0x30]\n" |
988 |
|
|
"ldr q11, [x21, #0x40]\n" |
989 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
990 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
991 |
|
|
"ldr q12, [x21, #0x50]\n" |
992 |
|
|
"ldr q13, [x20, #0x0]\n" |
993 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
994 |
|
|
".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n" |
995 |
|
|
"ldr q1, [x20, #0x10]\n" |
996 |
|
|
".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n" |
997 |
|
|
".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n" |
998 |
|
|
"add x21, x21, #0x60\n" |
999 |
|
|
".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n" |
1000 |
|
|
".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n" |
1001 |
|
|
"add x20, x20, #0x60\n" |
1002 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
1003 |
|
|
".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n" |
1004 |
|
|
".inst 0x6e40fdba // bfdot v26.4s, v13.8h, v0.8h\n" |
1005 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
1006 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
1007 |
|
|
".inst 0x6e40fc3b // bfdot v27.4s, v1.8h, v0.8h\n" |
1008 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
1009 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1010 |
|
|
"bge 68b\n" |
1011 |
|
|
"69:" // Width 7: Multiply loop: Single iteration only |
1012 |
|
|
"ld1r { v0.2d }, [x23]\n" |
1013 |
|
|
"ldr q2, [%x[B_ptr], #0x0]\n" |
1014 |
|
|
"add x23, x23, #0x8\n" |
1015 |
|
|
"sub x24, x24, #0x4\n" |
1016 |
|
|
"ldr q3, [%x[B_ptr], #0x10]\n" |
1017 |
|
|
"ldr q4, [%x[B_ptr], #0x20]\n" |
1018 |
|
|
"ldr q5, [%x[B_ptr], #0x30]\n" |
1019 |
|
|
"ldr q6, [%x[B_ptr], #0x40]\n" |
1020 |
|
|
"ldr q7, [%x[B_ptr], #0x50]\n" |
1021 |
|
|
"ldr q8, [x21, #0x0]\n" |
1022 |
|
|
".inst 0x6e40fc4e // bfdot v14.4s, v2.8h, v0.8h\n" |
1023 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
1024 |
|
|
"ldr q9, [x21, #0x10]\n" |
1025 |
|
|
"ldr q10, [x21, #0x20]\n" |
1026 |
|
|
".inst 0x6e40fc6f // bfdot v15.4s, v3.8h, v0.8h\n" |
1027 |
|
|
".inst 0x6e40fc90 // bfdot v16.4s, v4.8h, v0.8h\n" |
1028 |
|
|
"ldr q11, [x21, #0x30]\n" |
1029 |
|
|
"ldr q12, [x21, #0x40]\n" |
1030 |
|
|
".inst 0x6e40fcb1 // bfdot v17.4s, v5.8h, v0.8h\n" |
1031 |
|
|
".inst 0x6e40fcd2 // bfdot v18.4s, v6.8h, v0.8h\n" |
1032 |
|
|
"ldr q13, [x21, #0x50]\n" |
1033 |
|
|
"ldr q1, [x20, #0x0]\n" |
1034 |
|
|
".inst 0x6e40fcf3 // bfdot v19.4s, v7.8h, v0.8h\n" |
1035 |
|
|
".inst 0x6e40fd14 // bfdot v20.4s, v8.8h, v0.8h\n" |
1036 |
|
|
"ldr q2, [x20, #0x10]\n" |
1037 |
|
|
".inst 0x6e40fd35 // bfdot v21.4s, v9.8h, v0.8h\n" |
1038 |
|
|
".inst 0x6e40fd56 // bfdot v22.4s, v10.8h, v0.8h\n" |
1039 |
|
|
"add x21, x21, #0x60\n" |
1040 |
|
|
".inst 0x6e40fd77 // bfdot v23.4s, v11.8h, v0.8h\n" |
1041 |
|
|
".inst 0x6e40fd98 // bfdot v24.4s, v12.8h, v0.8h\n" |
1042 |
|
|
"add x20, x20, #0x60\n" |
1043 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
1044 |
|
|
".inst 0x6e40fdb9 // bfdot v25.4s, v13.8h, v0.8h\n" |
1045 |
|
|
".inst 0x6e40fc3a // bfdot v26.4s, v1.8h, v0.8h\n" |
1046 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
1047 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
1048 |
|
|
".inst 0x6e40fc5b // bfdot v27.4s, v2.8h, v0.8h\n" |
1049 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
1050 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1051 |
|
|
"70:" // Width 7: Multiply loop: Main loop skip |
1052 |
|
|
"cbz x24, 73f\n" |
1053 |
|
|
"tbz x24, #1, 71f\n" |
1054 |
|
|
"ldr s0, [x23], #0x4\n" |
1055 |
|
|
"tbz x24, #0, 72f\n" |
1056 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
1057 |
|
|
"b 72f\n" |
1058 |
|
|
"71:" // Width 7: Multiply loop: Ragged operand read: partial_1_0 |
1059 |
|
|
"ldr h0, [x23, #0x0]\n" |
1060 |
|
|
"72:" // Width 7: Multiply loop: Ragged operand read: Done |
1061 |
|
|
"ldr q3, [%x[B_ptr], #0x0]\n" |
1062 |
|
|
"ldr q4, [%x[B_ptr], #0x10]\n" |
1063 |
|
|
"dup v0.2d, v0.d[0]\n" |
1064 |
|
|
"ldr q5, [%x[B_ptr], #0x20]\n" |
1065 |
|
|
"ldr q6, [%x[B_ptr], #0x30]\n" |
1066 |
|
|
"ldr q7, [%x[B_ptr], #0x40]\n" |
1067 |
|
|
"ldr q8, [%x[B_ptr], #0x50]\n" |
1068 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
1069 |
|
|
"ldr q9, [x21, #0x0]\n" |
1070 |
|
|
"ldr q10, [x21, #0x10]\n" |
1071 |
|
|
".inst 0x6e40fc6e // bfdot v14.4s, v3.8h, v0.8h\n" |
1072 |
|
|
".inst 0x6e40fc8f // bfdot v15.4s, v4.8h, v0.8h\n" |
1073 |
|
|
"ldr q11, [x21, #0x20]\n" |
1074 |
|
|
"ldr q12, [x21, #0x30]\n" |
1075 |
|
|
".inst 0x6e40fcb0 // bfdot v16.4s, v5.8h, v0.8h\n" |
1076 |
|
|
".inst 0x6e40fcd1 // bfdot v17.4s, v6.8h, v0.8h\n" |
1077 |
|
|
"ldr q13, [x21, #0x40]\n" |
1078 |
|
|
"ldr q1, [x21, #0x50]\n" |
1079 |
|
|
".inst 0x6e40fcf2 // bfdot v18.4s, v7.8h, v0.8h\n" |
1080 |
|
|
".inst 0x6e40fd13 // bfdot v19.4s, v8.8h, v0.8h\n" |
1081 |
|
|
"ldr q2, [x20, #0x0]\n" |
1082 |
|
|
"ldr q3, [x20, #0x10]\n" |
1083 |
|
|
".inst 0x6e40fd34 // bfdot v20.4s, v9.8h, v0.8h\n" |
1084 |
|
|
".inst 0x6e40fd55 // bfdot v21.4s, v10.8h, v0.8h\n" |
1085 |
|
|
".inst 0x6e40fd76 // bfdot v22.4s, v11.8h, v0.8h\n" |
1086 |
|
|
".inst 0x6e40fd97 // bfdot v23.4s, v12.8h, v0.8h\n" |
1087 |
|
|
".inst 0x6e40fdb8 // bfdot v24.4s, v13.8h, v0.8h\n" |
1088 |
|
|
".inst 0x6e40fc39 // bfdot v25.4s, v1.8h, v0.8h\n" |
1089 |
|
|
".inst 0x6e40fc5a // bfdot v26.4s, v2.8h, v0.8h\n" |
1090 |
|
|
".inst 0x6e40fc7b // bfdot v27.4s, v3.8h, v0.8h\n" |
1091 |
|
|
"73:" // Width 7: Multiply loop: No odd multiplies |
1092 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
1093 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
1094 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
1095 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
1096 |
|
|
"faddp v17.4s, v20.4s, v21.4s\n" |
1097 |
|
|
"faddp v18.4s, v22.4s, v23.4s\n" |
1098 |
|
|
"faddp v19.4s, v24.4s, v25.4s\n" |
1099 |
|
|
"faddp v20.4s, v26.4s, v27.4s\n" |
1100 |
|
|
"tbz %x[flags], #1, 74f\n" |
1101 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
1102 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
1103 |
|
|
"ld1r { v22.4s }, [x21]\n" |
1104 |
|
|
"ld1r { v21.4s }, [x20]\n" |
1105 |
|
|
"fmin v14.4s, v14.4s, v22.4s\n" |
1106 |
|
|
"fmin v15.4s, v15.4s, v22.4s\n" |
1107 |
|
|
"fmin v16.4s, v16.4s, v22.4s\n" |
1108 |
|
|
"fmin v17.4s, v17.4s, v22.4s\n" |
1109 |
|
|
"fmin v18.4s, v18.4s, v22.4s\n" |
1110 |
|
|
"fmin v19.4s, v19.4s, v22.4s\n" |
1111 |
|
|
"fmin v20.4s, v20.4s, v22.4s\n" |
1112 |
|
|
"fmax v14.4s, v14.4s, v21.4s\n" |
1113 |
|
|
"fmax v15.4s, v15.4s, v21.4s\n" |
1114 |
|
|
"fmax v16.4s, v16.4s, v21.4s\n" |
1115 |
|
|
"fmax v17.4s, v17.4s, v21.4s\n" |
1116 |
|
|
"fmax v18.4s, v18.4s, v21.4s\n" |
1117 |
|
|
"fmax v19.4s, v19.4s, v21.4s\n" |
1118 |
|
|
"fmax v20.4s, v20.4s, v21.4s\n" |
1119 |
|
|
"74:" // Width 7: No activation |
1120 |
|
|
"cmp %x[N], #0x1c\n" |
1121 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
1122 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
1123 |
|
|
"str q16, [%x[output_ptr], #0x20]\n" |
1124 |
|
|
"str q17, [%x[output_ptr], #0x30]\n" |
1125 |
|
|
"str q18, [%x[output_ptr], #0x40]\n" |
1126 |
|
|
"str q19, [%x[output_ptr], #0x50]\n" |
1127 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x60\n" |
1128 |
|
|
"blt 75f\n" |
1129 |
|
|
"str q20, [%x[output_ptr], #0x0]\n" |
1130 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
1131 |
|
|
"b 77f\n" |
1132 |
|
|
"75:" // Width 7: Partial writeback |
1133 |
|
|
"tbz %x[N], #1, 76f\n" |
1134 |
|
|
"str d20, [%x[output_ptr]], #0x8\n" |
1135 |
|
|
"tbz %x[N], #0, 77f\n" |
1136 |
|
|
"st1 { v20.s }[2], [%x[output_ptr]]\n" |
1137 |
|
|
"b 77f\n" |
1138 |
|
|
"76:" // Width 7: Partial direct writeback: partial_1_24 |
1139 |
|
|
"tbz %x[N], #0, 77f\n" |
1140 |
|
|
"str s20, [%x[output_ptr], #0x0]\n" |
1141 |
|
|
"77:" // Width 7: Writeback done |
1142 |
|
|
"b 100f\n" |
1143 |
|
|
"78:" // Width 8 |
1144 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
1145 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
1146 |
|
|
"mov x24, %x[K]\n" |
1147 |
|
|
"add x21, %x[B_ptr], x26\n" |
1148 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
1149 |
|
|
"ldr q17, [x21, #0x0]\n" |
1150 |
|
|
"add x20, %x[B_ptr], x26, LSL #1\n" |
1151 |
|
|
"movi v30.16b, #0x0\n" |
1152 |
|
|
"ldr q18, [x21, #0x10]\n" |
1153 |
|
|
"ldr q19, [x21, #0x20]\n" |
1154 |
|
|
"mov x23, %x[A_ptr]\n" |
1155 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
1156 |
|
|
"ldr q20, [x20, #0x0]\n" |
1157 |
|
|
"ldr q21, [x20, #0x10]\n" |
1158 |
|
|
"cmp x24, #0x4\n" |
1159 |
|
|
"add x21, x21, #0x30\n" |
1160 |
|
|
"add x20, x20, #0x30\n" |
1161 |
|
|
"zip2 v25.4s, v19.4s, v30.4s\n" |
1162 |
|
|
"zip1 v24.4s, v19.4s, v30.4s\n" |
1163 |
|
|
"zip2 v23.4s, v18.4s, v30.4s\n" |
1164 |
|
|
"zip1 v22.4s, v18.4s, v30.4s\n" |
1165 |
|
|
"zip2 v29.4s, v21.4s, v30.4s\n" |
1166 |
|
|
"zip1 v28.4s, v21.4s, v30.4s\n" |
1167 |
|
|
"zip2 v27.4s, v20.4s, v30.4s\n" |
1168 |
|
|
"zip1 v26.4s, v20.4s, v30.4s\n" |
1169 |
|
|
"zip2 v21.4s, v17.4s, v30.4s\n" |
1170 |
|
|
"zip1 v20.4s, v17.4s, v30.4s\n" |
1171 |
|
|
"zip2 v19.4s, v16.4s, v30.4s\n" |
1172 |
|
|
"zip1 v18.4s, v16.4s, v30.4s\n" |
1173 |
|
|
"zip2 v17.4s, v15.4s, v30.4s\n" |
1174 |
|
|
"zip1 v16.4s, v15.4s, v30.4s\n" |
1175 |
|
|
"zip2 v15.4s, v14.4s, v30.4s\n" |
1176 |
|
|
"zip1 v14.4s, v14.4s, v30.4s\n" |
1177 |
|
|
"blt 81f\n" |
1178 |
|
|
"cmp x24, #0x8\n" |
1179 |
|
|
"blt 80f\n" |
1180 |
|
|
"79:" // Width 8: Multiply loop: Main loop head |
1181 |
|
|
"ld1r { v0.2d }, [x23]\n" |
1182 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
1183 |
|
|
"sub x24, x24, #0x4\n" |
1184 |
|
|
"add x23, x23, #0x8\n" |
1185 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
1186 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
1187 |
|
|
"cmp x24, #0x8\n" |
1188 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
1189 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
1190 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
1191 |
|
|
"ldr q7, [x21, #0x0]\n" |
1192 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
1193 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
1194 |
|
|
"ldr q8, [x21, #0x10]\n" |
1195 |
|
|
"ldr q9, [x21, #0x20]\n" |
1196 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
1197 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
1198 |
|
|
"ldr q10, [x21, #0x30]\n" |
1199 |
|
|
"ldr q11, [x21, #0x40]\n" |
1200 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
1201 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
1202 |
|
|
"ldr q12, [x21, #0x50]\n" |
1203 |
|
|
"ldr q13, [x20, #0x0]\n" |
1204 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
1205 |
|
|
".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n" |
1206 |
|
|
"ldr q1, [x20, #0x10]\n" |
1207 |
|
|
"ldr q2, [x20, #0x20]\n" |
1208 |
|
|
".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n" |
1209 |
|
|
".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n" |
1210 |
|
|
"ldr q3, [x20, #0x30]\n" |
1211 |
|
|
".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n" |
1212 |
|
|
".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n" |
1213 |
|
|
"add x21, x21, #0x60\n" |
1214 |
|
|
".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n" |
1215 |
|
|
".inst 0x6e40fdba // bfdot v26.4s, v13.8h, v0.8h\n" |
1216 |
|
|
"add x20, x20, #0x60\n" |
1217 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
1218 |
|
|
".inst 0x6e40fc3b // bfdot v27.4s, v1.8h, v0.8h\n" |
1219 |
|
|
".inst 0x6e40fc5c // bfdot v28.4s, v2.8h, v0.8h\n" |
1220 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
1221 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
1222 |
|
|
".inst 0x6e40fc7d // bfdot v29.4s, v3.8h, v0.8h\n" |
1223 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
1224 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1225 |
|
|
"bge 79b\n" |
1226 |
|
|
"80:" // Width 8: Multiply loop: Single iteration only |
1227 |
|
|
"ld1r { v0.2d }, [x23]\n" |
1228 |
|
|
"ldr q4, [%x[B_ptr], #0x0]\n" |
1229 |
|
|
"add x23, x23, #0x8\n" |
1230 |
|
|
"sub x24, x24, #0x4\n" |
1231 |
|
|
"ldr q5, [%x[B_ptr], #0x10]\n" |
1232 |
|
|
"ldr q6, [%x[B_ptr], #0x20]\n" |
1233 |
|
|
"ldr q7, [%x[B_ptr], #0x30]\n" |
1234 |
|
|
"ldr q8, [%x[B_ptr], #0x40]\n" |
1235 |
|
|
"ldr q9, [%x[B_ptr], #0x50]\n" |
1236 |
|
|
"ldr q10, [x21, #0x0]\n" |
1237 |
|
|
".inst 0x6e40fc8e // bfdot v14.4s, v4.8h, v0.8h\n" |
1238 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
1239 |
|
|
"ldr q11, [x21, #0x10]\n" |
1240 |
|
|
"ldr q12, [x21, #0x20]\n" |
1241 |
|
|
".inst 0x6e40fcaf // bfdot v15.4s, v5.8h, v0.8h\n" |
1242 |
|
|
".inst 0x6e40fcd0 // bfdot v16.4s, v6.8h, v0.8h\n" |
1243 |
|
|
"ldr q13, [x21, #0x30]\n" |
1244 |
|
|
"ldr q1, [x21, #0x40]\n" |
1245 |
|
|
".inst 0x6e40fcf1 // bfdot v17.4s, v7.8h, v0.8h\n" |
1246 |
|
|
".inst 0x6e40fd12 // bfdot v18.4s, v8.8h, v0.8h\n" |
1247 |
|
|
"ldr q2, [x21, #0x50]\n" |
1248 |
|
|
"ldr q3, [x20, #0x0]\n" |
1249 |
|
|
".inst 0x6e40fd33 // bfdot v19.4s, v9.8h, v0.8h\n" |
1250 |
|
|
".inst 0x6e40fd54 // bfdot v20.4s, v10.8h, v0.8h\n" |
1251 |
|
|
"ldr q4, [x20, #0x10]\n" |
1252 |
|
|
"ldr q5, [x20, #0x20]\n" |
1253 |
|
|
".inst 0x6e40fd75 // bfdot v21.4s, v11.8h, v0.8h\n" |
1254 |
|
|
".inst 0x6e40fd96 // bfdot v22.4s, v12.8h, v0.8h\n" |
1255 |
|
|
"ldr q6, [x20, #0x30]\n" |
1256 |
|
|
".inst 0x6e40fdb7 // bfdot v23.4s, v13.8h, v0.8h\n" |
1257 |
|
|
".inst 0x6e40fc38 // bfdot v24.4s, v1.8h, v0.8h\n" |
1258 |
|
|
"add x21, x21, #0x60\n" |
1259 |
|
|
".inst 0x6e40fc59 // bfdot v25.4s, v2.8h, v0.8h\n" |
1260 |
|
|
".inst 0x6e40fc7a // bfdot v26.4s, v3.8h, v0.8h\n" |
1261 |
|
|
"add x20, x20, #0x60\n" |
1262 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
1263 |
|
|
".inst 0x6e40fc9b // bfdot v27.4s, v4.8h, v0.8h\n" |
1264 |
|
|
".inst 0x6e40fcbc // bfdot v28.4s, v5.8h, v0.8h\n" |
1265 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
1266 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
1267 |
|
|
".inst 0x6e40fcdd // bfdot v29.4s, v6.8h, v0.8h\n" |
1268 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
1269 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1270 |
|
|
"81:" // Width 8: Multiply loop: Main loop skip |
1271 |
|
|
"cbz x24, 84f\n" |
1272 |
|
|
"tbz x24, #1, 82f\n" |
1273 |
|
|
"ldr s0, [x23], #0x4\n" |
1274 |
|
|
"tbz x24, #0, 83f\n" |
1275 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
1276 |
|
|
"b 83f\n" |
1277 |
|
|
"82:" // Width 8: Multiply loop: Ragged operand read: partial_1_0 |
1278 |
|
|
"ldr h0, [x23, #0x0]\n" |
1279 |
|
|
"83:" // Width 8: Multiply loop: Ragged operand read: Done |
1280 |
|
|
"ldr q7, [%x[B_ptr], #0x0]\n" |
1281 |
|
|
"ldr q8, [%x[B_ptr], #0x10]\n" |
1282 |
|
|
"dup v0.2d, v0.d[0]\n" |
1283 |
|
|
"ldr q9, [%x[B_ptr], #0x20]\n" |
1284 |
|
|
"ldr q10, [%x[B_ptr], #0x30]\n" |
1285 |
|
|
"ldr q11, [%x[B_ptr], #0x40]\n" |
1286 |
|
|
"ldr q12, [%x[B_ptr], #0x50]\n" |
1287 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
1288 |
|
|
"ldr q13, [x21, #0x0]\n" |
1289 |
|
|
"ldr q1, [x21, #0x10]\n" |
1290 |
|
|
".inst 0x6e40fcee // bfdot v14.4s, v7.8h, v0.8h\n" |
1291 |
|
|
".inst 0x6e40fd0f // bfdot v15.4s, v8.8h, v0.8h\n" |
1292 |
|
|
"ldr q2, [x21, #0x20]\n" |
1293 |
|
|
"ldr q3, [x21, #0x30]\n" |
1294 |
|
|
".inst 0x6e40fd30 // bfdot v16.4s, v9.8h, v0.8h\n" |
1295 |
|
|
".inst 0x6e40fd51 // bfdot v17.4s, v10.8h, v0.8h\n" |
1296 |
|
|
"ldr q4, [x21, #0x40]\n" |
1297 |
|
|
"ldr q5, [x21, #0x50]\n" |
1298 |
|
|
".inst 0x6e40fd72 // bfdot v18.4s, v11.8h, v0.8h\n" |
1299 |
|
|
".inst 0x6e40fd93 // bfdot v19.4s, v12.8h, v0.8h\n" |
1300 |
|
|
"ldr q6, [x20, #0x0]\n" |
1301 |
|
|
"ldr q7, [x20, #0x10]\n" |
1302 |
|
|
".inst 0x6e40fdb4 // bfdot v20.4s, v13.8h, v0.8h\n" |
1303 |
|
|
".inst 0x6e40fc35 // bfdot v21.4s, v1.8h, v0.8h\n" |
1304 |
|
|
"ldr q8, [x20, #0x20]\n" |
1305 |
|
|
"ldr q9, [x20, #0x30]\n" |
1306 |
|
|
".inst 0x6e40fc56 // bfdot v22.4s, v2.8h, v0.8h\n" |
1307 |
|
|
".inst 0x6e40fc77 // bfdot v23.4s, v3.8h, v0.8h\n" |
1308 |
|
|
".inst 0x6e40fc98 // bfdot v24.4s, v4.8h, v0.8h\n" |
1309 |
|
|
".inst 0x6e40fcb9 // bfdot v25.4s, v5.8h, v0.8h\n" |
1310 |
|
|
".inst 0x6e40fcda // bfdot v26.4s, v6.8h, v0.8h\n" |
1311 |
|
|
".inst 0x6e40fcfb // bfdot v27.4s, v7.8h, v0.8h\n" |
1312 |
|
|
".inst 0x6e40fd1c // bfdot v28.4s, v8.8h, v0.8h\n" |
1313 |
|
|
".inst 0x6e40fd3d // bfdot v29.4s, v9.8h, v0.8h\n" |
1314 |
|
|
"84:" // Width 8: Multiply loop: No odd multiplies |
1315 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
1316 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
1317 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
1318 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
1319 |
|
|
"faddp v17.4s, v20.4s, v21.4s\n" |
1320 |
|
|
"faddp v18.4s, v22.4s, v23.4s\n" |
1321 |
|
|
"faddp v19.4s, v24.4s, v25.4s\n" |
1322 |
|
|
"faddp v20.4s, v26.4s, v27.4s\n" |
1323 |
|
|
"faddp v21.4s, v28.4s, v29.4s\n" |
1324 |
|
|
"tbz %x[flags], #1, 85f\n" |
1325 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
1326 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
1327 |
|
|
"ld1r { v23.4s }, [x21]\n" |
1328 |
|
|
"ld1r { v22.4s }, [x20]\n" |
1329 |
|
|
"fmin v14.4s, v14.4s, v23.4s\n" |
1330 |
|
|
"fmin v15.4s, v15.4s, v23.4s\n" |
1331 |
|
|
"fmin v16.4s, v16.4s, v23.4s\n" |
1332 |
|
|
"fmin v17.4s, v17.4s, v23.4s\n" |
1333 |
|
|
"fmin v18.4s, v18.4s, v23.4s\n" |
1334 |
|
|
"fmin v19.4s, v19.4s, v23.4s\n" |
1335 |
|
|
"fmin v20.4s, v20.4s, v23.4s\n" |
1336 |
|
|
"fmin v21.4s, v21.4s, v23.4s\n" |
1337 |
|
|
"fmax v14.4s, v14.4s, v22.4s\n" |
1338 |
|
|
"fmax v15.4s, v15.4s, v22.4s\n" |
1339 |
|
|
"fmax v16.4s, v16.4s, v22.4s\n" |
1340 |
|
|
"fmax v17.4s, v17.4s, v22.4s\n" |
1341 |
|
|
"fmax v18.4s, v18.4s, v22.4s\n" |
1342 |
|
|
"fmax v19.4s, v19.4s, v22.4s\n" |
1343 |
|
|
"fmax v20.4s, v20.4s, v22.4s\n" |
1344 |
|
|
"fmax v21.4s, v21.4s, v22.4s\n" |
1345 |
|
|
"85:" // Width 8: No activation |
1346 |
|
|
"cmp %x[N], #0x20\n" |
1347 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
1348 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
1349 |
|
|
"str q16, [%x[output_ptr], #0x20]\n" |
1350 |
|
|
"str q17, [%x[output_ptr], #0x30]\n" |
1351 |
|
|
"str q18, [%x[output_ptr], #0x40]\n" |
1352 |
|
|
"str q19, [%x[output_ptr], #0x50]\n" |
1353 |
|
|
"str q20, [%x[output_ptr], #0x60]\n" |
1354 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x70\n" |
1355 |
|
|
"blt 86f\n" |
1356 |
|
|
"str q21, [%x[output_ptr], #0x0]\n" |
1357 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
1358 |
|
|
"b 88f\n" |
1359 |
|
|
"86:" // Width 8: Partial writeback |
1360 |
|
|
"tbz %x[N], #1, 87f\n" |
1361 |
|
|
"str d21, [%x[output_ptr]], #0x8\n" |
1362 |
|
|
"tbz %x[N], #0, 88f\n" |
1363 |
|
|
"st1 { v21.s }[2], [%x[output_ptr]]\n" |
1364 |
|
|
"b 88f\n" |
1365 |
|
|
"87:" // Width 8: Partial direct writeback: partial_1_28 |
1366 |
|
|
"tbz %x[N], #0, 88f\n" |
1367 |
|
|
"str s21, [%x[output_ptr], #0x0]\n" |
1368 |
|
|
"88:" // Width 8: Writeback done |
1369 |
|
|
"b 100f\n" |
1370 |
|
|
"89:" // Width 9 |
1371 |
|
|
"ldr q14, [%x[B_ptr], #0x0]\n" |
1372 |
|
|
"ldr q15, [%x[B_ptr], #0x10]\n" |
1373 |
|
|
"mov x24, %x[K]\n" |
1374 |
|
|
"add x21, %x[B_ptr], x26\n" |
1375 |
|
|
"ldr q16, [%x[B_ptr], #0x20]\n" |
1376 |
|
|
"ldr q17, [x21, #0x0]\n" |
1377 |
|
|
"add x20, %x[B_ptr], x26, LSL #1\n" |
1378 |
|
|
"movi v0.16b, #0x0\n" |
1379 |
|
|
"ldr q18, [x21, #0x10]\n" |
1380 |
|
|
"ldr q19, [x21, #0x20]\n" |
1381 |
|
|
"mov x23, %x[A_ptr]\n" |
1382 |
|
|
"add x22, x20, x26\n" |
1383 |
|
|
"ldr q20, [x20, #0x0]\n" |
1384 |
|
|
"ldr q21, [x20, #0x10]\n" |
1385 |
|
|
"cmp x24, #0x4\n" |
1386 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x30\n" |
1387 |
|
|
"ldr q22, [x20, #0x20]\n" |
1388 |
|
|
"add x21, x21, #0x30\n" |
1389 |
|
|
"add x20, x20, #0x30\n" |
1390 |
|
|
"zip2 v25.4s, v19.4s, v0.4s\n" |
1391 |
|
|
"zip1 v24.4s, v19.4s, v0.4s\n" |
1392 |
|
|
"zip2 v23.4s, v18.4s, v0.4s\n" |
1393 |
|
|
"zip2 v29.4s, v21.4s, v0.4s\n" |
1394 |
|
|
"zip1 v28.4s, v21.4s, v0.4s\n" |
1395 |
|
|
"zip2 v31.4s, v22.4s, v0.4s\n" |
1396 |
|
|
"zip1 v30.4s, v22.4s, v0.4s\n" |
1397 |
|
|
"zip2 v27.4s, v20.4s, v0.4s\n" |
1398 |
|
|
"zip1 v26.4s, v20.4s, v0.4s\n" |
1399 |
|
|
"zip1 v22.4s, v18.4s, v0.4s\n" |
1400 |
|
|
"zip2 v21.4s, v17.4s, v0.4s\n" |
1401 |
|
|
"zip1 v20.4s, v17.4s, v0.4s\n" |
1402 |
|
|
"zip2 v19.4s, v16.4s, v0.4s\n" |
1403 |
|
|
"zip1 v18.4s, v16.4s, v0.4s\n" |
1404 |
|
|
"zip2 v17.4s, v15.4s, v0.4s\n" |
1405 |
|
|
"zip1 v16.4s, v15.4s, v0.4s\n" |
1406 |
|
|
"zip2 v15.4s, v14.4s, v0.4s\n" |
1407 |
|
|
"zip1 v14.4s, v14.4s, v0.4s\n" |
1408 |
|
|
"blt 92f\n" |
1409 |
|
|
"cmp x24, #0x8\n" |
1410 |
|
|
"blt 91f\n" |
1411 |
|
|
"90:" // Width 9: Multiply loop: Main loop head |
1412 |
|
|
"ld1r { v0.2d }, [x23]\n" |
1413 |
|
|
"ldr q1, [%x[B_ptr], #0x0]\n" |
1414 |
|
|
"sub x24, x24, #0x4\n" |
1415 |
|
|
"add x23, x23, #0x8\n" |
1416 |
|
|
"ldr q2, [%x[B_ptr], #0x10]\n" |
1417 |
|
|
"ldr q3, [%x[B_ptr], #0x20]\n" |
1418 |
|
|
"cmp x24, #0x8\n" |
1419 |
|
|
"ldr q4, [%x[B_ptr], #0x30]\n" |
1420 |
|
|
"ldr q5, [%x[B_ptr], #0x40]\n" |
1421 |
|
|
"ldr q6, [%x[B_ptr], #0x50]\n" |
1422 |
|
|
"ldr q7, [x21, #0x0]\n" |
1423 |
|
|
".inst 0x6e40fc2e // bfdot v14.4s, v1.8h, v0.8h\n" |
1424 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
1425 |
|
|
"ldr q8, [x21, #0x10]\n" |
1426 |
|
|
"ldr q9, [x21, #0x20]\n" |
1427 |
|
|
".inst 0x6e40fc4f // bfdot v15.4s, v2.8h, v0.8h\n" |
1428 |
|
|
".inst 0x6e40fc70 // bfdot v16.4s, v3.8h, v0.8h\n" |
1429 |
|
|
"ldr q10, [x21, #0x30]\n" |
1430 |
|
|
"ldr q11, [x21, #0x40]\n" |
1431 |
|
|
".inst 0x6e40fc91 // bfdot v17.4s, v4.8h, v0.8h\n" |
1432 |
|
|
".inst 0x6e40fcb2 // bfdot v18.4s, v5.8h, v0.8h\n" |
1433 |
|
|
"ldr q12, [x21, #0x50]\n" |
1434 |
|
|
"ldr q13, [x20, #0x0]\n" |
1435 |
|
|
".inst 0x6e40fcd3 // bfdot v19.4s, v6.8h, v0.8h\n" |
1436 |
|
|
".inst 0x6e40fcf4 // bfdot v20.4s, v7.8h, v0.8h\n" |
1437 |
|
|
"ldr q1, [x20, #0x10]\n" |
1438 |
|
|
"ldr q2, [x20, #0x20]\n" |
1439 |
|
|
".inst 0x6e40fd15 // bfdot v21.4s, v8.8h, v0.8h\n" |
1440 |
|
|
".inst 0x6e40fd36 // bfdot v22.4s, v9.8h, v0.8h\n" |
1441 |
|
|
"ldr q3, [x20, #0x30]\n" |
1442 |
|
|
"ldr q4, [x20, #0x40]\n" |
1443 |
|
|
".inst 0x6e40fd57 // bfdot v23.4s, v10.8h, v0.8h\n" |
1444 |
|
|
".inst 0x6e40fd78 // bfdot v24.4s, v11.8h, v0.8h\n" |
1445 |
|
|
"ldr q5, [x20, #0x50]\n" |
1446 |
|
|
".inst 0x6e40fd99 // bfdot v25.4s, v12.8h, v0.8h\n" |
1447 |
|
|
".inst 0x6e40fdba // bfdot v26.4s, v13.8h, v0.8h\n" |
1448 |
|
|
"add x21, x21, #0x60\n" |
1449 |
|
|
".inst 0x6e40fc3b // bfdot v27.4s, v1.8h, v0.8h\n" |
1450 |
|
|
".inst 0x6e40fc5c // bfdot v28.4s, v2.8h, v0.8h\n" |
1451 |
|
|
"add x20, x20, #0x60\n" |
1452 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
1453 |
|
|
".inst 0x6e40fc7d // bfdot v29.4s, v3.8h, v0.8h\n" |
1454 |
|
|
".inst 0x6e40fc9e // bfdot v30.4s, v4.8h, v0.8h\n" |
1455 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
1456 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
1457 |
|
|
".inst 0x6e40fcbf // bfdot v31.4s, v5.8h, v0.8h\n" |
1458 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
1459 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1460 |
|
|
"bge 90b\n" |
1461 |
|
|
"91:" // Width 9: Multiply loop: Single iteration only |
1462 |
|
|
"ld1r { v0.2d }, [x23]\n" |
1463 |
|
|
"ldr q6, [%x[B_ptr], #0x0]\n" |
1464 |
|
|
"add x23, x23, #0x8\n" |
1465 |
|
|
"sub x24, x24, #0x4\n" |
1466 |
|
|
"ldr q7, [%x[B_ptr], #0x10]\n" |
1467 |
|
|
"ldr q8, [%x[B_ptr], #0x20]\n" |
1468 |
|
|
"ldr q9, [%x[B_ptr], #0x30]\n" |
1469 |
|
|
"ldr q10, [%x[B_ptr], #0x40]\n" |
1470 |
|
|
"ldr q11, [%x[B_ptr], #0x50]\n" |
1471 |
|
|
"ldr q12, [x21, #0x0]\n" |
1472 |
|
|
".inst 0x6e40fcce // bfdot v14.4s, v6.8h, v0.8h\n" |
1473 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x60\n" |
1474 |
|
|
"ldr q13, [x21, #0x10]\n" |
1475 |
|
|
"ldr q1, [x21, #0x20]\n" |
1476 |
|
|
".inst 0x6e40fcef // bfdot v15.4s, v7.8h, v0.8h\n" |
1477 |
|
|
".inst 0x6e40fd10 // bfdot v16.4s, v8.8h, v0.8h\n" |
1478 |
|
|
"ldr q2, [x21, #0x30]\n" |
1479 |
|
|
"ldr q3, [x21, #0x40]\n" |
1480 |
|
|
".inst 0x6e40fd31 // bfdot v17.4s, v9.8h, v0.8h\n" |
1481 |
|
|
".inst 0x6e40fd52 // bfdot v18.4s, v10.8h, v0.8h\n" |
1482 |
|
|
"ldr q4, [x21, #0x50]\n" |
1483 |
|
|
"ldr q5, [x20, #0x0]\n" |
1484 |
|
|
".inst 0x6e40fd73 // bfdot v19.4s, v11.8h, v0.8h\n" |
1485 |
|
|
".inst 0x6e40fd94 // bfdot v20.4s, v12.8h, v0.8h\n" |
1486 |
|
|
"ldr q6, [x20, #0x10]\n" |
1487 |
|
|
"ldr q7, [x20, #0x20]\n" |
1488 |
|
|
".inst 0x6e40fdb5 // bfdot v21.4s, v13.8h, v0.8h\n" |
1489 |
|
|
".inst 0x6e40fc36 // bfdot v22.4s, v1.8h, v0.8h\n" |
1490 |
|
|
"ldr q8, [x20, #0x30]\n" |
1491 |
|
|
"ldr q9, [x20, #0x40]\n" |
1492 |
|
|
".inst 0x6e40fc57 // bfdot v23.4s, v2.8h, v0.8h\n" |
1493 |
|
|
".inst 0x6e40fc78 // bfdot v24.4s, v3.8h, v0.8h\n" |
1494 |
|
|
"ldr q10, [x20, #0x50]\n" |
1495 |
|
|
".inst 0x6e40fc99 // bfdot v25.4s, v4.8h, v0.8h\n" |
1496 |
|
|
".inst 0x6e40fcba // bfdot v26.4s, v5.8h, v0.8h\n" |
1497 |
|
|
"add x21, x21, #0x60\n" |
1498 |
|
|
".inst 0x6e40fcdb // bfdot v27.4s, v6.8h, v0.8h\n" |
1499 |
|
|
".inst 0x6e40fcfc // bfdot v28.4s, v7.8h, v0.8h\n" |
1500 |
|
|
"add x20, x20, #0x60\n" |
1501 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x400]\n" |
1502 |
|
|
".inst 0x6e40fd1d // bfdot v29.4s, v8.8h, v0.8h\n" |
1503 |
|
|
".inst 0x6e40fd3e // bfdot v30.4s, v9.8h, v0.8h\n" |
1504 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x440]\n" |
1505 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x480]\n" |
1506 |
|
|
".inst 0x6e40fd5f // bfdot v31.4s, v10.8h, v0.8h\n" |
1507 |
|
|
"prfm pldl1keep, [%x[B_ptr], #0x4c0]\n" |
1508 |
|
|
"prfm pldl1keep, [x23, #0x80]\n" |
1509 |
|
|
"92:" // Width 9: Multiply loop: Main loop skip |
1510 |
|
|
"cbz x24, 95f\n" |
1511 |
|
|
"tbz x24, #1, 93f\n" |
1512 |
|
|
"ldr s0, [x23], #0x4\n" |
1513 |
|
|
"tbz x24, #0, 94f\n" |
1514 |
|
|
"ld1 { v0.h }[2], [x23]\n" |
1515 |
|
|
"b 94f\n" |
1516 |
|
|
"93:" // Width 9: Multiply loop: Ragged operand read: partial_1_0 |
1517 |
|
|
"ldr h0, [x23, #0x0]\n" |
1518 |
|
|
"94:" // Width 9: Multiply loop: Ragged operand read: Done |
1519 |
|
|
"ldr q11, [%x[B_ptr], #0x0]\n" |
1520 |
|
|
"ldr q12, [%x[B_ptr], #0x10]\n" |
1521 |
|
|
"dup v0.2d, v0.d[0]\n" |
1522 |
|
|
"ldr q13, [%x[B_ptr], #0x20]\n" |
1523 |
|
|
"ldr q1, [%x[B_ptr], #0x30]\n" |
1524 |
|
|
"ldr q2, [%x[B_ptr], #0x40]\n" |
1525 |
|
|
"ldr q3, [%x[B_ptr], #0x50]\n" |
1526 |
|
|
"add %x[B_ptr], %x[B_ptr], #0x90\n" |
1527 |
|
|
"ldr q4, [x21, #0x0]\n" |
1528 |
|
|
"ldr q5, [x21, #0x10]\n" |
1529 |
|
|
".inst 0x6e40fd6e // bfdot v14.4s, v11.8h, v0.8h\n" |
1530 |
|
|
".inst 0x6e40fd8f // bfdot v15.4s, v12.8h, v0.8h\n" |
1531 |
|
|
"ldr q6, [x21, #0x20]\n" |
1532 |
|
|
"ldr q7, [x21, #0x30]\n" |
1533 |
|
|
".inst 0x6e40fdb0 // bfdot v16.4s, v13.8h, v0.8h\n" |
1534 |
|
|
".inst 0x6e40fc31 // bfdot v17.4s, v1.8h, v0.8h\n" |
1535 |
|
|
"ldr q8, [x21, #0x40]\n" |
1536 |
|
|
"ldr q9, [x21, #0x50]\n" |
1537 |
|
|
".inst 0x6e40fc52 // bfdot v18.4s, v2.8h, v0.8h\n" |
1538 |
|
|
".inst 0x6e40fc73 // bfdot v19.4s, v3.8h, v0.8h\n" |
1539 |
|
|
"ldr q10, [x20, #0x0]\n" |
1540 |
|
|
"ldr q11, [x20, #0x10]\n" |
1541 |
|
|
".inst 0x6e40fc94 // bfdot v20.4s, v4.8h, v0.8h\n" |
1542 |
|
|
".inst 0x6e40fcb5 // bfdot v21.4s, v5.8h, v0.8h\n" |
1543 |
|
|
"ldr q12, [x20, #0x20]\n" |
1544 |
|
|
"ldr q13, [x20, #0x30]\n" |
1545 |
|
|
".inst 0x6e40fcd6 // bfdot v22.4s, v6.8h, v0.8h\n" |
1546 |
|
|
".inst 0x6e40fcf7 // bfdot v23.4s, v7.8h, v0.8h\n" |
1547 |
|
|
"ldr q1, [x20, #0x40]\n" |
1548 |
|
|
"ldr q2, [x20, #0x50]\n" |
1549 |
|
|
".inst 0x6e40fd18 // bfdot v24.4s, v8.8h, v0.8h\n" |
1550 |
|
|
".inst 0x6e40fd39 // bfdot v25.4s, v9.8h, v0.8h\n" |
1551 |
|
|
".inst 0x6e40fd5a // bfdot v26.4s, v10.8h, v0.8h\n" |
1552 |
|
|
".inst 0x6e40fd7b // bfdot v27.4s, v11.8h, v0.8h\n" |
1553 |
|
|
".inst 0x6e40fd9c // bfdot v28.4s, v12.8h, v0.8h\n" |
1554 |
|
|
".inst 0x6e40fdbd // bfdot v29.4s, v13.8h, v0.8h\n" |
1555 |
|
|
".inst 0x6e40fc3e // bfdot v30.4s, v1.8h, v0.8h\n" |
1556 |
|
|
".inst 0x6e40fc5f // bfdot v31.4s, v2.8h, v0.8h\n" |
1557 |
|
|
"95:" // Width 9: Multiply loop: No odd multiplies |
1558 |
|
|
"prfm pstl1keep, [%x[output_ptr], #0x0]\n" |
1559 |
|
|
"faddp v14.4s, v14.4s, v15.4s\n" |
1560 |
|
|
"faddp v15.4s, v16.4s, v17.4s\n" |
1561 |
|
|
"faddp v16.4s, v18.4s, v19.4s\n" |
1562 |
|
|
"faddp v17.4s, v20.4s, v21.4s\n" |
1563 |
|
|
"faddp v18.4s, v22.4s, v23.4s\n" |
1564 |
|
|
"faddp v19.4s, v24.4s, v25.4s\n" |
1565 |
|
|
"faddp v20.4s, v26.4s, v27.4s\n" |
1566 |
|
|
"faddp v21.4s, v28.4s, v29.4s\n" |
1567 |
|
|
"faddp v22.4s, v30.4s, v31.4s\n" |
1568 |
|
|
"tbz %x[flags], #1, 96f\n" |
1569 |
|
|
"add x21, %x[args_ptr], %[offset_max]\n" |
1570 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
1571 |
|
|
"ld1r { v24.4s }, [x21]\n" |
1572 |
|
|
"ld1r { v23.4s }, [x20]\n" |
1573 |
|
|
"fmin v14.4s, v14.4s, v24.4s\n" |
1574 |
|
|
"fmin v15.4s, v15.4s, v24.4s\n" |
1575 |
|
|
"fmin v16.4s, v16.4s, v24.4s\n" |
1576 |
|
|
"fmin v17.4s, v17.4s, v24.4s\n" |
1577 |
|
|
"fmin v18.4s, v18.4s, v24.4s\n" |
1578 |
|
|
"fmin v19.4s, v19.4s, v24.4s\n" |
1579 |
|
|
"fmin v20.4s, v20.4s, v24.4s\n" |
1580 |
|
|
"fmin v21.4s, v21.4s, v24.4s\n" |
1581 |
|
|
"fmin v22.4s, v22.4s, v24.4s\n" |
1582 |
|
|
"fmax v14.4s, v14.4s, v23.4s\n" |
1583 |
|
|
"fmax v15.4s, v15.4s, v23.4s\n" |
1584 |
|
|
"fmax v16.4s, v16.4s, v23.4s\n" |
1585 |
|
|
"fmax v17.4s, v17.4s, v23.4s\n" |
1586 |
|
|
"fmax v18.4s, v18.4s, v23.4s\n" |
1587 |
|
|
"fmax v19.4s, v19.4s, v23.4s\n" |
1588 |
|
|
"fmax v20.4s, v20.4s, v23.4s\n" |
1589 |
|
|
"fmax v21.4s, v21.4s, v23.4s\n" |
1590 |
|
|
"fmax v22.4s, v22.4s, v23.4s\n" |
1591 |
|
|
"96:" // Width 9: No activation |
1592 |
|
|
"cmp %x[N], #0x24\n" |
1593 |
|
|
"str q14, [%x[output_ptr], #0x0]\n" |
1594 |
|
|
"str q15, [%x[output_ptr], #0x10]\n" |
1595 |
|
|
"str q16, [%x[output_ptr], #0x20]\n" |
1596 |
|
|
"str q17, [%x[output_ptr], #0x30]\n" |
1597 |
|
|
"str q18, [%x[output_ptr], #0x40]\n" |
1598 |
|
|
"str q19, [%x[output_ptr], #0x50]\n" |
1599 |
|
|
"str q20, [%x[output_ptr], #0x60]\n" |
1600 |
|
|
"str q21, [%x[output_ptr], #0x70]\n" |
1601 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x80\n" |
1602 |
|
|
"blt 97f\n" |
1603 |
|
|
"str q22, [%x[output_ptr], #0x0]\n" |
1604 |
|
|
"add %x[output_ptr], %x[output_ptr], #0x10\n" |
1605 |
|
|
"b 99f\n" |
1606 |
|
|
"97:" // Width 9: Partial writeback |
1607 |
|
|
"tbz %x[N], #1, 98f\n" |
1608 |
|
|
"str d22, [%x[output_ptr]], #0x8\n" |
1609 |
|
|
"tbz %x[N], #0, 99f\n" |
1610 |
|
|
"st1 { v22.s }[2], [%x[output_ptr]]\n" |
1611 |
|
|
"b 99f\n" |
1612 |
|
|
"98:" // Width 9: Partial direct writeback: partial_1_32 |
1613 |
|
|
"tbz %x[N], #0, 99f\n" |
1614 |
|
|
"str s22, [%x[output_ptr], #0x0]\n" |
1615 |
|
|
"99:" // Width 9: Writeback done |
1616 |
|
|
"subs x25, x25, #0x9\n" |
1617 |
|
|
"mov %x[B_ptr], x22\n" |
1618 |
|
|
"sub %x[N], %x[N], #0x24\n" |
1619 |
|
|
"bgt 1b\n" |
1620 |
|
|
"100:" // Exit |
1621 |
|
|
: [B_ptr] "+&r"(B_ptr), [N] "+&r"(N), [output_ptr] "+&r"(output_ptr) |
1622 |
|
93 |
: [A_ptr] "r"(A_ptr), [K] "r"(K), [args_ptr] "r"(&ka), [flags] "r"(flags), |
1623 |
|
|
[offset_max] "I"(offsetof(KernelArgs, maxval)), [offset_min] "I"(offsetof(KernelArgs, minval)) |
1624 |
|
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", |
1625 |
|
|
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", |
1626 |
|
|
"v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); |
1627 |
|
93 |
} |
1628 |
|
|
|
1629 |
|
|
#endif // Architectural features check. |
1630 |
|
|
|