Line |
Branch |
Exec |
Source |
1 |
|
|
// |
2 |
|
|
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com> |
3 |
|
|
// |
4 |
|
|
// SPDX-License-Identifier: Apache-2.0 |
5 |
|
|
// |
6 |
|
|
|
7 |
|
|
// Do not flag up inline assembly blocks |
8 |
|
|
#pragma GCC diagnostic ignored "-Woverlength-strings" |
9 |
|
|
|
10 |
|
|
#if !defined(__aarch64__) || !defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) |
11 |
|
|
#error This file must be compiled for AArch64, FEAT_BF16. |
12 |
|
|
#else // Architectural features check. |
13 |
|
|
|
14 |
|
|
#include "kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.h" |
15 |
|
|
|
16 |
|
|
#include <arm_bf16.h> |
17 |
|
|
#include <arm_neon.h> |
18 |
|
|
#include <stddef.h> |
19 |
|
|
#include <stdint.h> |
20 |
|
|
|
21 |
|
|
#include "kai/kai_common.h" |
22 |
|
|
|
23 |
|
|
static const size_t kai_mr = 8; |
24 |
|
|
static const size_t kai_nr = 12; |
25 |
|
|
static const size_t kai_kr = 4; |
26 |
|
|
static const size_t kai_sr = 1; |
27 |
|
|
|
28 |
|
200 |
size_t kai_get_m_step_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) { |
29 |
|
200 |
return kai_mr; |
30 |
|
|
} |
31 |
|
|
|
32 |
|
200 |
size_t kai_get_n_step_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) { |
33 |
|
200 |
return kai_nr; |
34 |
|
|
} |
35 |
|
|
|
36 |
|
✗ |
size_t kai_get_mr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) { |
37 |
|
✗ |
return kai_mr; |
38 |
|
|
} |
39 |
|
|
|
40 |
|
40 |
size_t kai_get_nr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) { |
41 |
|
40 |
return kai_nr; |
42 |
|
|
} |
43 |
|
|
|
44 |
|
40 |
size_t kai_get_kr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) { |
45 |
|
40 |
return kai_kr; |
46 |
|
|
} |
47 |
|
|
|
48 |
|
40 |
size_t kai_get_sr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) { |
49 |
|
40 |
return kai_sr; |
50 |
|
|
} |
51 |
|
|
|
52 |
|
184 |
size_t kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(size_t m_idx, size_t k) { |
53 |
|
− |
KAI_ASSUME(m_idx % kai_mr == 0); |
54 |
|
|
|
55 |
|
184 |
return m_idx * kai_roundup(k, kai_kr) * sizeof(uint16_t); |
56 |
|
|
} |
57 |
|
|
|
58 |
|
184 |
size_t kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(size_t n_idx, size_t k) { |
59 |
|
− |
KAI_ASSUME(n_idx % kai_nr == 0); |
60 |
|
|
|
61 |
|
184 |
return n_idx * (sizeof(float) + kai_roundup(k, kai_kr) * sizeof(uint16_t)); |
62 |
|
|
} |
63 |
|
|
|
64 |
|
184 |
size_t kai_get_dst_offset_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla( |
65 |
|
|
size_t m_idx, size_t n_idx, size_t stride) { |
66 |
|
− |
KAI_ASSUME(m_idx % kai_mr == 0); |
67 |
|
− |
KAI_ASSUME(n_idx % kai_nr == 0); |
68 |
|
|
|
69 |
|
184 |
return m_idx * stride + n_idx * sizeof(float); |
70 |
|
|
} |
71 |
|
|
|
72 |
|
184 |
size_t kai_get_dst_size_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(size_t m, size_t n) { |
73 |
|
184 |
return m * n * sizeof(float); |
74 |
|
|
} |
75 |
|
|
|
76 |
|
185 |
void kai_run_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla( |
77 |
|
|
size_t m, size_t n, size_t k, // |
78 |
|
|
const void* lhs_packed, // |
79 |
|
|
const void* rhs_packed, // |
80 |
|
|
void* dst, size_t dst_stride_row, size_t dst_stride_col, // |
81 |
|
|
float clamp_min, float clamp_max) { |
82 |
|
− |
KAI_ASSERT(dst_stride_col == sizeof(float)); |
83 |
|
|
|
84 |
|
185 |
const void* Apanel = lhs_packed; |
85 |
|
185 |
void* Cpanel = dst; |
86 |
|
185 |
size_t ldc = dst_stride_row / sizeof(float); |
87 |
|
|
|
88 |
|
185 |
size_t M = m; |
89 |
|
|
|
90 |
|
|
typedef struct { |
91 |
|
|
float maxval; |
92 |
|
|
float minval; |
93 |
|
|
size_t N; |
94 |
|
|
size_t K; |
95 |
|
|
const void* Bpanel; |
96 |
|
|
void* output_ptr; |
97 |
|
|
} KernelArgs; |
98 |
|
|
|
99 |
|
185 |
KernelArgs ka; |
100 |
|
|
|
101 |
|
185 |
ka.N = n; |
102 |
|
185 |
ka.K = kai_roundup(k, kai_kr) / kai_kr - 1; |
103 |
|
|
|
104 |
|
185 |
ka.Bpanel = rhs_packed; |
105 |
|
|
|
106 |
|
|
// Direct output. |
107 |
|
185 |
ka.output_ptr = dst; |
108 |
|
|
|
109 |
|
|
// Clamping output. |
110 |
|
185 |
ka.maxval = clamp_max; |
111 |
|
185 |
ka.minval = clamp_min; |
112 |
|
|
|
113 |
|
370 |
__asm__ __volatile__( |
114 |
|
|
"1:" // Height loop |
115 |
|
|
"add x11, %x[Cpanel], %x[ldc], LSL #2\n" |
116 |
|
|
"add x10, %x[Cpanel], %x[ldc], LSL #1\n" |
117 |
|
|
"add x9, x11, %x[ldc], LSL #1\n" |
118 |
|
|
"cmp %x[M], #0x8\n" |
119 |
|
|
"add x28, %x[Cpanel], %x[ldc], LSL #3\n" |
120 |
|
|
"add x27, %x[Cpanel], %x[ldc]\n" |
121 |
|
|
"add x26, x10, %x[ldc]\n" |
122 |
|
|
"add x25, x11, %x[ldc]\n" |
123 |
|
|
"add x24, x9, %x[ldc]\n" |
124 |
|
|
"bge 2f\n" |
125 |
|
|
"cmp %x[M], #0x2\n" |
126 |
|
|
"mov x24, %x[Cpanel]\n" |
127 |
|
|
"csel x27, x27, %x[Cpanel], GE\n" |
128 |
|
|
"csel x10, x10, %x[Cpanel], GT\n" |
129 |
|
|
"cmp %x[M], #0x4\n" |
130 |
|
|
"csel x26, x26, %x[Cpanel], GE\n" |
131 |
|
|
"csel x11, x11, %x[Cpanel], GT\n" |
132 |
|
|
"cmp %x[M], #0x6\n" |
133 |
|
|
"csel x25, x25, %x[Cpanel], GE\n" |
134 |
|
|
"csel x9, x9, %x[Cpanel], GT\n" |
135 |
|
|
"2:" // all rows valid |
136 |
|
|
"ldr x23, [%x[args_ptr], %[offsetof_N]]\n" |
137 |
|
|
"ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n" |
138 |
|
|
"mov x21, %x[Apanel]\n" |
139 |
|
|
"3:" // Width loop |
140 |
|
|
"ldr q4, [x22, #0x0]\n" |
141 |
|
|
"ldr q5, [x22, #0x10]\n" |
142 |
|
|
"mov %x[Apanel], x21\n" |
143 |
|
|
"ldr q6, [x22, #0x20]\n" |
144 |
|
|
"ldr x20, [%x[args_ptr], %[offsetof_K]]\n" |
145 |
|
|
"add x22, x22, #0x30\n" |
146 |
|
|
"ldr q7, [x22, #0x0]\n" |
147 |
|
|
"ldr q0, [%x[Apanel], #0x0]\n" |
148 |
|
|
"ldr q1, [%x[Apanel], #0x10]\n" |
149 |
|
|
"zip1 v8.2d, v4.2d, v4.2d\n" |
150 |
|
|
"ldr q2, [%x[Apanel], #0x20]\n" |
151 |
|
|
"zip2 v11.2d, v4.2d, v4.2d\n" |
152 |
|
|
"ldr q4, [x22, #0x10]\n" |
153 |
|
|
"zip1 v9.2d, v5.2d, v5.2d\n" |
154 |
|
|
"zip2 v12.2d, v5.2d, v5.2d\n" |
155 |
|
|
"cmp x20, #0x2\n" |
156 |
|
|
"zip1 v10.2d, v6.2d, v6.2d\n" |
157 |
|
|
"zip2 v13.2d, v6.2d, v6.2d\n" |
158 |
|
|
"prfm pldl1keep, [%x[Apanel], #0x0]\n" |
159 |
|
|
"mov v14.16b, v8.16b\n" |
160 |
|
|
"mov v17.16b, v11.16b\n" |
161 |
|
|
"prfm pldl1keep, [x22, #0x0]\n" |
162 |
|
|
"mov v15.16b, v9.16b\n" |
163 |
|
|
"mov v18.16b, v12.16b\n" |
164 |
|
|
"prfm pldl1keep, [x22, #0x40]\n" |
165 |
|
|
"mov v16.16b, v10.16b\n" |
166 |
|
|
"mov v19.16b, v13.16b\n" |
167 |
|
|
"prfm pldl1keep, [%x[Apanel], #0x40]\n" |
168 |
|
|
"mov v20.16b, v8.16b\n" |
169 |
|
|
"mov v21.16b, v9.16b\n" |
170 |
|
|
"prfm pldl1keep, [x22, #0x80]\n" |
171 |
|
|
"mov v22.16b, v10.16b\n" |
172 |
|
|
"mov v23.16b, v11.16b\n" |
173 |
|
|
"prfm pldl1keep, [%x[Apanel], #0x80]\n" |
174 |
|
|
"mov v24.16b, v12.16b\n" |
175 |
|
|
"mov v25.16b, v13.16b\n" |
176 |
|
|
"prfm pldl1keep, [x22, #0xc0]\n" |
177 |
|
|
"mov v26.16b, v8.16b\n" |
178 |
|
|
"mov v27.16b, v9.16b\n" |
179 |
|
|
"prfm pldl1keep, [x22, #0x100]\n" |
180 |
|
|
"mov v28.16b, v10.16b\n" |
181 |
|
|
"mov v29.16b, v11.16b\n" |
182 |
|
|
"prfm pldl1keep, [%x[Apanel], #0xc0]\n" |
183 |
|
|
"mov v30.16b, v12.16b\n" |
184 |
|
|
"mov v31.16b, v13.16b\n" |
185 |
|
|
"prfm pldl1keep, [x22, #0x140]\n" |
186 |
|
|
"add x22, x22, #0x20\n" |
187 |
|
|
"add %x[Apanel], %x[Apanel], #0x30\n" |
188 |
|
|
"blt 5f\n" |
189 |
|
|
"4:" // main loop head |
190 |
|
|
"ldr q3, [%x[Apanel], #0x0]\n" |
191 |
|
|
"ldr q5, [x22, #0x0]\n" |
192 |
|
|
".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" |
193 |
|
|
"ldr q6, [x22, #0x10]\n" |
194 |
|
|
".inst 0x6e44ec0b // bfmmla v11.4s, v0.8h, v4.8h\n" |
195 |
|
|
".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n" |
196 |
|
|
".inst 0x6e44ec31 // bfmmla v17.4s, v1.8h, v4.8h\n" |
197 |
|
|
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" |
198 |
|
|
"sub x20, x20, #0x2\n" |
199 |
|
|
".inst 0x6e44ec57 // bfmmla v23.4s, v2.8h, v4.8h\n" |
200 |
|
|
".inst 0x6e47ec7a // bfmmla v26.4s, v3.8h, v7.8h\n" |
201 |
|
|
"ldr q7, [x22, #0x20]\n" |
202 |
|
|
".inst 0x6e44ec7d // bfmmla v29.4s, v3.8h, v4.8h\n" |
203 |
|
|
"ldr q4, [x22, #0x30]\n" |
204 |
|
|
".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" |
205 |
|
|
".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" |
206 |
|
|
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" |
207 |
|
|
"cmp x20, #0x2\n" |
208 |
|
|
".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n" |
209 |
|
|
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" |
210 |
|
|
"prfm pldl1keep, [%x[Apanel], #0x100]\n" |
211 |
|
|
".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n" |
212 |
|
|
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" |
213 |
|
|
"ldr q5, [x22, #0x40]\n" |
214 |
|
|
".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n" |
215 |
|
|
"ldr q6, [x22, #0x50]\n" |
216 |
|
|
".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" |
217 |
|
|
".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n" |
218 |
|
|
"ldr q0, [%x[Apanel], #0x10]\n" |
219 |
|
|
".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n" |
220 |
|
|
".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n" |
221 |
|
|
"ldr q1, [%x[Apanel], #0x20]\n" |
222 |
|
|
".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" |
223 |
|
|
".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n" |
224 |
|
|
"ldr q2, [%x[Apanel], #0x30]\n" |
225 |
|
|
".inst 0x6e47ec7c // bfmmla v28.4s, v3.8h, v7.8h\n" |
226 |
|
|
"ldr q7, [x22, #0x60]\n" |
227 |
|
|
".inst 0x6e44ec7f // bfmmla v31.4s, v3.8h, v4.8h\n" |
228 |
|
|
"ldr q3, [%x[Apanel], #0x40]\n" |
229 |
|
|
"ldr q4, [x22, #0x70]\n" |
230 |
|
|
".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n" |
231 |
|
|
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" |
232 |
|
|
".inst 0x6e45ec2e // bfmmla v14.4s, v1.8h, v5.8h\n" |
233 |
|
|
".inst 0x6e46ec31 // bfmmla v17.4s, v1.8h, v6.8h\n" |
234 |
|
|
"prfm pldl1keep, [x22, #0x180]\n" |
235 |
|
|
".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" |
236 |
|
|
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" |
237 |
|
|
"prfm pldl1keep, [x22, #0x1c0]\n" |
238 |
|
|
".inst 0x6e45ec7a // bfmmla v26.4s, v3.8h, v5.8h\n" |
239 |
|
|
"ldr q5, [x22, #0x80]\n" |
240 |
|
|
".inst 0x6e46ec7d // bfmmla v29.4s, v3.8h, v6.8h\n" |
241 |
|
|
"ldr q6, [x22, #0x90]\n" |
242 |
|
|
"prfm pldl1keep, [%x[Apanel], #0x140]\n" |
243 |
|
|
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" |
244 |
|
|
"prfm pldl1keep, [x22, #0x200]\n" |
245 |
|
|
".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" |
246 |
|
|
".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n" |
247 |
|
|
".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n" |
248 |
|
|
".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" |
249 |
|
|
".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" |
250 |
|
|
".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n" |
251 |
|
|
"ldr q7, [x22, #0xa0]\n" |
252 |
|
|
".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n" |
253 |
|
|
"ldr q4, [x22, #0xb0]\n" |
254 |
|
|
".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" |
255 |
|
|
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" |
256 |
|
|
"ldr q0, [%x[Apanel], #0x50]\n" |
257 |
|
|
".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n" |
258 |
|
|
".inst 0x6e46ec33 // bfmmla v19.4s, v1.8h, v6.8h\n" |
259 |
|
|
"ldr q1, [%x[Apanel], #0x60]\n" |
260 |
|
|
".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" |
261 |
|
|
".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" |
262 |
|
|
"ldr q2, [%x[Apanel], #0x70]\n" |
263 |
|
|
".inst 0x6e45ec7c // bfmmla v28.4s, v3.8h, v5.8h\n" |
264 |
|
|
".inst 0x6e46ec7f // bfmmla v31.4s, v3.8h, v6.8h\n" |
265 |
|
|
"add %x[Apanel], %x[Apanel], #0x80\n" |
266 |
|
|
"add x22, x22, #0xc0\n" |
267 |
|
|
"bge 4b\n" |
268 |
|
|
"5:" // main loop skip |
269 |
|
|
"ldr q3, [%x[Apanel], #0x0]\n" |
270 |
|
|
"ldr q5, [x22, #0x0]\n" |
271 |
|
|
".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" |
272 |
|
|
"ldr q6, [x22, #0x10]\n" |
273 |
|
|
".inst 0x6e44ec0b // bfmmla v11.4s, v0.8h, v4.8h\n" |
274 |
|
|
".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n" |
275 |
|
|
".inst 0x6e44ec31 // bfmmla v17.4s, v1.8h, v4.8h\n" |
276 |
|
|
".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" |
277 |
|
|
"add %x[Apanel], %x[Apanel], #0x10\n" |
278 |
|
|
".inst 0x6e44ec57 // bfmmla v23.4s, v2.8h, v4.8h\n" |
279 |
|
|
".inst 0x6e47ec7a // bfmmla v26.4s, v3.8h, v7.8h\n" |
280 |
|
|
"ldr q7, [x22, #0x20]\n" |
281 |
|
|
".inst 0x6e44ec7d // bfmmla v29.4s, v3.8h, v4.8h\n" |
282 |
|
|
"ldr q4, [x22, #0x30]\n" |
283 |
|
|
".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" |
284 |
|
|
".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" |
285 |
|
|
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" |
286 |
|
|
"add x22, x22, #0x40\n" |
287 |
|
|
".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n" |
288 |
|
|
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" |
289 |
|
|
".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n" |
290 |
|
|
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" |
291 |
|
|
".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n" |
292 |
|
|
".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" |
293 |
|
|
".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n" |
294 |
|
|
".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n" |
295 |
|
|
".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n" |
296 |
|
|
".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" |
297 |
|
|
".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n" |
298 |
|
|
".inst 0x6e47ec7c // bfmmla v28.4s, v3.8h, v7.8h\n" |
299 |
|
|
".inst 0x6e44ec7f // bfmmla v31.4s, v3.8h, v4.8h\n" |
300 |
|
|
"cbz x20, 6f\n" |
301 |
|
|
"ldr q5, [x22, #0x0]\n" |
302 |
|
|
"ldr q0, [%x[Apanel], #0x0]\n" |
303 |
|
|
"ldr q1, [%x[Apanel], #0x10]\n" |
304 |
|
|
"ldr q6, [x22, #0x10]\n" |
305 |
|
|
"ldr q2, [%x[Apanel], #0x20]\n" |
306 |
|
|
"ldr q3, [%x[Apanel], #0x30]\n" |
307 |
|
|
"add %x[Apanel], %x[Apanel], #0x40\n" |
308 |
|
|
"ldr q7, [x22, #0x20]\n" |
309 |
|
|
"ldr q4, [x22, #0x30]\n" |
310 |
|
|
".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n" |
311 |
|
|
".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" |
312 |
|
|
".inst 0x6e45ec2e // bfmmla v14.4s, v1.8h, v5.8h\n" |
313 |
|
|
".inst 0x6e46ec31 // bfmmla v17.4s, v1.8h, v6.8h\n" |
314 |
|
|
".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n" |
315 |
|
|
".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" |
316 |
|
|
".inst 0x6e45ec7a // bfmmla v26.4s, v3.8h, v5.8h\n" |
317 |
|
|
"ldr q5, [x22, #0x40]\n" |
318 |
|
|
".inst 0x6e46ec7d // bfmmla v29.4s, v3.8h, v6.8h\n" |
319 |
|
|
"ldr q6, [x22, #0x50]\n" |
320 |
|
|
".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" |
321 |
|
|
".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n" |
322 |
|
|
".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n" |
323 |
|
|
"add x22, x22, #0x60\n" |
324 |
|
|
".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n" |
325 |
|
|
".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" |
326 |
|
|
".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n" |
327 |
|
|
".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n" |
328 |
|
|
".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n" |
329 |
|
|
".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n" |
330 |
|
|
".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" |
331 |
|
|
".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n" |
332 |
|
|
".inst 0x6e46ec33 // bfmmla v19.4s, v1.8h, v6.8h\n" |
333 |
|
|
".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n" |
334 |
|
|
".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n" |
335 |
|
|
".inst 0x6e45ec7c // bfmmla v28.4s, v3.8h, v5.8h\n" |
336 |
|
|
".inst 0x6e46ec7f // bfmmla v31.4s, v3.8h, v6.8h\n" |
337 |
|
|
"6:" // multiply loop done |
338 |
|
|
"add x20, %x[args_ptr], %[offset_max]\n" |
339 |
|
|
"uzp1 v7.2d, v8.2d, v11.2d\n" |
340 |
|
|
"uzp2 v8.2d, v8.2d, v11.2d\n" |
341 |
|
|
"ld1r { v1.4s }, [x20]\n" |
342 |
|
|
"uzp1 v11.2d, v9.2d, v12.2d\n" |
343 |
|
|
"uzp2 v9.2d, v9.2d, v12.2d\n" |
344 |
|
|
"uzp1 v12.2d, v10.2d, v13.2d\n" |
345 |
|
|
"uzp2 v10.2d, v10.2d, v13.2d\n" |
346 |
|
|
"add x20, %x[args_ptr], %[offset_min]\n" |
347 |
|
|
"ld1r { v0.4s }, [x20]\n" |
348 |
|
|
"uzp1 v13.2d, v14.2d, v17.2d\n" |
349 |
|
|
"uzp2 v14.2d, v14.2d, v17.2d\n" |
350 |
|
|
"uzp1 v17.2d, v15.2d, v18.2d\n" |
351 |
|
|
"uzp2 v15.2d, v15.2d, v18.2d\n" |
352 |
|
|
"cmp x23, #0xc\n" |
353 |
|
|
"uzp1 v18.2d, v16.2d, v19.2d\n" |
354 |
|
|
"uzp2 v16.2d, v16.2d, v19.2d\n" |
355 |
|
|
"uzp1 v19.2d, v20.2d, v23.2d\n" |
356 |
|
|
"uzp2 v20.2d, v20.2d, v23.2d\n" |
357 |
|
|
"uzp1 v23.2d, v21.2d, v24.2d\n" |
358 |
|
|
"uzp2 v21.2d, v21.2d, v24.2d\n" |
359 |
|
|
"uzp1 v24.2d, v22.2d, v25.2d\n" |
360 |
|
|
"uzp2 v22.2d, v22.2d, v25.2d\n" |
361 |
|
|
"uzp1 v25.2d, v26.2d, v29.2d\n" |
362 |
|
|
"uzp2 v26.2d, v26.2d, v29.2d\n" |
363 |
|
|
"uzp1 v29.2d, v27.2d, v30.2d\n" |
364 |
|
|
"uzp2 v27.2d, v27.2d, v30.2d\n" |
365 |
|
|
"uzp1 v30.2d, v28.2d, v31.2d\n" |
366 |
|
|
"uzp2 v28.2d, v28.2d, v31.2d\n" |
367 |
|
|
"fmin v7.4s, v7.4s, v1.4s\n" |
368 |
|
|
"fmin v11.4s, v11.4s, v1.4s\n" |
369 |
|
|
"fmin v12.4s, v12.4s, v1.4s\n" |
370 |
|
|
"fmin v8.4s, v8.4s, v1.4s\n" |
371 |
|
|
"fmin v9.4s, v9.4s, v1.4s\n" |
372 |
|
|
"fmin v10.4s, v10.4s, v1.4s\n" |
373 |
|
|
"fmin v13.4s, v13.4s, v1.4s\n" |
374 |
|
|
"fmin v17.4s, v17.4s, v1.4s\n" |
375 |
|
|
"fmin v18.4s, v18.4s, v1.4s\n" |
376 |
|
|
"fmin v14.4s, v14.4s, v1.4s\n" |
377 |
|
|
"fmin v15.4s, v15.4s, v1.4s\n" |
378 |
|
|
"fmin v16.4s, v16.4s, v1.4s\n" |
379 |
|
|
"fmin v19.4s, v19.4s, v1.4s\n" |
380 |
|
|
"fmin v23.4s, v23.4s, v1.4s\n" |
381 |
|
|
"fmin v24.4s, v24.4s, v1.4s\n" |
382 |
|
|
"fmin v20.4s, v20.4s, v1.4s\n" |
383 |
|
|
"fmin v21.4s, v21.4s, v1.4s\n" |
384 |
|
|
"fmin v22.4s, v22.4s, v1.4s\n" |
385 |
|
|
"fmin v25.4s, v25.4s, v1.4s\n" |
386 |
|
|
"fmin v29.4s, v29.4s, v1.4s\n" |
387 |
|
|
"fmin v30.4s, v30.4s, v1.4s\n" |
388 |
|
|
"fmin v26.4s, v26.4s, v1.4s\n" |
389 |
|
|
"fmin v27.4s, v27.4s, v1.4s\n" |
390 |
|
|
"fmin v28.4s, v28.4s, v1.4s\n" |
391 |
|
|
"fmax v7.4s, v7.4s, v0.4s\n" |
392 |
|
|
"fmax v11.4s, v11.4s, v0.4s\n" |
393 |
|
|
"fmax v12.4s, v12.4s, v0.4s\n" |
394 |
|
|
"fmax v8.4s, v8.4s, v0.4s\n" |
395 |
|
|
"fmax v9.4s, v9.4s, v0.4s\n" |
396 |
|
|
"fmax v10.4s, v10.4s, v0.4s\n" |
397 |
|
|
"fmax v13.4s, v13.4s, v0.4s\n" |
398 |
|
|
"fmax v17.4s, v17.4s, v0.4s\n" |
399 |
|
|
"fmax v18.4s, v18.4s, v0.4s\n" |
400 |
|
|
"fmax v14.4s, v14.4s, v0.4s\n" |
401 |
|
|
"fmax v15.4s, v15.4s, v0.4s\n" |
402 |
|
|
"fmax v16.4s, v16.4s, v0.4s\n" |
403 |
|
|
"fmax v19.4s, v19.4s, v0.4s\n" |
404 |
|
|
"fmax v23.4s, v23.4s, v0.4s\n" |
405 |
|
|
"fmax v24.4s, v24.4s, v0.4s\n" |
406 |
|
|
"fmax v20.4s, v20.4s, v0.4s\n" |
407 |
|
|
"fmax v21.4s, v21.4s, v0.4s\n" |
408 |
|
|
"fmax v22.4s, v22.4s, v0.4s\n" |
409 |
|
|
"fmax v25.4s, v25.4s, v0.4s\n" |
410 |
|
|
"fmax v29.4s, v29.4s, v0.4s\n" |
411 |
|
|
"fmax v30.4s, v30.4s, v0.4s\n" |
412 |
|
|
"fmax v26.4s, v26.4s, v0.4s\n" |
413 |
|
|
"fmax v27.4s, v27.4s, v0.4s\n" |
414 |
|
|
"fmax v28.4s, v28.4s, v0.4s\n" |
415 |
|
|
"blt 7f\n" |
416 |
|
|
"str q26, [x24, #0x0]\n" |
417 |
|
|
"str q27, [x24, #0x10]\n" |
418 |
|
|
"str q28, [x24, #0x20]\n" |
419 |
|
|
"add x24, x24, #0x30\n" |
420 |
|
|
"str q25, [x9, #0x0]\n" |
421 |
|
|
"str q29, [x9, #0x10]\n" |
422 |
|
|
"str q30, [x9, #0x20]\n" |
423 |
|
|
"add x9, x9, #0x30\n" |
424 |
|
|
"str q20, [x25, #0x0]\n" |
425 |
|
|
"str q21, [x25, #0x10]\n" |
426 |
|
|
"str q22, [x25, #0x20]\n" |
427 |
|
|
"add x25, x25, #0x30\n" |
428 |
|
|
"str q19, [x11, #0x0]\n" |
429 |
|
|
"str q23, [x11, #0x10]\n" |
430 |
|
|
"str q24, [x11, #0x20]\n" |
431 |
|
|
"add x11, x11, #0x30\n" |
432 |
|
|
"str q14, [x26, #0x0]\n" |
433 |
|
|
"str q15, [x26, #0x10]\n" |
434 |
|
|
"str q16, [x26, #0x20]\n" |
435 |
|
|
"add x26, x26, #0x30\n" |
436 |
|
|
"str q13, [x10, #0x0]\n" |
437 |
|
|
"str q17, [x10, #0x10]\n" |
438 |
|
|
"str q18, [x10, #0x20]\n" |
439 |
|
|
"add x10, x10, #0x30\n" |
440 |
|
|
"str q8, [x27, #0x0]\n" |
441 |
|
|
"str q9, [x27, #0x10]\n" |
442 |
|
|
"str q10, [x27, #0x20]\n" |
443 |
|
|
"add x27, x27, #0x30\n" |
444 |
|
|
"str q7, [%x[Cpanel], #0x0]\n" |
445 |
|
|
"str q11, [%x[Cpanel], #0x10]\n" |
446 |
|
|
"str q12, [%x[Cpanel], #0x20]\n" |
447 |
|
|
"add %x[Cpanel], %x[Cpanel], #0x30\n" |
448 |
|
|
"b 14f\n" |
449 |
|
|
"7:" // partial output |
450 |
|
|
"tbz x23, #3, 9f\n" |
451 |
|
|
"st1 { v26.4s }, [x24], #0x10\n" |
452 |
|
|
"st1 { v27.4s }, [x24], #0x10\n" |
453 |
|
|
"st1 { v25.4s }, [x9], #0x10\n" |
454 |
|
|
"st1 { v29.4s }, [x9], #0x10\n" |
455 |
|
|
"st1 { v20.4s }, [x25], #0x10\n" |
456 |
|
|
"st1 { v21.4s }, [x25], #0x10\n" |
457 |
|
|
"st1 { v19.4s }, [x11], #0x10\n" |
458 |
|
|
"st1 { v23.4s }, [x11], #0x10\n" |
459 |
|
|
"st1 { v14.4s }, [x26], #0x10\n" |
460 |
|
|
"st1 { v15.4s }, [x26], #0x10\n" |
461 |
|
|
"st1 { v13.4s }, [x10], #0x10\n" |
462 |
|
|
"st1 { v17.4s }, [x10], #0x10\n" |
463 |
|
|
"st1 { v8.4s }, [x27], #0x10\n" |
464 |
|
|
"st1 { v9.4s }, [x27], #0x10\n" |
465 |
|
|
"st1 { v7.4s }, [%x[Cpanel]], #0x10\n" |
466 |
|
|
"st1 { v11.4s }, [%x[Cpanel]], #0x10\n" |
467 |
|
|
"tbz x23, #1, 8f\n" |
468 |
|
|
"str d28, [x24], #0x8\n" |
469 |
|
|
"str d30, [x9], #0x8\n" |
470 |
|
|
"str d22, [x25], #0x8\n" |
471 |
|
|
"str d24, [x11], #0x8\n" |
472 |
|
|
"str d16, [x26], #0x8\n" |
473 |
|
|
"str d18, [x10], #0x8\n" |
474 |
|
|
"str d10, [x27], #0x8\n" |
475 |
|
|
"str d12, [%x[Cpanel]], #0x8\n" |
476 |
|
|
"tbz x23, #0, 13f\n" |
477 |
|
|
"st1 { v28.s }[2], [x24]\n" |
478 |
|
|
"st1 { v30.s }[2], [x9]\n" |
479 |
|
|
"st1 { v22.s }[2], [x25]\n" |
480 |
|
|
"st1 { v24.s }[2], [x11]\n" |
481 |
|
|
"st1 { v16.s }[2], [x26]\n" |
482 |
|
|
"st1 { v18.s }[2], [x10]\n" |
483 |
|
|
"st1 { v10.s }[2], [x27]\n" |
484 |
|
|
"st1 { v12.s }[2], [%x[Cpanel]]\n" |
485 |
|
|
"b 13f\n" |
486 |
|
|
"8:" // partial result store: partial_1_8 |
487 |
|
|
"tbz x23, #0, 13f\n" |
488 |
|
|
"str s28, [x24, #0x0]\n" |
489 |
|
|
"str s30, [x9, #0x0]\n" |
490 |
|
|
"str s22, [x25, #0x0]\n" |
491 |
|
|
"str s24, [x11, #0x0]\n" |
492 |
|
|
"str s16, [x26, #0x0]\n" |
493 |
|
|
"str s18, [x10, #0x0]\n" |
494 |
|
|
"str s10, [x27, #0x0]\n" |
495 |
|
|
"str s12, [%x[Cpanel], #0x0]\n" |
496 |
|
|
"b 13f\n" |
497 |
|
|
"9:" // partial result store: partial_4_0 |
498 |
|
|
"tbz x23, #2, 11f\n" |
499 |
|
|
"st1 { v26.4s }, [x24], #0x10\n" |
500 |
|
|
"st1 { v25.4s }, [x9], #0x10\n" |
501 |
|
|
"st1 { v20.4s }, [x25], #0x10\n" |
502 |
|
|
"st1 { v19.4s }, [x11], #0x10\n" |
503 |
|
|
"st1 { v14.4s }, [x26], #0x10\n" |
504 |
|
|
"st1 { v13.4s }, [x10], #0x10\n" |
505 |
|
|
"st1 { v8.4s }, [x27], #0x10\n" |
506 |
|
|
"st1 { v7.4s }, [%x[Cpanel]], #0x10\n" |
507 |
|
|
"tbz x23, #1, 10f\n" |
508 |
|
|
"str d27, [x24], #0x8\n" |
509 |
|
|
"str d29, [x9], #0x8\n" |
510 |
|
|
"str d21, [x25], #0x8\n" |
511 |
|
|
"str d23, [x11], #0x8\n" |
512 |
|
|
"str d15, [x26], #0x8\n" |
513 |
|
|
"str d17, [x10], #0x8\n" |
514 |
|
|
"str d9, [x27], #0x8\n" |
515 |
|
|
"str d11, [%x[Cpanel]], #0x8\n" |
516 |
|
|
"tbz x23, #0, 13f\n" |
517 |
|
|
"st1 { v27.s }[2], [x24]\n" |
518 |
|
|
"st1 { v29.s }[2], [x9]\n" |
519 |
|
|
"st1 { v21.s }[2], [x25]\n" |
520 |
|
|
"st1 { v23.s }[2], [x11]\n" |
521 |
|
|
"st1 { v15.s }[2], [x26]\n" |
522 |
|
|
"st1 { v17.s }[2], [x10]\n" |
523 |
|
|
"st1 { v9.s }[2], [x27]\n" |
524 |
|
|
"st1 { v11.s }[2], [%x[Cpanel]]\n" |
525 |
|
|
"b 13f\n" |
526 |
|
|
"10:" // partial result store: partial_1_4 |
527 |
|
|
"tbz x23, #0, 13f\n" |
528 |
|
|
"str s27, [x24, #0x0]\n" |
529 |
|
|
"str s29, [x9, #0x0]\n" |
530 |
|
|
"str s21, [x25, #0x0]\n" |
531 |
|
|
"str s23, [x11, #0x0]\n" |
532 |
|
|
"str s15, [x26, #0x0]\n" |
533 |
|
|
"str s17, [x10, #0x0]\n" |
534 |
|
|
"str s9, [x27, #0x0]\n" |
535 |
|
|
"str s11, [%x[Cpanel], #0x0]\n" |
536 |
|
|
"b 13f\n" |
537 |
|
|
"11:" // partial result store: partial_2_0 |
538 |
|
|
"tbz x23, #1, 12f\n" |
539 |
|
|
"str d26, [x24], #0x8\n" |
540 |
|
|
"str d25, [x9], #0x8\n" |
541 |
|
|
"str d20, [x25], #0x8\n" |
542 |
|
|
"str d19, [x11], #0x8\n" |
543 |
|
|
"str d14, [x26], #0x8\n" |
544 |
|
|
"str d13, [x10], #0x8\n" |
545 |
|
|
"str d8, [x27], #0x8\n" |
546 |
|
|
"str d7, [%x[Cpanel]], #0x8\n" |
547 |
|
|
"tbz x23, #0, 13f\n" |
548 |
|
|
"st1 { v26.s }[2], [x24]\n" |
549 |
|
|
"st1 { v25.s }[2], [x9]\n" |
550 |
|
|
"st1 { v20.s }[2], [x25]\n" |
551 |
|
|
"st1 { v19.s }[2], [x11]\n" |
552 |
|
|
"st1 { v14.s }[2], [x26]\n" |
553 |
|
|
"st1 { v13.s }[2], [x10]\n" |
554 |
|
|
"st1 { v8.s }[2], [x27]\n" |
555 |
|
|
"st1 { v7.s }[2], [%x[Cpanel]]\n" |
556 |
|
|
"b 13f\n" |
557 |
|
|
"12:" // partial result store: partial_1_0 |
558 |
|
|
"str s26, [x24, #0x0]\n" |
559 |
|
|
"str s25, [x9, #0x0]\n" |
560 |
|
|
"str s20, [x25, #0x0]\n" |
561 |
|
|
"str s19, [x11, #0x0]\n" |
562 |
|
|
"str s14, [x26, #0x0]\n" |
563 |
|
|
"str s13, [x10, #0x0]\n" |
564 |
|
|
"str s8, [x27, #0x0]\n" |
565 |
|
|
"str s7, [%x[Cpanel], #0x0]\n" |
566 |
|
|
"13:" // partial result store: Done |
567 |
|
|
"14:" // store done |
568 |
|
|
"subs x23, x23, #0xc\n" |
569 |
|
|
"bgt 3b\n" |
570 |
|
|
"subs %x[M], %x[M], #0x8\n" |
571 |
|
|
"mov %x[Cpanel], x28\n" |
572 |
|
|
"bgt 1b\n" |
573 |
|
|
: [Apanel] "+&r"(Apanel), [Cpanel] "+&r"(Cpanel), [M] "+&r"(M) |
574 |
|
185 |
: [args_ptr] "r"(&ka), [ldc] "r"(ldc * sizeof(float)), [offset_max] "I"(offsetof(KernelArgs, maxval)), |
575 |
|
|
[offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_Bpanel] "I"(offsetof(KernelArgs, Bpanel)), |
576 |
|
|
[offsetof_K] "I"(offsetof(KernelArgs, K)), [offsetof_N] "I"(offsetof(KernelArgs, N)) |
577 |
|
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", |
578 |
|
|
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", |
579 |
|
|
"v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); |
580 |
|
185 |
} |
581 |
|
|
|
582 |
|
|
#endif // Architectural features check. |
583 |
|
|
|