KleidiAI Coverage Report


Directory: ./
File: kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.c
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 94.3% 33 5 40
Functions: 90.9% 10 0 11
Branches: -% 0 10 10

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 // Do not flag up inline assembly blocks
8 #pragma GCC diagnostic ignored "-Woverlength-strings"
9
10 #if !defined(__aarch64__) || !defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
11 #error This file must be compiled for AArch64, FEAT_BF16.
12 #else // Architectural features check.
13
14 #include "kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.h"
15
16 #include <arm_bf16.h>
17 #include <arm_neon.h>
18 #include <stddef.h>
19 #include <stdint.h>
20
21 #include "kai/kai_common.h"
22
23 static const size_t kai_mr = 8;
24 static const size_t kai_nr = 12;
25 static const size_t kai_kr = 4;
26 static const size_t kai_sr = 1;
27
28 200 size_t kai_get_m_step_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) {
29 200 return kai_mr;
30 }
31
32 200 size_t kai_get_n_step_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) {
33 200 return kai_nr;
34 }
35
36 size_t kai_get_mr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) {
37 return kai_mr;
38 }
39
40 40 size_t kai_get_nr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) {
41 40 return kai_nr;
42 }
43
44 40 size_t kai_get_kr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) {
45 40 return kai_kr;
46 }
47
48 40 size_t kai_get_sr_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(void) {
49 40 return kai_sr;
50 }
51
52 184 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(size_t m_idx, size_t k) {
53 KAI_ASSUME(m_idx % kai_mr == 0);
54
55 184 return m_idx * kai_roundup(k, kai_kr) * sizeof(uint16_t);
56 }
57
58 184 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(size_t n_idx, size_t k) {
59 KAI_ASSUME(n_idx % kai_nr == 0);
60
61 184 return n_idx * (sizeof(float) + kai_roundup(k, kai_kr) * sizeof(uint16_t));
62 }
63
64 184 size_t kai_get_dst_offset_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(
65 size_t m_idx, size_t n_idx, size_t stride) {
66 KAI_ASSUME(m_idx % kai_mr == 0);
67 KAI_ASSUME(n_idx % kai_nr == 0);
68
69 184 return m_idx * stride + n_idx * sizeof(float);
70 }
71
72 184 size_t kai_get_dst_size_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(size_t m, size_t n) {
73 184 return m * n * sizeof(float);
74 }
75
76 185 void kai_run_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla(
77 size_t m, size_t n, size_t k, //
78 const void* lhs_packed, //
79 const void* rhs_packed, //
80 void* dst, size_t dst_stride_row, size_t dst_stride_col, //
81 float clamp_min, float clamp_max) {
82 KAI_ASSERT(dst_stride_col == sizeof(float));
83
84 185 const void* Apanel = lhs_packed;
85 185 void* Cpanel = dst;
86 185 size_t ldc = dst_stride_row / sizeof(float);
87
88 185 size_t M = m;
89
90 typedef struct {
91 float maxval;
92 float minval;
93 size_t N;
94 size_t K;
95 const void* Bpanel;
96 void* output_ptr;
97 } KernelArgs;
98
99 185 KernelArgs ka;
100
101 185 ka.N = n;
102 185 ka.K = kai_roundup(k, kai_kr) / kai_kr - 1;
103
104 185 ka.Bpanel = rhs_packed;
105
106 // Direct output.
107 185 ka.output_ptr = dst;
108
109 // Clamping output.
110 185 ka.maxval = clamp_max;
111 185 ka.minval = clamp_min;
112
113 370 __asm__ __volatile__(
114 "1:" // Height loop
115 "add x11, %x[Cpanel], %x[ldc], LSL #2\n"
116 "add x10, %x[Cpanel], %x[ldc], LSL #1\n"
117 "add x9, x11, %x[ldc], LSL #1\n"
118 "cmp %x[M], #0x8\n"
119 "add x28, %x[Cpanel], %x[ldc], LSL #3\n"
120 "add x27, %x[Cpanel], %x[ldc]\n"
121 "add x26, x10, %x[ldc]\n"
122 "add x25, x11, %x[ldc]\n"
123 "add x24, x9, %x[ldc]\n"
124 "bge 2f\n"
125 "cmp %x[M], #0x2\n"
126 "mov x24, %x[Cpanel]\n"
127 "csel x27, x27, %x[Cpanel], GE\n"
128 "csel x10, x10, %x[Cpanel], GT\n"
129 "cmp %x[M], #0x4\n"
130 "csel x26, x26, %x[Cpanel], GE\n"
131 "csel x11, x11, %x[Cpanel], GT\n"
132 "cmp %x[M], #0x6\n"
133 "csel x25, x25, %x[Cpanel], GE\n"
134 "csel x9, x9, %x[Cpanel], GT\n"
135 "2:" // all rows valid
136 "ldr x23, [%x[args_ptr], %[offsetof_N]]\n"
137 "ldr x22, [%x[args_ptr], %[offsetof_Bpanel]]\n"
138 "mov x21, %x[Apanel]\n"
139 "3:" // Width loop
140 "ldr q4, [x22, #0x0]\n"
141 "ldr q5, [x22, #0x10]\n"
142 "mov %x[Apanel], x21\n"
143 "ldr q6, [x22, #0x20]\n"
144 "ldr x20, [%x[args_ptr], %[offsetof_K]]\n"
145 "add x22, x22, #0x30\n"
146 "ldr q7, [x22, #0x0]\n"
147 "ldr q0, [%x[Apanel], #0x0]\n"
148 "ldr q1, [%x[Apanel], #0x10]\n"
149 "zip1 v8.2d, v4.2d, v4.2d\n"
150 "ldr q2, [%x[Apanel], #0x20]\n"
151 "zip2 v11.2d, v4.2d, v4.2d\n"
152 "ldr q4, [x22, #0x10]\n"
153 "zip1 v9.2d, v5.2d, v5.2d\n"
154 "zip2 v12.2d, v5.2d, v5.2d\n"
155 "cmp x20, #0x2\n"
156 "zip1 v10.2d, v6.2d, v6.2d\n"
157 "zip2 v13.2d, v6.2d, v6.2d\n"
158 "prfm pldl1keep, [%x[Apanel], #0x0]\n"
159 "mov v14.16b, v8.16b\n"
160 "mov v17.16b, v11.16b\n"
161 "prfm pldl1keep, [x22, #0x0]\n"
162 "mov v15.16b, v9.16b\n"
163 "mov v18.16b, v12.16b\n"
164 "prfm pldl1keep, [x22, #0x40]\n"
165 "mov v16.16b, v10.16b\n"
166 "mov v19.16b, v13.16b\n"
167 "prfm pldl1keep, [%x[Apanel], #0x40]\n"
168 "mov v20.16b, v8.16b\n"
169 "mov v21.16b, v9.16b\n"
170 "prfm pldl1keep, [x22, #0x80]\n"
171 "mov v22.16b, v10.16b\n"
172 "mov v23.16b, v11.16b\n"
173 "prfm pldl1keep, [%x[Apanel], #0x80]\n"
174 "mov v24.16b, v12.16b\n"
175 "mov v25.16b, v13.16b\n"
176 "prfm pldl1keep, [x22, #0xc0]\n"
177 "mov v26.16b, v8.16b\n"
178 "mov v27.16b, v9.16b\n"
179 "prfm pldl1keep, [x22, #0x100]\n"
180 "mov v28.16b, v10.16b\n"
181 "mov v29.16b, v11.16b\n"
182 "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
183 "mov v30.16b, v12.16b\n"
184 "mov v31.16b, v13.16b\n"
185 "prfm pldl1keep, [x22, #0x140]\n"
186 "add x22, x22, #0x20\n"
187 "add %x[Apanel], %x[Apanel], #0x30\n"
188 "blt 5f\n"
189 "4:" // main loop head
190 "ldr q3, [%x[Apanel], #0x0]\n"
191 "ldr q5, [x22, #0x0]\n"
192 ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
193 "ldr q6, [x22, #0x10]\n"
194 ".inst 0x6e44ec0b // bfmmla v11.4s, v0.8h, v4.8h\n"
195 ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n"
196 ".inst 0x6e44ec31 // bfmmla v17.4s, v1.8h, v4.8h\n"
197 ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
198 "sub x20, x20, #0x2\n"
199 ".inst 0x6e44ec57 // bfmmla v23.4s, v2.8h, v4.8h\n"
200 ".inst 0x6e47ec7a // bfmmla v26.4s, v3.8h, v7.8h\n"
201 "ldr q7, [x22, #0x20]\n"
202 ".inst 0x6e44ec7d // bfmmla v29.4s, v3.8h, v4.8h\n"
203 "ldr q4, [x22, #0x30]\n"
204 ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
205 ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
206 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
207 "cmp x20, #0x2\n"
208 ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
209 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
210 "prfm pldl1keep, [%x[Apanel], #0x100]\n"
211 ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
212 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
213 "ldr q5, [x22, #0x40]\n"
214 ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
215 "ldr q6, [x22, #0x50]\n"
216 ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
217 ".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n"
218 "ldr q0, [%x[Apanel], #0x10]\n"
219 ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n"
220 ".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n"
221 "ldr q1, [%x[Apanel], #0x20]\n"
222 ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
223 ".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n"
224 "ldr q2, [%x[Apanel], #0x30]\n"
225 ".inst 0x6e47ec7c // bfmmla v28.4s, v3.8h, v7.8h\n"
226 "ldr q7, [x22, #0x60]\n"
227 ".inst 0x6e44ec7f // bfmmla v31.4s, v3.8h, v4.8h\n"
228 "ldr q3, [%x[Apanel], #0x40]\n"
229 "ldr q4, [x22, #0x70]\n"
230 ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
231 ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
232 ".inst 0x6e45ec2e // bfmmla v14.4s, v1.8h, v5.8h\n"
233 ".inst 0x6e46ec31 // bfmmla v17.4s, v1.8h, v6.8h\n"
234 "prfm pldl1keep, [x22, #0x180]\n"
235 ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
236 ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
237 "prfm pldl1keep, [x22, #0x1c0]\n"
238 ".inst 0x6e45ec7a // bfmmla v26.4s, v3.8h, v5.8h\n"
239 "ldr q5, [x22, #0x80]\n"
240 ".inst 0x6e46ec7d // bfmmla v29.4s, v3.8h, v6.8h\n"
241 "ldr q6, [x22, #0x90]\n"
242 "prfm pldl1keep, [%x[Apanel], #0x140]\n"
243 ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
244 "prfm pldl1keep, [x22, #0x200]\n"
245 ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
246 ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
247 ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
248 ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
249 ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
250 ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
251 "ldr q7, [x22, #0xa0]\n"
252 ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
253 "ldr q4, [x22, #0xb0]\n"
254 ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
255 ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
256 "ldr q0, [%x[Apanel], #0x50]\n"
257 ".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n"
258 ".inst 0x6e46ec33 // bfmmla v19.4s, v1.8h, v6.8h\n"
259 "ldr q1, [%x[Apanel], #0x60]\n"
260 ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
261 ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
262 "ldr q2, [%x[Apanel], #0x70]\n"
263 ".inst 0x6e45ec7c // bfmmla v28.4s, v3.8h, v5.8h\n"
264 ".inst 0x6e46ec7f // bfmmla v31.4s, v3.8h, v6.8h\n"
265 "add %x[Apanel], %x[Apanel], #0x80\n"
266 "add x22, x22, #0xc0\n"
267 "bge 4b\n"
268 "5:" // main loop skip
269 "ldr q3, [%x[Apanel], #0x0]\n"
270 "ldr q5, [x22, #0x0]\n"
271 ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n"
272 "ldr q6, [x22, #0x10]\n"
273 ".inst 0x6e44ec0b // bfmmla v11.4s, v0.8h, v4.8h\n"
274 ".inst 0x6e47ec2e // bfmmla v14.4s, v1.8h, v7.8h\n"
275 ".inst 0x6e44ec31 // bfmmla v17.4s, v1.8h, v4.8h\n"
276 ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n"
277 "add %x[Apanel], %x[Apanel], #0x10\n"
278 ".inst 0x6e44ec57 // bfmmla v23.4s, v2.8h, v4.8h\n"
279 ".inst 0x6e47ec7a // bfmmla v26.4s, v3.8h, v7.8h\n"
280 "ldr q7, [x22, #0x20]\n"
281 ".inst 0x6e44ec7d // bfmmla v29.4s, v3.8h, v4.8h\n"
282 "ldr q4, [x22, #0x30]\n"
283 ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
284 ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
285 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
286 "add x22, x22, #0x40\n"
287 ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
288 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
289 ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
290 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
291 ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
292 ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n"
293 ".inst 0x6e44ec0d // bfmmla v13.4s, v0.8h, v4.8h\n"
294 ".inst 0x6e47ec30 // bfmmla v16.4s, v1.8h, v7.8h\n"
295 ".inst 0x6e44ec33 // bfmmla v19.4s, v1.8h, v4.8h\n"
296 ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n"
297 ".inst 0x6e44ec59 // bfmmla v25.4s, v2.8h, v4.8h\n"
298 ".inst 0x6e47ec7c // bfmmla v28.4s, v3.8h, v7.8h\n"
299 ".inst 0x6e44ec7f // bfmmla v31.4s, v3.8h, v4.8h\n"
300 "cbz x20, 6f\n"
301 "ldr q5, [x22, #0x0]\n"
302 "ldr q0, [%x[Apanel], #0x0]\n"
303 "ldr q1, [%x[Apanel], #0x10]\n"
304 "ldr q6, [x22, #0x10]\n"
305 "ldr q2, [%x[Apanel], #0x20]\n"
306 "ldr q3, [%x[Apanel], #0x30]\n"
307 "add %x[Apanel], %x[Apanel], #0x40\n"
308 "ldr q7, [x22, #0x20]\n"
309 "ldr q4, [x22, #0x30]\n"
310 ".inst 0x6e45ec08 // bfmmla v8.4s, v0.8h, v5.8h\n"
311 ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n"
312 ".inst 0x6e45ec2e // bfmmla v14.4s, v1.8h, v5.8h\n"
313 ".inst 0x6e46ec31 // bfmmla v17.4s, v1.8h, v6.8h\n"
314 ".inst 0x6e45ec54 // bfmmla v20.4s, v2.8h, v5.8h\n"
315 ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n"
316 ".inst 0x6e45ec7a // bfmmla v26.4s, v3.8h, v5.8h\n"
317 "ldr q5, [x22, #0x40]\n"
318 ".inst 0x6e46ec7d // bfmmla v29.4s, v3.8h, v6.8h\n"
319 "ldr q6, [x22, #0x50]\n"
320 ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
321 ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
322 ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
323 "add x22, x22, #0x60\n"
324 ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
325 ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
326 ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
327 ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
328 ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
329 ".inst 0x6e45ec0a // bfmmla v10.4s, v0.8h, v5.8h\n"
330 ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n"
331 ".inst 0x6e45ec30 // bfmmla v16.4s, v1.8h, v5.8h\n"
332 ".inst 0x6e46ec33 // bfmmla v19.4s, v1.8h, v6.8h\n"
333 ".inst 0x6e45ec56 // bfmmla v22.4s, v2.8h, v5.8h\n"
334 ".inst 0x6e46ec59 // bfmmla v25.4s, v2.8h, v6.8h\n"
335 ".inst 0x6e45ec7c // bfmmla v28.4s, v3.8h, v5.8h\n"
336 ".inst 0x6e46ec7f // bfmmla v31.4s, v3.8h, v6.8h\n"
337 "6:" // multiply loop done
338 "add x20, %x[args_ptr], %[offset_max]\n"
339 "uzp1 v7.2d, v8.2d, v11.2d\n"
340 "uzp2 v8.2d, v8.2d, v11.2d\n"
341 "ld1r { v1.4s }, [x20]\n"
342 "uzp1 v11.2d, v9.2d, v12.2d\n"
343 "uzp2 v9.2d, v9.2d, v12.2d\n"
344 "uzp1 v12.2d, v10.2d, v13.2d\n"
345 "uzp2 v10.2d, v10.2d, v13.2d\n"
346 "add x20, %x[args_ptr], %[offset_min]\n"
347 "ld1r { v0.4s }, [x20]\n"
348 "uzp1 v13.2d, v14.2d, v17.2d\n"
349 "uzp2 v14.2d, v14.2d, v17.2d\n"
350 "uzp1 v17.2d, v15.2d, v18.2d\n"
351 "uzp2 v15.2d, v15.2d, v18.2d\n"
352 "cmp x23, #0xc\n"
353 "uzp1 v18.2d, v16.2d, v19.2d\n"
354 "uzp2 v16.2d, v16.2d, v19.2d\n"
355 "uzp1 v19.2d, v20.2d, v23.2d\n"
356 "uzp2 v20.2d, v20.2d, v23.2d\n"
357 "uzp1 v23.2d, v21.2d, v24.2d\n"
358 "uzp2 v21.2d, v21.2d, v24.2d\n"
359 "uzp1 v24.2d, v22.2d, v25.2d\n"
360 "uzp2 v22.2d, v22.2d, v25.2d\n"
361 "uzp1 v25.2d, v26.2d, v29.2d\n"
362 "uzp2 v26.2d, v26.2d, v29.2d\n"
363 "uzp1 v29.2d, v27.2d, v30.2d\n"
364 "uzp2 v27.2d, v27.2d, v30.2d\n"
365 "uzp1 v30.2d, v28.2d, v31.2d\n"
366 "uzp2 v28.2d, v28.2d, v31.2d\n"
367 "fmin v7.4s, v7.4s, v1.4s\n"
368 "fmin v11.4s, v11.4s, v1.4s\n"
369 "fmin v12.4s, v12.4s, v1.4s\n"
370 "fmin v8.4s, v8.4s, v1.4s\n"
371 "fmin v9.4s, v9.4s, v1.4s\n"
372 "fmin v10.4s, v10.4s, v1.4s\n"
373 "fmin v13.4s, v13.4s, v1.4s\n"
374 "fmin v17.4s, v17.4s, v1.4s\n"
375 "fmin v18.4s, v18.4s, v1.4s\n"
376 "fmin v14.4s, v14.4s, v1.4s\n"
377 "fmin v15.4s, v15.4s, v1.4s\n"
378 "fmin v16.4s, v16.4s, v1.4s\n"
379 "fmin v19.4s, v19.4s, v1.4s\n"
380 "fmin v23.4s, v23.4s, v1.4s\n"
381 "fmin v24.4s, v24.4s, v1.4s\n"
382 "fmin v20.4s, v20.4s, v1.4s\n"
383 "fmin v21.4s, v21.4s, v1.4s\n"
384 "fmin v22.4s, v22.4s, v1.4s\n"
385 "fmin v25.4s, v25.4s, v1.4s\n"
386 "fmin v29.4s, v29.4s, v1.4s\n"
387 "fmin v30.4s, v30.4s, v1.4s\n"
388 "fmin v26.4s, v26.4s, v1.4s\n"
389 "fmin v27.4s, v27.4s, v1.4s\n"
390 "fmin v28.4s, v28.4s, v1.4s\n"
391 "fmax v7.4s, v7.4s, v0.4s\n"
392 "fmax v11.4s, v11.4s, v0.4s\n"
393 "fmax v12.4s, v12.4s, v0.4s\n"
394 "fmax v8.4s, v8.4s, v0.4s\n"
395 "fmax v9.4s, v9.4s, v0.4s\n"
396 "fmax v10.4s, v10.4s, v0.4s\n"
397 "fmax v13.4s, v13.4s, v0.4s\n"
398 "fmax v17.4s, v17.4s, v0.4s\n"
399 "fmax v18.4s, v18.4s, v0.4s\n"
400 "fmax v14.4s, v14.4s, v0.4s\n"
401 "fmax v15.4s, v15.4s, v0.4s\n"
402 "fmax v16.4s, v16.4s, v0.4s\n"
403 "fmax v19.4s, v19.4s, v0.4s\n"
404 "fmax v23.4s, v23.4s, v0.4s\n"
405 "fmax v24.4s, v24.4s, v0.4s\n"
406 "fmax v20.4s, v20.4s, v0.4s\n"
407 "fmax v21.4s, v21.4s, v0.4s\n"
408 "fmax v22.4s, v22.4s, v0.4s\n"
409 "fmax v25.4s, v25.4s, v0.4s\n"
410 "fmax v29.4s, v29.4s, v0.4s\n"
411 "fmax v30.4s, v30.4s, v0.4s\n"
412 "fmax v26.4s, v26.4s, v0.4s\n"
413 "fmax v27.4s, v27.4s, v0.4s\n"
414 "fmax v28.4s, v28.4s, v0.4s\n"
415 "blt 7f\n"
416 "str q26, [x24, #0x0]\n"
417 "str q27, [x24, #0x10]\n"
418 "str q28, [x24, #0x20]\n"
419 "add x24, x24, #0x30\n"
420 "str q25, [x9, #0x0]\n"
421 "str q29, [x9, #0x10]\n"
422 "str q30, [x9, #0x20]\n"
423 "add x9, x9, #0x30\n"
424 "str q20, [x25, #0x0]\n"
425 "str q21, [x25, #0x10]\n"
426 "str q22, [x25, #0x20]\n"
427 "add x25, x25, #0x30\n"
428 "str q19, [x11, #0x0]\n"
429 "str q23, [x11, #0x10]\n"
430 "str q24, [x11, #0x20]\n"
431 "add x11, x11, #0x30\n"
432 "str q14, [x26, #0x0]\n"
433 "str q15, [x26, #0x10]\n"
434 "str q16, [x26, #0x20]\n"
435 "add x26, x26, #0x30\n"
436 "str q13, [x10, #0x0]\n"
437 "str q17, [x10, #0x10]\n"
438 "str q18, [x10, #0x20]\n"
439 "add x10, x10, #0x30\n"
440 "str q8, [x27, #0x0]\n"
441 "str q9, [x27, #0x10]\n"
442 "str q10, [x27, #0x20]\n"
443 "add x27, x27, #0x30\n"
444 "str q7, [%x[Cpanel], #0x0]\n"
445 "str q11, [%x[Cpanel], #0x10]\n"
446 "str q12, [%x[Cpanel], #0x20]\n"
447 "add %x[Cpanel], %x[Cpanel], #0x30\n"
448 "b 14f\n"
449 "7:" // partial output
450 "tbz x23, #3, 9f\n"
451 "st1 { v26.4s }, [x24], #0x10\n"
452 "st1 { v27.4s }, [x24], #0x10\n"
453 "st1 { v25.4s }, [x9], #0x10\n"
454 "st1 { v29.4s }, [x9], #0x10\n"
455 "st1 { v20.4s }, [x25], #0x10\n"
456 "st1 { v21.4s }, [x25], #0x10\n"
457 "st1 { v19.4s }, [x11], #0x10\n"
458 "st1 { v23.4s }, [x11], #0x10\n"
459 "st1 { v14.4s }, [x26], #0x10\n"
460 "st1 { v15.4s }, [x26], #0x10\n"
461 "st1 { v13.4s }, [x10], #0x10\n"
462 "st1 { v17.4s }, [x10], #0x10\n"
463 "st1 { v8.4s }, [x27], #0x10\n"
464 "st1 { v9.4s }, [x27], #0x10\n"
465 "st1 { v7.4s }, [%x[Cpanel]], #0x10\n"
466 "st1 { v11.4s }, [%x[Cpanel]], #0x10\n"
467 "tbz x23, #1, 8f\n"
468 "str d28, [x24], #0x8\n"
469 "str d30, [x9], #0x8\n"
470 "str d22, [x25], #0x8\n"
471 "str d24, [x11], #0x8\n"
472 "str d16, [x26], #0x8\n"
473 "str d18, [x10], #0x8\n"
474 "str d10, [x27], #0x8\n"
475 "str d12, [%x[Cpanel]], #0x8\n"
476 "tbz x23, #0, 13f\n"
477 "st1 { v28.s }[2], [x24]\n"
478 "st1 { v30.s }[2], [x9]\n"
479 "st1 { v22.s }[2], [x25]\n"
480 "st1 { v24.s }[2], [x11]\n"
481 "st1 { v16.s }[2], [x26]\n"
482 "st1 { v18.s }[2], [x10]\n"
483 "st1 { v10.s }[2], [x27]\n"
484 "st1 { v12.s }[2], [%x[Cpanel]]\n"
485 "b 13f\n"
486 "8:" // partial result store: partial_1_8
487 "tbz x23, #0, 13f\n"
488 "str s28, [x24, #0x0]\n"
489 "str s30, [x9, #0x0]\n"
490 "str s22, [x25, #0x0]\n"
491 "str s24, [x11, #0x0]\n"
492 "str s16, [x26, #0x0]\n"
493 "str s18, [x10, #0x0]\n"
494 "str s10, [x27, #0x0]\n"
495 "str s12, [%x[Cpanel], #0x0]\n"
496 "b 13f\n"
497 "9:" // partial result store: partial_4_0
498 "tbz x23, #2, 11f\n"
499 "st1 { v26.4s }, [x24], #0x10\n"
500 "st1 { v25.4s }, [x9], #0x10\n"
501 "st1 { v20.4s }, [x25], #0x10\n"
502 "st1 { v19.4s }, [x11], #0x10\n"
503 "st1 { v14.4s }, [x26], #0x10\n"
504 "st1 { v13.4s }, [x10], #0x10\n"
505 "st1 { v8.4s }, [x27], #0x10\n"
506 "st1 { v7.4s }, [%x[Cpanel]], #0x10\n"
507 "tbz x23, #1, 10f\n"
508 "str d27, [x24], #0x8\n"
509 "str d29, [x9], #0x8\n"
510 "str d21, [x25], #0x8\n"
511 "str d23, [x11], #0x8\n"
512 "str d15, [x26], #0x8\n"
513 "str d17, [x10], #0x8\n"
514 "str d9, [x27], #0x8\n"
515 "str d11, [%x[Cpanel]], #0x8\n"
516 "tbz x23, #0, 13f\n"
517 "st1 { v27.s }[2], [x24]\n"
518 "st1 { v29.s }[2], [x9]\n"
519 "st1 { v21.s }[2], [x25]\n"
520 "st1 { v23.s }[2], [x11]\n"
521 "st1 { v15.s }[2], [x26]\n"
522 "st1 { v17.s }[2], [x10]\n"
523 "st1 { v9.s }[2], [x27]\n"
524 "st1 { v11.s }[2], [%x[Cpanel]]\n"
525 "b 13f\n"
526 "10:" // partial result store: partial_1_4
527 "tbz x23, #0, 13f\n"
528 "str s27, [x24, #0x0]\n"
529 "str s29, [x9, #0x0]\n"
530 "str s21, [x25, #0x0]\n"
531 "str s23, [x11, #0x0]\n"
532 "str s15, [x26, #0x0]\n"
533 "str s17, [x10, #0x0]\n"
534 "str s9, [x27, #0x0]\n"
535 "str s11, [%x[Cpanel], #0x0]\n"
536 "b 13f\n"
537 "11:" // partial result store: partial_2_0
538 "tbz x23, #1, 12f\n"
539 "str d26, [x24], #0x8\n"
540 "str d25, [x9], #0x8\n"
541 "str d20, [x25], #0x8\n"
542 "str d19, [x11], #0x8\n"
543 "str d14, [x26], #0x8\n"
544 "str d13, [x10], #0x8\n"
545 "str d8, [x27], #0x8\n"
546 "str d7, [%x[Cpanel]], #0x8\n"
547 "tbz x23, #0, 13f\n"
548 "st1 { v26.s }[2], [x24]\n"
549 "st1 { v25.s }[2], [x9]\n"
550 "st1 { v20.s }[2], [x25]\n"
551 "st1 { v19.s }[2], [x11]\n"
552 "st1 { v14.s }[2], [x26]\n"
553 "st1 { v13.s }[2], [x10]\n"
554 "st1 { v8.s }[2], [x27]\n"
555 "st1 { v7.s }[2], [%x[Cpanel]]\n"
556 "b 13f\n"
557 "12:" // partial result store: partial_1_0
558 "str s26, [x24, #0x0]\n"
559 "str s25, [x9, #0x0]\n"
560 "str s20, [x25, #0x0]\n"
561 "str s19, [x11, #0x0]\n"
562 "str s14, [x26, #0x0]\n"
563 "str s13, [x10, #0x0]\n"
564 "str s8, [x27, #0x0]\n"
565 "str s7, [%x[Cpanel], #0x0]\n"
566 "13:" // partial result store: Done
567 "14:" // store done
568 "subs x23, x23, #0xc\n"
569 "bgt 3b\n"
570 "subs %x[M], %x[M], #0x8\n"
571 "mov %x[Cpanel], x28\n"
572 "bgt 1b\n"
573 : [Apanel] "+&r"(Apanel), [Cpanel] "+&r"(Cpanel), [M] "+&r"(M)
574 185 : [args_ptr] "r"(&ka), [ldc] "r"(ldc * sizeof(float)), [offset_max] "I"(offsetof(KernelArgs, maxval)),
575 [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_Bpanel] "I"(offsetof(KernelArgs, Bpanel)),
576 [offsetof_K] "I"(offsetof(KernelArgs, K)), [offsetof_N] "I"(offsetof(KernelArgs, N))
577 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
578 "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
579 "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
580 185 }
581
582 #endif // Architectural features check.
583