KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 100.0% 277 / 0 / 277
Functions: 100.0% 1 / 1 / 2
Branches: 51.3% 194 / 0 / 378

benchmark/matmul/matmul_registry.cpp
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #include "matmul_registry.hpp"
8
9 #include <array>
10 #include <cstddef>
11 #include <cstdint>
12 #include <test/common/cpu_info.hpp>
13 #include <test/common/data_type.hpp>
14
15 #include "matmul_benchmark_logic.hpp"
16 #include "matmul_interface.hpp"
17
18 #ifdef __GNUC__
19 #pragma GCC diagnostic push
20 #pragma GCC diagnostic ignored "-Wswitch-default"
21 #endif // __GNUC__
22
23 #include <benchmark/benchmark.h>
24
25 #ifdef __GNUC__
26 #pragma GCC diagnostic pop
27 #endif // __GNUC__
28
29 // Micro-kernels to register for benchmarking
30
31 // matmul_clamp_f16_bf16p_bf16p
32 #include "kai/ukernels/matmul/matmul_clamp_f16_bf16p_bf16p/kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla.h"
33
34 // matmul_clamp_f16_f16_f16p
35 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h"
36 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot.h"
37 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla.h"
38 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla.h"
39 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55.h"
40
41 // matmul_clamp_f16_f16p_f16p
42 #include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
43 #include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa.h"
44
45 // matmul_clamp_f32_bf16p_bf16p
46 #include "kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.h"
47 #include "kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.h"
48
49 // matmul_clamp_f32_f32_f32p
50 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla.h"
51 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla.h"
52 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55.h"
53 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"
54 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla.h"
55 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla.h"
56 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
57
58 // matmul_clamp_f32_f32p_f32p
59 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
60 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
61
62 // matmul_clamp_f32_qai8dxp_qsi4c32p
63 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
64 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4vlx4_1x4vl_sme2_dot.h"
65 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
66 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.h"
67 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
68 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.h"
69 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.h"
70 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
71 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod.h"
72 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.h"
73 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h"
74 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm.h"
75 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.h"
76
77 // matmul_clamp_f32_qai8dxp_qsi4cxp
78 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.h"
79 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.h"
80 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h"
81 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h"
82 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h"
83 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod.h"
84 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h"
85 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h"
86 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h"
87 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h"
88 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h"
89
90 // matmul_clamp_f32_qai8dxp_qsi8cxp
91 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
92 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
93 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
94 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot.h"
95 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
96 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
97 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
98 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
99
100 // matmul_clamp_f32_qsi8d32p_qsi4c32p
101 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
102 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
103 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
104 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p8x4_1x8_sve_dotprod.h"
105 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
106 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h"
107 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
108 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
109 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h"
110 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h"
111
112 // matmul_clamp_fp32_bf16p_bf16p
113 #include "kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
114
115 // matmul_clamp_qai8_qai8_qsi8cxp
116 #include "kai/ukernels/matmul/matmul_clamp_qai8_qai8_qsi8cxp/kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot.h"
117
118 // matmul_clamp_qai8_qai8p_qsi8cxp
119 #include "kai/ukernels/matmul/matmul_clamp_qai8_qai8p_qsi8cxp/kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa.h"
120 #include "kai/ukernels/matmul/matmul_clamp_qai8_qai8p_qsi8cxp/kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa.h"
121
122 // matmul_clamp_f16_qai8dxp_qsi4cxp
123 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h"
124 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod.h"
125 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod.h"
126 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm.h"
127
128 // matmul_clamp_f16_qai8dxp_qsi8cxp
129 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
130 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
131 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
132 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
133
134 // matmul_clamp_f16_qsi8d32p_qai4c32p
135 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa.h"
136 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot.h"
137 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod.h"
138 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h"
139 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod.h"
140 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h"
141
142 // matmul_clamp_f32_qsi8d32p_qai4c32p
143 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa.h"
144 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot.h"
145 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod.h"
146 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h"
147 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod.h"
148 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h"
149
150 // matmul_clamp_bf16_qai8dxp_qsi4c32p
151 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4c32p/kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod.h"
152 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4c32p/kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
153
154 // matmul_clamp_bf16_qai8dxp_qsi4cxp
155 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod.h"
156 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm.h"
157
158 namespace kai::benchmark {
159 using DataType = test::DataType;
160
161 // matmul_clamp_f16_bf16p_bf16p
162 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface{
163 .run_matmul = kai_run_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla,
164 };
165
166 // matmul_clamp_f16_f16_f16p
167 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla_interface{
168 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla,
169 };
170
171 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot_interface{
172 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot,
173 };
174
175 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla_interface{
176 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
177 };
178
179 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_interface{
180 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla,
181 };
182
183 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55_interface{
184 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55,
185 };
186
187 // matmul_clamp_f16_f16p_f16p
188 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa_interface{
189 .run_matmul = kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa,
190 };
191
192 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa_interface{
193 .run_matmul = kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa,
194 };
195
196 // matmul_clamp_f32_bf16p_bf16p
197 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot_interface{
198 .run_matmul = kai_run_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot,
199 };
200
201 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface{
202 .run_matmul = kai_run_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla,
203 };
204
205 // matmul_clamp_f32_f32_f32p
206 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla_interface{
207 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
208 };
209
210 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla_interface{
211 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla,
212 };
213
214 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_interface{
215 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla,
216 };
217
218 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_interface{
219 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla,
220 };
221
222 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55_interface{
223 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55,
224 };
225
226 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla_interface{
227 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla,
228 };
229
230 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla_interface{
231 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla,
232 };
233
234 // matmul_clamp_f32_f32p_f32p
235 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa_interface{
236 .run_matmul = kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
237 };
238
239 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa_interface{
240 .run_matmul = kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa,
241 };
242
243 // matmul_clamp_f32_qai8dxp_qsi4c32p
244 inline constexpr MatMulBlockwiseDynamicQuantInterface
245 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod_interface{
246 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
247 };
248
249 inline constexpr MatMulBlockwiseDynamicQuantInterface
250 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod_interface{
251 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod,
252 };
253
254 inline constexpr MatMulBlockwiseDynamicQuantInterface
255 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface{
256 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
257 };
258
259 inline constexpr MatMulBlockwiseDynamicQuantInterface
260 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod_interface{
261 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod,
262 };
263 inline constexpr MatMulBlockwiseDynamicQuantInterface
264 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod_interface{
265 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod,
266 };
267
268 inline constexpr MatMulBlockwiseDynamicQuantInterface
269 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod_interface{
270 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
271 };
272
273 inline constexpr MatMulBlockwiseDynamicQuantInterface
274 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod_interface{
275 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod,
276 };
277
278 inline constexpr MatMulBlockwiseDynamicQuantInterface
279 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface{
280 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm,
281 };
282
283 inline constexpr MatMulBlockwiseDynamicQuantInterface
284 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm_interface{
285 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
286 };
287
288 inline constexpr MatMulBlockwiseDynamicQuantInterface
289 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm_interface{
290 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm,
291 };
292
293 inline constexpr MatMulBlockwiseDynamicQuantInterface
294 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm_interface{
295 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm,
296 };
297
298 inline constexpr MatMulBlockwiseDynamicQuantInterface
299 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4vlx4_1x4vl_sme2_dot_interface{
300 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4vlx4_1x4vl_sme2_dot,
301 };
302
303 inline constexpr MatMulBlockwiseDynamicQuantInterface
304 kai_matmul_clamp_f32_qai8dxp1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
305 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
306 };
307
308 // matmul_clamp_f32_qai8dxp_qsi4cxp
309 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa_interface{
310 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa,
311 };
312
313 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot_interface{
314 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot,
315 };
316
317 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface{
318 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod,
319 };
320
321 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod_interface{
322 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod,
323 };
324
325 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod_interface{
326 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod,
327 };
328
329 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod_interface{
330 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod,
331 };
332
333 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod_interface{
334 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod,
335 };
336
337 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm_interface{
338 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm,
339 };
340
341 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm_interface{
342 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm,
343 };
344
345 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm_interface{
346 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm,
347 };
348
349 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm_interface{
350 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm,
351 };
352
353 // matmul_clamp_f32_qai8dxp_qsi8cxp
354 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa_interface{
355 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
356 };
357
358 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot_interface{
359 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot,
360 };
361
362 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface{
363 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
364 };
365
366 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface{
367 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
368 };
369
370 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface{
371 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
372 };
373
374 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface{
375 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
376 };
377
378 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_interface{
379 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
380 };
381
382 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_interface{
383 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
384 };
385
386 // matmul_clamp_f32_qsi8d32p_qsi4c32p
387 inline constexpr MatMulBlockwiseDynamicQuantInterface
388 kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
389 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
390 };
391
392 inline constexpr MatMulBlockwiseDynamicQuantInterface
393 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot_interface{
394 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
395 };
396
397 inline constexpr MatMulBlockwiseDynamicQuantInterface
398 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod_interface{
399 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
400 };
401
402 inline constexpr MatMulBlockwiseDynamicQuantInterface
403 kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface{
404 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
405 };
406
407 inline constexpr MatMulBlockwiseDynamicQuantInterface
408 kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod_interface{
409 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
410 };
411
412 inline constexpr MatMulBlockwiseDynamicQuantInterface
413 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface{
414 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm,
415 };
416
417 inline constexpr MatMulBlockwiseDynamicQuantInterface
418 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm_interface{
419 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
420 };
421
422 inline constexpr MatMulBlockwiseDynamicQuantInterface
423 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p8x4_1x8_sve_dotprod_interface{
424 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p8x4_1x8_sve_dotprod,
425 };
426 inline constexpr MatMulBlockwiseDynamicQuantInterface
427 kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_interface{
428 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod,
429 };
430 inline constexpr MatMulBlockwiseDynamicQuantInterface
431 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_interface{
432 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm,
433 };
434
435 // matmul_clamp_fp32_bf16p_bf16p
436 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_interface{
437 .run_matmul = kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
438 };
439
440 // matmul_clamp_qai8_qai8_qsi8cxp
441 inline constexpr MatMulStaticQuantInterface kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot_interface{
442 .run_matmul = kai_run_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot,
443 };
444
445 // matmul_clamp_qai8_qai8p_qsi8cxp
446 inline constexpr MatMulStaticQuantInterface kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa_interface{
447 .run_matmul = kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa,
448 };
449
450 inline constexpr MatMulStaticQuantInterface kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa_interface{
451 .run_matmul = kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa,
452 };
453
454 // matmul_clamp_bf16_qai8dxp_qsi4c32p
455 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
456 kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod_interface{
457 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod,
458 };
459
460 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
461 kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm_interface{
462 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm,
463 };
464
465 // matmul_clamp_bf16_qai8dxp_qsi4cxp
466 inline constexpr MatMulBaseInterface kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod_interface{
467 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
468 };
469
470 inline constexpr MatMulBaseInterface kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm_interface{
471 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
472 };
473
474 // matmul_clamp_f16_qai8dxp_qsi4cxp
475 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface{
476 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod,
477 };
478
479 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod_interface{
480 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod,
481 };
482
483 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod_interface{
484 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod,
485 };
486
487 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm_interface{
488 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm,
489 };
490
491 // matmul_clamp_f16_qai8dxp_qsi8cxp
492 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface{
493 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
494 };
495
496 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface{
497 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
498 };
499
500 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface{
501 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
502 };
503
504 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface{
505 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
506 };
507
508 // matmul_clamp_f16_qsi8d32p_qai4c32p
509 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
510 kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
511 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa,
512 };
513
514 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
515 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface{
516 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot,
517 };
518
519 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
520 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface{
521 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod,
522 };
523
524 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
525 kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface{
526 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod,
527 };
528
529 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
530 kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface{
531 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod,
532 };
533
534 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
535 kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface{
536 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm,
537 };
538
539 // matmul_clamp_f32_qsi8d32p_qai4c32p
540 inline constexpr MatMulBlockwiseDynamicQuantInterface
541 kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
542 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa,
543 };
544
545 inline constexpr MatMulBlockwiseDynamicQuantInterface
546 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface{
547 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot,
548 };
549
550 inline constexpr MatMulBlockwiseDynamicQuantInterface
551 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface{
552 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod,
553 };
554
555 inline constexpr MatMulBlockwiseDynamicQuantInterface
556 kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface{
557 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod,
558 };
559
560 inline constexpr MatMulBlockwiseDynamicQuantInterface
561 kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface{
562 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod,
563 };
564
565 inline constexpr MatMulBlockwiseDynamicQuantInterface
566 kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface{
567 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm,
568 };
569
570 36 inline const std::array matmul_benchmarks{
571 // matmul_clamp_f16_bf16p_bf16p
572
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
3204 RegisterBenchmark(
573
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla", kai_benchmark_matmul<MatMulBaseInterface>,
574 36 kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface, DataType::FP16, MatMulOp::GEMM,
575 test::cpu_has_bf16),
576
577 // matmul_clamp_f16_f16_f16p
578
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
579
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
580 36 kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla_interface, DataType::FP16, MatMulOp::GEMV, test::cpu_has_sme),
581
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
582
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot", kai_benchmark_matmul<MatMulStridedLhsInterface>,
583 36 kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot_interface, DataType::FP16, MatMulOp::GEMV,
584 test::cpu_has_sme2),
585
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
586
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
587 36 kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla_interface, DataType::FP16, MatMulOp::GEMM,
588 test::cpu_has_fp16),
589
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
590
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
591 36 kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_interface, DataType::FP16, MatMulOp::GEMM, test::cpu_has_fp16),
592
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
593
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55", kai_benchmark_matmul<MatMulStridedLhsInterface>,
594 36 kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55_interface, DataType::FP16, MatMulOp::GEMM,
595 test::cpu_has_fp16),
596
597 // matmul_clamp_f16_f16p_f16p
598
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
599
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
600 36 kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa_interface, DataType::FP16, MatMulOp::GEMM,
601 test::cpu_has_sme2),
602
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
603
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
604 36 kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa_interface, DataType::FP16, MatMulOp::GEMM,
605 test::cpu_has_sme),
606
607 // matmul_clamp_f32_bf16p_bf16p
608
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
609
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot", kai_benchmark_matmul<MatMulBaseInterface>,
610 36 kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot_interface, DataType::FP32, MatMulOp::GEMV,
611 test::cpu_has_dotprod),
612
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
613
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla", kai_benchmark_matmul<MatMulBaseInterface>,
614 36 kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface, DataType::FP32, MatMulOp::GEMM,
615 test::cpu_has_i8mm),
616
617 // matmul_clamp_f32_f32_f32p
618
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
619
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
620 36 kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla_interface, DataType::FP32, MatMulOp::GEMV,
621 test::cpu_has_sme2),
622
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
623
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
624 36 kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla_interface, DataType::FP32, MatMulOp::GEMV, test::cpu_has_sme),
625
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
626
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
627 36 kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_interface, DataType::FP32, MatMulOp::GEMM,
628 test::cpu_has_advsimd),
629
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
630
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
631 36 kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_interface, DataType::FP32, MatMulOp::GEMM,
632 test::cpu_has_advsimd),
633
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
634
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55", kai_benchmark_matmul<MatMulStridedLhsInterface>,
635 36 kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55_interface, DataType::FP32, MatMulOp::GEMM,
636 test::cpu_has_advsimd),
637
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
638
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
639 36 kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla_interface, DataType::FP32, MatMulOp::GEMV,
640 test::cpu_has_sme2),
641
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
642
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
643 36 kai_matmul_clamp_f32_f32_f32p4vlx1b_6x4vl_sve_mla_interface, DataType::FP32, MatMulOp::GEMM, test::cpu_has_sve),
644
645 // matmul_clamp_f32_f32p_f32p
646
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
647
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
648 36 kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
649 test::cpu_has_sme2),
650
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
651
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
652 36 kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa_interface, DataType::FP32, MatMulOp::GEMM,
653 test::cpu_has_sme),
654
655 // matmul_clamp_f32_qai8dxp_qsi4c32p
656
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
657
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod",
658 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
659 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
660 test::cpu_has_dotprod),
661
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
662
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod",
663 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
664 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
665 test::cpu_has_dotprod),
666
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
667
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod",
668 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
669 36 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
670 test::cpu_has_dotprod),
671
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
672
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod",
673 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
674 36 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
675 test::cpu_has_dotprod),
676
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
677
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod",
678 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
679 36 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
680 test::cpu_has_dotprod),
681
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
682
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod",
683 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
684 36 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
685 test::cpu_has_dotprod),
686
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
687
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod",
688 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
689 36 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
690 test::cpu_has_dotprod),
691
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
692
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm",
693 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
694 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
695 test::cpu_has_i8mm),
696
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
697
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm",
698 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
699 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
700 test::cpu_has_i8mm),
701
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
702
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm",
703 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
704 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
705 test::cpu_has_i8mm),
706
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
707
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm",
708 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
709 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
710 test::cpu_has_i8mm),
711
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
712
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4vlx4_1x4vl_sme2_dot",
713 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
714 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4vlx4_1x4vl_sme2_dot_interface, DataType::FP32, MatMulOp::GEMV,
715 test::cpu_has_sme2),
716
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
717
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa",
718 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
719 36 kai_matmul_clamp_f32_qai8dxp1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
720 test::cpu_has_sme2),
721
722 // matmul_clamp_f32_qai8dxp_qsi4cxp
723
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
724
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa", kai_benchmark_matmul<MatMulFloatInterface>,
725 36 kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
726 test::cpu_has_sme2),
727
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
728
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot", kai_benchmark_matmul<MatMulFloatInterface>,
729 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot_interface, DataType::FP32, MatMulOp::GEMV,
730 test::cpu_has_sme2),
731
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
732
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
733 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
734 test::cpu_has_dotprod),
735
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
736
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
737 36 kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
738 test::cpu_has_dotprod),
739
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
740
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
741 36 kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
742 test::cpu_has_dotprod),
743
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
744
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
745 36 kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
746 test::cpu_has_dotprod),
747
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
748
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
749 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
750 test::cpu_has_dotprod),
751
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
752
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
753 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
754 test::cpu_has_i8mm),
755
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
756
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
757 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
758 test::cpu_has_i8mm),
759
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
760
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
761 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
762 test::cpu_has_i8mm),
763
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
764
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
765 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
766 test::cpu_has_i8mm),
767
768 // matmul_clamp_f32_qai8dxp_qsi8cxp
769
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
770
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa", kai_benchmark_matmul<MatMulFloatInterface>,
771 36 kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa_interface, DataType::FP32, MatMulOp::GEMM,
772 test::cpu_has_sme),
773
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
774
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot", kai_benchmark_matmul<MatMulFloatInterface>,
775 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot_interface, DataType::FP32, MatMulOp::GEMV,
776 test::cpu_has_sme),
777
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
778
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
779 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
780 test::cpu_has_dotprod),
781
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
782
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
783 36 kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
784 test::cpu_has_dotprod),
785
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
786
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
787 36 kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
788 test::cpu_has_dotprod),
789
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
790
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
791 36 kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
792 test::cpu_has_i8mm),
793
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
794
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa", kai_benchmark_matmul<MatMulFloatInterface>,
795 36 kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
796 test::cpu_has_sme2),
797
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
798
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot", kai_benchmark_matmul<MatMulFloatInterface>,
799 36 kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_interface, DataType::FP32, MatMulOp::GEMV,
800 test::cpu_has_sme2),
801
802 // matmul_clamp_f32_qsi8d32p_qsi4c32p
803
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
804
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa",
805 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
806 36 kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
807 test::cpu_has_sme2),
808
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
809
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot",
810 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
811 36 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot_interface, DataType::FP32, MatMulOp::GEMV,
812 test::cpu_has_sme2),
813
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
814
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod",
815 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
816 36 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
817 test::cpu_has_dotprod),
818
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
819
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod",
820 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
821 36 kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
822 test::cpu_has_dotprod),
823
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
824
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod",
825 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
826 36 kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
827 test::cpu_has_dotprod),
828
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
829
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm",
830 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
831 36 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
832 test::cpu_has_i8mm),
833
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
834
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm",
835 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
836 36 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
837 test::cpu_has_i8mm),
838
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
839
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p8x4_1x8_sve_dotprod",
840 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
841 36 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p8x4_1x8_sve_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
842 (test::cpu_check<test::cpu_has_sve_vl256, test::cpu_has_dotprod>)),
843
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
844
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod",
845 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
846 36 kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
847 (test::cpu_check<test::cpu_has_sve_vl256, test::cpu_has_dotprod>)),
848
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
849
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm",
850 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
851 36 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
852 (test::cpu_check<test::cpu_has_sve_vl256, test::cpu_has_i8mm>)),
853
854 // matmul_clamp_fp32_bf16p_bf16p
855
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
856
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
857 36 kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
858 test::cpu_has_sme2),
859
860 // matmul_clamp_qai8_qai8_qsi8cxp
861
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
862
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot", kai_benchmark_matmul<MatMulStaticQuantInterface>,
863 36 kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot_interface, DataType::QAI8, MatMulOp::GEMV,
864 test::cpu_has_sme2),
865
866 // matmul_clamp_qai8_qai8p_qsi8cxp
867
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
868
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa",
869 kai_benchmark_matmul<MatMulStaticQuantInterface>,
870 36 kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa_interface, DataType::QAI8, MatMulOp::GEMM,
871 test::cpu_has_sme2),
872
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
873
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa",
874 kai_benchmark_matmul<MatMulStaticQuantInterface>,
875 36 kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa_interface, DataType::QAI8, MatMulOp::GEMM,
876 test::cpu_has_sme),
877
878 // matmul_clamp_bf16_qai8dxp_qsi4c32p
879
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
880
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod",
881 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
882 36 kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod_interface, DataType::BF16, MatMulOp::GEMV,
883 test::cpu_has_dotprod_and_bf16),
884
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
885
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm",
886 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
887 36 kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm_interface, DataType::BF16, MatMulOp::GEMM,
888 test::cpu_has_i8mm_and_bf16),
889
890 // matmul_clamp_bf16_qai8dxp_qsi4cxp
891
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
892
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
893 36 kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod_interface, DataType::BF16, MatMulOp::GEMV,
894 test::cpu_has_dotprod_and_bf16),
895
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
896
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm", kai_benchmark_matmul<MatMulBaseInterface>,
897 36 kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm_interface, DataType::BF16, MatMulOp::GEMM,
898 test::cpu_has_i8mm_and_bf16),
899
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
900
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
901 36 kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
902 test::cpu_has_dotprod_and_fp16),
903
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
904
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
905 36 kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
906 test::cpu_has_dotprod_and_fp16),
907
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
908
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
909 36 kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMM,
910 test::cpu_has_dotprod_and_fp16),
911
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
912
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm", kai_benchmark_matmul<MatMulBaseInterface>,
913 36 kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm_interface, DataType::FP16, MatMulOp::GEMM,
914 test::cpu_has_i8mm_and_fp16),
915
916 // matmul_clamp_f16_qai8dxp_qsi8cxp
917
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
918
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
919 36 kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
920 test::cpu_has_dotprod_and_fp16),
921
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
922
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
923 36 kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
924 test::cpu_has_dotprod_and_fp16),
925
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
926
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
927 36 kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMM,
928 test::cpu_has_dotprod_and_fp16),
929
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
930
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm", kai_benchmark_matmul<MatMulBaseInterface>,
931 36 kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface, DataType::FP16, MatMulOp::GEMM,
932 test::cpu_has_i8mm_and_fp16),
933
934 // matmul_clamp_f16_qsi8d32p_qai4c32p
935
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
936
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa",
937 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
938 36 kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP16, MatMulOp::GEMM,
939 test::cpu_has_sme2),
940
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
941
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot",
942 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
943 36 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface, DataType::FP16, MatMulOp::GEMV,
944 test::cpu_has_sme2),
945
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
946
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod",
947 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
948 36 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
949 test::cpu_has_dotprod_and_fp16),
950
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
951
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod",
952 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
953 36 kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
954 test::cpu_has_dotprod_and_fp16),
955
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
956
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod",
957 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
958 36 kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMM,
959 test::cpu_has_dotprod_and_fp16),
960
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
961
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm",
962 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
963 36 kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface, DataType::FP16, MatMulOp::GEMM,
964 test::cpu_has_i8mm_and_fp16),
965
966 // matmul_clamp_f32_qsi8d32p_qai4c32p
967
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
968
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa",
969 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
970 36 kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
971 test::cpu_has_sme2),
972
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
973
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot",
974 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
975 36 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface, DataType::FP32, MatMulOp::GEMV,
976 test::cpu_has_sme2),
977
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
978
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod",
979 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
980 36 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
981 test::cpu_has_dotprod),
982
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
983
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod",
984 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
985 36 kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
986 test::cpu_has_dotprod),
987
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
988
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod",
989 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
990 36 kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
991 test::cpu_has_dotprod),
992
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 RegisterBenchmark(
993
1/2
✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
36 "kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm",
994 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
995 36 kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
996 test::cpu_has_i8mm),
997
998 };
999
1000 12 void RegisterMatMulBenchmarks(const MatMulShape& shape, const size_t bl) {
1001
2/2
✓ Branch 0 taken 1068 times.
✓ Branch 1 taken 12 times.
1080 for (const auto& benchmark : matmul_benchmarks) {
1002
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
2136 benchmark
1003
2/2
✓ Branch 0 taken 356 times.
✓ Branch 1 taken 712 times.
1068 ->Args(
1004
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 712 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 712 times.
2136 {static_cast<int64_t>(shape.m), static_cast<int64_t>(shape.n), static_cast<int64_t>(shape.k),
1005 1068 static_cast<int64_t>(bl)})
1006
10/12
✓ Branch 0 taken 356 times.
✓ Branch 1 taken 712 times.
✓ Branch 2 taken 356 times.
✓ Branch 3 taken 712 times.
✓ Branch 4 taken 356 times.
✓ Branch 5 taken 712 times.
✓ Branch 6 taken 356 times.
✓ Branch 7 taken 712 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 1068 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 1068 times.
1068 ->ArgNames({"m", "n", "k", "bl"});
1007 1068 }
1008 12 }
1009 } // namespace kai::benchmark
1010