KleidiAI Coverage Report


Directory: ./
File: benchmark/matmul/matmul_registry.cpp
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 100.0% 9 0 9
Functions: 100.0% 1 0 1
Branches: 56.2% 9 0 16

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #include "matmul_registry.hpp"
8
9 #include <array>
10 #include <cstddef>
11 #include <cstdint>
12 #include <test/common/cpu_info.hpp>
13 #include <test/common/data_type.hpp>
14
15 #include "matmul_benchmark_logic.hpp"
16 #include "matmul_interface.hpp"
17
18 #ifdef __GNUC__
19 #pragma GCC diagnostic push
20 #pragma GCC diagnostic ignored "-Wswitch-default"
21 #endif // __GNUC__
22
23 #include <benchmark/benchmark.h>
24
25 #ifdef __GNUC__
26 #pragma GCC diagnostic pop
27 #endif // __GNUC__
28
29 // Micro-kernels to register for benchmarking
30
31 // matmul_clamp_f16_bf16p_bf16p
32 #include "kai/ukernels/matmul/matmul_clamp_f16_bf16p_bf16p/kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla.h"
33
34 // matmul_clamp_f16_f16_f16p
35 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h"
36 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot.h"
37 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla.h"
38 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla.h"
39 #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55.h"
40
41 // matmul_clamp_f16_f16p_f16p
42 #include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
43 #include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa.h"
44
45 // matmul_clamp_f32_bf16p_bf16p
46 #include "kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.h"
47 #include "kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.h"
48
49 // matmul_clamp_f32_f32_f32p
50 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla.h"
51 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla.h"
52 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55.h"
53 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"
54 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla.h"
55 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
56
57 // matmul_clamp_f32_f32p_f32p
58 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
59 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
60
61 // matmul_clamp_f32_qai8dxp_qsi4c32p
62 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
63 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.h"
64 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
65 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.h"
66 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.h"
67 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
68 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod.h"
69 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.h"
70 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h"
71 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm.h"
72 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.h"
73
74 // matmul_clamp_f32_qai8dxp_qsi4cxp
75 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.h"
76 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.h"
77 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h"
78 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h"
79 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h"
80 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod.h"
81 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h"
82 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h"
83 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h"
84 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h"
85 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h"
86
87 // matmul_clamp_f32_qai8dxp_qsi8cxp
88 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
89 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
90 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
91 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot.h"
92 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
93 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
94 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
95 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
96
97 // matmul_clamp_f32_qsi8d32p_qsi4c32p
98 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
99 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
100 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
101 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
102 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
103 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
104 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h"
105
106 // matmul_clamp_fp32_bf16p_bf16p
107 #include "kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
108
109 // matmul_clamp_qai8_qai8_qsi8cxp
110 #include "kai/ukernels/matmul/matmul_clamp_qai8_qai8_qsi8cxp/kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot.h"
111
112 // matmul_clamp_qai8_qai8p_qsi8cxp
113 #include "kai/ukernels/matmul/matmul_clamp_qai8_qai8p_qsi8cxp/kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa.h"
114 #include "kai/ukernels/matmul/matmul_clamp_qai8_qai8p_qsi8cxp/kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa.h"
115
116 // matmul_clamp_f16_qai8dxp_qsi4cxp
117 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h"
118 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod.h"
119 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod.h"
120 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm.h"
121
122 // matmul_clamp_f16_qai8dxp_qsi8cxp
123 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
124 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
125 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
126 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
127
128 // matmul_clamp_f16_qsi8d32p_qai4c32p
129 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa.h"
130 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot.h"
131 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod.h"
132 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h"
133 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod.h"
134 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h"
135
136 // matmul_clamp_f32_qsi8d32p_qai4c32p
137 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa.h"
138 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot.h"
139 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod.h"
140 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h"
141 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod.h"
142 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h"
143
144 // matmul_clamp_bf16_qai8dxp_qsi4c32p
145 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4c32p/kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod.h"
146 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4c32p/kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
147
148 // matmul_clamp_bf16_qai8dxp_qsi4cxp
149 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod.h"
150 #include "kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm.h"
151
152 namespace kai::benchmark {
153 using DataType = test::DataType;
154
155 // matmul_clamp_f16_bf16p_bf16p
156 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface{
157 .run_matmul = kai_run_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla,
158 };
159
160 // matmul_clamp_f16_f16_f16p
161 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla_interface{
162 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla,
163 };
164
165 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot_interface{
166 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot,
167 };
168
169 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla_interface{
170 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
171 };
172
173 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_interface{
174 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla,
175 };
176
177 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55_interface{
178 .run_matmul = kai_run_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55,
179 };
180
181 // matmul_clamp_f16_f16p_f16p
182 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa_interface{
183 .run_matmul = kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa,
184 };
185
186 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa_interface{
187 .run_matmul = kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa,
188 };
189
190 // matmul_clamp_f32_bf16p_bf16p
191 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot_interface{
192 .run_matmul = kai_run_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot,
193 };
194
195 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface{
196 .run_matmul = kai_run_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla,
197 };
198
199 // matmul_clamp_f32_f32_f32p
200 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla_interface{
201 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
202 };
203
204 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla_interface{
205 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla,
206 };
207
208 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_interface{
209 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla,
210 };
211
212 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_interface{
213 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla,
214 };
215
216 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55_interface{
217 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55,
218 };
219
220 inline constexpr MatMulStridedLhsInterface kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla_interface{
221 .run_matmul = kai_run_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla,
222 };
223
224 // matmul_clamp_f32_f32p_f32p
225 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa_interface{
226 .run_matmul = kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
227 };
228
229 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa_interface{
230 .run_matmul = kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa,
231 };
232
233 // matmul_clamp_f32_qai8dxp_qsi4c32p
234 inline constexpr MatMulBlockwiseDynamicQuantInterface
235 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod_interface{
236 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
237 };
238
239 inline constexpr MatMulBlockwiseDynamicQuantInterface
240 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod_interface{
241 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod,
242 };
243
244 inline constexpr MatMulBlockwiseDynamicQuantInterface
245 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface{
246 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
247 };
248
249 inline constexpr MatMulBlockwiseDynamicQuantInterface
250 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod_interface{
251 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod,
252 };
253 inline constexpr MatMulBlockwiseDynamicQuantInterface
254 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod_interface{
255 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod,
256 };
257
258 inline constexpr MatMulBlockwiseDynamicQuantInterface
259 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod_interface{
260 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod,
261 };
262
263 inline constexpr MatMulBlockwiseDynamicQuantInterface
264 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod_interface{
265 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod,
266 };
267
268 inline constexpr MatMulBlockwiseDynamicQuantInterface
269 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface{
270 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm,
271 };
272
273 inline constexpr MatMulBlockwiseDynamicQuantInterface
274 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm_interface{
275 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm,
276 };
277
278 inline constexpr MatMulBlockwiseDynamicQuantInterface
279 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm_interface{
280 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm,
281 };
282
283 inline constexpr MatMulBlockwiseDynamicQuantInterface
284 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm_interface{
285 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm,
286 };
287
288 // matmul_clamp_f32_qai8dxp_qsi4cxp
289 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa_interface{
290 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa,
291 };
292
293 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot_interface{
294 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot,
295 };
296
297 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface{
298 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod,
299 };
300
301 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod_interface{
302 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod,
303 };
304
305 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod_interface{
306 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod,
307 };
308
309 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod_interface{
310 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod,
311 };
312
313 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod_interface{
314 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod,
315 };
316
317 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm_interface{
318 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm,
319 };
320
321 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm_interface{
322 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm,
323 };
324
325 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm_interface{
326 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm,
327 };
328
329 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm_interface{
330 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm,
331 };
332
333 // matmul_clamp_f32_qai8dxp_qsi8cxp
334 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa_interface{
335 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
336 };
337
338 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot_interface{
339 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot,
340 };
341
342 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface{
343 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
344 };
345
346 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface{
347 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
348 };
349
350 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface{
351 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
352 };
353
354 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface{
355 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
356 };
357
358 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_interface{
359 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
360 };
361
362 inline constexpr MatMulFloatInterface kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_interface{
363 .run_matmul = kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
364 };
365
366 // matmul_clamp_f32_qsi8d32p_qsi4c32p
367 inline constexpr MatMulBlockwiseDynamicQuantInterface
368 kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
369 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa,
370 };
371
372 inline constexpr MatMulBlockwiseDynamicQuantInterface
373 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot_interface{
374 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
375 };
376
377 inline constexpr MatMulBlockwiseDynamicQuantInterface
378 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod_interface{
379 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod,
380 };
381
382 inline constexpr MatMulBlockwiseDynamicQuantInterface
383 kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface{
384 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod,
385 };
386
387 inline constexpr MatMulBlockwiseDynamicQuantInterface
388 kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod_interface{
389 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod,
390 };
391
392 inline constexpr MatMulBlockwiseDynamicQuantInterface
393 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface{
394 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm,
395 };
396
397 inline constexpr MatMulBlockwiseDynamicQuantInterface
398 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm_interface{
399 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm,
400 };
401
402 // matmul_clamp_fp32_bf16p_bf16p
403 inline constexpr MatMulBaseInterface kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_interface{
404 .run_matmul = kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
405 };
406
407 // matmul_clamp_qai8_qai8_qsi8cxp
408 inline constexpr MatMulStaticQuantInterface kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot_interface{
409 .run_matmul = kai_run_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot,
410 };
411
412 // matmul_clamp_qai8_qai8p_qsi8cxp
413 inline constexpr MatMulStaticQuantInterface kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa_interface{
414 .run_matmul = kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa,
415 };
416
417 inline constexpr MatMulStaticQuantInterface kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa_interface{
418 .run_matmul = kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa,
419 };
420
421 // matmul_clamp_bf16_qai8dxp_qsi4c32p
422 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
423 kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod_interface{
424 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod,
425 };
426
427 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
428 kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm_interface{
429 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm,
430 };
431
432 // matmul_clamp_bf16_qai8dxp_qsi4cxp
433 inline constexpr MatMulBaseInterface kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod_interface{
434 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
435 };
436
437 inline constexpr MatMulBaseInterface kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm_interface{
438 .run_matmul = kai_run_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
439 };
440
441 // matmul_clamp_f16_qai8dxp_qsi4cxp
442 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface{
443 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod,
444 };
445
446 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod_interface{
447 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod,
448 };
449
450 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod_interface{
451 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod,
452 };
453
454 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm_interface{
455 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm,
456 };
457
458 // matmul_clamp_f16_qai8dxp_qsi8cxp
459 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface{
460 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
461 };
462
463 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface{
464 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
465 };
466
467 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface{
468 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
469 };
470
471 inline constexpr MatMulBaseInterface kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface{
472 .run_matmul = kai_run_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
473 };
474
475 // matmul_clamp_f16_qsi8d32p_qai4c32p
476 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
477 kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
478 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa,
479 };
480
481 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
482 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface{
483 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot,
484 };
485
486 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
487 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface{
488 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod,
489 };
490
491 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
492 kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface{
493 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod,
494 };
495
496 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
497 kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface{
498 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod,
499 };
500
501 inline constexpr MatMulBlockwiseDynamicQuantGenericDstInterface
502 kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface{
503 .run_matmul = kai_run_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm,
504 };
505
506 // matmul_clamp_f32_qsi8d32p_qai4c32p
507 inline constexpr MatMulBlockwiseDynamicQuantInterface
508 kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface{
509 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa,
510 };
511
512 inline constexpr MatMulBlockwiseDynamicQuantInterface
513 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface{
514 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot,
515 };
516
517 inline constexpr MatMulBlockwiseDynamicQuantInterface
518 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface{
519 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod,
520 };
521
522 inline constexpr MatMulBlockwiseDynamicQuantInterface
523 kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface{
524 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod,
525 };
526
527 inline constexpr MatMulBlockwiseDynamicQuantInterface
528 kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface{
529 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod,
530 };
531
532 inline constexpr MatMulBlockwiseDynamicQuantInterface
533 kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface{
534 .run_matmul = kai_run_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm,
535 };
536
537 inline const std::array matmul_benchmarks{
538 // matmul_clamp_f16_bf16p_bf16p
539 RegisterBenchmark(
540 "kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla", kai_benchmark_matmul<MatMulBaseInterface>,
541 kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface, DataType::FP16, MatMulOp::GEMM,
542 test::cpu_has_bf16),
543
544 // matmul_clamp_f16_f16_f16p
545 RegisterBenchmark(
546 "kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
547 kai_matmul_clamp_f16_f16_f16p2vlx2b_1x8vl_sme_mla_interface, DataType::FP16, MatMulOp::GEMV, test::cpu_has_sme),
548 RegisterBenchmark(
549 "kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot", kai_benchmark_matmul<MatMulStridedLhsInterface>,
550 kai_matmul_clamp_f16_f16_f16p2vlx2b_1x16vl_sme2_dot_interface, DataType::FP16, MatMulOp::GEMV,
551 test::cpu_has_sme2),
552 RegisterBenchmark(
553 "kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
554 kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla_interface, DataType::FP16, MatMulOp::GEMM,
555 test::cpu_has_fp16),
556 RegisterBenchmark(
557 "kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
558 kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_interface, DataType::FP16, MatMulOp::GEMM, test::cpu_has_fp16),
559 RegisterBenchmark(
560 "kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55", kai_benchmark_matmul<MatMulStridedLhsInterface>,
561 kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla_cortexa55_interface, DataType::FP16, MatMulOp::GEMM,
562 test::cpu_has_fp16),
563
564 // matmul_clamp_f16_f16p_f16p
565 RegisterBenchmark(
566 "kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
567 kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa_interface, DataType::FP16, MatMulOp::GEMM,
568 test::cpu_has_sme2),
569 RegisterBenchmark(
570 "kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
571 kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa_interface, DataType::FP16, MatMulOp::GEMM,
572 test::cpu_has_sme),
573
574 // matmul_clamp_f32_bf16p_bf16p
575 RegisterBenchmark(
576 "kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot", kai_benchmark_matmul<MatMulBaseInterface>,
577 kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot_interface, DataType::FP32, MatMulOp::GEMV,
578 test::cpu_has_dotprod),
579 RegisterBenchmark(
580 "kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla", kai_benchmark_matmul<MatMulBaseInterface>,
581 kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla_interface, DataType::FP32, MatMulOp::GEMM,
582 test::cpu_has_i8mm),
583
584 // matmul_clamp_f32_f32_f32p
585 RegisterBenchmark(
586 "kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
587 kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla_interface, DataType::FP32, MatMulOp::GEMV,
588 test::cpu_has_sme2),
589 RegisterBenchmark(
590 "kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
591 kai_matmul_clamp_f32_f32_f32p2vlx1b_1x8vl_sme_mla_interface, DataType::FP32, MatMulOp::GEMV, test::cpu_has_sme),
592 RegisterBenchmark(
593 "kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
594 kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_interface, DataType::FP32, MatMulOp::GEMM,
595 test::cpu_has_advsimd),
596 RegisterBenchmark(
597 "kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
598 kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_interface, DataType::FP32, MatMulOp::GEMM,
599 test::cpu_has_advsimd),
600 RegisterBenchmark(
601 "kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55", kai_benchmark_matmul<MatMulStridedLhsInterface>,
602 kai_matmul_clamp_f32_f32_f32p16x1b_6x16_neon_mla_cortexa55_interface, DataType::FP32, MatMulOp::GEMM,
603 test::cpu_has_advsimd),
604 RegisterBenchmark(
605 "kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla", kai_benchmark_matmul<MatMulStridedLhsInterface>,
606 kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla_interface, DataType::FP32, MatMulOp::GEMV,
607 test::cpu_has_sme2),
608
609 // matmul_clamp_f32_f32p_f32p
610 RegisterBenchmark(
611 "kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
612 kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
613 test::cpu_has_sme2),
614 RegisterBenchmark(
615 "kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
616 kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa_interface, DataType::FP32, MatMulOp::GEMM,
617 test::cpu_has_sme),
618
619 // matmul_clamp_f32_qai8dxp_qsi4c32p
620 RegisterBenchmark(
621 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod",
622 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
623 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
624 test::cpu_has_dotprod),
625 RegisterBenchmark(
626 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod",
627 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
628 kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
629 test::cpu_has_dotprod),
630 RegisterBenchmark(
631 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod",
632 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
633 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
634 test::cpu_has_dotprod),
635 RegisterBenchmark(
636 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod",
637 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
638 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
639 test::cpu_has_dotprod),
640 RegisterBenchmark(
641 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod",
642 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
643 kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
644 test::cpu_has_dotprod),
645 RegisterBenchmark(
646 "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod",
647 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
648 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
649 test::cpu_has_dotprod),
650 RegisterBenchmark(
651 "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod",
652 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
653 kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
654 test::cpu_has_dotprod),
655 RegisterBenchmark(
656 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm",
657 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
658 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
659 test::cpu_has_i8mm),
660 RegisterBenchmark(
661 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm",
662 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
663 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
664 test::cpu_has_i8mm),
665 RegisterBenchmark(
666 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm",
667 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
668 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
669 test::cpu_has_i8mm),
670 RegisterBenchmark(
671 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm",
672 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
673 kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
674 test::cpu_has_i8mm),
675
676 // matmul_clamp_f32_qai8dxp_qsi4cxp
677 RegisterBenchmark(
678 "kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa", kai_benchmark_matmul<MatMulFloatInterface>,
679 kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
680 test::cpu_has_sme2),
681 RegisterBenchmark(
682 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot", kai_benchmark_matmul<MatMulFloatInterface>,
683 kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot_interface, DataType::FP32, MatMulOp::GEMV,
684 test::cpu_has_sme2),
685 RegisterBenchmark(
686 "kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
687 kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
688 test::cpu_has_dotprod),
689 RegisterBenchmark(
690 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
691 kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
692 test::cpu_has_dotprod),
693 RegisterBenchmark(
694 "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
695 kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
696 test::cpu_has_dotprod),
697 RegisterBenchmark(
698 "kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
699 kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
700 test::cpu_has_dotprod),
701 RegisterBenchmark(
702 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
703 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
704 test::cpu_has_dotprod),
705 RegisterBenchmark(
706 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
707 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
708 test::cpu_has_i8mm),
709 RegisterBenchmark(
710 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
711 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
712 test::cpu_has_i8mm),
713 RegisterBenchmark(
714 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
715 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
716 test::cpu_has_i8mm),
717 RegisterBenchmark(
718 "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
719 kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
720 test::cpu_has_i8mm),
721
722 // matmul_clamp_f32_qai8dxp_qsi8cxp
723 RegisterBenchmark(
724 "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa", kai_benchmark_matmul<MatMulFloatInterface>,
725 kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa_interface, DataType::FP32, MatMulOp::GEMM,
726 test::cpu_has_sme),
727 RegisterBenchmark(
728 "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot", kai_benchmark_matmul<MatMulFloatInterface>,
729 kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme_dot_interface, DataType::FP32, MatMulOp::GEMV,
730 test::cpu_has_sme),
731 RegisterBenchmark(
732 "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
733 kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
734 test::cpu_has_dotprod),
735 RegisterBenchmark(
736 "kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
737 kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
738 test::cpu_has_dotprod),
739 RegisterBenchmark(
740 "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod", kai_benchmark_matmul<MatMulFloatInterface>,
741 kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
742 test::cpu_has_dotprod),
743 RegisterBenchmark(
744 "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm", kai_benchmark_matmul<MatMulFloatInterface>,
745 kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
746 test::cpu_has_i8mm),
747 RegisterBenchmark(
748 "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa", kai_benchmark_matmul<MatMulFloatInterface>,
749 kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
750 test::cpu_has_sme2),
751 RegisterBenchmark(
752 "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot", kai_benchmark_matmul<MatMulFloatInterface>,
753 kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_interface, DataType::FP32, MatMulOp::GEMV,
754 test::cpu_has_sme2),
755
756 // matmul_clamp_f32_qsi8d32p_qsi4c32p
757 RegisterBenchmark(
758 "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa",
759 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
760 kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
761 test::cpu_has_sme2),
762 RegisterBenchmark(
763 "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot",
764 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
765 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot_interface, DataType::FP32, MatMulOp::GEMV,
766 test::cpu_has_sme2),
767 RegisterBenchmark(
768 "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod",
769 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
770 kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
771 test::cpu_has_dotprod),
772 RegisterBenchmark(
773 "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod",
774 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
775 kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
776 test::cpu_has_dotprod),
777 RegisterBenchmark(
778 "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod",
779 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
780 kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
781 test::cpu_has_dotprod),
782 RegisterBenchmark(
783 "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm",
784 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
785 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
786 test::cpu_has_i8mm),
787 RegisterBenchmark(
788 "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm",
789 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
790 kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
791 test::cpu_has_i8mm),
792
793 // matmul_clamp_fp32_bf16p_bf16p
794 RegisterBenchmark(
795 "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa", kai_benchmark_matmul<MatMulBaseInterface>,
796 kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
797 test::cpu_has_sme2),
798
799 // matmul_clamp_qai8_qai8_qsi8cxp
800 RegisterBenchmark(
801 "kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot", kai_benchmark_matmul<MatMulStaticQuantInterface>,
802 kai_matmul_clamp_qai8_qai8_qsi8cxp2vlx4sb_1x16vl_sme2_dot_interface, DataType::QAI8, MatMulOp::GEMV,
803 test::cpu_has_sme2),
804
805 // matmul_clamp_qai8_qai8p_qsi8cxp
806 RegisterBenchmark(
807 "kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa",
808 kai_benchmark_matmul<MatMulStaticQuantInterface>,
809 kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa_interface, DataType::QAI8, MatMulOp::GEMM,
810 test::cpu_has_sme2),
811 RegisterBenchmark(
812 "kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa",
813 kai_benchmark_matmul<MatMulStaticQuantInterface>,
814 kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa_interface, DataType::QAI8, MatMulOp::GEMM,
815 test::cpu_has_sme),
816
817 // matmul_clamp_bf16_qai8dxp_qsi4c32p
818 RegisterBenchmark(
819 "kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod",
820 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
821 kai_matmul_clamp_bf16_qai8dxp1x8_qsi4c32p4x8_1x4_neon_dotprod_interface, DataType::BF16, MatMulOp::GEMV,
822 test::cpu_has_dotprod_and_bf16),
823 RegisterBenchmark(
824 "kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm",
825 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
826 kai_matmul_clamp_bf16_qai8dxp4x8_qsi4c32p4x8_16x4_neon_i8mm_interface, DataType::BF16, MatMulOp::GEMM,
827 test::cpu_has_i8mm_and_bf16),
828
829 // matmul_clamp_bf16_qai8dxp_qsi4cxp
830 RegisterBenchmark(
831 "kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
832 kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod_interface, DataType::BF16, MatMulOp::GEMV,
833 test::cpu_has_dotprod_and_bf16),
834 RegisterBenchmark(
835 "kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm", kai_benchmark_matmul<MatMulBaseInterface>,
836 kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm_interface, DataType::BF16, MatMulOp::GEMM,
837 test::cpu_has_i8mm_and_bf16),
838 RegisterBenchmark(
839 "kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
840 kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
841 test::cpu_has_dotprod_and_fp16),
842 RegisterBenchmark(
843 "kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
844 kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
845 test::cpu_has_dotprod_and_fp16),
846 RegisterBenchmark(
847 "kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
848 kai_matmul_clamp_f16_qai8dxp4x4_qsi4cxp4x4_16x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMM,
849 test::cpu_has_dotprod_and_fp16),
850 RegisterBenchmark(
851 "kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm", kai_benchmark_matmul<MatMulBaseInterface>,
852 kai_matmul_clamp_f16_qai8dxp4x8_qsi4cxp4x8_16x4_neon_i8mm_interface, DataType::FP16, MatMulOp::GEMM,
853 test::cpu_has_i8mm_and_fp16),
854
855 // matmul_clamp_f16_qai8dxp_qsi8cxp
856 RegisterBenchmark(
857 "kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
858 kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
859 test::cpu_has_dotprod_and_fp16),
860 RegisterBenchmark(
861 "kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
862 kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
863 test::cpu_has_dotprod_and_fp16),
864 RegisterBenchmark(
865 "kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod", kai_benchmark_matmul<MatMulBaseInterface>,
866 kai_matmul_clamp_f16_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMM,
867 test::cpu_has_dotprod_and_fp16),
868 RegisterBenchmark(
869 "kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm", kai_benchmark_matmul<MatMulBaseInterface>,
870 kai_matmul_clamp_f16_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm_interface, DataType::FP16, MatMulOp::GEMM,
871 test::cpu_has_i8mm_and_fp16),
872
873 // matmul_clamp_f16_qsi8d32p_qai4c32p
874 RegisterBenchmark(
875 "kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa",
876 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
877 kai_matmul_clamp_f16_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP16, MatMulOp::GEMM,
878 test::cpu_has_sme2),
879 RegisterBenchmark(
880 "kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot",
881 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
882 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface, DataType::FP16, MatMulOp::GEMV,
883 test::cpu_has_sme2),
884 RegisterBenchmark(
885 "kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod",
886 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
887 kai_matmul_clamp_f16_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
888 test::cpu_has_dotprod_and_fp16),
889 RegisterBenchmark(
890 "kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod",
891 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
892 kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMV,
893 test::cpu_has_dotprod_and_fp16),
894 RegisterBenchmark(
895 "kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod",
896 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
897 kai_matmul_clamp_f16_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface, DataType::FP16, MatMulOp::GEMM,
898 test::cpu_has_dotprod_and_fp16),
899 RegisterBenchmark(
900 "kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm",
901 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantGenericDstInterface>,
902 kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface, DataType::FP16, MatMulOp::GEMM,
903 test::cpu_has_i8mm_and_fp16),
904
905 // matmul_clamp_f32_qsi8d32p_qai4c32p
906 RegisterBenchmark(
907 "kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa",
908 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
909 kai_matmul_clamp_f32_qsi8d32p1vlx4_qai4c32p4vlx4_1vlx4vl_sme2_mopa_interface, DataType::FP32, MatMulOp::GEMM,
910 test::cpu_has_sme2),
911 RegisterBenchmark(
912 "kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot",
913 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
914 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4vlx4_1x4vl_sme2_dot_interface, DataType::FP32, MatMulOp::GEMV,
915 test::cpu_has_sme2),
916 RegisterBenchmark(
917 "kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod",
918 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
919 kai_matmul_clamp_f32_qsi8d32p1x4_qai4c32p4x4_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
920 test::cpu_has_dotprod),
921 RegisterBenchmark(
922 "kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod",
923 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
924 kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMV,
925 test::cpu_has_dotprod),
926 RegisterBenchmark(
927 "kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod",
928 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
929 kai_matmul_clamp_f32_qsi8d32p4x4_qai4c32p4x4_8x4_neon_dotprod_interface, DataType::FP32, MatMulOp::GEMM,
930 test::cpu_has_dotprod),
931 RegisterBenchmark(
932 "kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm",
933 kai_benchmark_matmul<MatMulBlockwiseDynamicQuantInterface>,
934 kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm_interface, DataType::FP32, MatMulOp::GEMM,
935 test::cpu_has_i8mm),
936
937 };
938
939 1 void RegisterMatMulBenchmarks(const MatMulShape& shape, const size_t bl) {
940
2/2
✓ Branch 0 taken 83 times.
✓ Branch 1 taken 1 time.
84 for (const auto& benchmark : matmul_benchmarks) {
941 166 benchmark
942
1/2
✓ Branch 0 taken 83 times.
✗ Branch 1 not taken.
83 ->Args(
943 166 {static_cast<int64_t>(shape.m), static_cast<int64_t>(shape.n), static_cast<int64_t>(shape.k),
944 83 static_cast<int64_t>(bl)})
945
6/12
✓ Branch 0 taken 83 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 83 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 83 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 83 times.
✗ Branch 7 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 83 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 83 times.
83 ->ArgNames({"m", "n", "k", "bl"});
946 83 }
947 1 }
948 } // namespace kai::benchmark
949