benchmark/imatmul/imatmul_registry.cpp
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // | ||
| 2 | // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 3 | // | ||
| 4 | // SPDX-License-Identifier: Apache-2.0 | ||
| 5 | // | ||
| 6 | |||
| 7 | #include "imatmul_registry.hpp" | ||
| 8 | |||
| 9 | #include <array> | ||
| 10 | #include <cstddef> | ||
| 11 | #include <cstdint> | ||
| 12 | #include <test/common/cpu_info.hpp> | ||
| 13 | #include <test/common/data_type.hpp> | ||
| 14 | |||
| 15 | #include "imatmul_benchmark_logic.hpp" | ||
| 16 | #include "imatmul_interface.hpp" | ||
| 17 | |||
| 18 | #ifdef __GNUC__ | ||
| 19 | #pragma GCC diagnostic push | ||
| 20 | #pragma GCC diagnostic ignored "-Wswitch-default" | ||
| 21 | #endif // __GNUC__ | ||
| 22 | |||
| 23 | #include <benchmark/benchmark.h> | ||
| 24 | |||
| 25 | #ifdef __GNUC__ | ||
| 26 | #pragma GCC diagnostic pop | ||
| 27 | #endif // __GNUC__ | ||
| 28 | |||
| 29 | // Micro-kernels to register for benchmarking | ||
| 30 | |||
| 31 | // imatmul_clamp_f16_f16p_f16p | ||
| 32 | #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" | ||
| 33 | #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa.h" | ||
| 34 | |||
| 35 | // imatmul_clamp_f32_f32p_f32p | ||
| 36 | #include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h" | ||
| 37 | #include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h" | ||
| 38 | |||
| 39 | // imatmul_clamp_qai8_qai8p_qsi8cxp | ||
| 40 | #include "kai/ukernels/matmul/imatmul_clamp_qai8_qai8p_qsi8cxp/kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa.h" | ||
| 41 | #include "kai/ukernels/matmul/imatmul_clamp_qai8_qai8p_qsi8cxp/kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa.h" | ||
| 42 | |||
| 43 | namespace kai::benchmark { | ||
| 44 | using DataType = test::DataType; | ||
| 45 | |||
| 46 | // imatmul_clamp_f16_f16p_f16p | ||
| 47 | inline constexpr ImatmulBaseInterface kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa_interface{ | ||
| 48 | .run_imatmul = kai_run_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa, | ||
| 49 | }; | ||
| 50 | |||
| 51 | inline constexpr ImatmulBaseInterface kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa_interface{ | ||
| 52 | .run_imatmul = kai_run_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa, | ||
| 53 | }; | ||
| 54 | |||
| 55 | // imatmul_clamp_f16_f16_f16p | ||
| 56 | inline constexpr ImatmulBaseInterface kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa_interface{ | ||
| 57 | .run_imatmul = kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa, | ||
| 58 | }; | ||
| 59 | |||
| 60 | inline constexpr ImatmulBaseInterface kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa_interface{ | ||
| 61 | .run_imatmul = kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa, | ||
| 62 | }; | ||
| 63 | |||
| 64 | // imatmul_clamp_qai8_qai8p_qsi8cxp | ||
| 65 | inline constexpr ImatmulStaticQuantInterface | ||
| 66 | kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa_interface{ | ||
| 67 | .run_imatmul = kai_run_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa, | ||
| 68 | }; | ||
| 69 | |||
| 70 | inline constexpr ImatmulStaticQuantInterface | ||
| 71 | kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa_interface{ | ||
| 72 | .run_imatmul = kai_run_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa, | ||
| 73 | }; | ||
| 74 | |||
| 75 | 36 | inline const std::array imatmul_benchmarks{ | |
| 76 | // imatmul_clamp_f16_f16p_f16p | ||
| 77 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
216 | RegisterBenchmark( |
| 78 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa", kai_benchmark_imatmul<ImatmulBaseInterface>, |
| 79 | 36 | kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa_interface, DataType::FP16, test::cpu_has_sme2), | |
| 80 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | RegisterBenchmark( |
| 81 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa", kai_benchmark_imatmul<ImatmulBaseInterface>, |
| 82 | 36 | kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa_interface, DataType::FP16, test::cpu_has_sme), | |
| 83 | |||
| 84 | // imatmul_clamp_f16_f16_f16p | ||
| 85 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | RegisterBenchmark( |
| 86 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa", kai_benchmark_imatmul<ImatmulBaseInterface>, |
| 87 | 36 | kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa_interface, DataType::FP32, test::cpu_has_sme2), | |
| 88 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | RegisterBenchmark( |
| 89 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa", kai_benchmark_imatmul<ImatmulBaseInterface>, |
| 90 | 36 | kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa_interface, DataType::FP32, test::cpu_has_sme), | |
| 91 | |||
| 92 | // imatmul_clamp_qai8_qai8p_qsi8cxp | ||
| 93 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | RegisterBenchmark( |
| 94 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa", |
| 95 | kai_benchmark_imatmul<ImatmulStaticQuantInterface>, | ||
| 96 | 36 | kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa_interface, DataType::QAI8, test::cpu_has_sme), | |
| 97 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | RegisterBenchmark( |
| 98 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa", |
| 99 | kai_benchmark_imatmul<ImatmulStaticQuantInterface>, | ||
| 100 | 36 | kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa_interface, DataType::QAI8, | |
| 101 | test::cpu_has_sme2), | ||
| 102 | |||
| 103 | }; | ||
| 104 | |||
| 105 | 9 | void RegisteriMatMulBenchmarks(size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length) { | |
| 106 |
2/2✓ Branch 0 taken 54 times.
✓ Branch 1 taken 9 times.
|
63 | for (const auto& benchmark : imatmul_benchmarks) { |
| 107 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
108 | benchmark |
| 108 |
2/2✓ Branch 0 taken 18 times.
✓ Branch 1 taken 36 times.
|
54 | ->Args( |
| 109 |
2/4✗ Branch 0 not taken.
✓ Branch 1 taken 36 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 36 times.
|
108 | {static_cast<int64_t>(m), static_cast<int64_t>(n), static_cast<int64_t>(k_chunk_count), |
| 110 | 54 | static_cast<int64_t>(k_chunk_length)}) | |
| 111 |
10/12✓ Branch 0 taken 18 times.
✓ Branch 1 taken 36 times.
✓ Branch 2 taken 18 times.
✓ Branch 3 taken 36 times.
✓ Branch 4 taken 18 times.
✓ Branch 5 taken 36 times.
✓ Branch 6 taken 18 times.
✓ Branch 7 taken 36 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 54 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 54 times.
|
54 | ->ArgNames({"m", "n", "k_chunk_count", "k_chunk_length"}); |
| 112 | 54 | } | |
| 113 | 9 | } | |
| 114 | } // namespace kai::benchmark | ||
| 115 |