benchmark/dwconv/dwconv_registry.cpp
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // | ||
| 2 | // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 3 | // | ||
| 4 | // SPDX-License-Identifier: Apache-2.0 | ||
| 5 | // | ||
| 6 | |||
| 7 | #include "dwconv_registry.hpp" | ||
| 8 | |||
| 9 | #include <array> | ||
| 10 | #include <cstddef> | ||
| 11 | #include <cstdint> | ||
| 12 | #include <optional> | ||
| 13 | #include <test/common/cpu_info.hpp> | ||
| 14 | #include <test/common/data_type.hpp> | ||
| 15 | |||
| 16 | #include "dwconv_benchmark_logic.hpp" | ||
| 17 | #include "dwconv_interface.hpp" | ||
| 18 | |||
| 19 | #ifdef __GNUC__ | ||
| 20 | #pragma GCC diagnostic push | ||
| 21 | #pragma GCC diagnostic ignored "-Wswitch-default" | ||
| 22 | #endif // __GNUC__ | ||
| 23 | |||
| 24 | #include <benchmark/benchmark.h> | ||
| 25 | |||
| 26 | #ifdef __GNUC__ | ||
| 27 | #pragma GCC diagnostic pop | ||
| 28 | #endif // __GNUC__ | ||
| 29 | |||
| 30 | // Micro-kernels to register for benchmarking | ||
| 31 | #include "kai/ukernels/dwconv/dwconv_f32_f32_f32p/kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla.h" | ||
| 32 | #include "kai/ukernels/dwconv/pack/kai_rhs_dwconv_pack_x32p1vlx1b_x32_x32_sme.h" | ||
| 33 | |||
| 34 | namespace kai::benchmark { | ||
| 35 | using DataType = test::DataType; | ||
| 36 | |||
| 37 | // Build interface + traits + RHS config for the packed FP32 kernel | ||
| 38 | inline constexpr DwConvPackedFloatInterface kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla_iface{ | ||
| 39 | .run_dwconv = kai_run_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 40 | }; | ||
| 41 | |||
| 42 | struct DwConvBenchmarkCase { | ||
| 43 | ::benchmark::internal::Benchmark* benchmark; | ||
| 44 | const DwConvTraits* traits; | ||
| 45 | }; | ||
| 46 | |||
| 47 | inline constexpr DwConvRhsConfig kai_dwconv_packed_fp32_rhs_cfg{ | ||
| 48 | .layout = DwConvRhsLayout::Packed, | ||
| 49 | .weights_elem_bits = 32, | ||
| 50 | .bias_elem_bits = 32, | ||
| 51 | .get_packed_rhs_size = kai_rhs_get_dst_size_dwconv_pack_x32p1vlx1b_x32_x32_sme, | ||
| 52 | }; | ||
| 53 | |||
| 54 | // Helper function to bundle traits | ||
| 55 | template < | ||
| 56 | typename GetMStep, typename GetFilterHeight, typename GetFilterWidth, typename GetKr, typename GetDstSize, | ||
| 57 | typename GetDstOffset, typename GetSrcOffset> | ||
| 58 | constexpr DwConvTraits BundleTraits( | ||
| 59 | GetMStep get_m_step, GetFilterHeight get_filter_height, GetFilterWidth get_filter_width, GetKr get_kr, | ||
| 60 | GetDstSize get_dst_size, GetDstOffset get_dst_offset, GetSrcOffset get_src_offset) { | ||
| 61 | DwConvTraits traits{}; | ||
| 62 | traits.get_m_step = get_m_step; | ||
| 63 | traits.get_filter_height = get_filter_height; | ||
| 64 | traits.get_filter_width = get_filter_width; | ||
| 65 | traits.get_kr = get_kr; | ||
| 66 | traits.get_dst_size = get_dst_size; | ||
| 67 | traits.get_dst_offset = get_dst_offset; | ||
| 68 | traits.get_src_offset = get_src_offset; | ||
| 69 | return traits; | ||
| 70 | } | ||
| 71 | |||
| 72 | // Usage: declare traits for the kernel | ||
| 73 | inline constexpr DwConvTraits kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla_traits = BundleTraits( | ||
| 74 | kai_get_m_step_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 75 | kai_get_filter_height_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 76 | kai_get_filter_width_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 77 | kai_get_kr_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 78 | kai_get_dst_size_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 79 | kai_get_dst_offset_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla, | ||
| 80 | kai_get_src_offset_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla); | ||
| 81 | |||
| 82 | 36 | inline std::array<DwConvBenchmarkCase, 1> dwconv_benchmarks{{ | |
| 83 | 36 | { | |
| 84 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | ::benchmark::RegisterBenchmark( |
| 85 |
1/2✓ Branch 0 taken 36 times.
✗ Branch 1 not taken.
|
36 | "kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla", kai_benchmark_dwconv, |
| 86 |
1/2✓ Branch 0 taken 12 times.
✗ Branch 1 not taken.
|
37 | RunnerFactory{[](const DwConvTraits& tr, DataType sdt, DataType ddt) { |
| 87 | 1 | return std::make_unique<DwConvPackedFloatRunner>( | |
| 88 | 1 | kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla_iface, tr, sdt, ddt); | |
| 89 | }}, | ||
| 90 | 36 | kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla_traits, DataType::FP32, DataType::FP32, | |
| 91 | kai_dwconv_packed_fp32_rhs_cfg, test::cpu_has_sme2), | ||
| 92 | 36 | &kai_dwconv_clamp_f32_f32_f32p1vlx1b_3x3_s1_4xc_sme2_mla_traits, | |
| 93 | }, | ||
| 94 | }}; | ||
| 95 | |||
| 96 | 9 | void RegisterDwConvBenchmarks(const DwConvShape& shape) { | |
| 97 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
|
9 | if (!supports_unit_stride_and_dilation(shape)) { |
| 98 | ✗ | return; | |
| 99 | } | ||
| 100 | |||
| 101 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 9 times.
|
18 | for (auto& entry : dwconv_benchmarks) { |
| 102 | 9 | const size_t filter_height = entry.traits->get_filter_height(); | |
| 103 | 9 | const size_t filter_width = entry.traits->get_filter_width(); | |
| 104 | |||
| 105 | 9 | const auto [out_h, out_w] = compute_dwconv_output_dims(shape, filter_height, filter_width); | |
| 106 |
2/4✓ Branch 0 taken 9 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 9 times.
|
9 | if (out_h == 0 || out_w == 0) { |
| 107 | ✗ | continue; | |
| 108 | } | ||
| 109 | |||
| 110 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
18 | entry.benchmark |
| 111 |
13/26✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 9 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 9 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 9 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 9 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 9 times.
✗ Branch 12 not taken.
✓ Branch 13 taken 9 times.
✗ Branch 14 not taken.
✓ Branch 15 taken 9 times.
✗ Branch 16 not taken.
✓ Branch 17 taken 9 times.
✗ Branch 18 not taken.
✓ Branch 19 taken 9 times.
✗ Branch 20 not taken.
✓ Branch 21 taken 9 times.
✗ Branch 22 not taken.
✓ Branch 23 taken 9 times.
✗ Branch 24 not taken.
✓ Branch 25 taken 6 times.
|
108 | ->Args({ |
| 112 | 9 | static_cast<int64_t>(shape.num_channels), | |
| 113 | 9 | static_cast<int64_t>(shape.input_height), | |
| 114 | 9 | static_cast<int64_t>(shape.input_width), | |
| 115 | 9 | static_cast<int64_t>(shape.stride[0]), | |
| 116 | 9 | static_cast<int64_t>(shape.stride[1]), | |
| 117 | 9 | static_cast<int64_t>(shape.padding[0]), | |
| 118 | 9 | static_cast<int64_t>(shape.padding[1]), | |
| 119 | 9 | static_cast<int64_t>(shape.padding[2]), | |
| 120 | 9 | static_cast<int64_t>(shape.padding[3]), | |
| 121 | 9 | static_cast<int64_t>(shape.dilation[0]), | |
| 122 | 9 | static_cast<int64_t>(shape.dilation[1]), | |
| 123 | }) | ||
| 124 |
2/4✗ Branch 0 not taken.
✓ Branch 1 taken 9 times.
✓ Branch 2 taken 9 times.
✗ Branch 3 not taken.
|
9 | ->ArgNames({ |
| 125 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "channels", |
| 126 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "input_height", |
| 127 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "input_width", |
| 128 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "stride_h", |
| 129 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "stride_w", |
| 130 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "pad_top", |
| 131 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "pad_bottom", |
| 132 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "pad_left", |
| 133 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "pad_right", |
| 134 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "dilation_h", |
| 135 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 6 times.
|
9 | "dilation_w", |
| 136 | }); | ||
| 137 | 9 | } | |
| 138 | 9 | } | |
| 139 | |||
| 140 | 3 | std::optional<DwConvOutputShape> InferDwConvOutputDims(const DwConvShape& shape) { | |
| 141 | if (dwconv_benchmarks.empty()) { | ||
| 142 | return std::nullopt; | ||
| 143 | } | ||
| 144 | |||
| 145 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 3 times.
|
3 | if (!supports_unit_stride_and_dilation(shape)) { |
| 146 | ✗ | return std::nullopt; | |
| 147 | } | ||
| 148 | |||
| 149 | 3 | const DwConvTraits* traits = dwconv_benchmarks.front().traits; | |
| 150 | 3 | const size_t filter_height = traits->get_filter_height(); | |
| 151 | 3 | const size_t filter_width = traits->get_filter_width(); | |
| 152 | |||
| 153 | 3 | const auto dims = compute_dwconv_output_dims(shape, filter_height, filter_width); | |
| 154 |
2/4✓ Branch 0 taken 3 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
|
3 | if (dims.height == 0 || dims.width == 0) { |
| 155 | ✗ | return std::nullopt; | |
| 156 | } | ||
| 157 | |||
| 158 | 3 | return dims; | |
| 159 | 3 | } | |
| 160 | |||
| 161 | } // namespace kai::benchmark | ||
| 162 |