KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 88.6% 78 / 1 / 89
Functions: 100.0% 6 / 1 / 7
Branches: 45.8% 44 / 2 / 98

benchmark/dwconv/dwconv_benchmark_logic.hpp
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #pragma once
8
9 #include <array>
10 #include <cstddef>
11 #include <cstdint>
12 #include <functional>
13 #include <limits>
14 #include <memory>
15 #include <test/common/cpu_info.hpp>
16 #include <test/common/data_type.hpp>
17 #include <tuple>
18 #include <utility>
19 #include <vector>
20
21 #include "dwconv_interface.hpp"
22 #include "dwconv_runner.hpp"
23 #include "kai/kai_common.h"
24
25 #ifdef __GNUC__
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wswitch-default"
28 #endif // __GNUC__
29
30 #include <benchmark/benchmark.h>
31
32 #ifdef __GNUC__
33 #pragma GCC diagnostic pop
34 #endif // __GNUC__
35
36 namespace kai::benchmark {
37 using Buffer = std::vector<uint8_t>;
38 using CpuRequirement = std::function<bool()>;
39 using DataType = test::DataType;
40
41 struct DwConvShape {
42 size_t input_height;
43 size_t input_width;
44 size_t num_channels;
45 1 std::array<size_t, 2> stride{{1, 1}}; // {stride_height, stride_width}
46 1 std::array<size_t, 4> padding{{0, 0, 0, 0}}; // {pad_top, pad_bottom, pad_left, pad_right}
47 1 std::array<size_t, 2> dilation{{1, 1}}; // {dilation_height, dilation_width}
48 };
49
50 struct DwConvOutputShape {
51 size_t height;
52 size_t width;
53 };
54
55 16 inline bool supports_unit_stride_and_dilation(size_t stride_h, size_t stride_w, size_t dilation_h, size_t dilation_w) {
56
3/6
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 16 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.
16 return stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
57 }
58
59 15 inline bool supports_unit_stride_and_dilation(const DwConvShape& shape) {
60 15 return supports_unit_stride_and_dilation(shape.stride[0], shape.stride[1], shape.dilation[0], shape.dilation[1]);
61 }
62
63 13 inline DwConvOutputShape compute_dwconv_output_dims(
64 const DwConvShape& shape, size_t filter_height, size_t filter_width) {
65 39 const auto compute_dim = [&](size_t idx) -> size_t { // 0: height, 1: width
66
2/2
✓ Branch 0 taken 13 times.
✓ Branch 1 taken 13 times.
26 const size_t input = (idx == 0) ? shape.input_height : shape.input_width;
67
2/2
✓ Branch 0 taken 13 times.
✓ Branch 1 taken 13 times.
26 const size_t filter = (idx == 0) ? filter_height : filter_width;
68 26 const size_t stride = shape.stride[idx];
69 26 const size_t dilation = shape.dilation[idx];
70 26 const size_t pad_before = shape.padding[idx * 2];
71 26 const size_t pad_after = shape.padding[idx * 2 + 1];
72 26 const size_t effective_kernel = (filter - 1) * dilation + 1;
73 26 const size_t input_plus_pad = input + pad_before + pad_after;
74
75
4/8
✓ Branch 0 taken 26 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 26 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 26 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 26 times.
26 if (stride == 0 || filter == 0 || effective_kernel == 0 || input_plus_pad < effective_kernel) {
76 return 0;
77 }
78 26 const size_t numerator = input + pad_before + pad_after - effective_kernel;
79 26 return numerator / stride + 1;
80 26 };
81
82 13 return DwConvOutputShape{compute_dim(0), compute_dim(1)};
83 13 }
84
85 // Factory to construct a runner matching the registered micro-kernel
86 using RunnerFactory = std::function<std::unique_ptr<DwConvRunner>(const DwConvTraits&, DataType, DataType)>;
87
88 /// Benchmarks a depthwise convolution micro-kernel using a provided runner factory
89 3 inline void kai_benchmark_dwconv(
90 ::benchmark::State& state, const RunnerFactory& runner_factory, const DwConvTraits& traits, const DataType src_type,
91 const DataType dst_type, const DwConvRhsConfig& rhs_cfg, const CpuRequirement& is_cpu_supported) {
92
2/2
✓ Branch 0 taken 1 time.
✓ Branch 1 taken 2 times.
3 if (!is_cpu_supported()) {
93
2/4
✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.
2 state.SkipWithMessage("Unsupported CPU feature");
94 2 return;
95 }
96
97 1 const size_t num_channels = static_cast<size_t>(state.range(0));
98 1 const size_t input_height = static_cast<size_t>(state.range(1));
99 1 const size_t input_width = static_cast<size_t>(state.range(2));
100 1 const size_t stride_h = static_cast<size_t>(state.range(3));
101 1 const size_t stride_w = static_cast<size_t>(state.range(4));
102 1 const size_t pad_top = static_cast<size_t>(state.range(5));
103 1 const size_t pad_bottom = static_cast<size_t>(state.range(6));
104 1 const size_t pad_left = static_cast<size_t>(state.range(7));
105 1 const size_t pad_right = static_cast<size_t>(state.range(8));
106 1 const size_t dilation_h = static_cast<size_t>(state.range(9));
107 1 const size_t dilation_w = static_cast<size_t>(state.range(10));
108
109
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 if (!supports_unit_stride_and_dilation(stride_h, stride_w, dilation_h, dilation_w)) {
110 state.SkipWithMessage("Current DWConv micro-kernels only support stride=1 and dilation=1");
111 return;
112 }
113
114 // Buffer sizes
115 1 const size_t filter_height = traits.get_filter_height();
116 1 const size_t filter_width = traits.get_filter_width();
117 3 DwConvShape runtime_shape{};
118 1 runtime_shape.input_height = input_height;
119 1 runtime_shape.input_width = input_width;
120 1 runtime_shape.num_channels = num_channels;
121 1 runtime_shape.stride = {stride_h, stride_w};
122 1 runtime_shape.padding = {pad_top, pad_bottom, pad_left, pad_right};
123 1 runtime_shape.dilation = {dilation_h, dilation_w};
124 5 const auto [output_height, output_width] = compute_dwconv_output_dims(runtime_shape, filter_height, filter_width);
125
126
2/4
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 time.
1 if (output_height == 0 || output_width == 0) {
127 state.SkipWithMessage("Invalid DWConv dimensions derived from CLI flags");
128 return;
129 }
130
131 1 size_t input_size = input_height * input_width * num_channels * data_type_size_bytes(src_type);
132 3 size_t output_size = output_height * output_width * num_channels * data_type_size_bytes(dst_type);
133
134 // SME/SVE scaling for bandwidth accounting
135 #if defined(__ARM_FEATURE_SVE2) || defined(_M_ARM64)
136 if (test::cpu_has_sme() || test::cpu_has_sme2()) {
137 const size_t vl = kai_get_sme_vector_length_u32();
138 input_size *= vl;
139 output_size *= vl;
140 }
141 #endif
142
143 // RHS sizes by layout
144 1 size_t rhs_packed_size = 0, rhs_weights_size = 0, rhs_bias_size = 0;
145
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 time.
1 if (rhs_cfg.layout == DwConvRhsLayout::Packed) {
146 KAI_ASSERT_ALWAYS_MSG(
147 rhs_cfg.get_packed_rhs_size, "Packed DWConv benchmarks must provide get_packed_rhs_size callback");
148 1 rhs_packed_size = rhs_cfg.get_packed_rhs_size(filter_height, filter_width, num_channels);
149 1 } else {
150 rhs_weights_size = num_channels * (filter_height * filter_width) * (rhs_cfg.weights_elem_bits / 8);
151 rhs_bias_size = num_channels * (rhs_cfg.bias_elem_bits / 8);
152 }
153
154
0/2
✗ Branch 0 not taken.
✗ Branch 1 not taken.
1 const Buffer src(input_size);
155
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 Buffer dst(output_size);
156
157 // Construct runner and configure common parameters
158
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 auto runner = runner_factory(traits, src_type, dst_type);
159
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 runner->set_input_dims(input_height, input_width);
160
3/6
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 time.
✗ Branch 5 not taken.
3 runner->set_output_dims(output_height, output_width);
161
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 runner->set_channels(num_channels);
162
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 runner->set_padding(pad_top, pad_bottom, pad_left, pad_right);
163
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 runner->set_clamp(-std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity());
164
165 1 Buffer rhs_packed, rhs_weights, rhs_bias;
166
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 1 time.
1 if (rhs_cfg.layout == DwConvRhsLayout::Packed) {
167
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 rhs_packed = Buffer(rhs_packed_size);
168
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 runner->prepare(rhs_packed.data(), nullptr, nullptr, nullptr);
169 1 } else {
170 rhs_weights = Buffer(rhs_weights_size);
171 rhs_bias = Buffer(rhs_bias_size);
172 runner->prepare(nullptr, rhs_weights.data(), rhs_bias.data(), nullptr);
173 }
174
175 // This is the benchmarking loop
176
7/12
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 time.
✓ Branch 7 taken 1 time.
✓ Branch 8 taken 1 time.
✗ Branch 9 not taken.
✓ Branch 10 taken 1 time.
✗ Branch 11 not taken.
2 for (auto _ : state) {
177
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 runner->run(src.data(), dst.data());
178 1 }
179
180 3 const size_t num_ops = output_height * output_width * num_channels * filter_height * filter_width * 2; // MACs
181 2 const size_t rhs_bytes =
182
1/2
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
1 (rhs_cfg.layout == DwConvRhsLayout::Packed) ? rhs_packed_size : (rhs_weights_size + rhs_bias_size);
183
2/4
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.
1 state.SetItemsProcessed(state.iterations() * num_ops);
184
2/4
✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.
1 state.SetBytesProcessed(state.iterations() * (input_size + rhs_bytes + output_size));
185 3 }
186
187 } // namespace kai::benchmark
188