benchmark/dwconv/dwconv_benchmark_logic.hpp

Directory:	./
Coverage:	low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%

	Coverage	Exec / Excl / Total
Lines:	88.6%	78 / 1 / 89
Functions:	100.0%	6 / 1 / 7
Branches:	45.8%	44 / 2 / 98

    benchmark/dwconv/dwconv_benchmark_logic.hpp
    
        Line
        Branch
        Exec
        Source
      
        //
      
        // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
      
        //
      
        // SPDX-License-Identifier: Apache-2.0
      
        //
      
        #pragma once
      
        #include <array>
      
        #include <cstddef>
      
        #include <cstdint>
      
        #include <functional>
      
        #include <limits>
      
        #include <memory>
      
        #include <test/common/cpu_info.hpp>
      
        #include <test/common/data_type.hpp>
      
        #include <tuple>
      
        #include <utility>
      
        #include <vector>
      
        #include "dwconv_interface.hpp"
      
        #include "dwconv_runner.hpp"
      
        #include "kai/kai_common.h"
      
        #ifdef __GNUC__
      
        #pragma GCC diagnostic push
      
        #pragma GCC diagnostic ignored "-Wswitch-default"
      
        #endif  // __GNUC__
      
        #include <benchmark/benchmark.h>
      
        #ifdef __GNUC__
      
        #pragma GCC diagnostic pop
      
        #endif  // __GNUC__
      
        namespace kai::benchmark {
      
        using Buffer = std::vector<uint8_t>;
      
        using CpuRequirement = std::function<bool()>;
      
        using DataType = test::DataType;
      
        struct DwConvShape {
      
            size_t input_height;
      
            size_t input_width;
      
            size_t num_channels;
      
        1
            std::array<size_t, 2> stride{{1, 1}};         // {stride_height, stride_width}
      
        1
            std::array<size_t, 4> padding{{0, 0, 0, 0}};  // {pad_top, pad_bottom, pad_left, pad_right}
      
        1
            std::array<size_t, 2> dilation{{1, 1}};       // {dilation_height, dilation_width}
      
        };
      
        struct DwConvOutputShape {
      
            size_t height;
      
            size_t width;
      
        };
      
        16
        inline bool supports_unit_stride_and_dilation(size_t stride_h, size_t stride_w, size_t dilation_h, size_t dilation_w) {
      
          3/6✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 16 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.

        16
            return stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
      
        }
      
        15
        inline bool supports_unit_stride_and_dilation(const DwConvShape& shape) {
      
        15
            return supports_unit_stride_and_dilation(shape.stride[0], shape.stride[1], shape.dilation[0], shape.dilation[1]);
      
        }
      
        13
        inline DwConvOutputShape compute_dwconv_output_dims(
      
            const DwConvShape& shape, size_t filter_height, size_t filter_width) {
      
        39
            const auto compute_dim = [&](size_t idx) -> size_t {  // 0: height, 1: width
      
          2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 13 times.

        26
                const size_t input = (idx == 0) ? shape.input_height : shape.input_width;
      
          2/2✓ Branch 0 taken 13 times.
✓ Branch 1 taken 13 times.

        26
                const size_t filter = (idx == 0) ? filter_height : filter_width;
      
        26
                const size_t stride = shape.stride[idx];
      
        26
                const size_t dilation = shape.dilation[idx];
      
        26
                const size_t pad_before = shape.padding[idx * 2];
      
        26
                const size_t pad_after = shape.padding[idx * 2 + 1];
      
        26
                const size_t effective_kernel = (filter - 1) * dilation + 1;
      
        26
                const size_t input_plus_pad = input + pad_before + pad_after;
      
          4/8✓ Branch 0 taken 26 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 26 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 26 times.
✗ Branch 5 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 26 times.

        26
                if (stride == 0 || filter == 0 || effective_kernel == 0 || input_plus_pad < effective_kernel) {
      
        ✗
                    return 0;
      
                }
      
        26
                const size_t numerator = input + pad_before + pad_after - effective_kernel;
      
        26
                return numerator / stride + 1;
      
        26
            };
      
        13
            return DwConvOutputShape{compute_dim(0), compute_dim(1)};
      
        13
        }
      
        // Factory to construct a runner matching the registered micro-kernel
      
        using RunnerFactory = std::function<std::unique_ptr<DwConvRunner>(const DwConvTraits&, DataType, DataType)>;
      
        /// Benchmarks a depthwise convolution micro-kernel using a provided runner factory
      
        3
        inline void kai_benchmark_dwconv(
      
            ::benchmark::State& state, const RunnerFactory& runner_factory, const DwConvTraits& traits, const DataType src_type,
      
            const DataType dst_type, const DwConvRhsConfig& rhs_cfg, const CpuRequirement& is_cpu_supported) {
      
          2/2✓ Branch 0 taken 1 time.
✓ Branch 1 taken 2 times.

        3
            if (!is_cpu_supported()) {
      
          2/4✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2 times.
✗ Branch 3 not taken.

        2
                state.SkipWithMessage("Unsupported CPU feature");
      
        2
                return;
      
            }
      
        1
            const size_t num_channels = static_cast<size_t>(state.range(0));
      
        1
            const size_t input_height = static_cast<size_t>(state.range(1));
      
        1
            const size_t input_width = static_cast<size_t>(state.range(2));
      
        1
            const size_t stride_h = static_cast<size_t>(state.range(3));
      
        1
            const size_t stride_w = static_cast<size_t>(state.range(4));
      
        1
            const size_t pad_top = static_cast<size_t>(state.range(5));
      
        1
            const size_t pad_bottom = static_cast<size_t>(state.range(6));
      
        1
            const size_t pad_left = static_cast<size_t>(state.range(7));
      
        1
            const size_t pad_right = static_cast<size_t>(state.range(8));
      
        1
            const size_t dilation_h = static_cast<size_t>(state.range(9));
      
        1
            const size_t dilation_w = static_cast<size_t>(state.range(10));
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            if (!supports_unit_stride_and_dilation(stride_h, stride_w, dilation_h, dilation_w)) {
      
        ✗
                state.SkipWithMessage("Current DWConv micro-kernels only support stride=1 and dilation=1");
      
        ✗
                return;
      
            }
      
            // Buffer sizes
      
        1
            const size_t filter_height = traits.get_filter_height();
      
        1
            const size_t filter_width = traits.get_filter_width();
      
        3
            DwConvShape runtime_shape{};
      
        1
            runtime_shape.input_height = input_height;
      
        1
            runtime_shape.input_width = input_width;
      
        1
            runtime_shape.num_channels = num_channels;
      
        1
            runtime_shape.stride = {stride_h, stride_w};
      
        1
            runtime_shape.padding = {pad_top, pad_bottom, pad_left, pad_right};
      
        1
            runtime_shape.dilation = {dilation_h, dilation_w};
      
        5
            const auto [output_height, output_width] = compute_dwconv_output_dims(runtime_shape, filter_height, filter_width);
      
          2/4✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 1 time.

        1
            if (output_height == 0 || output_width == 0) {
      
        ✗
                state.SkipWithMessage("Invalid DWConv dimensions derived from CLI flags");
      
        ✗
                return;
      
            }
      
        1
            size_t input_size = input_height * input_width * num_channels * data_type_size_bytes(src_type);
      
        3
            size_t output_size = output_height * output_width * num_channels * data_type_size_bytes(dst_type);
      
            // SME/SVE scaling for bandwidth accounting
      
        #if defined(__ARM_FEATURE_SVE2) || defined(_M_ARM64)
      
            if (test::cpu_has_sme() || test::cpu_has_sme2()) {
      
                const size_t vl = kai_get_sme_vector_length_u32();
      
                input_size *= vl;
      
                output_size *= vl;
      
            }
      
        #endif
      
            // RHS sizes by layout
      
        1
            size_t rhs_packed_size = 0, rhs_weights_size = 0, rhs_bias_size = 0;
      
          1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 time.

        1
            if (rhs_cfg.layout == DwConvRhsLayout::Packed) {
      
        −
                KAI_ASSERT_ALWAYS_MSG(
      
                    rhs_cfg.get_packed_rhs_size, "Packed DWConv benchmarks must provide get_packed_rhs_size callback");
      
        1
                rhs_packed_size = rhs_cfg.get_packed_rhs_size(filter_height, filter_width, num_channels);
      
        1
            } else {
      
        ✗
                rhs_weights_size = num_channels * (filter_height * filter_width) * (rhs_cfg.weights_elem_bits / 8);
      
        ✗
                rhs_bias_size = num_channels * (rhs_cfg.bias_elem_bits / 8);
      
            }
      
          0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.

        1
            const Buffer src(input_size);
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            Buffer dst(output_size);
      
            // Construct runner and configure common parameters
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            auto runner = runner_factory(traits, src_type, dst_type);
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            runner->set_input_dims(input_height, input_width);
      
          3/6✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.
✓ Branch 4 taken 1 time.
✗ Branch 5 not taken.

        3
            runner->set_output_dims(output_height, output_width);
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            runner->set_channels(num_channels);
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            runner->set_padding(pad_top, pad_bottom, pad_left, pad_right);
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
            runner->set_clamp(-std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity());
      
        1
            Buffer rhs_packed, rhs_weights, rhs_bias;
      
          1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 time.

        1
            if (rhs_cfg.layout == DwConvRhsLayout::Packed) {
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
                rhs_packed = Buffer(rhs_packed_size);
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
                runner->prepare(rhs_packed.data(), nullptr, nullptr, nullptr);
      
        1
            } else {
      
        ✗
                rhs_weights = Buffer(rhs_weights_size);
      
        ✗
                rhs_bias = Buffer(rhs_bias_size);
      
        ✗
                runner->prepare(nullptr, rhs_weights.data(), rhs_bias.data(), nullptr);
      
            }
      
            // This is the benchmarking loop
      
          7/12✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 6 taken 1 time.
✓ Branch 7 taken 1 time.
✓ Branch 8 taken 1 time.
✗ Branch 9 not taken.
✓ Branch 10 taken 1 time.
✗ Branch 11 not taken.

        2
            for (auto _ : state) {
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
                runner->run(src.data(), dst.data());
      
        1
            }
      
        3
            const size_t num_ops = output_height * output_width * num_channels * filter_height * filter_width * 2;  // MACs
      
        2
            const size_t rhs_bytes =
      
          1/2✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.

        1
                (rhs_cfg.layout == DwConvRhsLayout::Packed) ? rhs_packed_size : (rhs_weights_size + rhs_bias_size);
      
          2/4✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.

        1
            state.SetItemsProcessed(state.iterations() * num_ops);
      
          2/4✓ Branch 0 taken 1 time.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 time.
✗ Branch 3 not taken.

        1
            state.SetBytesProcessed(state.iterations() * (input_size + rhs_bytes + output_size));
      
        3
        }
      
        }  // namespace kai::benchmark

Line	Branch	Exec	Source
1			//
2			// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3			//
4			// SPDX-License-Identifier: Apache-2.0
5			//
6
7			#pragma once
8
9			#include <array>
10			#include <cstddef>
11			#include <cstdint>
12			#include <functional>
13			#include <limits>
14			#include <memory>
15			#include <test/common/cpu_info.hpp>
16			#include <test/common/data_type.hpp>
17			#include <tuple>
18			#include <utility>
19			#include <vector>
20
21			#include "dwconv_interface.hpp"
22			#include "dwconv_runner.hpp"
23			#include "kai/kai_common.h"
24
25			#ifdef __GNUC__
26			#pragma GCC diagnostic push
27			#pragma GCC diagnostic ignored "-Wswitch-default"
28			#endif // __GNUC__
29
30			#include <benchmark/benchmark.h>
31
32			#ifdef __GNUC__
33			#pragma GCC diagnostic pop
34			#endif // __GNUC__
35
36			namespace kai::benchmark {
37			using Buffer = std::vector<uint8_t>;
38			using CpuRequirement = std::function<bool()>;
39			using DataType = test::DataType;
40
41			struct DwConvShape {
42			size_t input_height;
43			size_t input_width;
44			size_t num_channels;
45		1	std::array<size_t, 2> stride{{1, 1}}; // {stride_height, stride_width}
46		1	std::array<size_t, 4> padding{{0, 0, 0, 0}}; // {pad_top, pad_bottom, pad_left, pad_right}
47		1	std::array<size_t, 2> dilation{{1, 1}}; // {dilation_height, dilation_width}
48			};
49
50			struct DwConvOutputShape {
51			size_t height;
52			size_t width;
53			};
54
55		16	inline bool supports_unit_stride_and_dilation(size_t stride_h, size_t stride_w, size_t dilation_h, size_t dilation_w) {
56	3/6 ✓ Branch 0 taken 16 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 16 times. ✗ Branch 3 not taken. ✗ Branch 4 not taken. ✓ Branch 5 taken 16 times.	16	return stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
57			}
58
59		15	inline bool supports_unit_stride_and_dilation(const DwConvShape& shape) {
60		15	return supports_unit_stride_and_dilation(shape.stride[0], shape.stride[1], shape.dilation[0], shape.dilation[1]);
61			}
62
63		13	inline DwConvOutputShape compute_dwconv_output_dims(
64			const DwConvShape& shape, size_t filter_height, size_t filter_width) {
65		39	const auto compute_dim = [&](size_t idx) -> size_t { // 0: height, 1: width
66	2/2 ✓ Branch 0 taken 13 times. ✓ Branch 1 taken 13 times.	26	const size_t input = (idx == 0) ? shape.input_height : shape.input_width;
67	2/2 ✓ Branch 0 taken 13 times. ✓ Branch 1 taken 13 times.	26	const size_t filter = (idx == 0) ? filter_height : filter_width;
68		26	const size_t stride = shape.stride[idx];
69		26	const size_t dilation = shape.dilation[idx];
70		26	const size_t pad_before = shape.padding[idx * 2];
71		26	const size_t pad_after = shape.padding[idx * 2 + 1];
72		26	const size_t effective_kernel = (filter - 1) * dilation + 1;
73		26	const size_t input_plus_pad = input + pad_before + pad_after;
74
75	4/8 ✓ Branch 0 taken 26 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 26 times. ✗ Branch 3 not taken. ✓ Branch 4 taken 26 times. ✗ Branch 5 not taken. ✗ Branch 6 not taken. ✓ Branch 7 taken 26 times.	26	if (stride == 0 \|\| filter == 0 \|\| effective_kernel == 0 \|\| input_plus_pad < effective_kernel) {
76		✗	return 0;
77			}
78		26	const size_t numerator = input + pad_before + pad_after - effective_kernel;
79		26	return numerator / stride + 1;
80		26	};
81
82		13	return DwConvOutputShape{compute_dim(0), compute_dim(1)};
83		13	}
84
85			// Factory to construct a runner matching the registered micro-kernel
86			using RunnerFactory = std::function<std::unique_ptr<DwConvRunner>(const DwConvTraits&, DataType, DataType)>;
87
88			/// Benchmarks a depthwise convolution micro-kernel using a provided runner factory
89		3	inline void kai_benchmark_dwconv(
90			::benchmark::State& state, const RunnerFactory& runner_factory, const DwConvTraits& traits, const DataType src_type,
91			const DataType dst_type, const DwConvRhsConfig& rhs_cfg, const CpuRequirement& is_cpu_supported) {
92	2/2 ✓ Branch 0 taken 1 time. ✓ Branch 1 taken 2 times.	3	if (!is_cpu_supported()) {
93	2/4 ✓ Branch 0 taken 2 times. ✗ Branch 1 not taken. ✓ Branch 2 taken 2 times. ✗ Branch 3 not taken.	2	state.SkipWithMessage("Unsupported CPU feature");
94		2	return;
95			}
96
97		1	const size_t num_channels = static_cast<size_t>(state.range(0));
98		1	const size_t input_height = static_cast<size_t>(state.range(1));
99		1	const size_t input_width = static_cast<size_t>(state.range(2));
100		1	const size_t stride_h = static_cast<size_t>(state.range(3));
101		1	const size_t stride_w = static_cast<size_t>(state.range(4));
102		1	const size_t pad_top = static_cast<size_t>(state.range(5));
103		1	const size_t pad_bottom = static_cast<size_t>(state.range(6));
104		1	const size_t pad_left = static_cast<size_t>(state.range(7));
105		1	const size_t pad_right = static_cast<size_t>(state.range(8));
106		1	const size_t dilation_h = static_cast<size_t>(state.range(9));
107		1	const size_t dilation_w = static_cast<size_t>(state.range(10));
108
109	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	if (!supports_unit_stride_and_dilation(stride_h, stride_w, dilation_h, dilation_w)) {
110		✗	state.SkipWithMessage("Current DWConv micro-kernels only support stride=1 and dilation=1");
111		✗	return;
112			}
113
114			// Buffer sizes
115		1	const size_t filter_height = traits.get_filter_height();
116		1	const size_t filter_width = traits.get_filter_width();
117		3	DwConvShape runtime_shape{};
118		1	runtime_shape.input_height = input_height;
119		1	runtime_shape.input_width = input_width;
120		1	runtime_shape.num_channels = num_channels;
121		1	runtime_shape.stride = {stride_h, stride_w};
122		1	runtime_shape.padding = {pad_top, pad_bottom, pad_left, pad_right};
123		1	runtime_shape.dilation = {dilation_h, dilation_w};
124		5	const auto [output_height, output_width] = compute_dwconv_output_dims(runtime_shape, filter_height, filter_width);
125
126	2/4 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken. ✗ Branch 2 not taken. ✓ Branch 3 taken 1 time.	1	if (output_height == 0 \|\| output_width == 0) {
127		✗	state.SkipWithMessage("Invalid DWConv dimensions derived from CLI flags");
128		✗	return;
129			}
130
131		1	size_t input_size = input_height * input_width * num_channels * data_type_size_bytes(src_type);
132		3	size_t output_size = output_height * output_width * num_channels * data_type_size_bytes(dst_type);
133
134			// SME/SVE scaling for bandwidth accounting
135			#if defined(__ARM_FEATURE_SVE2) \|\| defined(_M_ARM64)
136			if (test::cpu_has_sme() \|\| test::cpu_has_sme2()) {
137			const size_t vl = kai_get_sme_vector_length_u32();
138			input_size *= vl;
139			output_size *= vl;
140			}
141			#endif
142
143			// RHS sizes by layout
144		1	size_t rhs_packed_size = 0, rhs_weights_size = 0, rhs_bias_size = 0;
145	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 1 time.	1	if (rhs_cfg.layout == DwConvRhsLayout::Packed) {
146		−	KAI_ASSERT_ALWAYS_MSG(
147			rhs_cfg.get_packed_rhs_size, "Packed DWConv benchmarks must provide get_packed_rhs_size callback");
148		1	rhs_packed_size = rhs_cfg.get_packed_rhs_size(filter_height, filter_width, num_channels);
149		1	} else {
150		✗	rhs_weights_size = num_channels * (filter_height * filter_width) * (rhs_cfg.weights_elem_bits / 8);
151		✗	rhs_bias_size = num_channels * (rhs_cfg.bias_elem_bits / 8);
152			}
153
154	0/2 ✗ Branch 0 not taken. ✗ Branch 1 not taken.	1	const Buffer src(input_size);
155	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	Buffer dst(output_size);
156
157			// Construct runner and configure common parameters
158	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	auto runner = runner_factory(traits, src_type, dst_type);
159	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	runner->set_input_dims(input_height, input_width);
160	3/6 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken. ✓ Branch 2 taken 1 time. ✗ Branch 3 not taken. ✓ Branch 4 taken 1 time. ✗ Branch 5 not taken.	3	runner->set_output_dims(output_height, output_width);
161	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	runner->set_channels(num_channels);
162	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	runner->set_padding(pad_top, pad_bottom, pad_left, pad_right);
163	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	runner->set_clamp(-std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity());
164
165		1	Buffer rhs_packed, rhs_weights, rhs_bias;
166	1/2 ✗ Branch 0 not taken. ✓ Branch 1 taken 1 time.	1	if (rhs_cfg.layout == DwConvRhsLayout::Packed) {
167	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	rhs_packed = Buffer(rhs_packed_size);
168	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	runner->prepare(rhs_packed.data(), nullptr, nullptr, nullptr);
169		1	} else {
170		✗	rhs_weights = Buffer(rhs_weights_size);
171		✗	rhs_bias = Buffer(rhs_bias_size);
172		✗	runner->prepare(nullptr, rhs_weights.data(), rhs_bias.data(), nullptr);
173			}
174
175			// This is the benchmarking loop
176	7/12 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken. ✓ Branch 2 taken 1 time. ✗ Branch 3 not taken. ✓ Branch 4 taken 2 times. ✗ Branch 5 not taken. ✓ Branch 6 taken 1 time. ✓ Branch 7 taken 1 time. ✓ Branch 8 taken 1 time. ✗ Branch 9 not taken. ✓ Branch 10 taken 1 time. ✗ Branch 11 not taken.	2	for (auto _ : state) {
177	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	runner->run(src.data(), dst.data());
178		1	}
179
180		3	const size_t num_ops = output_height * output_width * num_channels * filter_height * filter_width * 2; // MACs
181		2	const size_t rhs_bytes =
182	1/2 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken.	1	(rhs_cfg.layout == DwConvRhsLayout::Packed) ? rhs_packed_size : (rhs_weights_size + rhs_bias_size);
183	2/4 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken. ✓ Branch 2 taken 1 time. ✗ Branch 3 not taken.	1	state.SetItemsProcessed(state.iterations() * num_ops);
184	2/4 ✓ Branch 0 taken 1 time. ✗ Branch 1 not taken. ✓ Branch 2 taken 1 time. ✗ Branch 3 not taken.	1	state.SetBytesProcessed(state.iterations() * (input_size + rhs_bytes + output_size));
185		3	}
186
187			} // namespace kai::benchmark
188

KleidiAI Coverage Report