KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 100.0% 33 / 5 / 38
Functions: 100.0% 5 / 0 / 5
Branches: 100.0% 10 / 10 / 20

kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.c
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #if (!defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)) && !defined(_M_ARM64)
8 #error This file must be compiled for AArch64, FEAT_SVE2.
9 #else // Architectural features check.
10 #include "kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
11
12 #include <stddef.h>
13 #include <stdint.h>
14
15 #include "kai/kai_common.h"
16
17 enum {
18 MR = 2,
19 KR = 1,
20 MAX_M_STEP = MR * (KAI_SME_VEC_LENGTH_MAX_BYTES / sizeof(float)) / KR,
21 };
22
23 void kai_kernel_lhs_imatmul_pack_x32p2vlx1_x32p_sme(size_t height, size_t width, const void* in, void* out);
24
25 18768 static size_t kai_get_mr_lhs_imatmul_pack_x32p2vlx1_x32p_sme(void) {
26 18768 return MR * kai_get_sme_vector_length_u32() / KR;
27 }
28
29 9384 size_t kai_get_m_step_lhs_imatmul_pack_x32p2vlx1_x32p_sme(void) {
30 9384 return kai_get_mr_lhs_imatmul_pack_x32p2vlx1_x32p_sme();
31 }
32
33 9384 size_t kai_get_lhs_packed_offset_lhs_imatmul_pack_x32p2vlx1_x32p_sme(
34 size_t m_idx, size_t k_chunk_count, size_t k_chunk_length) {
35 KAI_ASSUME(m_idx % kai_get_m_step_lhs_imatmul_pack_x32p2vlx1_x32p_sme() == 0);
36
37 9384 return m_idx * k_chunk_count * kai_roundup(k_chunk_length, KR) * sizeof(float);
38 }
39
40 4692 size_t kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(
41 size_t m, size_t k_chunk_count, size_t k_chunk_length) {
42 4692 const size_t m_end = kai_roundup(m, kai_get_mr_lhs_imatmul_pack_x32p2vlx1_x32p_sme());
43 9384 return kai_get_lhs_packed_offset_lhs_imatmul_pack_x32p2vlx1_x32p_sme(m_end, k_chunk_count, k_chunk_length);
44 4692 }
45
46 4692 void kai_run_lhs_imatmul_pack_x32p2vlx1_x32p_sme(
47 size_t m, size_t k_chunk_count, size_t k_chunk_length, const void* const* lhs_ptrs, size_t lhs_ptr_offset,
48 const void* pad_ptr, void* lhs_packed) {
49 KAI_ASSUME(lhs_ptrs != NULL);
50 KAI_ASSUME(lhs_packed != NULL);
51
52 4692 const size_t m_step = kai_get_mr_lhs_imatmul_pack_x32p2vlx1_x32p_sme();
53 4692 const size_t width = k_chunk_length;
54
55 KAI_ASSERT(m_step <= MAX_M_STEP);
56 4692 const uint8_t* in[MAX_M_STEP];
57
58 4692 uint8_t* out_base = lhs_packed;
59
60 4692 kai_commit_za();
61
62
2/2
✓ Branch 0 taken 6506 times.
✓ Branch 1 taken 4692 times.
11198 for (size_t i_m = 0; i_m < m; i_m += m_step) {
63
2/2
✓ Branch 0 taken 84510 times.
✓ Branch 1 taken 6506 times.
91016 for (size_t i_k_chunk = 0; i_k_chunk < k_chunk_count; i_k_chunk += 1) {
64
2/2
✓ Branch 0 taken 52392 times.
✓ Branch 1 taken 32118 times.
84510 const size_t height = KAI_MIN(m - i_m, m_step);
65 84510 void* out = out_base;
66 84510 out_base += m_step * kai_roundup(k_chunk_length, KR) * sizeof(float);
67
2/2
✓ Branch 0 taken 1517286 times.
✓ Branch 1 taken 84510 times.
1601796 for (size_t y = 0; y < height; y += 1) {
68 KAI_ASSERT(i_k_chunk + (i_m + y) * k_chunk_count < m * k_chunk_count);
69 1517286 in[y] = *(lhs_ptrs + i_m * k_chunk_count + i_k_chunk * m_step + y);
70
2/2
✓ Branch 0 taken 47582 times.
✓ Branch 1 taken 1469704 times.
1517286 if (in[y] != pad_ptr) {
71 1469704 uintptr_t in_ptr = (uintptr_t)in[y] + lhs_ptr_offset;
72 1469704 in[y] = (const uint8_t*)in_ptr; // NOLINT(performance-no-int-to-ptr)
73 1469704 }
74 1517286 }
75
76 84510 kai_kernel_lhs_imatmul_pack_x32p2vlx1_x32p_sme(
77 84510 height, width, in, out); // NOLINT(bugprone-multi-level-implicit-pointer-conversion)
78 84510 }
79 6506 }
80 4692 }
81
82 #endif // Architectural features check.
83