KleidiAI Coverage Report


Directory: ./
File: kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.c
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 100.0% 32 5 37
Functions: 100.0% 7 0 7
Branches: -% 0 10 10

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #if (!defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)) && !defined(_M_ARM64)
8 #error This file must be compiled for AArch64, FEAT_SVE2.
9 #else // Architectural features check.
10 #include "kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
11
12 #include <stddef.h>
13 #include <stdint.h>
14
15 #include "kai/kai_common.h"
16
17 enum {
18 NR = 2,
19 KR = 1,
20 };
21
22 typedef struct {
23 const void* bias_ptr;
24 size_t width;
25 size_t height;
26 size_t k_chunk_count;
27 size_t in_stride;
28 size_t out_stride;
29 const void* in;
30 void* out;
31 } KernelArgs;
32
33 static const size_t kai_num_bytes_input = sizeof(uint32_t);
34 static const size_t kai_num_bytes_output = sizeof(uint32_t);
35 static const size_t kai_num_bytes_bias = sizeof(float);
36
37 void kai_kernel_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(const KernelArgs* args_ptr);
38
39 102168 size_t kai_get_n_step_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(void) {
40 102168 return NR * kai_get_sme_vector_length_u32() / KR;
41 }
42
43 11352 size_t kai_get_rhs_offset_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(size_t n_idx) {
44 KAI_ASSUME(n_idx % kai_get_n_step_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme() == 0);
45
46 11352 return n_idx * kai_num_bytes_input;
47 }
48
49 11352 size_t kai_get_bias_offset_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(size_t n_idx) {
50 11352 return n_idx * kai_num_bytes_bias;
51 }
52
53 34056 static size_t kai_get_rhs_packed_stride_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
54 size_t k_chunk_count, size_t k_chunk_length) {
55 68112 return kai_get_n_step_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme() *
56 34056 (kai_num_bytes_bias + k_chunk_count * kai_roundup(k_chunk_length, KR) * kai_num_bytes_output);
57 }
58
59 22704 size_t kai_get_rhs_packed_offset_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
60 size_t n_idx, size_t k_chunk_count, size_t k_chunk_length) {
61 KAI_ASSUME(n_idx % kai_get_n_step_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme() == 0);
62
63 22704 const size_t block_idx = n_idx / kai_get_n_step_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme();
64 68112 return block_idx *
65 22704 kai_get_rhs_packed_stride_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(k_chunk_count, k_chunk_length);
66 22704 }
67
68 11352 size_t kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
69 size_t n, size_t k_chunk_count, size_t k_chunk_length) {
70 11352 const size_t n_nr_blocks = kai_roundup(n, kai_get_n_step_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme());
71 22704 return kai_get_rhs_packed_offset_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
72 11352 n_nr_blocks, k_chunk_count, k_chunk_length);
73 11352 }
74
75 11352 void kai_run_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
76 size_t n, size_t k_chunk_count, size_t k_chunk_length, size_t rhs_stride_row, const void* rhs, const void* bias,
77 void* rhs_packed) {
78 KAI_ASSUME(rhs != NULL);
79 KAI_ASSUME(bias != NULL);
80 KAI_ASSUME(rhs_packed != NULL);
81
82 11352 KernelArgs args;
83 11352 args.bias_ptr = bias;
84 11352 args.height = k_chunk_length;
85 11352 args.width = n;
86 11352 args.in = rhs;
87 11352 args.out = rhs_packed;
88 11352 args.k_chunk_count = k_chunk_count;
89 11352 args.in_stride = rhs_stride_row;
90 11352 args.out_stride =
91 11352 kai_get_rhs_packed_stride_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(k_chunk_count, k_chunk_length);
92
93 11352 kai_kernel_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(&args);
94 11352 }
95
96 #endif // Architectural features check.
97