KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 100.0% 29 / 11 / 40
Functions: 100.0% 7 / 0 / 7
Branches: -% 0 / 22 / 22

kai/ukernels/matmul/pack/kai_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve.c
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #if (!defined(__aarch64__) || !defined(__ARM_FEATURE_SVE)) && !defined(_M_ARM64)
8 #error This file must be compiled for AArch64, FEAT_SVE.
9 #else // Architectural features check.
10
11 #include "kai_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve.h"
12
13 #include <stddef.h>
14 #include <stdint.h>
15
16 #include "kai/kai_common.h"
17
18 enum {
19 NR = 4,
20 KR = 1,
21 };
22
23 typedef struct {
24 const void* bias_ptr;
25 size_t width;
26 size_t height;
27 size_t in_stride;
28 size_t out_stride;
29 const void* in;
30 void* out;
31 } KernelArgs;
32
33 static const size_t kai_num_bytes_input = sizeof(uint32_t);
34 static const size_t kai_num_bytes_output = sizeof(uint32_t);
35 static const size_t kai_num_bytes_bias = sizeof(float);
36
37 void kai_kernel_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(const KernelArgs* args_ptr);
38
39 1188 size_t kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(void) {
40 1188 return NR * kai_get_sve_vector_length_u32() / KR;
41 }
42
43 108 size_t kai_get_rhs_offset_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(size_t n_idx) {
44 KAI_ASSUME(n_idx % kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve() == 0);
45
46 108 return n_idx * kai_num_bytes_input;
47 }
48
49 108 size_t kai_get_bias_offset_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(size_t n_idx) {
50 108 return n_idx * kai_num_bytes_bias;
51 }
52
53 324 size_t kai_get_rhs_packed_stride_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(size_t k) {
54 648 return kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve() *
55 324 (kai_num_bytes_bias + kai_roundup(k, KR) * kai_num_bytes_output);
56 }
57
58 216 size_t kai_get_rhs_packed_offset_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(size_t n_idx, size_t k) {
59 KAI_ASSUME(n_idx % kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve() == 0);
60
61 216 const size_t block_idx = n_idx / kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve();
62 432 return block_idx * kai_get_rhs_packed_stride_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(k);
63 216 }
64
65 108 size_t kai_get_rhs_packed_size_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(size_t n, size_t k) {
66 108 const size_t n_nr_blocks = kai_roundup(n, kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve());
67 216 return kai_get_rhs_packed_offset_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(n_nr_blocks, k);
68 108 }
69
70 108 void kai_run_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(
71 size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride_row, const void* rhs,
72 const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params) {
73 KAI_ASSUME(num_groups == 1);
74 KAI_ASSUME(nr == kai_get_n_step_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve());
75 KAI_ASSUME(kr == KR);
76 KAI_ASSUME(rhs != NULL);
77 KAI_ASSUME(bias != NULL);
78 KAI_ASSUME(scale == NULL);
79 KAI_ASSUME(rhs_packed != NULL);
80 KAI_ASSUME(extra_bytes == 0);
81 KAI_ASSUME(params == NULL);
82 108 KAI_UNUSED(sr);
83
84 108 KernelArgs args;
85 108 args.bias_ptr = bias;
86 108 args.height = k;
87 108 args.width = n;
88 108 args.in = rhs;
89 108 args.out = rhs_packed;
90 108 args.in_stride = rhs_stride_row;
91 108 args.out_stride = kai_get_rhs_packed_stride_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(args.height);
92
93 108 kai_kernel_rhs_pack_kxn_x32p4vlx1b_x32_x32_sve(&args);
94 108 }
95
96 #endif // Architectural features check.
97