KleidiAI Coverage Report


Directory: ./
Coverage: low: ≥ 0% medium: ≥ 75.0% high: ≥ 90.0%
Coverage Exec / Excl / Total
Lines: 100.0% 37 / 5 / 42
Functions: 100.0% 10 / 0 / 10
Branches: -% 0 / 10 / 10

kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #if (!defined(__aarch64__) && !defined(_M_ARM64))
8 #error This file must be compiled for AArch64.
9 #else // Architectural features check.
10
11 #include "kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
12
13 #include <stddef.h>
14 #include <stdint.h>
15
16 #include "kai/kai_common.h"
17
18 typedef struct {
19 float maxval;
20 float minval;
21 unsigned int num_strings;
22 const unsigned int* string_lengths;
23 size_t N;
24 const void* B_ptr;
25 size_t output_offset;
26 size_t input_initial_col;
27 size_t input_offset;
28 void* output_ptr;
29 const void* bias;
30 } kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl_args_t;
31
32 extern void kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl(
33 const void* input_ptr, size_t m, kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl_args_t* args_ptr,
34 unsigned long flags);
35
36 static const size_t kai_mr = 6;
37 static const size_t kai_nr = 8;
38 static const size_t kai_kr = 1;
39 static const size_t kai_sr = 1;
40
41 108 size_t kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(void) {
42 108 return kai_mr;
43 }
44
45 108 size_t kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(void) {
46 108 return kai_nr;
47 }
48
49 108 size_t kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(void) {
50 108 return kai_nr;
51 }
52
53 108 size_t kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(void) {
54 108 return kai_kr;
55 }
56
57 108 size_t kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(void) {
58 108 return kai_sr;
59 }
60
61 108 size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(size_t m_idx, size_t stride) {
62 KAI_ASSUME(m_idx % kai_mr == 0);
63
64 108 return m_idx * stride;
65 }
66
67 108 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(size_t n_idx, size_t k) {
68 KAI_ASSUME(n_idx % kai_nr == 0);
69
70 108 return n_idx / kai_nr * (kai_nr * sizeof(float) + kai_nr * k * sizeof(float));
71 }
72
73 108 size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(
74 size_t m_idx, size_t n_idx, size_t stride) {
75 KAI_ASSUME(m_idx % kai_mr == 0);
76 KAI_ASSUME(n_idx % kai_nr == 0);
77
78 108 return m_idx * stride + n_idx * sizeof(float);
79 }
80
81 108 size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(size_t m, size_t n) {
82 108 return m * n * sizeof(float);
83 }
84
85 114 void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla(
86 size_t m, size_t n, size_t k, //
87 const void* lhs, size_t lhs_stride, //
88 const void* rhs_packed, //
89 void* dst, size_t dst_stride_row, size_t dst_stride_col, //
90 float clamp_min, float clamp_max) {
91 KAI_ASSERT(dst_stride_col == sizeof(float));
92
93 114 kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl_args_t ka;
94
95 114 unsigned long flags = 0;
96
97 114 unsigned int string_length = k;
98 114 ka.num_strings = 1;
99 114 ka.string_lengths = &string_length;
100 114 ka.N = n;
101 114 ka.B_ptr = rhs_packed;
102 114 ka.bias = NULL;
103
104 // Direct input.
105 114 const void* input_ptr = lhs;
106 114 ka.input_offset = lhs_stride / sizeof(float);
107 114 ka.input_initial_col = 0;
108
109 // Direct output.
110 114 ka.output_ptr = dst;
111 114 ka.output_offset = dst_stride_row / sizeof(float);
112
113 // Clamping output.
114 114 flags |= 0x2;
115 114 ka.maxval = clamp_max;
116 114 ka.minval = clamp_min;
117
118 114 kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl(input_ptr, m, &ka, flags);
119 114 }
120
121 #endif // Architectural features check.
122