KleidiAI Coverage Report


Directory: ./
File: kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla.c
Date: 2025-10-20 13:18:31
Coverage Exec Excl Total
Lines: 100.0% 38 4 42
Functions: 100.0% 10 0 10
Branches: -% 0 8 8

Line Branch Exec Source
1 //
2 // SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 //
4 // SPDX-License-Identifier: Apache-2.0
5 //
6
7 #if ( \
8 !defined(__aarch64__) && !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && \
9 !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) && \
10 !defined(_M_ARM64)
11 #error This file must be compiled for AArch64, FEAT_FP16.
12 #else // Architectural features check.
13
14 #include "kai_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla.h"
15
16 #include <stddef.h>
17 #include <stdint.h>
18
19 #include "kai/kai_common.h"
20
21 typedef struct {
22 uint16_t maxval;
23 uint16_t minval;
24 unsigned int num_strings;
25 const unsigned int* string_lengths;
26 size_t N;
27 const void* B_ptr;
28 size_t output_offset;
29 size_t input_initial_col;
30 size_t input_offset;
31 void* output_ptr;
32 const void* bias;
33 } KernelArgs;
34
35 void kai_kernel_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(
36 const void* input_ptr, size_t m, KernelArgs* args_ptr, unsigned long flags);
37 uint16_t kai_f16_from_float_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(float value);
38
39 static const size_t kai_mr = 6;
40 static const size_t kai_nr = 32;
41 static const size_t kai_kr = 1;
42 static const size_t kai_sr = 1;
43
44 50 size_t kai_get_m_step_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(void) {
45 50 return kai_mr;
46 }
47
48 50 size_t kai_get_n_step_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(void) {
49 50 return kai_nr;
50 }
51
52 17 size_t kai_get_nr_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(void) {
53 17 return kai_nr;
54 }
55
56 17 size_t kai_get_kr_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(void) {
57 17 return kai_kr;
58 }
59
60 17 size_t kai_get_sr_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(void) {
61 17 return kai_sr;
62 }
63
64 16 size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(size_t m_idx, size_t stride) {
65 KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla() == 0);
66
67 16 return m_idx * stride;
68 }
69
70 16 size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(size_t n_idx, size_t k) {
71 KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla() == 0);
72
73 16 return n_idx / kai_nr * (kai_nr * sizeof(uint16_t) + kai_nr * k * sizeof(uint16_t));
74 }
75
76 16 size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(size_t m_idx, size_t n_idx, size_t stride) {
77 KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla() == 0);
78 KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla() == 0);
79
80 16 return m_idx * stride + n_idx * sizeof(uint16_t);
81 }
82
83 16 size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(size_t m, size_t n) {
84 16 return m * n * sizeof(uint16_t);
85 }
86
87 17 void kai_run_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(
88 size_t m, size_t n, size_t k, //
89 const void* lhs, size_t lhs_stride, //
90 const void* rhs_packed, //
91 void* dst, size_t dst_stride_row, size_t dst_stride_col, //
92 float clamp_min, float clamp_max) {
93 17 KAI_UNUSED(dst_stride_col);
94
95 17 KernelArgs ka;
96
97 17 unsigned long flags = 0;
98
99 17 unsigned int string_length = k;
100 17 ka.num_strings = 1;
101 17 ka.string_lengths = &string_length;
102 17 ka.N = n;
103 17 ka.B_ptr = rhs_packed;
104 17 ka.bias = NULL;
105
106 // Direct input.
107 17 const void* input_ptr = lhs;
108 17 ka.input_offset = lhs_stride / sizeof(uint16_t);
109 17 ka.input_initial_col = 0;
110
111 // Direct output.
112 17 ka.output_ptr = dst;
113 17 ka.output_offset = dst_stride_row / sizeof(uint16_t);
114
115 // Clamping output.
116 17 flags |= 0x2;
117 17 ka.maxval = kai_f16_from_float_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(clamp_max);
118 17 ka.minval = kai_f16_from_float_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(clamp_min);
119
120 17 kai_kernel_matmul_clamp_f16_f16_f16p32x1b_6x32_neon_mla(input_ptr, m, &ka, flags);
121 17 }
122
123 #endif // Architectural features check.
124