kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // | ||
| 2 | // SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 3 | // | ||
| 4 | // SPDX-License-Identifier: Apache-2.0 | ||
| 5 | // | ||
| 6 | #include "kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h" | ||
| 7 | |||
| 8 | #include <stddef.h> | ||
| 9 | #include <stdint.h> | ||
| 10 | #include <string.h> | ||
| 11 | |||
| 12 | #include "kai/kai_common.h" | ||
| 13 | |||
| 14 | static const size_t kai_num_bytes_sum_rhs = sizeof(float); | ||
| 15 | static const size_t kai_num_bytes_bias = sizeof(float); | ||
| 16 | static const size_t kai_nr_multiple_of = 4; | ||
| 17 | static const size_t kai_bl_multiple_of = 32; | ||
| 18 | |||
| 19 | 6568 | inline static size_t kai_get_num_blocks_per_row(size_t k, size_t bl) { | |
| 20 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 21 | 6568 | return kai_roundup(k, bl) / bl; | |
| 22 | } | ||
| 23 | |||
| 24 | 5460 | inline static size_t kai_get_num_bytes_per_block(size_t bl, size_t num_bytes_multiplier_rhs) { | |
| 25 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 26 | 5460 | return (bl / 2) + num_bytes_multiplier_rhs; | |
| 27 | } | ||
| 28 | |||
| 29 | 1108 | inline static size_t kai_get_rhs_packed_offset_end_of_all_blocks( | |
| 30 | size_t k, size_t nr, size_t kr, size_t bl, size_t num_bytes_multiplier_rhs) { | ||
| 31 | − | KAI_ASSERT((bl % kr) == 0); | |
| 32 | − | KAI_ASSERT((nr % kai_nr_multiple_of) == 0); | |
| 33 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 34 | |||
| 35 | 1108 | const size_t num_blocks_per_row = kai_get_num_blocks_per_row(k, bl); | |
| 36 | 1108 | const size_t num_bytes_per_block = kai_get_num_bytes_per_block(bl, num_bytes_multiplier_rhs); | |
| 37 | |||
| 38 | 2216 | return (nr * num_bytes_per_block * num_blocks_per_row); | |
| 39 | 1108 | } | |
| 40 | |||
| 41 | ✗ | size_t kai_get_n_step_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(size_t nr) { | |
| 42 | ✗ | return nr; | |
| 43 | } | ||
| 44 | |||
| 45 | 1108 | size_t kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( | |
| 46 | size_t n_idx, // | ||
| 47 | size_t rhs_stride) { | ||
| 48 | 1108 | return n_idx * rhs_stride; | |
| 49 | } | ||
| 50 | |||
| 51 | 4352 | size_t kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( | |
| 52 | size_t k, // | ||
| 53 | size_t nr, // | ||
| 54 | size_t kr, // | ||
| 55 | size_t sr, // | ||
| 56 | size_t bl, // | ||
| 57 | enum kai_datatype scale_dt) { | ||
| 58 | − | KAI_ASSERT((k % bl) == 0); | |
| 59 | − | KAI_ASSERT((bl % kr) == 0); | |
| 60 | − | KAI_ASSERT((nr % kai_nr_multiple_of) == 0); | |
| 61 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 62 | − | KAI_ASSERT(scale_dt == kai_dt_bf16); | |
| 63 | |||
| 64 | 4352 | KAI_UNUSED(kr); | |
| 65 | 4352 | KAI_UNUSED(sr); | |
| 66 | |||
| 67 | 4352 | const size_t num_bytes_multiplier_rhs = kai_get_datatype_size_in_bytes(scale_dt); | |
| 68 | 4352 | const size_t num_blocks_per_row = kai_get_num_blocks_per_row(k, bl); | |
| 69 | 4352 | const size_t num_bytes_per_block = kai_get_num_bytes_per_block(bl, num_bytes_multiplier_rhs); | |
| 70 | |||
| 71 | 8704 | return nr * ((num_bytes_per_block * num_blocks_per_row) + kai_num_bytes_sum_rhs + kai_num_bytes_bias); | |
| 72 | 4352 | } | |
| 73 | |||
| 74 | 3244 | size_t kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( | |
| 75 | size_t n_idx, // | ||
| 76 | size_t k, // | ||
| 77 | size_t nr, // | ||
| 78 | size_t kr, // | ||
| 79 | size_t sr, // | ||
| 80 | size_t bl, // | ||
| 81 | enum kai_datatype scale_dt) { | ||
| 82 | − | KAI_ASSERT((n_idx % nr) == 0); | |
| 83 | − | KAI_ASSERT((k % bl) == 0); | |
| 84 | − | KAI_ASSERT((bl % kr) == 0); | |
| 85 | − | KAI_ASSERT((nr % kai_nr_multiple_of) == 0); | |
| 86 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 87 | − | KAI_ASSERT(scale_dt == kai_dt_bf16); | |
| 88 | |||
| 89 | 3244 | return (n_idx / nr) * kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(k, nr, kr, sr, bl, scale_dt); | |
| 90 | } | ||
| 91 | |||
| 92 | 1108 | size_t kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( | |
| 93 | size_t n, // | ||
| 94 | size_t k, // | ||
| 95 | size_t nr, // | ||
| 96 | size_t kr, // | ||
| 97 | size_t sr, // | ||
| 98 | size_t bl, // | ||
| 99 | enum kai_datatype scale_dt) { | ||
| 100 | − | KAI_ASSERT((k % bl) == 0); | |
| 101 | − | KAI_ASSERT((bl % kr) == 0); | |
| 102 | − | KAI_ASSERT((nr % kai_nr_multiple_of) == 0); | |
| 103 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 104 | − | KAI_ASSERT(scale_dt == kai_dt_bf16); | |
| 105 | |||
| 106 | 1108 | const size_t num_rows = kai_roundup(n, nr) / nr; | |
| 107 | |||
| 108 | 2216 | return num_rows * kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(k, nr, kr, sr, bl, scale_dt); | |
| 109 | 1108 | } | |
| 110 | |||
| 111 | 1108 | void kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( | |
| 112 | size_t num_groups, // | ||
| 113 | size_t n, // | ||
| 114 | size_t k, // | ||
| 115 | size_t nr, // | ||
| 116 | size_t kr, // | ||
| 117 | size_t sr, // | ||
| 118 | size_t bl, // | ||
| 119 | const uint8_t* rhs, // | ||
| 120 | size_t rhs_stride, // | ||
| 121 | const float* bias, // | ||
| 122 | const void* scale, // | ||
| 123 | size_t scale_stride, // | ||
| 124 | void* rhs_packed, // | ||
| 125 | size_t extra_bytes, // | ||
| 126 | const struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params* params) { | ||
| 127 | − | KAI_ASSERT(num_groups == 1); | |
| 128 | − | KAI_ASSERT(extra_bytes == 0); | |
| 129 | − | KAI_ASSERT(rhs != NULL); | |
| 130 | − | KAI_ASSERT(scale != NULL); | |
| 131 | − | KAI_ASSERT(rhs_packed != NULL); | |
| 132 | − | KAI_ASSERT(params != NULL); | |
| 133 | − | KAI_ASSERT(params->rhs_zero_point == 8); | |
| 134 | − | KAI_ASSERT(params->lhs_zero_point == 1); | |
| 135 | |||
| 136 | − | KAI_ASSERT((k % bl) == 0); | |
| 137 | − | KAI_ASSERT((bl % kr) == 0); | |
| 138 | − | KAI_ASSERT((kr % sr) == 0); | |
| 139 | − | KAI_ASSERT((nr % kai_nr_multiple_of) == 0); | |
| 140 | − | KAI_ASSERT((bl % kai_bl_multiple_of) == 0); | |
| 141 | − | KAI_ASSERT(params->scale_dt == kai_dt_bf16); | |
| 142 | |||
| 143 | // Note: The input matrix (rhs) is expected with: | ||
| 144 | // "k" columns and "n" rows (NxK) | ||
| 145 | 1108 | const enum kai_datatype scale_dt = params->scale_dt; | |
| 146 | 1108 | const size_t num_bytes_multiplier_rhs = kai_get_datatype_size_in_bytes(scale_dt); | |
| 147 | 2216 | const size_t rhs_packed_offset_end_of_all_blocks = | |
| 148 | 1108 | kai_get_rhs_packed_offset_end_of_all_blocks(k, nr, kr, bl, num_bytes_multiplier_rhs); | |
| 149 | 1108 | const size_t num_qblocks_per_row = kai_get_num_blocks_per_row(k, bl); | |
| 150 | 1108 | const size_t num_bytes_per_block_k = bl / 2; | |
| 151 | 1108 | const size_t dst_num_rows = kai_roundup(n, nr); | |
| 152 | 1108 | const size_t block_length_in_bytes = kr / sr; | |
| 153 | |||
| 154 | 1108 | uint8_t* dst_row = (uint8_t*)rhs_packed; | |
| 155 | |||
| 156 |
2/2✓ Branch 0 taken 1108 times.
✓ Branch 1 taken 17166 times.
|
18274 | for (size_t dst_row_idx = 0; dst_row_idx < dst_num_rows; dst_row_idx += nr) { |
| 157 | 17166 | float* sums = (float*)(dst_row + rhs_packed_offset_end_of_all_blocks); | |
| 158 | |||
| 159 | // Initialize the RHS reduction sums to zero | ||
| 160 | 17166 | memset(sums, 0, nr * kai_num_bytes_sum_rhs); | |
| 161 | |||
| 162 | // Iterate over the quantized blocks | ||
| 163 |
2/2✓ Branch 0 taken 136876 times.
✓ Branch 1 taken 17166 times.
|
154042 | for (size_t dst_qblock_idx = 0; dst_qblock_idx < num_qblocks_per_row; ++dst_qblock_idx) { |
| 164 | // Store the scales after packing all K values in the block | ||
| 165 | 136876 | uint8_t* rhs_packed_scale = dst_row + num_bytes_per_block_k * nr; | |
| 166 | 136876 | const uint8_t* scale_ptr = (const uint8_t*)scale + dst_qblock_idx * num_bytes_multiplier_rhs; | |
| 167 | |||
| 168 |
2/2✓ Branch 0 taken 738160 times.
✓ Branch 1 taken 136876 times.
|
875036 | for (size_t i = 0; i < nr; ++i) { |
| 169 |
2/2✓ Branch 0 taken 727132 times.
✓ Branch 1 taken 11028 times.
|
738160 | const size_t src_row_idx = KAI_MIN(dst_row_idx + i, n - 1); |
| 170 | 738160 | const void* src_scales_ptr = scale_ptr + src_row_idx * scale_stride; | |
| 171 | 738160 | void* dst_scales_ptr = rhs_packed_scale + i * num_bytes_multiplier_rhs; | |
| 172 | |||
| 173 | 738160 | memcpy( | |
| 174 | 369080 | dst_scales_ptr, // | |
| 175 | 369080 | src_scales_ptr, // | |
| 176 | 369080 | num_bytes_multiplier_rhs); // | |
| 177 | 738160 | } | |
| 178 | |||
| 179 | 136876 | size_t k0_idx_i = dst_qblock_idx * bl; | |
| 180 | |||
| 181 |
2/2✓ Branch 0 taken 151276 times.
✓ Branch 1 taken 136876 times.
|
288152 | for (size_t dst_byte_idx = 0; dst_byte_idx < num_bytes_per_block_k; dst_byte_idx += 16) { |
| 182 |
2/2✓ Branch 0 taken 408376 times.
✓ Branch 1 taken 151276 times.
|
559652 | for (size_t segment_idx = 0; segment_idx < 16 / block_length_in_bytes; ++segment_idx) { |
| 183 |
2/2✓ Branch 0 taken 2199520 times.
✓ Branch 1 taken 408376 times.
|
2607896 | for (size_t nr_idx = 0; nr_idx < nr; ++nr_idx) { |
| 184 | 2199520 | const size_t n0_idx = dst_row_idx + nr_idx; | |
| 185 | |||
| 186 | // Two int4 values are stored in one byte. | ||
| 187 | // The lower order part of the byte (low) holds the first nibble (K-index + 0). | ||
| 188 | // The higher order of the byte holds the second nibble (K-index + 16). | ||
| 189 | 2199520 | size_t k0_idx = k0_idx_i; | |
| 190 | 2199520 | size_t k1_idx = k0_idx_i + 16; | |
| 191 | |||
| 192 | // Clamp the index to avoid out-of-bound reads | ||
| 193 |
2/2✓ Branch 0 taken 2167792 times.
✓ Branch 1 taken 31728 times.
|
2199520 | const size_t n0_valid_idx = KAI_MIN(n0_idx, n - 1); |
| 194 | 2199520 | float d = kai_cast_f32_bf16(((uint16_t*)rhs_packed_scale)[nr_idx]); | |
| 195 | |||
| 196 | 2199520 | int32_t partial_sum = 0; | |
| 197 | |||
| 198 | 2199520 | size_t src_addr_byte0 = (k0_idx / 2) + n0_valid_idx * rhs_stride; | |
| 199 | |||
| 200 |
2/2✓ Branch 0 taken 6538880 times.
✓ Branch 1 taken 2199520 times.
|
8738400 | for (size_t block_byte_idx = 0; block_byte_idx < block_length_in_bytes; block_byte_idx += 2) { |
| 201 | // Initialize the byte with the zero-point (8) | ||
| 202 | // e.g. uint8_t byte0 = 8 | 8 << 4 | ||
| 203 | 6538880 | uint8_t byte0 = 136; | |
| 204 | 6538880 | uint8_t byte1 = 136; | |
| 205 | 6538880 | uint8_t byte2 = 136; | |
| 206 | 6538880 | uint8_t byte3 = 136; | |
| 207 | |||
| 208 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
|
6538880 | if (k0_idx < k) { |
| 209 | 6538880 | byte0 = rhs[src_addr_byte0]; | |
| 210 | 6538880 | } | |
| 211 | |||
| 212 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
|
6538880 | if (k1_idx < k) { |
| 213 | 6538880 | byte1 = rhs[src_addr_byte0 + 8]; | |
| 214 | 6538880 | } | |
| 215 | |||
| 216 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
|
6538880 | if (k0_idx + 1 < k) { |
| 217 | 6538880 | byte2 = byte0; | |
| 218 | 6538880 | } | |
| 219 | |||
| 220 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 6538880 times.
|
6538880 | if (k1_idx + 1 < k) { |
| 221 | 6538880 | byte3 = byte1; | |
| 222 | 6538880 | } | |
| 223 | |||
| 224 | 6538880 | k0_idx += 2; | |
| 225 | 6538880 | k1_idx += 2; | |
| 226 | |||
| 227 | 6538880 | const uint8_t src_x0_lo = byte0 & 0x0F; | |
| 228 | 6538880 | const uint8_t src_x0_hi = byte1 & 0x0F; | |
| 229 | 6538880 | const uint8_t src_x1_lo = (byte2 >> 4) & 0x0F; | |
| 230 | 6538880 | const uint8_t src_x1_hi = (byte3 >> 4) & 0x0F; | |
| 231 | |||
| 232 | 6538880 | partial_sum += (int32_t)src_x0_lo; | |
| 233 | 6538880 | partial_sum += (int32_t)src_x0_hi; | |
| 234 | 6538880 | partial_sum += (int32_t)src_x1_lo; | |
| 235 | 6538880 | partial_sum += (int32_t)src_x1_hi; | |
| 236 | 6538880 | partial_sum -= 32; // 4 * zero_point (8) | |
| 237 | |||
| 238 | 13077760 | const uint16_t dst_q = | |
| 239 | 6538880 | ((src_x0_lo)) | ((src_x0_hi) << 4) | ((src_x1_lo) << 8) | ((src_x1_hi) << 12); | |
| 240 | |||
| 241 | 6538880 | *((uint16_t*)dst_row) = dst_q ^ 0x8888; | |
| 242 | |||
| 243 | 6538880 | dst_row += 2; | |
| 244 | 6538880 | src_addr_byte0 += 1; | |
| 245 | 6538880 | } | |
| 246 | // NOLINTBEGIN(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) | ||
| 247 | 2199520 | sums[nr_idx] += (float)partial_sum * d; | |
| 248 | // NOLINTEND(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) | ||
| 249 | 2199520 | } | |
| 250 | |||
| 251 | 408376 | k0_idx_i += block_length_in_bytes; | |
| 252 | 408376 | } | |
| 253 | 151276 | k0_idx_i += 16; | |
| 254 | 151276 | } | |
| 255 | // Move the pointer after scales | ||
| 256 | 136876 | dst_row += num_bytes_multiplier_rhs * nr; | |
| 257 | 136876 | } | |
| 258 | |||
| 259 | // Move the pointer after the row sum | ||
| 260 | 17166 | dst_row += kai_num_bytes_sum_rhs * nr; | |
| 261 | |||
| 262 | // Set the bias | ||
| 263 |
1/2✓ Branch 0 taken 17166 times.
✗ Branch 1 not taken.
|
17166 | if (bias == NULL) { |
| 264 | ✗ | memset(dst_row, 0, nr * kai_num_bytes_bias); | |
| 265 | ✗ | } else { | |
| 266 |
2/2✓ Branch 0 taken 92088 times.
✓ Branch 1 taken 17166 times.
|
109254 | for (size_t i = 0; i < nr; ++i) { |
| 267 | // Clamp the row index to avoid out-of-bound reads | ||
| 268 |
2/2✓ Branch 0 taken 90364 times.
✓ Branch 1 taken 1724 times.
|
92088 | const size_t src_row_idx = KAI_MIN(dst_row_idx + i, n - 1); |
| 269 | 92088 | ((float*)dst_row)[i] = bias[src_row_idx]; | |
| 270 | 92088 | } | |
| 271 | } | ||
| 272 | // Move the pointer after the row sum | ||
| 273 | 17166 | dst_row += kai_num_bytes_bias * nr; | |
| 274 | 17166 | } | |
| 275 | 1108 | } | |
| 276 |