diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp b/ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp new file mode 100644 index 00000000000..ce7e4651676 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-shaders.cpp @@ -0,0 +1,2 @@ +#include "ggml-vulkan-shaders.hpp" + diff --git a/ggml/src/ggml-vulkan/ggml-vulkan-shaders.hpp b/ggml/src/ggml-vulkan/ggml-vulkan-shaders.hpp new file mode 100644 index 00000000000..c09cd5aabe2 --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan-shaders.hpp @@ -0,0 +1,8185 @@ +#include + +extern const uint64_t abs_f16_len; +extern const unsigned char abs_f16_data[]; + +extern const uint64_t abs_f32_len; +extern const unsigned char abs_f32_data[]; + +extern const uint64_t acc_f32_len; +extern const unsigned char acc_f32_data[]; + +extern const uint64_t add1_f16_f16_len; +extern const unsigned char add1_f16_f16_data[]; + +extern const uint64_t add1_f16_f32_len; +extern const unsigned char add1_f16_f32_data[]; + +extern const uint64_t add1_f32_f32_len; +extern const unsigned char add1_f32_f32_data[]; + +extern const uint64_t add_f16_f16_f16_len; +extern const unsigned char add_f16_f16_f16_data[]; + +extern const uint64_t add_f16_f16_f32_len; +extern const unsigned char add_f16_f16_f32_data[]; + +extern const uint64_t add_f16_f32_f16_len; +extern const unsigned char add_f16_f32_f16_data[]; + +extern const uint64_t add_f16_f32_f32_len; +extern const unsigned char add_f16_f32_f32_data[]; + +extern const uint64_t add_f32_f16_f16_len; +extern const unsigned char add_f32_f16_f16_data[]; + +extern const uint64_t add_f32_f16_f32_len; +extern const unsigned char add_f32_f16_f32_data[]; + +extern const uint64_t add_f32_f32_f16_len; +extern const unsigned char add_f32_f32_f16_data[]; + +extern const uint64_t add_f32_f32_f32_len; +extern const unsigned char add_f32_f32_f32_data[]; + +extern const uint64_t add_id_f32_len; +extern const unsigned char add_id_f32_data[]; + +extern const uint64_t add_rms_f16_f16_f16_len; +extern const unsigned char add_rms_f16_f16_f16_data[]; + +extern const uint64_t add_rms_f16_f16_f32_len; +extern const unsigned char add_rms_f16_f16_f32_data[]; + +extern const uint64_t add_rms_f16_f32_f16_len; +extern const unsigned char add_rms_f16_f32_f16_data[]; + +extern const uint64_t add_rms_f16_f32_f32_len; +extern const unsigned char add_rms_f16_f32_f32_data[]; + +extern const uint64_t add_rms_f32_f16_f16_len; +extern const unsigned char add_rms_f32_f16_f16_data[]; + +extern const uint64_t add_rms_f32_f16_f32_len; +extern const unsigned char add_rms_f32_f16_f32_data[]; + +extern const uint64_t add_rms_f32_f32_f16_len; +extern const unsigned char add_rms_f32_f32_f16_data[]; + +extern const uint64_t add_rms_f32_f32_f32_len; +extern const unsigned char add_rms_f32_f32_f32_data[]; + +extern const uint64_t arange_f32_len; +extern const unsigned char arange_f32_data[]; + +extern const uint64_t argmax_f32_len; +extern const unsigned char argmax_f32_data[]; + +extern const uint64_t argsort_f32_len; +extern const unsigned char argsort_f32_data[]; + +extern const uint64_t argsort_large_f32_len; +extern const unsigned char argsort_large_f32_data[]; + +extern const uint64_t ceil_f16_len; +extern const unsigned char ceil_f16_data[]; + +extern const uint64_t ceil_f32_len; +extern const unsigned char ceil_f32_data[]; + +extern const uint64_t clamp_f32_len; +extern const unsigned char clamp_f32_data[]; + +extern const uint64_t concat_f16_len; +extern const unsigned char concat_f16_data[]; + +extern const uint64_t concat_f32_len; +extern const unsigned char concat_f32_data[]; + +extern const uint64_t concat_i32_len; +extern const unsigned char concat_i32_data[]; + +extern const uint64_t contig_cpy_f16_f16_len; +extern const unsigned char contig_cpy_f16_f16_data[]; + +extern const uint64_t contig_cpy_f16_f32_len; +extern const unsigned char contig_cpy_f16_f32_data[]; + +extern const uint64_t contig_cpy_f32_bf16_len; +extern const unsigned char contig_cpy_f32_bf16_data[]; + +extern const uint64_t contig_cpy_f32_f16_len; +extern const unsigned char contig_cpy_f32_f16_data[]; + +extern const uint64_t contig_cpy_f32_f32_len; +extern const unsigned char contig_cpy_f32_f32_data[]; + +extern const uint64_t contig_cpy_f32_i32_len; +extern const unsigned char contig_cpy_f32_i32_data[]; + +extern const uint64_t contig_cpy_i32_f32_len; +extern const unsigned char contig_cpy_i32_f32_data[]; + +extern const uint64_t conv2d_dw_cwhn_f16_f32_len; +extern const unsigned char conv2d_dw_cwhn_f16_f32_data[]; + +extern const uint64_t conv2d_dw_cwhn_f32_len; +extern const unsigned char conv2d_dw_cwhn_f32_data[]; + +extern const uint64_t conv2d_dw_whcn_f16_f32_len; +extern const unsigned char conv2d_dw_whcn_f16_f32_data[]; + +extern const uint64_t conv2d_dw_whcn_f32_len; +extern const unsigned char conv2d_dw_whcn_f32_data[]; + +extern const uint64_t conv2d_f16_f32_len; +extern const unsigned char conv2d_f16_f32_data[]; + +extern const uint64_t conv2d_f16_f32_cm2_len; +extern const unsigned char conv2d_f16_f32_cm2_data[]; + +extern const uint64_t conv2d_f16_f32_unroll_len; +extern const unsigned char conv2d_f16_f32_unroll_data[]; + +extern const uint64_t conv2d_f32_len; +extern const unsigned char conv2d_f32_data[]; + +extern const uint64_t conv2d_f32_cm2_len; +extern const unsigned char conv2d_f32_cm2_data[]; + +extern const uint64_t conv2d_f32_unroll_len; +extern const unsigned char conv2d_f32_unroll_data[]; + +extern const uint64_t conv_transpose_1d_f32_len; +extern const unsigned char conv_transpose_1d_f32_data[]; + +extern const uint64_t conv_transpose_2d_f16_f32_len; +extern const unsigned char conv_transpose_2d_f16_f32_data[]; + +extern const uint64_t conv_transpose_2d_f16_f32_cm2_len; +extern const unsigned char conv_transpose_2d_f16_f32_cm2_data[]; + +extern const uint64_t conv_transpose_2d_f16_f32_unroll_len; +extern const unsigned char conv_transpose_2d_f16_f32_unroll_data[]; + +extern const uint64_t conv_transpose_2d_f32_len; +extern const unsigned char conv_transpose_2d_f32_data[]; + +extern const uint64_t conv_transpose_2d_f32_cm2_len; +extern const unsigned char conv_transpose_2d_f32_cm2_data[]; + +extern const uint64_t conv_transpose_2d_f32_unroll_len; +extern const unsigned char conv_transpose_2d_f32_unroll_data[]; + +extern const uint64_t cos_f32_len; +extern const unsigned char cos_f32_data[]; + +extern const uint64_t count_equal_i32_len; +extern const unsigned char count_equal_i32_data[]; + +extern const uint64_t count_experts_len; +extern const unsigned char count_experts_data[]; + +extern const uint64_t cpy_f16_f16_len; +extern const unsigned char cpy_f16_f16_data[]; + +extern const uint64_t cpy_f16_f32_len; +extern const unsigned char cpy_f16_f32_data[]; + +extern const uint64_t cpy_f32_bf16_len; +extern const unsigned char cpy_f32_bf16_data[]; + +extern const uint64_t cpy_f32_f16_len; +extern const unsigned char cpy_f32_f16_data[]; + +extern const uint64_t cpy_f32_f32_len; +extern const unsigned char cpy_f32_f32_data[]; + +extern const uint64_t cpy_f32_i32_len; +extern const unsigned char cpy_f32_i32_data[]; + +extern const uint64_t cpy_f32_iq4_nl_len; +extern const unsigned char cpy_f32_iq4_nl_data[]; + +extern const uint64_t cpy_f32_q1_0_len; +extern const unsigned char cpy_f32_q1_0_data[]; + +extern const uint64_t cpy_f32_q4_0_len; +extern const unsigned char cpy_f32_q4_0_data[]; + +extern const uint64_t cpy_f32_q4_1_len; +extern const unsigned char cpy_f32_q4_1_data[]; + +extern const uint64_t cpy_f32_q5_0_len; +extern const unsigned char cpy_f32_q5_0_data[]; + +extern const uint64_t cpy_f32_q5_1_len; +extern const unsigned char cpy_f32_q5_1_data[]; + +extern const uint64_t cpy_f32_q8_0_len; +extern const unsigned char cpy_f32_q8_0_data[]; + +extern const uint64_t cpy_i32_f32_len; +extern const unsigned char cpy_i32_f32_data[]; + +extern const uint64_t cpy_iq4_nl_f32_len; +extern const unsigned char cpy_iq4_nl_f32_data[]; + +extern const uint64_t cpy_q1_0_f32_len; +extern const unsigned char cpy_q1_0_f32_data[]; + +extern const uint64_t cpy_q4_0_f32_len; +extern const unsigned char cpy_q4_0_f32_data[]; + +extern const uint64_t cpy_q4_1_f32_len; +extern const unsigned char cpy_q4_1_f32_data[]; + +extern const uint64_t cpy_q5_0_f32_len; +extern const unsigned char cpy_q5_0_f32_data[]; + +extern const uint64_t cpy_q5_1_f32_len; +extern const unsigned char cpy_q5_1_f32_data[]; + +extern const uint64_t cpy_q8_0_f32_len; +extern const unsigned char cpy_q8_0_f32_data[]; + +extern const uint64_t cpy_tq3_1s_f32_len; +extern const unsigned char cpy_tq3_1s_f32_data[]; + +extern const uint64_t cpy_tq4_1s_f32_len; +extern const unsigned char cpy_tq4_1s_f32_data[]; + +extern const uint64_t cpy_transpose_16_len; +extern const unsigned char cpy_transpose_16_data[]; + +extern const uint64_t cpy_transpose_32_len; +extern const unsigned char cpy_transpose_32_data[]; + +extern const uint64_t cpy_turbo2_0_f32_len; +extern const unsigned char cpy_turbo2_0_f32_data[]; + +extern const uint64_t cpy_turbo3_0_f32_len; +extern const unsigned char cpy_turbo3_0_f32_data[]; + +extern const uint64_t cpy_turbo4_0_f32_len; +extern const unsigned char cpy_turbo4_0_f32_data[]; + +extern const uint64_t cumsum_f32_len; +extern const unsigned char cumsum_f32_data[]; + +extern const uint64_t cumsum_multipass1_f32_len; +extern const unsigned char cumsum_multipass1_f32_data[]; + +extern const uint64_t cumsum_multipass2_f32_len; +extern const unsigned char cumsum_multipass2_f32_data[]; + +extern const uint64_t dequant_f32_len; +extern const unsigned char dequant_f32_data[]; + +extern const uint64_t dequant_iq1_m_len; +extern const unsigned char dequant_iq1_m_data[]; + +extern const uint64_t dequant_iq1_s_len; +extern const unsigned char dequant_iq1_s_data[]; + +extern const uint64_t dequant_iq2_s_len; +extern const unsigned char dequant_iq2_s_data[]; + +extern const uint64_t dequant_iq2_xs_len; +extern const unsigned char dequant_iq2_xs_data[]; + +extern const uint64_t dequant_iq2_xxs_len; +extern const unsigned char dequant_iq2_xxs_data[]; + +extern const uint64_t dequant_iq3_s_len; +extern const unsigned char dequant_iq3_s_data[]; + +extern const uint64_t dequant_iq3_xxs_len; +extern const unsigned char dequant_iq3_xxs_data[]; + +extern const uint64_t dequant_iq4_nl_len; +extern const unsigned char dequant_iq4_nl_data[]; + +extern const uint64_t dequant_iq4_xs_len; +extern const unsigned char dequant_iq4_xs_data[]; + +extern const uint64_t dequant_mxfp4_len; +extern const unsigned char dequant_mxfp4_data[]; + +extern const uint64_t dequant_nvfp4_len; +extern const unsigned char dequant_nvfp4_data[]; + +extern const uint64_t dequant_q1_0_len; +extern const unsigned char dequant_q1_0_data[]; + +extern const uint64_t dequant_q2_k_len; +extern const unsigned char dequant_q2_k_data[]; + +extern const uint64_t dequant_q3_k_len; +extern const unsigned char dequant_q3_k_data[]; + +extern const uint64_t dequant_q4_0_len; +extern const unsigned char dequant_q4_0_data[]; + +extern const uint64_t dequant_q4_1_len; +extern const unsigned char dequant_q4_1_data[]; + +extern const uint64_t dequant_q4_k_len; +extern const unsigned char dequant_q4_k_data[]; + +extern const uint64_t dequant_q5_0_len; +extern const unsigned char dequant_q5_0_data[]; + +extern const uint64_t dequant_q5_1_len; +extern const unsigned char dequant_q5_1_data[]; + +extern const uint64_t dequant_q5_k_len; +extern const unsigned char dequant_q5_k_data[]; + +extern const uint64_t dequant_q6_k_len; +extern const unsigned char dequant_q6_k_data[]; + +extern const uint64_t dequant_q8_0_len; +extern const unsigned char dequant_q8_0_data[]; + +extern const uint64_t dequant_tq3_1s_len; +extern const unsigned char dequant_tq3_1s_data[]; + +extern const uint64_t dequant_tq4_1s_len; +extern const unsigned char dequant_tq4_1s_data[]; + +extern const uint64_t dequant_turbo2_0_len; +extern const unsigned char dequant_turbo2_0_data[]; + +extern const uint64_t dequant_turbo3_0_len; +extern const unsigned char dequant_turbo3_0_data[]; + +extern const uint64_t dequant_turbo4_0_len; +extern const unsigned char dequant_turbo4_0_data[]; + +extern const uint64_t diag_f16_len; +extern const unsigned char diag_f16_data[]; + +extern const uint64_t diag_f32_len; +extern const unsigned char diag_f32_data[]; + +extern const uint64_t diag_mask_inf_f32_len; +extern const unsigned char diag_mask_inf_f32_data[]; + +extern const uint64_t div_f16_f16_f16_len; +extern const unsigned char div_f16_f16_f16_data[]; + +extern const uint64_t div_f16_f16_f32_len; +extern const unsigned char div_f16_f16_f32_data[]; + +extern const uint64_t div_f16_f32_f16_len; +extern const unsigned char div_f16_f32_f16_data[]; + +extern const uint64_t div_f16_f32_f32_len; +extern const unsigned char div_f16_f32_f32_data[]; + +extern const uint64_t div_f32_len; +extern const unsigned char div_f32_data[]; + +extern const uint64_t div_f32_f16_f16_len; +extern const unsigned char div_f32_f16_f16_data[]; + +extern const uint64_t div_f32_f16_f32_len; +extern const unsigned char div_f32_f16_f32_data[]; + +extern const uint64_t div_f32_f32_f16_len; +extern const unsigned char div_f32_f32_f16_data[]; + +extern const uint64_t div_f32_f32_f32_len; +extern const unsigned char div_f32_f32_f32_data[]; + +extern const uint64_t elu_f16_len; +extern const unsigned char elu_f16_data[]; + +extern const uint64_t elu_f32_len; +extern const unsigned char elu_f32_data[]; + +extern const uint64_t exp_f16_len; +extern const unsigned char exp_f16_data[]; + +extern const uint64_t exp_f32_len; +extern const unsigned char exp_f32_data[]; + +extern const uint64_t fa_mask_opt_len; +extern const unsigned char fa_mask_opt_data[]; + +extern const uint64_t fa_split_k_reduce_len; +extern const unsigned char fa_split_k_reduce_data[]; + +extern const uint64_t fill_f16_len; +extern const unsigned char fill_f16_data[]; + +extern const uint64_t fill_f32_len; +extern const unsigned char fill_f32_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_len; +extern const unsigned char flash_attn_f32_f16_f16_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_cm1_len; +extern const unsigned char flash_attn_f32_f16_f16_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_cm2_len; +extern const unsigned char flash_attn_f32_f16_f16_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_f16acc_len; +extern const unsigned char flash_attn_f32_f16_f16_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_f16_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_f16_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_f16_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_f16_fp32_len; +extern const unsigned char flash_attn_f32_f16_f16_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_len; +extern const unsigned char flash_attn_f32_f16_f32_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_cm1_len; +extern const unsigned char flash_attn_f32_f16_f32_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_cm2_len; +extern const unsigned char flash_attn_f32_f16_f32_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_f16acc_len; +extern const unsigned char flash_attn_f32_f16_f32_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_f32_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_f32_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_f32_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_f32_fp32_len; +extern const unsigned char flash_attn_f32_f16_f32_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_iq1_m_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq1_m_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq1_m_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq1_m_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq1_s_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq1_s_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq1_s_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq1_s_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq2_s_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq2_s_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq2_s_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq2_s_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq2_xs_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq2_xs_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq2_xs_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq2_xs_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq2_xxs_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq2_xxs_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq2_xxs_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq2_xxs_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq3_s_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq3_s_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq3_s_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq3_s_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq3_xxs_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq3_xxs_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq3_xxs_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq3_xxs_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_cm1_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_f16acc_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_f16acc_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_f16acc_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_f16acc_int8_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_f16acc_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_fp32_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_nl_int8_len; +extern const unsigned char flash_attn_f32_f16_iq4_nl_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_xs_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq4_xs_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_iq4_xs_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_iq4_xs_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_mxfp4_cm2_len; +extern const unsigned char flash_attn_f32_f16_mxfp4_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_mxfp4_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_mxfp4_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_nvfp4_cm2_len; +extern const unsigned char flash_attn_f32_f16_nvfp4_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_nvfp4_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_nvfp4_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q1_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_q1_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q1_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q1_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q2_k_cm2_len; +extern const unsigned char flash_attn_f32_f16_q2_k_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q2_k_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q2_k_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q3_k_cm2_len; +extern const unsigned char flash_attn_f32_f16_q3_k_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q3_k_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q3_k_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_len; +extern const unsigned char flash_attn_f32_f16_q4_0_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_cm1_len; +extern const unsigned char flash_attn_f32_f16_q4_0_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_q4_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_f16acc_len; +extern const unsigned char flash_attn_f32_f16_q4_0_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_q4_0_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q4_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_q4_0_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_f16acc_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_0_f16acc_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_f16acc_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_0_f16acc_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_fp32_len; +extern const unsigned char flash_attn_f32_f16_q4_0_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_0_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_0_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_0_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_len; +extern const unsigned char flash_attn_f32_f16_q4_1_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_cm1_len; +extern const unsigned char flash_attn_f32_f16_q4_1_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_cm2_len; +extern const unsigned char flash_attn_f32_f16_q4_1_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_f16acc_len; +extern const unsigned char flash_attn_f32_f16_q4_1_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_q4_1_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q4_1_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_q4_1_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_f16acc_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_1_f16acc_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_f16acc_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_1_f16acc_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_fp32_len; +extern const unsigned char flash_attn_f32_f16_q4_1_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_1_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_1_int8_len; +extern const unsigned char flash_attn_f32_f16_q4_1_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_k_cm2_len; +extern const unsigned char flash_attn_f32_f16_q4_k_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q4_k_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q4_k_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_len; +extern const unsigned char flash_attn_f32_f16_q5_0_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_cm1_len; +extern const unsigned char flash_attn_f32_f16_q5_0_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_q5_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_f16acc_len; +extern const unsigned char flash_attn_f32_f16_q5_0_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_q5_0_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q5_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_q5_0_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_f16acc_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_0_f16acc_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_f16acc_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_0_f16acc_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_fp32_len; +extern const unsigned char flash_attn_f32_f16_q5_0_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_0_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_0_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_0_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_len; +extern const unsigned char flash_attn_f32_f16_q5_1_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_cm1_len; +extern const unsigned char flash_attn_f32_f16_q5_1_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_cm2_len; +extern const unsigned char flash_attn_f32_f16_q5_1_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_f16acc_len; +extern const unsigned char flash_attn_f32_f16_q5_1_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_q5_1_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q5_1_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_q5_1_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_f16acc_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_1_f16acc_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_f16acc_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_1_f16acc_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_fp32_len; +extern const unsigned char flash_attn_f32_f16_q5_1_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_1_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_1_int8_len; +extern const unsigned char flash_attn_f32_f16_q5_1_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_k_cm2_len; +extern const unsigned char flash_attn_f32_f16_q5_k_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q5_k_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q5_k_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q6_k_cm2_len; +extern const unsigned char flash_attn_f32_f16_q6_k_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q6_k_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q6_k_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_len; +extern const unsigned char flash_attn_f32_f16_q8_0_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_cm1_len; +extern const unsigned char flash_attn_f32_f16_q8_0_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_q8_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_f16acc_len; +extern const unsigned char flash_attn_f32_f16_q8_0_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_q8_0_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_q8_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_q8_0_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_f16acc_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q8_0_f16acc_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_f16acc_int8_len; +extern const unsigned char flash_attn_f32_f16_q8_0_f16acc_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_fp32_len; +extern const unsigned char flash_attn_f32_f16_q8_0_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_fp32_int8_len; +extern const unsigned char flash_attn_f32_f16_q8_0_fp32_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_q8_0_int8_len; +extern const unsigned char flash_attn_f32_f16_q8_0_int8_data[]; + +extern const uint64_t flash_attn_f32_f16_tq3_1s_cm2_len; +extern const unsigned char flash_attn_f32_f16_tq3_1s_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_tq3_1s_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_tq3_1s_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_cm1_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_f16acc_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo2_0_fp32_len; +extern const unsigned char flash_attn_f32_f16_turbo2_0_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_cm1_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_f16acc_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo3_0_fp32_len; +extern const unsigned char flash_attn_f32_f16_turbo3_0_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_cm1_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_cm2_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_f16acc_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_f16acc_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_f16acc_cm1_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_f16acc_cm1_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_f16acc_cm2_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_f16acc_cm2_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_f16acc_fp32_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_f16acc_fp32_data[]; + +extern const uint64_t flash_attn_f32_f16_turbo4_0_fp32_len; +extern const unsigned char flash_attn_f32_f16_turbo4_0_fp32_data[]; + +extern const uint64_t floor_f16_len; +extern const unsigned char floor_f16_data[]; + +extern const uint64_t floor_f32_len; +extern const unsigned char floor_f32_data[]; + +extern const uint64_t gated_delta_net_f32_len; +extern const unsigned char gated_delta_net_f32_data[]; + +extern const uint64_t gated_delta_net_f32_nocluster_len; +extern const unsigned char gated_delta_net_f32_nocluster_data[]; + +extern const uint64_t gated_delta_net_f32_shmem_len; +extern const unsigned char gated_delta_net_f32_shmem_data[]; + +extern const uint64_t geglu_erf_f16_len; +extern const unsigned char geglu_erf_f16_data[]; + +extern const uint64_t geglu_erf_f32_len; +extern const unsigned char geglu_erf_f32_data[]; + +extern const uint64_t geglu_f16_len; +extern const unsigned char geglu_f16_data[]; + +extern const uint64_t geglu_f32_len; +extern const unsigned char geglu_f32_data[]; + +extern const uint64_t geglu_quick_f16_len; +extern const unsigned char geglu_quick_f16_data[]; + +extern const uint64_t geglu_quick_f32_len; +extern const unsigned char geglu_quick_f32_data[]; + +extern const uint64_t gelu_erf_f16_len; +extern const unsigned char gelu_erf_f16_data[]; + +extern const uint64_t gelu_erf_f32_len; +extern const unsigned char gelu_erf_f32_data[]; + +extern const uint64_t gelu_f16_len; +extern const unsigned char gelu_f16_data[]; + +extern const uint64_t gelu_f32_len; +extern const unsigned char gelu_f32_data[]; + +extern const uint64_t gelu_quick_f16_len; +extern const unsigned char gelu_quick_f16_data[]; + +extern const uint64_t gelu_quick_f32_len; +extern const unsigned char gelu_quick_f32_data[]; + +extern const uint64_t get_rows_bf16_len; +extern const unsigned char get_rows_bf16_data[]; + +extern const uint64_t get_rows_bf16_f32_len; +extern const unsigned char get_rows_bf16_f32_data[]; + +extern const uint64_t get_rows_f16_len; +extern const unsigned char get_rows_f16_data[]; + +extern const uint64_t get_rows_f16_f32_len; +extern const unsigned char get_rows_f16_f32_data[]; + +extern const uint64_t get_rows_f32_len; +extern const unsigned char get_rows_f32_data[]; + +extern const uint64_t get_rows_f32_f32_len; +extern const unsigned char get_rows_f32_f32_data[]; + +extern const uint64_t get_rows_i32_len; +extern const unsigned char get_rows_i32_data[]; + +extern const uint64_t get_rows_iq1_m_len; +extern const unsigned char get_rows_iq1_m_data[]; + +extern const uint64_t get_rows_iq1_m_f32_len; +extern const unsigned char get_rows_iq1_m_f32_data[]; + +extern const uint64_t get_rows_iq1_s_len; +extern const unsigned char get_rows_iq1_s_data[]; + +extern const uint64_t get_rows_iq1_s_f32_len; +extern const unsigned char get_rows_iq1_s_f32_data[]; + +extern const uint64_t get_rows_iq2_s_len; +extern const unsigned char get_rows_iq2_s_data[]; + +extern const uint64_t get_rows_iq2_s_f32_len; +extern const unsigned char get_rows_iq2_s_f32_data[]; + +extern const uint64_t get_rows_iq2_xs_len; +extern const unsigned char get_rows_iq2_xs_data[]; + +extern const uint64_t get_rows_iq2_xs_f32_len; +extern const unsigned char get_rows_iq2_xs_f32_data[]; + +extern const uint64_t get_rows_iq2_xxs_len; +extern const unsigned char get_rows_iq2_xxs_data[]; + +extern const uint64_t get_rows_iq2_xxs_f32_len; +extern const unsigned char get_rows_iq2_xxs_f32_data[]; + +extern const uint64_t get_rows_iq3_s_len; +extern const unsigned char get_rows_iq3_s_data[]; + +extern const uint64_t get_rows_iq3_s_f32_len; +extern const unsigned char get_rows_iq3_s_f32_data[]; + +extern const uint64_t get_rows_iq3_xxs_len; +extern const unsigned char get_rows_iq3_xxs_data[]; + +extern const uint64_t get_rows_iq3_xxs_f32_len; +extern const unsigned char get_rows_iq3_xxs_f32_data[]; + +extern const uint64_t get_rows_iq4_nl_len; +extern const unsigned char get_rows_iq4_nl_data[]; + +extern const uint64_t get_rows_iq4_nl_f32_len; +extern const unsigned char get_rows_iq4_nl_f32_data[]; + +extern const uint64_t get_rows_iq4_xs_len; +extern const unsigned char get_rows_iq4_xs_data[]; + +extern const uint64_t get_rows_iq4_xs_f32_len; +extern const unsigned char get_rows_iq4_xs_f32_data[]; + +extern const uint64_t get_rows_mxfp4_len; +extern const unsigned char get_rows_mxfp4_data[]; + +extern const uint64_t get_rows_mxfp4_f32_len; +extern const unsigned char get_rows_mxfp4_f32_data[]; + +extern const uint64_t get_rows_nvfp4_len; +extern const unsigned char get_rows_nvfp4_data[]; + +extern const uint64_t get_rows_nvfp4_f32_len; +extern const unsigned char get_rows_nvfp4_f32_data[]; + +extern const uint64_t get_rows_q1_0_len; +extern const unsigned char get_rows_q1_0_data[]; + +extern const uint64_t get_rows_q1_0_f32_len; +extern const unsigned char get_rows_q1_0_f32_data[]; + +extern const uint64_t get_rows_q2_k_len; +extern const unsigned char get_rows_q2_k_data[]; + +extern const uint64_t get_rows_q2_k_f32_len; +extern const unsigned char get_rows_q2_k_f32_data[]; + +extern const uint64_t get_rows_q3_k_len; +extern const unsigned char get_rows_q3_k_data[]; + +extern const uint64_t get_rows_q3_k_f32_len; +extern const unsigned char get_rows_q3_k_f32_data[]; + +extern const uint64_t get_rows_q4_0_len; +extern const unsigned char get_rows_q4_0_data[]; + +extern const uint64_t get_rows_q4_0_f32_len; +extern const unsigned char get_rows_q4_0_f32_data[]; + +extern const uint64_t get_rows_q4_1_len; +extern const unsigned char get_rows_q4_1_data[]; + +extern const uint64_t get_rows_q4_1_f32_len; +extern const unsigned char get_rows_q4_1_f32_data[]; + +extern const uint64_t get_rows_q4_k_len; +extern const unsigned char get_rows_q4_k_data[]; + +extern const uint64_t get_rows_q4_k_f32_len; +extern const unsigned char get_rows_q4_k_f32_data[]; + +extern const uint64_t get_rows_q5_0_len; +extern const unsigned char get_rows_q5_0_data[]; + +extern const uint64_t get_rows_q5_0_f32_len; +extern const unsigned char get_rows_q5_0_f32_data[]; + +extern const uint64_t get_rows_q5_1_len; +extern const unsigned char get_rows_q5_1_data[]; + +extern const uint64_t get_rows_q5_1_f32_len; +extern const unsigned char get_rows_q5_1_f32_data[]; + +extern const uint64_t get_rows_q5_k_len; +extern const unsigned char get_rows_q5_k_data[]; + +extern const uint64_t get_rows_q5_k_f32_len; +extern const unsigned char get_rows_q5_k_f32_data[]; + +extern const uint64_t get_rows_q6_k_len; +extern const unsigned char get_rows_q6_k_data[]; + +extern const uint64_t get_rows_q6_k_f32_len; +extern const unsigned char get_rows_q6_k_f32_data[]; + +extern const uint64_t get_rows_q8_0_len; +extern const unsigned char get_rows_q8_0_data[]; + +extern const uint64_t get_rows_q8_0_f32_len; +extern const unsigned char get_rows_q8_0_f32_data[]; + +extern const uint64_t get_rows_tq3_1s_len; +extern const unsigned char get_rows_tq3_1s_data[]; + +extern const uint64_t get_rows_tq3_1s_f32_len; +extern const unsigned char get_rows_tq3_1s_f32_data[]; + +extern const uint64_t get_rows_tq4_1s_len; +extern const unsigned char get_rows_tq4_1s_data[]; + +extern const uint64_t get_rows_tq4_1s_f32_len; +extern const unsigned char get_rows_tq4_1s_f32_data[]; + +extern const uint64_t get_rows_turbo2_0_len; +extern const unsigned char get_rows_turbo2_0_data[]; + +extern const uint64_t get_rows_turbo2_0_f32_len; +extern const unsigned char get_rows_turbo2_0_f32_data[]; + +extern const uint64_t get_rows_turbo3_0_len; +extern const unsigned char get_rows_turbo3_0_data[]; + +extern const uint64_t get_rows_turbo3_0_f32_len; +extern const unsigned char get_rows_turbo3_0_f32_data[]; + +extern const uint64_t get_rows_turbo4_0_len; +extern const unsigned char get_rows_turbo4_0_data[]; + +extern const uint64_t get_rows_turbo4_0_f32_len; +extern const unsigned char get_rows_turbo4_0_f32_data[]; + +extern const uint64_t group_norm_f32_len; +extern const unsigned char group_norm_f32_data[]; + +extern const uint64_t hardsigmoid_f16_len; +extern const unsigned char hardsigmoid_f16_data[]; + +extern const uint64_t hardsigmoid_f32_len; +extern const unsigned char hardsigmoid_f32_data[]; + +extern const uint64_t hardswish_f16_len; +extern const unsigned char hardswish_f16_data[]; + +extern const uint64_t hardswish_f32_len; +extern const unsigned char hardswish_f32_data[]; + +extern const uint64_t im2col_3d_f32_len; +extern const unsigned char im2col_3d_f32_data[]; + +extern const uint64_t im2col_3d_f32_bda_len; +extern const unsigned char im2col_3d_f32_bda_data[]; + +extern const uint64_t im2col_3d_f32_f16_len; +extern const unsigned char im2col_3d_f32_f16_data[]; + +extern const uint64_t im2col_3d_f32_f16_bda_len; +extern const unsigned char im2col_3d_f32_f16_bda_data[]; + +extern const uint64_t im2col_f32_len; +extern const unsigned char im2col_f32_data[]; + +extern const uint64_t im2col_f32_bda_len; +extern const unsigned char im2col_f32_bda_data[]; + +extern const uint64_t im2col_f32_f16_len; +extern const unsigned char im2col_f32_f16_data[]; + +extern const uint64_t im2col_f32_f16_bda_len; +extern const unsigned char im2col_f32_f16_bda_data[]; + +extern const uint64_t l2_norm_f32_len; +extern const unsigned char l2_norm_f32_data[]; + +extern const uint64_t leaky_relu_f32_len; +extern const unsigned char leaky_relu_f32_data[]; + +extern const uint64_t log_f16_len; +extern const unsigned char log_f16_data[]; + +extern const uint64_t log_f32_len; +extern const unsigned char log_f32_data[]; + +extern const uint64_t matmul_bf16_len; +extern const unsigned char matmul_bf16_data[]; + +extern const uint64_t matmul_bf16_aligned_len; +extern const unsigned char matmul_bf16_aligned_data[]; + +extern const uint64_t matmul_bf16_aligned_cm1_len; +extern const unsigned char matmul_bf16_aligned_cm1_data[]; + +extern const uint64_t matmul_bf16_aligned_cm2_len; +extern const unsigned char matmul_bf16_aligned_cm2_data[]; + +extern const uint64_t matmul_bf16_aligned_f16acc_len; +extern const unsigned char matmul_bf16_aligned_f16acc_data[]; + +extern const uint64_t matmul_bf16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_bf16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_bf16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_bf16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_bf16_aligned_fp32_len; +extern const unsigned char matmul_bf16_aligned_fp32_data[]; + +extern const uint64_t matmul_bf16_cm1_len; +extern const unsigned char matmul_bf16_cm1_data[]; + +extern const uint64_t matmul_bf16_cm2_len; +extern const unsigned char matmul_bf16_cm2_data[]; + +extern const uint64_t matmul_bf16_f16acc_len; +extern const unsigned char matmul_bf16_f16acc_data[]; + +extern const uint64_t matmul_bf16_f16acc_cm1_len; +extern const unsigned char matmul_bf16_f16acc_cm1_data[]; + +extern const uint64_t matmul_bf16_f16acc_cm2_len; +extern const unsigned char matmul_bf16_f16acc_cm2_data[]; + +extern const uint64_t matmul_bf16_fp32_len; +extern const unsigned char matmul_bf16_fp32_data[]; + +extern const uint64_t matmul_f16_len; +extern const unsigned char matmul_f16_data[]; + +extern const uint64_t matmul_f16_aligned_len; +extern const unsigned char matmul_f16_aligned_data[]; + +extern const uint64_t matmul_f16_aligned_cm1_len; +extern const unsigned char matmul_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_f16_aligned_cm2_len; +extern const unsigned char matmul_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_f16_aligned_f16acc_len; +extern const unsigned char matmul_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_f16_aligned_fp32_len; +extern const unsigned char matmul_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_f16_cm1_len; +extern const unsigned char matmul_f16_cm1_data[]; + +extern const uint64_t matmul_f16_cm2_len; +extern const unsigned char matmul_f16_cm2_data[]; + +extern const uint64_t matmul_f16_f16acc_len; +extern const unsigned char matmul_f16_f16acc_data[]; + +extern const uint64_t matmul_f16_f16acc_cm1_len; +extern const unsigned char matmul_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_f16_f16acc_cm2_len; +extern const unsigned char matmul_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_f16_f32_len; +extern const unsigned char matmul_f16_f32_data[]; + +extern const uint64_t matmul_f16_f32_aligned_len; +extern const unsigned char matmul_f16_f32_aligned_data[]; + +extern const uint64_t matmul_f16_f32_aligned_cm1_len; +extern const unsigned char matmul_f16_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_f16_f32_aligned_f16acc_len; +extern const unsigned char matmul_f16_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_f16_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_f16_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_f16_f32_aligned_fp32_len; +extern const unsigned char matmul_f16_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_f16_f32_cm1_len; +extern const unsigned char matmul_f16_f32_cm1_data[]; + +extern const uint64_t matmul_f16_f32_f16acc_len; +extern const unsigned char matmul_f16_f32_f16acc_data[]; + +extern const uint64_t matmul_f16_f32_f16acc_cm1_len; +extern const unsigned char matmul_f16_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_f16_f32_fp32_len; +extern const unsigned char matmul_f16_f32_fp32_data[]; + +extern const uint64_t matmul_f16_fp32_len; +extern const unsigned char matmul_f16_fp32_data[]; + +extern const uint64_t matmul_f32_f16_len; +extern const unsigned char matmul_f32_f16_data[]; + +extern const uint64_t matmul_f32_f16_aligned_len; +extern const unsigned char matmul_f32_f16_aligned_data[]; + +extern const uint64_t matmul_f32_f16_aligned_cm1_len; +extern const unsigned char matmul_f32_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_f32_f16_aligned_cm2_len; +extern const unsigned char matmul_f32_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_f32_f16_aligned_f16acc_len; +extern const unsigned char matmul_f32_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_f32_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_f32_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_f32_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_f32_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_f32_f16_aligned_fp32_len; +extern const unsigned char matmul_f32_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_f32_f16_cm1_len; +extern const unsigned char matmul_f32_f16_cm1_data[]; + +extern const uint64_t matmul_f32_f16_cm2_len; +extern const unsigned char matmul_f32_f16_cm2_data[]; + +extern const uint64_t matmul_f32_f16_f16acc_len; +extern const unsigned char matmul_f32_f16_f16acc_data[]; + +extern const uint64_t matmul_f32_f16_f16acc_cm1_len; +extern const unsigned char matmul_f32_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_f32_f16_f16acc_cm2_len; +extern const unsigned char matmul_f32_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_f32_f16_fp32_len; +extern const unsigned char matmul_f32_f16_fp32_data[]; + +extern const uint64_t matmul_f32_f32_len; +extern const unsigned char matmul_f32_f32_data[]; + +extern const uint64_t matmul_f32_f32_aligned_len; +extern const unsigned char matmul_f32_f32_aligned_data[]; + +extern const uint64_t matmul_f32_f32_aligned_cm1_len; +extern const unsigned char matmul_f32_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_f32_f32_aligned_f16acc_len; +extern const unsigned char matmul_f32_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_f32_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_f32_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_f32_f32_aligned_fp32_len; +extern const unsigned char matmul_f32_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_f32_f32_cm1_len; +extern const unsigned char matmul_f32_f32_cm1_data[]; + +extern const uint64_t matmul_f32_f32_f16acc_len; +extern const unsigned char matmul_f32_f32_f16acc_data[]; + +extern const uint64_t matmul_f32_f32_f16acc_cm1_len; +extern const unsigned char matmul_f32_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_f32_f32_fp32_len; +extern const unsigned char matmul_f32_f32_fp32_data[]; + +extern const uint64_t matmul_id_bf16_len; +extern const unsigned char matmul_id_bf16_data[]; + +extern const uint64_t matmul_id_bf16_aligned_len; +extern const unsigned char matmul_id_bf16_aligned_data[]; + +extern const uint64_t matmul_id_bf16_aligned_f16acc_len; +extern const unsigned char matmul_id_bf16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_bf16_aligned_fp32_len; +extern const unsigned char matmul_id_bf16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_bf16_f16acc_len; +extern const unsigned char matmul_id_bf16_f16acc_data[]; + +extern const uint64_t matmul_id_bf16_fp32_len; +extern const unsigned char matmul_id_bf16_fp32_data[]; + +extern const uint64_t matmul_id_f16_len; +extern const unsigned char matmul_id_f16_data[]; + +extern const uint64_t matmul_id_f16_aligned_len; +extern const unsigned char matmul_id_f16_aligned_data[]; + +extern const uint64_t matmul_id_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_f16_aligned_fp32_len; +extern const unsigned char matmul_id_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_f16_f16acc_len; +extern const unsigned char matmul_id_f16_f16acc_data[]; + +extern const uint64_t matmul_id_f16_f32_len; +extern const unsigned char matmul_id_f16_f32_data[]; + +extern const uint64_t matmul_id_f16_f32_aligned_len; +extern const unsigned char matmul_id_f16_f32_aligned_data[]; + +extern const uint64_t matmul_id_f16_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_f16_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_f16_f32_aligned_fp32_len; +extern const unsigned char matmul_id_f16_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_f16_f32_f16acc_len; +extern const unsigned char matmul_id_f16_f32_f16acc_data[]; + +extern const uint64_t matmul_id_f16_f32_fp32_len; +extern const unsigned char matmul_id_f16_f32_fp32_data[]; + +extern const uint64_t matmul_id_f16_fp32_len; +extern const unsigned char matmul_id_f16_fp32_data[]; + +extern const uint64_t matmul_id_f32_f16_len; +extern const unsigned char matmul_id_f32_f16_data[]; + +extern const uint64_t matmul_id_f32_f16_aligned_len; +extern const unsigned char matmul_id_f32_f16_aligned_data[]; + +extern const uint64_t matmul_id_f32_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_f32_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_f32_f16_aligned_fp32_len; +extern const unsigned char matmul_id_f32_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_f32_f16_f16acc_len; +extern const unsigned char matmul_id_f32_f16_f16acc_data[]; + +extern const uint64_t matmul_id_f32_f16_fp32_len; +extern const unsigned char matmul_id_f32_f16_fp32_data[]; + +extern const uint64_t matmul_id_f32_f32_len; +extern const unsigned char matmul_id_f32_f32_data[]; + +extern const uint64_t matmul_id_f32_f32_aligned_len; +extern const unsigned char matmul_id_f32_f32_aligned_data[]; + +extern const uint64_t matmul_id_f32_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_f32_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_f32_f32_aligned_fp32_len; +extern const unsigned char matmul_id_f32_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_f32_f32_f16acc_len; +extern const unsigned char matmul_id_f32_f32_f16acc_data[]; + +extern const uint64_t matmul_id_f32_f32_fp32_len; +extern const unsigned char matmul_id_f32_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq1_m_f16_len; +extern const unsigned char matmul_id_iq1_m_f16_data[]; + +extern const uint64_t matmul_id_iq1_m_f16_aligned_len; +extern const unsigned char matmul_id_iq1_m_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq1_m_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq1_m_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_m_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq1_m_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq1_m_f16_f16acc_len; +extern const unsigned char matmul_id_iq1_m_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_m_f16_fp32_len; +extern const unsigned char matmul_id_iq1_m_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq1_m_f32_len; +extern const unsigned char matmul_id_iq1_m_f32_data[]; + +extern const uint64_t matmul_id_iq1_m_f32_aligned_len; +extern const unsigned char matmul_id_iq1_m_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq1_m_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq1_m_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_m_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq1_m_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq1_m_f32_f16acc_len; +extern const unsigned char matmul_id_iq1_m_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_m_f32_fp32_len; +extern const unsigned char matmul_id_iq1_m_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq1_s_f16_len; +extern const unsigned char matmul_id_iq1_s_f16_data[]; + +extern const uint64_t matmul_id_iq1_s_f16_aligned_len; +extern const unsigned char matmul_id_iq1_s_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq1_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq1_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq1_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq1_s_f16_f16acc_len; +extern const unsigned char matmul_id_iq1_s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_s_f16_fp32_len; +extern const unsigned char matmul_id_iq1_s_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq1_s_f32_len; +extern const unsigned char matmul_id_iq1_s_f32_data[]; + +extern const uint64_t matmul_id_iq1_s_f32_aligned_len; +extern const unsigned char matmul_id_iq1_s_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq1_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq1_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq1_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq1_s_f32_f16acc_len; +extern const unsigned char matmul_id_iq1_s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq1_s_f32_fp32_len; +extern const unsigned char matmul_id_iq1_s_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq2_s_f16_len; +extern const unsigned char matmul_id_iq2_s_f16_data[]; + +extern const uint64_t matmul_id_iq2_s_f16_aligned_len; +extern const unsigned char matmul_id_iq2_s_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq2_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq2_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq2_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq2_s_f16_f16acc_len; +extern const unsigned char matmul_id_iq2_s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_s_f16_fp32_len; +extern const unsigned char matmul_id_iq2_s_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq2_s_f32_len; +extern const unsigned char matmul_id_iq2_s_f32_data[]; + +extern const uint64_t matmul_id_iq2_s_f32_aligned_len; +extern const unsigned char matmul_id_iq2_s_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq2_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq2_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq2_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq2_s_f32_f16acc_len; +extern const unsigned char matmul_id_iq2_s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_s_f32_fp32_len; +extern const unsigned char matmul_id_iq2_s_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xs_f16_len; +extern const unsigned char matmul_id_iq2_xs_f16_data[]; + +extern const uint64_t matmul_id_iq2_xs_f16_aligned_len; +extern const unsigned char matmul_id_iq2_xs_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq2_xs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq2_xs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq2_xs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xs_f16_f16acc_len; +extern const unsigned char matmul_id_iq2_xs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xs_f16_fp32_len; +extern const unsigned char matmul_id_iq2_xs_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xs_f32_len; +extern const unsigned char matmul_id_iq2_xs_f32_data[]; + +extern const uint64_t matmul_id_iq2_xs_f32_aligned_len; +extern const unsigned char matmul_id_iq2_xs_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq2_xs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq2_xs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq2_xs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xs_f32_f16acc_len; +extern const unsigned char matmul_id_iq2_xs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xs_f32_fp32_len; +extern const unsigned char matmul_id_iq2_xs_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f16_len; +extern const unsigned char matmul_id_iq2_xxs_f16_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f16_aligned_len; +extern const unsigned char matmul_id_iq2_xxs_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq2_xxs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq2_xxs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f16_f16acc_len; +extern const unsigned char matmul_id_iq2_xxs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f16_fp32_len; +extern const unsigned char matmul_id_iq2_xxs_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f32_len; +extern const unsigned char matmul_id_iq2_xxs_f32_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f32_aligned_len; +extern const unsigned char matmul_id_iq2_xxs_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq2_xxs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq2_xxs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f32_f16acc_len; +extern const unsigned char matmul_id_iq2_xxs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq2_xxs_f32_fp32_len; +extern const unsigned char matmul_id_iq2_xxs_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq3_s_f16_len; +extern const unsigned char matmul_id_iq3_s_f16_data[]; + +extern const uint64_t matmul_id_iq3_s_f16_aligned_len; +extern const unsigned char matmul_id_iq3_s_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq3_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq3_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq3_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq3_s_f16_f16acc_len; +extern const unsigned char matmul_id_iq3_s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_s_f16_fp32_len; +extern const unsigned char matmul_id_iq3_s_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq3_s_f32_len; +extern const unsigned char matmul_id_iq3_s_f32_data[]; + +extern const uint64_t matmul_id_iq3_s_f32_aligned_len; +extern const unsigned char matmul_id_iq3_s_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq3_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq3_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq3_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq3_s_f32_f16acc_len; +extern const unsigned char matmul_id_iq3_s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_s_f32_fp32_len; +extern const unsigned char matmul_id_iq3_s_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f16_len; +extern const unsigned char matmul_id_iq3_xxs_f16_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f16_aligned_len; +extern const unsigned char matmul_id_iq3_xxs_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq3_xxs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq3_xxs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f16_f16acc_len; +extern const unsigned char matmul_id_iq3_xxs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f16_fp32_len; +extern const unsigned char matmul_id_iq3_xxs_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f32_len; +extern const unsigned char matmul_id_iq3_xxs_f32_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f32_aligned_len; +extern const unsigned char matmul_id_iq3_xxs_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq3_xxs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq3_xxs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f32_f16acc_len; +extern const unsigned char matmul_id_iq3_xxs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq3_xxs_f32_fp32_len; +extern const unsigned char matmul_id_iq3_xxs_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq4_nl_f16_len; +extern const unsigned char matmul_id_iq4_nl_f16_data[]; + +extern const uint64_t matmul_id_iq4_nl_f16_aligned_len; +extern const unsigned char matmul_id_iq4_nl_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq4_nl_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq4_nl_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_nl_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq4_nl_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq4_nl_f16_f16acc_len; +extern const unsigned char matmul_id_iq4_nl_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_nl_f16_fp32_len; +extern const unsigned char matmul_id_iq4_nl_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq4_nl_f32_len; +extern const unsigned char matmul_id_iq4_nl_f32_data[]; + +extern const uint64_t matmul_id_iq4_nl_f32_aligned_len; +extern const unsigned char matmul_id_iq4_nl_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq4_nl_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq4_nl_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_nl_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq4_nl_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq4_nl_f32_f16acc_len; +extern const unsigned char matmul_id_iq4_nl_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_nl_f32_fp32_len; +extern const unsigned char matmul_id_iq4_nl_f32_fp32_data[]; + +extern const uint64_t matmul_id_iq4_xs_f16_len; +extern const unsigned char matmul_id_iq4_xs_f16_data[]; + +extern const uint64_t matmul_id_iq4_xs_f16_aligned_len; +extern const unsigned char matmul_id_iq4_xs_f16_aligned_data[]; + +extern const uint64_t matmul_id_iq4_xs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_iq4_xs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_xs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_iq4_xs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq4_xs_f16_f16acc_len; +extern const unsigned char matmul_id_iq4_xs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_xs_f16_fp32_len; +extern const unsigned char matmul_id_iq4_xs_f16_fp32_data[]; + +extern const uint64_t matmul_id_iq4_xs_f32_len; +extern const unsigned char matmul_id_iq4_xs_f32_data[]; + +extern const uint64_t matmul_id_iq4_xs_f32_aligned_len; +extern const unsigned char matmul_id_iq4_xs_f32_aligned_data[]; + +extern const uint64_t matmul_id_iq4_xs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_iq4_xs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_xs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_iq4_xs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_iq4_xs_f32_f16acc_len; +extern const unsigned char matmul_id_iq4_xs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_iq4_xs_f32_fp32_len; +extern const unsigned char matmul_id_iq4_xs_f32_fp32_data[]; + +extern const uint64_t matmul_id_mxfp4_f16_len; +extern const unsigned char matmul_id_mxfp4_f16_data[]; + +extern const uint64_t matmul_id_mxfp4_f16_aligned_len; +extern const unsigned char matmul_id_mxfp4_f16_aligned_data[]; + +extern const uint64_t matmul_id_mxfp4_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_mxfp4_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_mxfp4_f16_aligned_fp32_len; +extern const unsigned char matmul_id_mxfp4_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_mxfp4_f16_f16acc_len; +extern const unsigned char matmul_id_mxfp4_f16_f16acc_data[]; + +extern const uint64_t matmul_id_mxfp4_f16_fp32_len; +extern const unsigned char matmul_id_mxfp4_f16_fp32_data[]; + +extern const uint64_t matmul_id_mxfp4_f32_len; +extern const unsigned char matmul_id_mxfp4_f32_data[]; + +extern const uint64_t matmul_id_mxfp4_f32_aligned_len; +extern const unsigned char matmul_id_mxfp4_f32_aligned_data[]; + +extern const uint64_t matmul_id_mxfp4_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_mxfp4_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_mxfp4_f32_aligned_fp32_len; +extern const unsigned char matmul_id_mxfp4_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_mxfp4_f32_f16acc_len; +extern const unsigned char matmul_id_mxfp4_f32_f16acc_data[]; + +extern const uint64_t matmul_id_mxfp4_f32_fp32_len; +extern const unsigned char matmul_id_mxfp4_f32_fp32_data[]; + +extern const uint64_t matmul_id_mxfp4_q8_1_len; +extern const unsigned char matmul_id_mxfp4_q8_1_data[]; + +extern const uint64_t matmul_id_mxfp4_q8_1_fp32_len; +extern const unsigned char matmul_id_mxfp4_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_nvfp4_f16_len; +extern const unsigned char matmul_id_nvfp4_f16_data[]; + +extern const uint64_t matmul_id_nvfp4_f16_aligned_len; +extern const unsigned char matmul_id_nvfp4_f16_aligned_data[]; + +extern const uint64_t matmul_id_nvfp4_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_nvfp4_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_nvfp4_f16_aligned_fp32_len; +extern const unsigned char matmul_id_nvfp4_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_nvfp4_f16_f16acc_len; +extern const unsigned char matmul_id_nvfp4_f16_f16acc_data[]; + +extern const uint64_t matmul_id_nvfp4_f16_fp32_len; +extern const unsigned char matmul_id_nvfp4_f16_fp32_data[]; + +extern const uint64_t matmul_id_nvfp4_f32_len; +extern const unsigned char matmul_id_nvfp4_f32_data[]; + +extern const uint64_t matmul_id_nvfp4_f32_aligned_len; +extern const unsigned char matmul_id_nvfp4_f32_aligned_data[]; + +extern const uint64_t matmul_id_nvfp4_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_nvfp4_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_nvfp4_f32_aligned_fp32_len; +extern const unsigned char matmul_id_nvfp4_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_nvfp4_f32_f16acc_len; +extern const unsigned char matmul_id_nvfp4_f32_f16acc_data[]; + +extern const uint64_t matmul_id_nvfp4_f32_fp32_len; +extern const unsigned char matmul_id_nvfp4_f32_fp32_data[]; + +extern const uint64_t matmul_id_q1_0_f16_len; +extern const unsigned char matmul_id_q1_0_f16_data[]; + +extern const uint64_t matmul_id_q1_0_f16_aligned_len; +extern const unsigned char matmul_id_q1_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_q1_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q1_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q1_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q1_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q1_0_f16_f16acc_len; +extern const unsigned char matmul_id_q1_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q1_0_f16_fp32_len; +extern const unsigned char matmul_id_q1_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_q1_0_f32_len; +extern const unsigned char matmul_id_q1_0_f32_data[]; + +extern const uint64_t matmul_id_q1_0_f32_aligned_len; +extern const unsigned char matmul_id_q1_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_q1_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q1_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q1_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q1_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q1_0_f32_f16acc_len; +extern const unsigned char matmul_id_q1_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q1_0_f32_fp32_len; +extern const unsigned char matmul_id_q1_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_q2_k_f16_len; +extern const unsigned char matmul_id_q2_k_f16_data[]; + +extern const uint64_t matmul_id_q2_k_f16_aligned_len; +extern const unsigned char matmul_id_q2_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_q2_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q2_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q2_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q2_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q2_k_f16_f16acc_len; +extern const unsigned char matmul_id_q2_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q2_k_f16_fp32_len; +extern const unsigned char matmul_id_q2_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_q2_k_f32_len; +extern const unsigned char matmul_id_q2_k_f32_data[]; + +extern const uint64_t matmul_id_q2_k_f32_aligned_len; +extern const unsigned char matmul_id_q2_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_q2_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q2_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q2_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q2_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q2_k_f32_f16acc_len; +extern const unsigned char matmul_id_q2_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q2_k_f32_fp32_len; +extern const unsigned char matmul_id_q2_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_q2_k_q8_1_len; +extern const unsigned char matmul_id_q2_k_q8_1_data[]; + +extern const uint64_t matmul_id_q2_k_q8_1_fp32_len; +extern const unsigned char matmul_id_q2_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q3_k_f16_len; +extern const unsigned char matmul_id_q3_k_f16_data[]; + +extern const uint64_t matmul_id_q3_k_f16_aligned_len; +extern const unsigned char matmul_id_q3_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_q3_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q3_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q3_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q3_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q3_k_f16_f16acc_len; +extern const unsigned char matmul_id_q3_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q3_k_f16_fp32_len; +extern const unsigned char matmul_id_q3_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_q3_k_f32_len; +extern const unsigned char matmul_id_q3_k_f32_data[]; + +extern const uint64_t matmul_id_q3_k_f32_aligned_len; +extern const unsigned char matmul_id_q3_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_q3_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q3_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q3_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q3_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q3_k_f32_f16acc_len; +extern const unsigned char matmul_id_q3_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q3_k_f32_fp32_len; +extern const unsigned char matmul_id_q3_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_q3_k_q8_1_len; +extern const unsigned char matmul_id_q3_k_q8_1_data[]; + +extern const uint64_t matmul_id_q3_k_q8_1_fp32_len; +extern const unsigned char matmul_id_q3_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q4_0_f16_len; +extern const unsigned char matmul_id_q4_0_f16_data[]; + +extern const uint64_t matmul_id_q4_0_f16_aligned_len; +extern const unsigned char matmul_id_q4_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_q4_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q4_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q4_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q4_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q4_0_f16_f16acc_len; +extern const unsigned char matmul_id_q4_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q4_0_f16_fp32_len; +extern const unsigned char matmul_id_q4_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_q4_0_f32_len; +extern const unsigned char matmul_id_q4_0_f32_data[]; + +extern const uint64_t matmul_id_q4_0_f32_aligned_len; +extern const unsigned char matmul_id_q4_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_q4_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q4_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q4_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q4_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q4_0_f32_f16acc_len; +extern const unsigned char matmul_id_q4_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q4_0_f32_fp32_len; +extern const unsigned char matmul_id_q4_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_q4_0_q8_1_len; +extern const unsigned char matmul_id_q4_0_q8_1_data[]; + +extern const uint64_t matmul_id_q4_0_q8_1_fp32_len; +extern const unsigned char matmul_id_q4_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q4_1_f16_len; +extern const unsigned char matmul_id_q4_1_f16_data[]; + +extern const uint64_t matmul_id_q4_1_f16_aligned_len; +extern const unsigned char matmul_id_q4_1_f16_aligned_data[]; + +extern const uint64_t matmul_id_q4_1_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q4_1_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q4_1_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q4_1_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q4_1_f16_f16acc_len; +extern const unsigned char matmul_id_q4_1_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q4_1_f16_fp32_len; +extern const unsigned char matmul_id_q4_1_f16_fp32_data[]; + +extern const uint64_t matmul_id_q4_1_f32_len; +extern const unsigned char matmul_id_q4_1_f32_data[]; + +extern const uint64_t matmul_id_q4_1_f32_aligned_len; +extern const unsigned char matmul_id_q4_1_f32_aligned_data[]; + +extern const uint64_t matmul_id_q4_1_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q4_1_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q4_1_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q4_1_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q4_1_f32_f16acc_len; +extern const unsigned char matmul_id_q4_1_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q4_1_f32_fp32_len; +extern const unsigned char matmul_id_q4_1_f32_fp32_data[]; + +extern const uint64_t matmul_id_q4_1_q8_1_len; +extern const unsigned char matmul_id_q4_1_q8_1_data[]; + +extern const uint64_t matmul_id_q4_1_q8_1_fp32_len; +extern const unsigned char matmul_id_q4_1_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q4_k_f16_len; +extern const unsigned char matmul_id_q4_k_f16_data[]; + +extern const uint64_t matmul_id_q4_k_f16_aligned_len; +extern const unsigned char matmul_id_q4_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_q4_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q4_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q4_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q4_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q4_k_f16_f16acc_len; +extern const unsigned char matmul_id_q4_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q4_k_f16_fp32_len; +extern const unsigned char matmul_id_q4_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_q4_k_f32_len; +extern const unsigned char matmul_id_q4_k_f32_data[]; + +extern const uint64_t matmul_id_q4_k_f32_aligned_len; +extern const unsigned char matmul_id_q4_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_q4_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q4_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q4_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q4_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q4_k_f32_f16acc_len; +extern const unsigned char matmul_id_q4_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q4_k_f32_fp32_len; +extern const unsigned char matmul_id_q4_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_q4_k_q8_1_len; +extern const unsigned char matmul_id_q4_k_q8_1_data[]; + +extern const uint64_t matmul_id_q4_k_q8_1_fp32_len; +extern const unsigned char matmul_id_q4_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q5_0_f16_len; +extern const unsigned char matmul_id_q5_0_f16_data[]; + +extern const uint64_t matmul_id_q5_0_f16_aligned_len; +extern const unsigned char matmul_id_q5_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_q5_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q5_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q5_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q5_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q5_0_f16_f16acc_len; +extern const unsigned char matmul_id_q5_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q5_0_f16_fp32_len; +extern const unsigned char matmul_id_q5_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_q5_0_f32_len; +extern const unsigned char matmul_id_q5_0_f32_data[]; + +extern const uint64_t matmul_id_q5_0_f32_aligned_len; +extern const unsigned char matmul_id_q5_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_q5_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q5_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q5_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q5_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q5_0_f32_f16acc_len; +extern const unsigned char matmul_id_q5_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q5_0_f32_fp32_len; +extern const unsigned char matmul_id_q5_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_q5_0_q8_1_len; +extern const unsigned char matmul_id_q5_0_q8_1_data[]; + +extern const uint64_t matmul_id_q5_0_q8_1_fp32_len; +extern const unsigned char matmul_id_q5_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q5_1_f16_len; +extern const unsigned char matmul_id_q5_1_f16_data[]; + +extern const uint64_t matmul_id_q5_1_f16_aligned_len; +extern const unsigned char matmul_id_q5_1_f16_aligned_data[]; + +extern const uint64_t matmul_id_q5_1_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q5_1_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q5_1_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q5_1_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q5_1_f16_f16acc_len; +extern const unsigned char matmul_id_q5_1_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q5_1_f16_fp32_len; +extern const unsigned char matmul_id_q5_1_f16_fp32_data[]; + +extern const uint64_t matmul_id_q5_1_f32_len; +extern const unsigned char matmul_id_q5_1_f32_data[]; + +extern const uint64_t matmul_id_q5_1_f32_aligned_len; +extern const unsigned char matmul_id_q5_1_f32_aligned_data[]; + +extern const uint64_t matmul_id_q5_1_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q5_1_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q5_1_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q5_1_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q5_1_f32_f16acc_len; +extern const unsigned char matmul_id_q5_1_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q5_1_f32_fp32_len; +extern const unsigned char matmul_id_q5_1_f32_fp32_data[]; + +extern const uint64_t matmul_id_q5_1_q8_1_len; +extern const unsigned char matmul_id_q5_1_q8_1_data[]; + +extern const uint64_t matmul_id_q5_1_q8_1_fp32_len; +extern const unsigned char matmul_id_q5_1_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q5_k_f16_len; +extern const unsigned char matmul_id_q5_k_f16_data[]; + +extern const uint64_t matmul_id_q5_k_f16_aligned_len; +extern const unsigned char matmul_id_q5_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_q5_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q5_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q5_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q5_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q5_k_f16_f16acc_len; +extern const unsigned char matmul_id_q5_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q5_k_f16_fp32_len; +extern const unsigned char matmul_id_q5_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_q5_k_f32_len; +extern const unsigned char matmul_id_q5_k_f32_data[]; + +extern const uint64_t matmul_id_q5_k_f32_aligned_len; +extern const unsigned char matmul_id_q5_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_q5_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q5_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q5_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q5_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q5_k_f32_f16acc_len; +extern const unsigned char matmul_id_q5_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q5_k_f32_fp32_len; +extern const unsigned char matmul_id_q5_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_q5_k_q8_1_len; +extern const unsigned char matmul_id_q5_k_q8_1_data[]; + +extern const uint64_t matmul_id_q5_k_q8_1_fp32_len; +extern const unsigned char matmul_id_q5_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q6_k_f16_len; +extern const unsigned char matmul_id_q6_k_f16_data[]; + +extern const uint64_t matmul_id_q6_k_f16_aligned_len; +extern const unsigned char matmul_id_q6_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_q6_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q6_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q6_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q6_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q6_k_f16_f16acc_len; +extern const unsigned char matmul_id_q6_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q6_k_f16_fp32_len; +extern const unsigned char matmul_id_q6_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_q6_k_f32_len; +extern const unsigned char matmul_id_q6_k_f32_data[]; + +extern const uint64_t matmul_id_q6_k_f32_aligned_len; +extern const unsigned char matmul_id_q6_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_q6_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q6_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q6_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q6_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q6_k_f32_f16acc_len; +extern const unsigned char matmul_id_q6_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q6_k_f32_fp32_len; +extern const unsigned char matmul_id_q6_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_q6_k_q8_1_len; +extern const unsigned char matmul_id_q6_k_q8_1_data[]; + +extern const uint64_t matmul_id_q6_k_q8_1_fp32_len; +extern const unsigned char matmul_id_q6_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_q8_0_f16_len; +extern const unsigned char matmul_id_q8_0_f16_data[]; + +extern const uint64_t matmul_id_q8_0_f16_aligned_len; +extern const unsigned char matmul_id_q8_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_q8_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_q8_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q8_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_q8_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q8_0_f16_f16acc_len; +extern const unsigned char matmul_id_q8_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_q8_0_f16_fp32_len; +extern const unsigned char matmul_id_q8_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_q8_0_f32_len; +extern const unsigned char matmul_id_q8_0_f32_data[]; + +extern const uint64_t matmul_id_q8_0_f32_aligned_len; +extern const unsigned char matmul_id_q8_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_q8_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_q8_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_q8_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_q8_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_q8_0_f32_f16acc_len; +extern const unsigned char matmul_id_q8_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_q8_0_f32_fp32_len; +extern const unsigned char matmul_id_q8_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_q8_0_q8_1_len; +extern const unsigned char matmul_id_q8_0_q8_1_data[]; + +extern const uint64_t matmul_id_q8_0_q8_1_fp32_len; +extern const unsigned char matmul_id_q8_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_len; +extern const unsigned char matmul_id_subgroup_bf16_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_bf16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_cm1_len; +extern const unsigned char matmul_id_subgroup_bf16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_cm2_len; +extern const unsigned char matmul_id_subgroup_bf16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_f16acc_len; +extern const unsigned char matmul_id_subgroup_bf16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_bf16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_bf16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_bf16_fp32_len; +extern const unsigned char matmul_id_subgroup_bf16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f16_len; +extern const unsigned char matmul_id_subgroup_f16_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_len; +extern const unsigned char matmul_id_subgroup_f16_f32_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_f16_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_f16_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_f16_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_f16_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f16_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f16_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_f16_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_len; +extern const unsigned char matmul_id_subgroup_f32_f16_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_f32_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_f32_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_f32_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_f32_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_f32_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_len; +extern const unsigned char matmul_id_subgroup_f32_f32_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_f32_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_f32_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_f32_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_f32_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_f32_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_f32_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_f32_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_m_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_m_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq1_s_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq1_s_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_s_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_s_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xs_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xs_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq2_xxs_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq2_xxs_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_s_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_s_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq3_xxs_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq3_xxs_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_nl_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_nl_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_iq4_xs_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_iq4_xs_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_mxfp4_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_q8_1_len; +extern const unsigned char matmul_id_subgroup_mxfp4_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_mxfp4_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_mxfp4_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_nvfp4_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_nvfp4_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q1_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q1_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q1_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q2_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q2_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_q8_1_len; +extern const unsigned char matmul_id_subgroup_q2_k_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q2_k_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q2_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q3_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q3_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_q8_1_len; +extern const unsigned char matmul_id_subgroup_q3_k_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q3_k_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q3_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_q8_1_len; +extern const unsigned char matmul_id_subgroup_q4_0_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_0_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_1_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_1_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_q8_1_len; +extern const unsigned char matmul_id_subgroup_q4_1_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_1_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_1_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_q8_1_len; +extern const unsigned char matmul_id_subgroup_q4_k_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q4_k_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q4_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_q8_1_len; +extern const unsigned char matmul_id_subgroup_q5_0_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_0_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_1_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_1_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_q8_1_len; +extern const unsigned char matmul_id_subgroup_q5_1_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_1_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_1_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_q8_1_len; +extern const unsigned char matmul_id_subgroup_q5_k_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q5_k_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q5_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q6_k_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q6_k_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_q8_1_len; +extern const unsigned char matmul_id_subgroup_q6_k_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q6_k_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q6_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_q8_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_q8_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_q8_1_len; +extern const unsigned char matmul_id_subgroup_q8_0_q8_1_data[]; + +extern const uint64_t matmul_id_subgroup_q8_0_q8_1_fp32_len; +extern const unsigned char matmul_id_subgroup_q8_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_tq3_1s_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_tq3_1s_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo2_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo2_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo3_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo3_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f16_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_aligned_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_aligned_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_f16acc_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_id_subgroup_turbo4_0_f32_fp32_len; +extern const unsigned char matmul_id_subgroup_turbo4_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_tq3_1s_f16_len; +extern const unsigned char matmul_id_tq3_1s_f16_data[]; + +extern const uint64_t matmul_id_tq3_1s_f16_aligned_len; +extern const unsigned char matmul_id_tq3_1s_f16_aligned_data[]; + +extern const uint64_t matmul_id_tq3_1s_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_tq3_1s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_tq3_1s_f16_aligned_fp32_len; +extern const unsigned char matmul_id_tq3_1s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_tq3_1s_f16_f16acc_len; +extern const unsigned char matmul_id_tq3_1s_f16_f16acc_data[]; + +extern const uint64_t matmul_id_tq3_1s_f16_fp32_len; +extern const unsigned char matmul_id_tq3_1s_f16_fp32_data[]; + +extern const uint64_t matmul_id_tq3_1s_f32_len; +extern const unsigned char matmul_id_tq3_1s_f32_data[]; + +extern const uint64_t matmul_id_tq3_1s_f32_aligned_len; +extern const unsigned char matmul_id_tq3_1s_f32_aligned_data[]; + +extern const uint64_t matmul_id_tq3_1s_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_tq3_1s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_tq3_1s_f32_aligned_fp32_len; +extern const unsigned char matmul_id_tq3_1s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_tq3_1s_f32_f16acc_len; +extern const unsigned char matmul_id_tq3_1s_f32_f16acc_data[]; + +extern const uint64_t matmul_id_tq3_1s_f32_fp32_len; +extern const unsigned char matmul_id_tq3_1s_f32_fp32_data[]; + +extern const uint64_t matmul_id_turbo2_0_f16_len; +extern const unsigned char matmul_id_turbo2_0_f16_data[]; + +extern const uint64_t matmul_id_turbo2_0_f16_aligned_len; +extern const unsigned char matmul_id_turbo2_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_turbo2_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_turbo2_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_turbo2_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_turbo2_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_turbo2_0_f16_f16acc_len; +extern const unsigned char matmul_id_turbo2_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_turbo2_0_f16_fp32_len; +extern const unsigned char matmul_id_turbo2_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_turbo2_0_f32_len; +extern const unsigned char matmul_id_turbo2_0_f32_data[]; + +extern const uint64_t matmul_id_turbo2_0_f32_aligned_len; +extern const unsigned char matmul_id_turbo2_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_turbo2_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_turbo2_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_turbo2_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_turbo2_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_turbo2_0_f32_f16acc_len; +extern const unsigned char matmul_id_turbo2_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_turbo2_0_f32_fp32_len; +extern const unsigned char matmul_id_turbo2_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_turbo3_0_f16_len; +extern const unsigned char matmul_id_turbo3_0_f16_data[]; + +extern const uint64_t matmul_id_turbo3_0_f16_aligned_len; +extern const unsigned char matmul_id_turbo3_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_turbo3_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_turbo3_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_turbo3_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_turbo3_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_turbo3_0_f16_f16acc_len; +extern const unsigned char matmul_id_turbo3_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_turbo3_0_f16_fp32_len; +extern const unsigned char matmul_id_turbo3_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_turbo3_0_f32_len; +extern const unsigned char matmul_id_turbo3_0_f32_data[]; + +extern const uint64_t matmul_id_turbo3_0_f32_aligned_len; +extern const unsigned char matmul_id_turbo3_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_turbo3_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_turbo3_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_turbo3_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_turbo3_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_turbo3_0_f32_f16acc_len; +extern const unsigned char matmul_id_turbo3_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_turbo3_0_f32_fp32_len; +extern const unsigned char matmul_id_turbo3_0_f32_fp32_data[]; + +extern const uint64_t matmul_id_turbo4_0_f16_len; +extern const unsigned char matmul_id_turbo4_0_f16_data[]; + +extern const uint64_t matmul_id_turbo4_0_f16_aligned_len; +extern const unsigned char matmul_id_turbo4_0_f16_aligned_data[]; + +extern const uint64_t matmul_id_turbo4_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_id_turbo4_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_turbo4_0_f16_aligned_fp32_len; +extern const unsigned char matmul_id_turbo4_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_id_turbo4_0_f16_f16acc_len; +extern const unsigned char matmul_id_turbo4_0_f16_f16acc_data[]; + +extern const uint64_t matmul_id_turbo4_0_f16_fp32_len; +extern const unsigned char matmul_id_turbo4_0_f16_fp32_data[]; + +extern const uint64_t matmul_id_turbo4_0_f32_len; +extern const unsigned char matmul_id_turbo4_0_f32_data[]; + +extern const uint64_t matmul_id_turbo4_0_f32_aligned_len; +extern const unsigned char matmul_id_turbo4_0_f32_aligned_data[]; + +extern const uint64_t matmul_id_turbo4_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_id_turbo4_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_id_turbo4_0_f32_aligned_fp32_len; +extern const unsigned char matmul_id_turbo4_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_id_turbo4_0_f32_f16acc_len; +extern const unsigned char matmul_id_turbo4_0_f32_f16acc_data[]; + +extern const uint64_t matmul_id_turbo4_0_f32_fp32_len; +extern const unsigned char matmul_id_turbo4_0_f32_fp32_data[]; + +extern const uint64_t matmul_iq1_m_f16_len; +extern const unsigned char matmul_iq1_m_f16_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_len; +extern const unsigned char matmul_iq1_m_f16_aligned_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_cm1_len; +extern const unsigned char matmul_iq1_m_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_cm2_len; +extern const unsigned char matmul_iq1_m_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq1_m_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq1_m_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq1_m_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq1_m_f16_aligned_fp32_len; +extern const unsigned char matmul_iq1_m_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq1_m_f16_cm1_len; +extern const unsigned char matmul_iq1_m_f16_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f16_cm2_len; +extern const unsigned char matmul_iq1_m_f16_cm2_data[]; + +extern const uint64_t matmul_iq1_m_f16_f16acc_len; +extern const unsigned char matmul_iq1_m_f16_f16acc_data[]; + +extern const uint64_t matmul_iq1_m_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq1_m_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq1_m_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq1_m_f16_fp32_len; +extern const unsigned char matmul_iq1_m_f16_fp32_data[]; + +extern const uint64_t matmul_iq1_m_f32_len; +extern const unsigned char matmul_iq1_m_f32_data[]; + +extern const uint64_t matmul_iq1_m_f32_aligned_len; +extern const unsigned char matmul_iq1_m_f32_aligned_data[]; + +extern const uint64_t matmul_iq1_m_f32_aligned_cm1_len; +extern const unsigned char matmul_iq1_m_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq1_m_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq1_m_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq1_m_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f32_aligned_fp32_len; +extern const unsigned char matmul_iq1_m_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq1_m_f32_cm1_len; +extern const unsigned char matmul_iq1_m_f32_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f32_f16acc_len; +extern const unsigned char matmul_iq1_m_f32_f16acc_data[]; + +extern const uint64_t matmul_iq1_m_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq1_m_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_m_f32_fp32_len; +extern const unsigned char matmul_iq1_m_f32_fp32_data[]; + +extern const uint64_t matmul_iq1_s_f16_len; +extern const unsigned char matmul_iq1_s_f16_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_len; +extern const unsigned char matmul_iq1_s_f16_aligned_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_cm1_len; +extern const unsigned char matmul_iq1_s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_cm2_len; +extern const unsigned char matmul_iq1_s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq1_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq1_s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq1_s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq1_s_f16_aligned_fp32_len; +extern const unsigned char matmul_iq1_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq1_s_f16_cm1_len; +extern const unsigned char matmul_iq1_s_f16_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f16_cm2_len; +extern const unsigned char matmul_iq1_s_f16_cm2_data[]; + +extern const uint64_t matmul_iq1_s_f16_f16acc_len; +extern const unsigned char matmul_iq1_s_f16_f16acc_data[]; + +extern const uint64_t matmul_iq1_s_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq1_s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq1_s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq1_s_f16_fp32_len; +extern const unsigned char matmul_iq1_s_f16_fp32_data[]; + +extern const uint64_t matmul_iq1_s_f32_len; +extern const unsigned char matmul_iq1_s_f32_data[]; + +extern const uint64_t matmul_iq1_s_f32_aligned_len; +extern const unsigned char matmul_iq1_s_f32_aligned_data[]; + +extern const uint64_t matmul_iq1_s_f32_aligned_cm1_len; +extern const unsigned char matmul_iq1_s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq1_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq1_s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq1_s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f32_aligned_fp32_len; +extern const unsigned char matmul_iq1_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq1_s_f32_cm1_len; +extern const unsigned char matmul_iq1_s_f32_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f32_f16acc_len; +extern const unsigned char matmul_iq1_s_f32_f16acc_data[]; + +extern const uint64_t matmul_iq1_s_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq1_s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq1_s_f32_fp32_len; +extern const unsigned char matmul_iq1_s_f32_fp32_data[]; + +extern const uint64_t matmul_iq2_s_f16_len; +extern const unsigned char matmul_iq2_s_f16_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_len; +extern const unsigned char matmul_iq2_s_f16_aligned_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_cm1_len; +extern const unsigned char matmul_iq2_s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_cm2_len; +extern const unsigned char matmul_iq2_s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq2_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq2_s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq2_s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq2_s_f16_aligned_fp32_len; +extern const unsigned char matmul_iq2_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq2_s_f16_cm1_len; +extern const unsigned char matmul_iq2_s_f16_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f16_cm2_len; +extern const unsigned char matmul_iq2_s_f16_cm2_data[]; + +extern const uint64_t matmul_iq2_s_f16_f16acc_len; +extern const unsigned char matmul_iq2_s_f16_f16acc_data[]; + +extern const uint64_t matmul_iq2_s_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq2_s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq2_s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq2_s_f16_fp32_len; +extern const unsigned char matmul_iq2_s_f16_fp32_data[]; + +extern const uint64_t matmul_iq2_s_f32_len; +extern const unsigned char matmul_iq2_s_f32_data[]; + +extern const uint64_t matmul_iq2_s_f32_aligned_len; +extern const unsigned char matmul_iq2_s_f32_aligned_data[]; + +extern const uint64_t matmul_iq2_s_f32_aligned_cm1_len; +extern const unsigned char matmul_iq2_s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq2_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq2_s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq2_s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f32_aligned_fp32_len; +extern const unsigned char matmul_iq2_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq2_s_f32_cm1_len; +extern const unsigned char matmul_iq2_s_f32_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f32_f16acc_len; +extern const unsigned char matmul_iq2_s_f32_f16acc_data[]; + +extern const uint64_t matmul_iq2_s_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq2_s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_s_f32_fp32_len; +extern const unsigned char matmul_iq2_s_f32_fp32_data[]; + +extern const uint64_t matmul_iq2_xs_f16_len; +extern const unsigned char matmul_iq2_xs_f16_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_cm1_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_cm2_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq2_xs_f16_aligned_fp32_len; +extern const unsigned char matmul_iq2_xs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq2_xs_f16_cm1_len; +extern const unsigned char matmul_iq2_xs_f16_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f16_cm2_len; +extern const unsigned char matmul_iq2_xs_f16_cm2_data[]; + +extern const uint64_t matmul_iq2_xs_f16_f16acc_len; +extern const unsigned char matmul_iq2_xs_f16_f16acc_data[]; + +extern const uint64_t matmul_iq2_xs_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq2_xs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq2_xs_f16_fp32_len; +extern const unsigned char matmul_iq2_xs_f16_fp32_data[]; + +extern const uint64_t matmul_iq2_xs_f32_len; +extern const unsigned char matmul_iq2_xs_f32_data[]; + +extern const uint64_t matmul_iq2_xs_f32_aligned_len; +extern const unsigned char matmul_iq2_xs_f32_aligned_data[]; + +extern const uint64_t matmul_iq2_xs_f32_aligned_cm1_len; +extern const unsigned char matmul_iq2_xs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq2_xs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq2_xs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f32_aligned_fp32_len; +extern const unsigned char matmul_iq2_xs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq2_xs_f32_cm1_len; +extern const unsigned char matmul_iq2_xs_f32_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f32_f16acc_len; +extern const unsigned char matmul_iq2_xs_f32_f16acc_data[]; + +extern const uint64_t matmul_iq2_xs_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xs_f32_fp32_len; +extern const unsigned char matmul_iq2_xs_f32_fp32_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_len; +extern const unsigned char matmul_iq2_xxs_f16_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_cm1_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_cm2_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_aligned_fp32_len; +extern const unsigned char matmul_iq2_xxs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_cm1_len; +extern const unsigned char matmul_iq2_xxs_f16_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_cm2_len; +extern const unsigned char matmul_iq2_xxs_f16_cm2_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_f16acc_len; +extern const unsigned char matmul_iq2_xxs_f16_f16acc_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xxs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq2_xxs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq2_xxs_f16_fp32_len; +extern const unsigned char matmul_iq2_xxs_f16_fp32_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_len; +extern const unsigned char matmul_iq2_xxs_f32_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_aligned_len; +extern const unsigned char matmul_iq2_xxs_f32_aligned_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_aligned_cm1_len; +extern const unsigned char matmul_iq2_xxs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq2_xxs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xxs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_aligned_fp32_len; +extern const unsigned char matmul_iq2_xxs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_cm1_len; +extern const unsigned char matmul_iq2_xxs_f32_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_f16acc_len; +extern const unsigned char matmul_iq2_xxs_f32_f16acc_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq2_xxs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq2_xxs_f32_fp32_len; +extern const unsigned char matmul_iq2_xxs_f32_fp32_data[]; + +extern const uint64_t matmul_iq3_s_f16_len; +extern const unsigned char matmul_iq3_s_f16_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_len; +extern const unsigned char matmul_iq3_s_f16_aligned_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_cm1_len; +extern const unsigned char matmul_iq3_s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_cm2_len; +extern const unsigned char matmul_iq3_s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq3_s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq3_s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq3_s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq3_s_f16_aligned_fp32_len; +extern const unsigned char matmul_iq3_s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq3_s_f16_cm1_len; +extern const unsigned char matmul_iq3_s_f16_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f16_cm2_len; +extern const unsigned char matmul_iq3_s_f16_cm2_data[]; + +extern const uint64_t matmul_iq3_s_f16_f16acc_len; +extern const unsigned char matmul_iq3_s_f16_f16acc_data[]; + +extern const uint64_t matmul_iq3_s_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq3_s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq3_s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq3_s_f16_fp32_len; +extern const unsigned char matmul_iq3_s_f16_fp32_data[]; + +extern const uint64_t matmul_iq3_s_f32_len; +extern const unsigned char matmul_iq3_s_f32_data[]; + +extern const uint64_t matmul_iq3_s_f32_aligned_len; +extern const unsigned char matmul_iq3_s_f32_aligned_data[]; + +extern const uint64_t matmul_iq3_s_f32_aligned_cm1_len; +extern const unsigned char matmul_iq3_s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq3_s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq3_s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq3_s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f32_aligned_fp32_len; +extern const unsigned char matmul_iq3_s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq3_s_f32_cm1_len; +extern const unsigned char matmul_iq3_s_f32_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f32_f16acc_len; +extern const unsigned char matmul_iq3_s_f32_f16acc_data[]; + +extern const uint64_t matmul_iq3_s_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq3_s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_s_f32_fp32_len; +extern const unsigned char matmul_iq3_s_f32_fp32_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_len; +extern const unsigned char matmul_iq3_xxs_f16_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_cm1_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_cm2_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_aligned_fp32_len; +extern const unsigned char matmul_iq3_xxs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_cm1_len; +extern const unsigned char matmul_iq3_xxs_f16_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_cm2_len; +extern const unsigned char matmul_iq3_xxs_f16_cm2_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_f16acc_len; +extern const unsigned char matmul_iq3_xxs_f16_f16acc_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq3_xxs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq3_xxs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq3_xxs_f16_fp32_len; +extern const unsigned char matmul_iq3_xxs_f16_fp32_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_len; +extern const unsigned char matmul_iq3_xxs_f32_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_aligned_len; +extern const unsigned char matmul_iq3_xxs_f32_aligned_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_aligned_cm1_len; +extern const unsigned char matmul_iq3_xxs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq3_xxs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq3_xxs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_aligned_fp32_len; +extern const unsigned char matmul_iq3_xxs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_cm1_len; +extern const unsigned char matmul_iq3_xxs_f32_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_f16acc_len; +extern const unsigned char matmul_iq3_xxs_f32_f16acc_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq3_xxs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq3_xxs_f32_fp32_len; +extern const unsigned char matmul_iq3_xxs_f32_fp32_data[]; + +extern const uint64_t matmul_iq4_nl_f16_len; +extern const unsigned char matmul_iq4_nl_f16_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_cm1_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_cm2_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq4_nl_f16_aligned_fp32_len; +extern const unsigned char matmul_iq4_nl_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq4_nl_f16_cm1_len; +extern const unsigned char matmul_iq4_nl_f16_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f16_cm2_len; +extern const unsigned char matmul_iq4_nl_f16_cm2_data[]; + +extern const uint64_t matmul_iq4_nl_f16_f16acc_len; +extern const unsigned char matmul_iq4_nl_f16_f16acc_data[]; + +extern const uint64_t matmul_iq4_nl_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq4_nl_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq4_nl_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq4_nl_f16_fp32_len; +extern const unsigned char matmul_iq4_nl_f16_fp32_data[]; + +extern const uint64_t matmul_iq4_nl_f32_len; +extern const unsigned char matmul_iq4_nl_f32_data[]; + +extern const uint64_t matmul_iq4_nl_f32_aligned_len; +extern const unsigned char matmul_iq4_nl_f32_aligned_data[]; + +extern const uint64_t matmul_iq4_nl_f32_aligned_cm1_len; +extern const unsigned char matmul_iq4_nl_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq4_nl_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq4_nl_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq4_nl_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f32_aligned_fp32_len; +extern const unsigned char matmul_iq4_nl_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq4_nl_f32_cm1_len; +extern const unsigned char matmul_iq4_nl_f32_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f32_f16acc_len; +extern const unsigned char matmul_iq4_nl_f32_f16acc_data[]; + +extern const uint64_t matmul_iq4_nl_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq4_nl_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_nl_f32_fp32_len; +extern const unsigned char matmul_iq4_nl_f32_fp32_data[]; + +extern const uint64_t matmul_iq4_xs_f16_len; +extern const unsigned char matmul_iq4_xs_f16_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_cm1_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_cm2_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_f16acc_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq4_xs_f16_aligned_fp32_len; +extern const unsigned char matmul_iq4_xs_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_iq4_xs_f16_cm1_len; +extern const unsigned char matmul_iq4_xs_f16_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f16_cm2_len; +extern const unsigned char matmul_iq4_xs_f16_cm2_data[]; + +extern const uint64_t matmul_iq4_xs_f16_f16acc_len; +extern const unsigned char matmul_iq4_xs_f16_f16acc_data[]; + +extern const uint64_t matmul_iq4_xs_f16_f16acc_cm1_len; +extern const unsigned char matmul_iq4_xs_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f16_f16acc_cm2_len; +extern const unsigned char matmul_iq4_xs_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_iq4_xs_f16_fp32_len; +extern const unsigned char matmul_iq4_xs_f16_fp32_data[]; + +extern const uint64_t matmul_iq4_xs_f32_len; +extern const unsigned char matmul_iq4_xs_f32_data[]; + +extern const uint64_t matmul_iq4_xs_f32_aligned_len; +extern const unsigned char matmul_iq4_xs_f32_aligned_data[]; + +extern const uint64_t matmul_iq4_xs_f32_aligned_cm1_len; +extern const unsigned char matmul_iq4_xs_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f32_aligned_f16acc_len; +extern const unsigned char matmul_iq4_xs_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_iq4_xs_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_iq4_xs_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f32_aligned_fp32_len; +extern const unsigned char matmul_iq4_xs_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_iq4_xs_f32_cm1_len; +extern const unsigned char matmul_iq4_xs_f32_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f32_f16acc_len; +extern const unsigned char matmul_iq4_xs_f32_f16acc_data[]; + +extern const uint64_t matmul_iq4_xs_f32_f16acc_cm1_len; +extern const unsigned char matmul_iq4_xs_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_iq4_xs_f32_fp32_len; +extern const unsigned char matmul_iq4_xs_f32_fp32_data[]; + +extern const uint64_t matmul_mxfp4_f16_len; +extern const unsigned char matmul_mxfp4_f16_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_len; +extern const unsigned char matmul_mxfp4_f16_aligned_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_cm1_len; +extern const unsigned char matmul_mxfp4_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_cm2_len; +extern const unsigned char matmul_mxfp4_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_f16acc_len; +extern const unsigned char matmul_mxfp4_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_mxfp4_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_mxfp4_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_mxfp4_f16_aligned_fp32_len; +extern const unsigned char matmul_mxfp4_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_mxfp4_f16_cm1_len; +extern const unsigned char matmul_mxfp4_f16_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f16_cm2_len; +extern const unsigned char matmul_mxfp4_f16_cm2_data[]; + +extern const uint64_t matmul_mxfp4_f16_f16acc_len; +extern const unsigned char matmul_mxfp4_f16_f16acc_data[]; + +extern const uint64_t matmul_mxfp4_f16_f16acc_cm1_len; +extern const unsigned char matmul_mxfp4_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f16_f16acc_cm2_len; +extern const unsigned char matmul_mxfp4_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_mxfp4_f16_fp32_len; +extern const unsigned char matmul_mxfp4_f16_fp32_data[]; + +extern const uint64_t matmul_mxfp4_f32_len; +extern const unsigned char matmul_mxfp4_f32_data[]; + +extern const uint64_t matmul_mxfp4_f32_aligned_len; +extern const unsigned char matmul_mxfp4_f32_aligned_data[]; + +extern const uint64_t matmul_mxfp4_f32_aligned_cm1_len; +extern const unsigned char matmul_mxfp4_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f32_aligned_f16acc_len; +extern const unsigned char matmul_mxfp4_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_mxfp4_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_mxfp4_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f32_aligned_fp32_len; +extern const unsigned char matmul_mxfp4_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_mxfp4_f32_cm1_len; +extern const unsigned char matmul_mxfp4_f32_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f32_f16acc_len; +extern const unsigned char matmul_mxfp4_f32_f16acc_data[]; + +extern const uint64_t matmul_mxfp4_f32_f16acc_cm1_len; +extern const unsigned char matmul_mxfp4_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_mxfp4_f32_fp32_len; +extern const unsigned char matmul_mxfp4_f32_fp32_data[]; + +extern const uint64_t matmul_mxfp4_q8_1_len; +extern const unsigned char matmul_mxfp4_q8_1_data[]; + +extern const uint64_t matmul_mxfp4_q8_1_fp32_len; +extern const unsigned char matmul_mxfp4_q8_1_fp32_data[]; + +extern const uint64_t matmul_nvfp4_f16_len; +extern const unsigned char matmul_nvfp4_f16_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_len; +extern const unsigned char matmul_nvfp4_f16_aligned_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_cm1_len; +extern const unsigned char matmul_nvfp4_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_cm2_len; +extern const unsigned char matmul_nvfp4_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_f16acc_len; +extern const unsigned char matmul_nvfp4_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_nvfp4_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_nvfp4_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_nvfp4_f16_aligned_fp32_len; +extern const unsigned char matmul_nvfp4_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_nvfp4_f16_cm1_len; +extern const unsigned char matmul_nvfp4_f16_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f16_cm2_len; +extern const unsigned char matmul_nvfp4_f16_cm2_data[]; + +extern const uint64_t matmul_nvfp4_f16_f16acc_len; +extern const unsigned char matmul_nvfp4_f16_f16acc_data[]; + +extern const uint64_t matmul_nvfp4_f16_f16acc_cm1_len; +extern const unsigned char matmul_nvfp4_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f16_f16acc_cm2_len; +extern const unsigned char matmul_nvfp4_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_nvfp4_f16_fp32_len; +extern const unsigned char matmul_nvfp4_f16_fp32_data[]; + +extern const uint64_t matmul_nvfp4_f32_len; +extern const unsigned char matmul_nvfp4_f32_data[]; + +extern const uint64_t matmul_nvfp4_f32_aligned_len; +extern const unsigned char matmul_nvfp4_f32_aligned_data[]; + +extern const uint64_t matmul_nvfp4_f32_aligned_cm1_len; +extern const unsigned char matmul_nvfp4_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f32_aligned_f16acc_len; +extern const unsigned char matmul_nvfp4_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_nvfp4_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_nvfp4_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f32_aligned_fp32_len; +extern const unsigned char matmul_nvfp4_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_nvfp4_f32_cm1_len; +extern const unsigned char matmul_nvfp4_f32_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f32_f16acc_len; +extern const unsigned char matmul_nvfp4_f32_f16acc_data[]; + +extern const uint64_t matmul_nvfp4_f32_f16acc_cm1_len; +extern const unsigned char matmul_nvfp4_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_nvfp4_f32_fp32_len; +extern const unsigned char matmul_nvfp4_f32_fp32_data[]; + +extern const uint64_t matmul_q1_0_f16_len; +extern const unsigned char matmul_q1_0_f16_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_len; +extern const unsigned char matmul_q1_0_f16_aligned_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_cm1_len; +extern const unsigned char matmul_q1_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_cm2_len; +extern const unsigned char matmul_q1_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_q1_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q1_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q1_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q1_0_f16_aligned_fp32_len; +extern const unsigned char matmul_q1_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q1_0_f16_cm1_len; +extern const unsigned char matmul_q1_0_f16_cm1_data[]; + +extern const uint64_t matmul_q1_0_f16_cm2_len; +extern const unsigned char matmul_q1_0_f16_cm2_data[]; + +extern const uint64_t matmul_q1_0_f16_f16acc_len; +extern const unsigned char matmul_q1_0_f16_f16acc_data[]; + +extern const uint64_t matmul_q1_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_q1_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q1_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_q1_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q1_0_f16_fp32_len; +extern const unsigned char matmul_q1_0_f16_fp32_data[]; + +extern const uint64_t matmul_q1_0_f32_len; +extern const unsigned char matmul_q1_0_f32_data[]; + +extern const uint64_t matmul_q1_0_f32_aligned_len; +extern const unsigned char matmul_q1_0_f32_aligned_data[]; + +extern const uint64_t matmul_q1_0_f32_aligned_cm1_len; +extern const unsigned char matmul_q1_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q1_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_q1_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q1_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q1_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q1_0_f32_aligned_fp32_len; +extern const unsigned char matmul_q1_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q1_0_f32_cm1_len; +extern const unsigned char matmul_q1_0_f32_cm1_data[]; + +extern const uint64_t matmul_q1_0_f32_f16acc_len; +extern const unsigned char matmul_q1_0_f32_f16acc_data[]; + +extern const uint64_t matmul_q1_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_q1_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q1_0_f32_fp32_len; +extern const unsigned char matmul_q1_0_f32_fp32_data[]; + +extern const uint64_t matmul_q2_k_f16_len; +extern const unsigned char matmul_q2_k_f16_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_len; +extern const unsigned char matmul_q2_k_f16_aligned_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_cm1_len; +extern const unsigned char matmul_q2_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_cm2_len; +extern const unsigned char matmul_q2_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_q2_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q2_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q2_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q2_k_f16_aligned_fp32_len; +extern const unsigned char matmul_q2_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q2_k_f16_cm1_len; +extern const unsigned char matmul_q2_k_f16_cm1_data[]; + +extern const uint64_t matmul_q2_k_f16_cm2_len; +extern const unsigned char matmul_q2_k_f16_cm2_data[]; + +extern const uint64_t matmul_q2_k_f16_f16acc_len; +extern const unsigned char matmul_q2_k_f16_f16acc_data[]; + +extern const uint64_t matmul_q2_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_q2_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q2_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_q2_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q2_k_f16_fp32_len; +extern const unsigned char matmul_q2_k_f16_fp32_data[]; + +extern const uint64_t matmul_q2_k_f32_len; +extern const unsigned char matmul_q2_k_f32_data[]; + +extern const uint64_t matmul_q2_k_f32_aligned_len; +extern const unsigned char matmul_q2_k_f32_aligned_data[]; + +extern const uint64_t matmul_q2_k_f32_aligned_cm1_len; +extern const unsigned char matmul_q2_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q2_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_q2_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q2_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q2_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q2_k_f32_aligned_fp32_len; +extern const unsigned char matmul_q2_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q2_k_f32_cm1_len; +extern const unsigned char matmul_q2_k_f32_cm1_data[]; + +extern const uint64_t matmul_q2_k_f32_f16acc_len; +extern const unsigned char matmul_q2_k_f32_f16acc_data[]; + +extern const uint64_t matmul_q2_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_q2_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q2_k_f32_fp32_len; +extern const unsigned char matmul_q2_k_f32_fp32_data[]; + +extern const uint64_t matmul_q2_k_q8_1_len; +extern const unsigned char matmul_q2_k_q8_1_data[]; + +extern const uint64_t matmul_q2_k_q8_1_fp32_len; +extern const unsigned char matmul_q2_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_q3_k_f16_len; +extern const unsigned char matmul_q3_k_f16_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_len; +extern const unsigned char matmul_q3_k_f16_aligned_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_cm1_len; +extern const unsigned char matmul_q3_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_cm2_len; +extern const unsigned char matmul_q3_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_q3_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q3_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q3_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q3_k_f16_aligned_fp32_len; +extern const unsigned char matmul_q3_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q3_k_f16_cm1_len; +extern const unsigned char matmul_q3_k_f16_cm1_data[]; + +extern const uint64_t matmul_q3_k_f16_cm2_len; +extern const unsigned char matmul_q3_k_f16_cm2_data[]; + +extern const uint64_t matmul_q3_k_f16_f16acc_len; +extern const unsigned char matmul_q3_k_f16_f16acc_data[]; + +extern const uint64_t matmul_q3_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_q3_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q3_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_q3_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q3_k_f16_fp32_len; +extern const unsigned char matmul_q3_k_f16_fp32_data[]; + +extern const uint64_t matmul_q3_k_f32_len; +extern const unsigned char matmul_q3_k_f32_data[]; + +extern const uint64_t matmul_q3_k_f32_aligned_len; +extern const unsigned char matmul_q3_k_f32_aligned_data[]; + +extern const uint64_t matmul_q3_k_f32_aligned_cm1_len; +extern const unsigned char matmul_q3_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q3_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_q3_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q3_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q3_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q3_k_f32_aligned_fp32_len; +extern const unsigned char matmul_q3_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q3_k_f32_cm1_len; +extern const unsigned char matmul_q3_k_f32_cm1_data[]; + +extern const uint64_t matmul_q3_k_f32_f16acc_len; +extern const unsigned char matmul_q3_k_f32_f16acc_data[]; + +extern const uint64_t matmul_q3_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_q3_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q3_k_f32_fp32_len; +extern const unsigned char matmul_q3_k_f32_fp32_data[]; + +extern const uint64_t matmul_q3_k_q8_1_len; +extern const unsigned char matmul_q3_k_q8_1_data[]; + +extern const uint64_t matmul_q3_k_q8_1_fp32_len; +extern const unsigned char matmul_q3_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_q4_0_f16_len; +extern const unsigned char matmul_q4_0_f16_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_len; +extern const unsigned char matmul_q4_0_f16_aligned_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_cm1_len; +extern const unsigned char matmul_q4_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_cm2_len; +extern const unsigned char matmul_q4_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_q4_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q4_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q4_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q4_0_f16_aligned_fp32_len; +extern const unsigned char matmul_q4_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q4_0_f16_cm1_len; +extern const unsigned char matmul_q4_0_f16_cm1_data[]; + +extern const uint64_t matmul_q4_0_f16_cm2_len; +extern const unsigned char matmul_q4_0_f16_cm2_data[]; + +extern const uint64_t matmul_q4_0_f16_f16acc_len; +extern const unsigned char matmul_q4_0_f16_f16acc_data[]; + +extern const uint64_t matmul_q4_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_q4_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_q4_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q4_0_f16_fp32_len; +extern const unsigned char matmul_q4_0_f16_fp32_data[]; + +extern const uint64_t matmul_q4_0_f32_len; +extern const unsigned char matmul_q4_0_f32_data[]; + +extern const uint64_t matmul_q4_0_f32_aligned_len; +extern const unsigned char matmul_q4_0_f32_aligned_data[]; + +extern const uint64_t matmul_q4_0_f32_aligned_cm1_len; +extern const unsigned char matmul_q4_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q4_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_q4_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q4_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q4_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_0_f32_aligned_fp32_len; +extern const unsigned char matmul_q4_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q4_0_f32_cm1_len; +extern const unsigned char matmul_q4_0_f32_cm1_data[]; + +extern const uint64_t matmul_q4_0_f32_f16acc_len; +extern const unsigned char matmul_q4_0_f32_f16acc_data[]; + +extern const uint64_t matmul_q4_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_q4_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_0_f32_fp32_len; +extern const unsigned char matmul_q4_0_f32_fp32_data[]; + +extern const uint64_t matmul_q4_0_q8_1_len; +extern const unsigned char matmul_q4_0_q8_1_data[]; + +extern const uint64_t matmul_q4_0_q8_1_fp32_len; +extern const unsigned char matmul_q4_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_q4_1_f16_len; +extern const unsigned char matmul_q4_1_f16_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_len; +extern const unsigned char matmul_q4_1_f16_aligned_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_cm1_len; +extern const unsigned char matmul_q4_1_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_cm2_len; +extern const unsigned char matmul_q4_1_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_f16acc_len; +extern const unsigned char matmul_q4_1_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q4_1_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q4_1_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q4_1_f16_aligned_fp32_len; +extern const unsigned char matmul_q4_1_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q4_1_f16_cm1_len; +extern const unsigned char matmul_q4_1_f16_cm1_data[]; + +extern const uint64_t matmul_q4_1_f16_cm2_len; +extern const unsigned char matmul_q4_1_f16_cm2_data[]; + +extern const uint64_t matmul_q4_1_f16_f16acc_len; +extern const unsigned char matmul_q4_1_f16_f16acc_data[]; + +extern const uint64_t matmul_q4_1_f16_f16acc_cm1_len; +extern const unsigned char matmul_q4_1_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_1_f16_f16acc_cm2_len; +extern const unsigned char matmul_q4_1_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q4_1_f16_fp32_len; +extern const unsigned char matmul_q4_1_f16_fp32_data[]; + +extern const uint64_t matmul_q4_1_f32_len; +extern const unsigned char matmul_q4_1_f32_data[]; + +extern const uint64_t matmul_q4_1_f32_aligned_len; +extern const unsigned char matmul_q4_1_f32_aligned_data[]; + +extern const uint64_t matmul_q4_1_f32_aligned_cm1_len; +extern const unsigned char matmul_q4_1_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q4_1_f32_aligned_f16acc_len; +extern const unsigned char matmul_q4_1_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q4_1_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q4_1_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_1_f32_aligned_fp32_len; +extern const unsigned char matmul_q4_1_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q4_1_f32_cm1_len; +extern const unsigned char matmul_q4_1_f32_cm1_data[]; + +extern const uint64_t matmul_q4_1_f32_f16acc_len; +extern const unsigned char matmul_q4_1_f32_f16acc_data[]; + +extern const uint64_t matmul_q4_1_f32_f16acc_cm1_len; +extern const unsigned char matmul_q4_1_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_1_f32_fp32_len; +extern const unsigned char matmul_q4_1_f32_fp32_data[]; + +extern const uint64_t matmul_q4_1_q8_1_len; +extern const unsigned char matmul_q4_1_q8_1_data[]; + +extern const uint64_t matmul_q4_1_q8_1_fp32_len; +extern const unsigned char matmul_q4_1_q8_1_fp32_data[]; + +extern const uint64_t matmul_q4_k_f16_len; +extern const unsigned char matmul_q4_k_f16_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_len; +extern const unsigned char matmul_q4_k_f16_aligned_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_cm1_len; +extern const unsigned char matmul_q4_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_cm2_len; +extern const unsigned char matmul_q4_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_q4_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q4_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q4_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q4_k_f16_aligned_fp32_len; +extern const unsigned char matmul_q4_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q4_k_f16_cm1_len; +extern const unsigned char matmul_q4_k_f16_cm1_data[]; + +extern const uint64_t matmul_q4_k_f16_cm2_len; +extern const unsigned char matmul_q4_k_f16_cm2_data[]; + +extern const uint64_t matmul_q4_k_f16_f16acc_len; +extern const unsigned char matmul_q4_k_f16_f16acc_data[]; + +extern const uint64_t matmul_q4_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_q4_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_q4_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q4_k_f16_fp32_len; +extern const unsigned char matmul_q4_k_f16_fp32_data[]; + +extern const uint64_t matmul_q4_k_f32_len; +extern const unsigned char matmul_q4_k_f32_data[]; + +extern const uint64_t matmul_q4_k_f32_aligned_len; +extern const unsigned char matmul_q4_k_f32_aligned_data[]; + +extern const uint64_t matmul_q4_k_f32_aligned_cm1_len; +extern const unsigned char matmul_q4_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q4_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_q4_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q4_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q4_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_k_f32_aligned_fp32_len; +extern const unsigned char matmul_q4_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q4_k_f32_cm1_len; +extern const unsigned char matmul_q4_k_f32_cm1_data[]; + +extern const uint64_t matmul_q4_k_f32_f16acc_len; +extern const unsigned char matmul_q4_k_f32_f16acc_data[]; + +extern const uint64_t matmul_q4_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_q4_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q4_k_f32_fp32_len; +extern const unsigned char matmul_q4_k_f32_fp32_data[]; + +extern const uint64_t matmul_q4_k_q8_1_len; +extern const unsigned char matmul_q4_k_q8_1_data[]; + +extern const uint64_t matmul_q4_k_q8_1_fp32_len; +extern const unsigned char matmul_q4_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_q5_0_f16_len; +extern const unsigned char matmul_q5_0_f16_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_len; +extern const unsigned char matmul_q5_0_f16_aligned_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_cm1_len; +extern const unsigned char matmul_q5_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_cm2_len; +extern const unsigned char matmul_q5_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_q5_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q5_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q5_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q5_0_f16_aligned_fp32_len; +extern const unsigned char matmul_q5_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q5_0_f16_cm1_len; +extern const unsigned char matmul_q5_0_f16_cm1_data[]; + +extern const uint64_t matmul_q5_0_f16_cm2_len; +extern const unsigned char matmul_q5_0_f16_cm2_data[]; + +extern const uint64_t matmul_q5_0_f16_f16acc_len; +extern const unsigned char matmul_q5_0_f16_f16acc_data[]; + +extern const uint64_t matmul_q5_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_q5_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_q5_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q5_0_f16_fp32_len; +extern const unsigned char matmul_q5_0_f16_fp32_data[]; + +extern const uint64_t matmul_q5_0_f32_len; +extern const unsigned char matmul_q5_0_f32_data[]; + +extern const uint64_t matmul_q5_0_f32_aligned_len; +extern const unsigned char matmul_q5_0_f32_aligned_data[]; + +extern const uint64_t matmul_q5_0_f32_aligned_cm1_len; +extern const unsigned char matmul_q5_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q5_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_q5_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q5_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q5_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_0_f32_aligned_fp32_len; +extern const unsigned char matmul_q5_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q5_0_f32_cm1_len; +extern const unsigned char matmul_q5_0_f32_cm1_data[]; + +extern const uint64_t matmul_q5_0_f32_f16acc_len; +extern const unsigned char matmul_q5_0_f32_f16acc_data[]; + +extern const uint64_t matmul_q5_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_q5_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_0_f32_fp32_len; +extern const unsigned char matmul_q5_0_f32_fp32_data[]; + +extern const uint64_t matmul_q5_0_q8_1_len; +extern const unsigned char matmul_q5_0_q8_1_data[]; + +extern const uint64_t matmul_q5_0_q8_1_fp32_len; +extern const unsigned char matmul_q5_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_q5_1_f16_len; +extern const unsigned char matmul_q5_1_f16_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_len; +extern const unsigned char matmul_q5_1_f16_aligned_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_cm1_len; +extern const unsigned char matmul_q5_1_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_cm2_len; +extern const unsigned char matmul_q5_1_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_f16acc_len; +extern const unsigned char matmul_q5_1_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q5_1_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q5_1_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q5_1_f16_aligned_fp32_len; +extern const unsigned char matmul_q5_1_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q5_1_f16_cm1_len; +extern const unsigned char matmul_q5_1_f16_cm1_data[]; + +extern const uint64_t matmul_q5_1_f16_cm2_len; +extern const unsigned char matmul_q5_1_f16_cm2_data[]; + +extern const uint64_t matmul_q5_1_f16_f16acc_len; +extern const unsigned char matmul_q5_1_f16_f16acc_data[]; + +extern const uint64_t matmul_q5_1_f16_f16acc_cm1_len; +extern const unsigned char matmul_q5_1_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_1_f16_f16acc_cm2_len; +extern const unsigned char matmul_q5_1_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q5_1_f16_fp32_len; +extern const unsigned char matmul_q5_1_f16_fp32_data[]; + +extern const uint64_t matmul_q5_1_f32_len; +extern const unsigned char matmul_q5_1_f32_data[]; + +extern const uint64_t matmul_q5_1_f32_aligned_len; +extern const unsigned char matmul_q5_1_f32_aligned_data[]; + +extern const uint64_t matmul_q5_1_f32_aligned_cm1_len; +extern const unsigned char matmul_q5_1_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q5_1_f32_aligned_f16acc_len; +extern const unsigned char matmul_q5_1_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q5_1_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q5_1_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_1_f32_aligned_fp32_len; +extern const unsigned char matmul_q5_1_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q5_1_f32_cm1_len; +extern const unsigned char matmul_q5_1_f32_cm1_data[]; + +extern const uint64_t matmul_q5_1_f32_f16acc_len; +extern const unsigned char matmul_q5_1_f32_f16acc_data[]; + +extern const uint64_t matmul_q5_1_f32_f16acc_cm1_len; +extern const unsigned char matmul_q5_1_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_1_f32_fp32_len; +extern const unsigned char matmul_q5_1_f32_fp32_data[]; + +extern const uint64_t matmul_q5_1_q8_1_len; +extern const unsigned char matmul_q5_1_q8_1_data[]; + +extern const uint64_t matmul_q5_1_q8_1_fp32_len; +extern const unsigned char matmul_q5_1_q8_1_fp32_data[]; + +extern const uint64_t matmul_q5_k_f16_len; +extern const unsigned char matmul_q5_k_f16_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_len; +extern const unsigned char matmul_q5_k_f16_aligned_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_cm1_len; +extern const unsigned char matmul_q5_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_cm2_len; +extern const unsigned char matmul_q5_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_q5_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q5_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q5_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q5_k_f16_aligned_fp32_len; +extern const unsigned char matmul_q5_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q5_k_f16_cm1_len; +extern const unsigned char matmul_q5_k_f16_cm1_data[]; + +extern const uint64_t matmul_q5_k_f16_cm2_len; +extern const unsigned char matmul_q5_k_f16_cm2_data[]; + +extern const uint64_t matmul_q5_k_f16_f16acc_len; +extern const unsigned char matmul_q5_k_f16_f16acc_data[]; + +extern const uint64_t matmul_q5_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_q5_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_q5_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q5_k_f16_fp32_len; +extern const unsigned char matmul_q5_k_f16_fp32_data[]; + +extern const uint64_t matmul_q5_k_f32_len; +extern const unsigned char matmul_q5_k_f32_data[]; + +extern const uint64_t matmul_q5_k_f32_aligned_len; +extern const unsigned char matmul_q5_k_f32_aligned_data[]; + +extern const uint64_t matmul_q5_k_f32_aligned_cm1_len; +extern const unsigned char matmul_q5_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q5_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_q5_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q5_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q5_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_k_f32_aligned_fp32_len; +extern const unsigned char matmul_q5_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q5_k_f32_cm1_len; +extern const unsigned char matmul_q5_k_f32_cm1_data[]; + +extern const uint64_t matmul_q5_k_f32_f16acc_len; +extern const unsigned char matmul_q5_k_f32_f16acc_data[]; + +extern const uint64_t matmul_q5_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_q5_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q5_k_f32_fp32_len; +extern const unsigned char matmul_q5_k_f32_fp32_data[]; + +extern const uint64_t matmul_q5_k_q8_1_len; +extern const unsigned char matmul_q5_k_q8_1_data[]; + +extern const uint64_t matmul_q5_k_q8_1_fp32_len; +extern const unsigned char matmul_q5_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_q6_k_f16_len; +extern const unsigned char matmul_q6_k_f16_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_len; +extern const unsigned char matmul_q6_k_f16_aligned_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_cm1_len; +extern const unsigned char matmul_q6_k_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_cm2_len; +extern const unsigned char matmul_q6_k_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_f16acc_len; +extern const unsigned char matmul_q6_k_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q6_k_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q6_k_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q6_k_f16_aligned_fp32_len; +extern const unsigned char matmul_q6_k_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q6_k_f16_cm1_len; +extern const unsigned char matmul_q6_k_f16_cm1_data[]; + +extern const uint64_t matmul_q6_k_f16_cm2_len; +extern const unsigned char matmul_q6_k_f16_cm2_data[]; + +extern const uint64_t matmul_q6_k_f16_f16acc_len; +extern const unsigned char matmul_q6_k_f16_f16acc_data[]; + +extern const uint64_t matmul_q6_k_f16_f16acc_cm1_len; +extern const unsigned char matmul_q6_k_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q6_k_f16_f16acc_cm2_len; +extern const unsigned char matmul_q6_k_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q6_k_f16_fp32_len; +extern const unsigned char matmul_q6_k_f16_fp32_data[]; + +extern const uint64_t matmul_q6_k_f32_len; +extern const unsigned char matmul_q6_k_f32_data[]; + +extern const uint64_t matmul_q6_k_f32_aligned_len; +extern const unsigned char matmul_q6_k_f32_aligned_data[]; + +extern const uint64_t matmul_q6_k_f32_aligned_cm1_len; +extern const unsigned char matmul_q6_k_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q6_k_f32_aligned_f16acc_len; +extern const unsigned char matmul_q6_k_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q6_k_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q6_k_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q6_k_f32_aligned_fp32_len; +extern const unsigned char matmul_q6_k_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q6_k_f32_cm1_len; +extern const unsigned char matmul_q6_k_f32_cm1_data[]; + +extern const uint64_t matmul_q6_k_f32_f16acc_len; +extern const unsigned char matmul_q6_k_f32_f16acc_data[]; + +extern const uint64_t matmul_q6_k_f32_f16acc_cm1_len; +extern const unsigned char matmul_q6_k_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q6_k_f32_fp32_len; +extern const unsigned char matmul_q6_k_f32_fp32_data[]; + +extern const uint64_t matmul_q6_k_q8_1_len; +extern const unsigned char matmul_q6_k_q8_1_data[]; + +extern const uint64_t matmul_q6_k_q8_1_fp32_len; +extern const unsigned char matmul_q6_k_q8_1_fp32_data[]; + +extern const uint64_t matmul_q8_0_f16_len; +extern const unsigned char matmul_q8_0_f16_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_len; +extern const unsigned char matmul_q8_0_f16_aligned_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_cm1_len; +extern const unsigned char matmul_q8_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_cm2_len; +extern const unsigned char matmul_q8_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_q8_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q8_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_q8_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_q8_0_f16_aligned_fp32_len; +extern const unsigned char matmul_q8_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_q8_0_f16_cm1_len; +extern const unsigned char matmul_q8_0_f16_cm1_data[]; + +extern const uint64_t matmul_q8_0_f16_cm2_len; +extern const unsigned char matmul_q8_0_f16_cm2_data[]; + +extern const uint64_t matmul_q8_0_f16_f16acc_len; +extern const unsigned char matmul_q8_0_f16_f16acc_data[]; + +extern const uint64_t matmul_q8_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_q8_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_q8_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_q8_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_q8_0_f16_fp32_len; +extern const unsigned char matmul_q8_0_f16_fp32_data[]; + +extern const uint64_t matmul_q8_0_f32_len; +extern const unsigned char matmul_q8_0_f32_data[]; + +extern const uint64_t matmul_q8_0_f32_aligned_len; +extern const unsigned char matmul_q8_0_f32_aligned_data[]; + +extern const uint64_t matmul_q8_0_f32_aligned_cm1_len; +extern const unsigned char matmul_q8_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_q8_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_q8_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_q8_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_q8_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_q8_0_f32_aligned_fp32_len; +extern const unsigned char matmul_q8_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_q8_0_f32_cm1_len; +extern const unsigned char matmul_q8_0_f32_cm1_data[]; + +extern const uint64_t matmul_q8_0_f32_f16acc_len; +extern const unsigned char matmul_q8_0_f32_f16acc_data[]; + +extern const uint64_t matmul_q8_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_q8_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_q8_0_f32_fp32_len; +extern const unsigned char matmul_q8_0_f32_fp32_data[]; + +extern const uint64_t matmul_q8_0_q8_1_len; +extern const unsigned char matmul_q8_0_q8_1_data[]; + +extern const uint64_t matmul_q8_0_q8_1_fp32_len; +extern const unsigned char matmul_q8_0_q8_1_fp32_data[]; + +extern const uint64_t matmul_tq3_1s_f16_len; +extern const unsigned char matmul_tq3_1s_f16_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_cm1_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_cm2_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_f16acc_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_tq3_1s_f16_aligned_fp32_len; +extern const unsigned char matmul_tq3_1s_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_tq3_1s_f16_cm1_len; +extern const unsigned char matmul_tq3_1s_f16_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f16_cm2_len; +extern const unsigned char matmul_tq3_1s_f16_cm2_data[]; + +extern const uint64_t matmul_tq3_1s_f16_f16acc_len; +extern const unsigned char matmul_tq3_1s_f16_f16acc_data[]; + +extern const uint64_t matmul_tq3_1s_f16_f16acc_cm1_len; +extern const unsigned char matmul_tq3_1s_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f16_f16acc_cm2_len; +extern const unsigned char matmul_tq3_1s_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_tq3_1s_f16_fp32_len; +extern const unsigned char matmul_tq3_1s_f16_fp32_data[]; + +extern const uint64_t matmul_tq3_1s_f32_len; +extern const unsigned char matmul_tq3_1s_f32_data[]; + +extern const uint64_t matmul_tq3_1s_f32_aligned_len; +extern const unsigned char matmul_tq3_1s_f32_aligned_data[]; + +extern const uint64_t matmul_tq3_1s_f32_aligned_cm1_len; +extern const unsigned char matmul_tq3_1s_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f32_aligned_f16acc_len; +extern const unsigned char matmul_tq3_1s_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_tq3_1s_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_tq3_1s_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f32_aligned_fp32_len; +extern const unsigned char matmul_tq3_1s_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_tq3_1s_f32_cm1_len; +extern const unsigned char matmul_tq3_1s_f32_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f32_f16acc_len; +extern const unsigned char matmul_tq3_1s_f32_f16acc_data[]; + +extern const uint64_t matmul_tq3_1s_f32_f16acc_cm1_len; +extern const unsigned char matmul_tq3_1s_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_tq3_1s_f32_fp32_len; +extern const unsigned char matmul_tq3_1s_f32_fp32_data[]; + +extern const uint64_t matmul_turbo2_0_f16_len; +extern const unsigned char matmul_turbo2_0_f16_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_cm1_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_cm2_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_turbo2_0_f16_aligned_fp32_len; +extern const unsigned char matmul_turbo2_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_turbo2_0_f16_cm1_len; +extern const unsigned char matmul_turbo2_0_f16_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f16_cm2_len; +extern const unsigned char matmul_turbo2_0_f16_cm2_data[]; + +extern const uint64_t matmul_turbo2_0_f16_f16acc_len; +extern const unsigned char matmul_turbo2_0_f16_f16acc_data[]; + +extern const uint64_t matmul_turbo2_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_turbo2_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_turbo2_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_turbo2_0_f16_fp32_len; +extern const unsigned char matmul_turbo2_0_f16_fp32_data[]; + +extern const uint64_t matmul_turbo2_0_f32_len; +extern const unsigned char matmul_turbo2_0_f32_data[]; + +extern const uint64_t matmul_turbo2_0_f32_aligned_len; +extern const unsigned char matmul_turbo2_0_f32_aligned_data[]; + +extern const uint64_t matmul_turbo2_0_f32_aligned_cm1_len; +extern const unsigned char matmul_turbo2_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_turbo2_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_turbo2_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_turbo2_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f32_aligned_fp32_len; +extern const unsigned char matmul_turbo2_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_turbo2_0_f32_cm1_len; +extern const unsigned char matmul_turbo2_0_f32_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f32_f16acc_len; +extern const unsigned char matmul_turbo2_0_f32_f16acc_data[]; + +extern const uint64_t matmul_turbo2_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_turbo2_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo2_0_f32_fp32_len; +extern const unsigned char matmul_turbo2_0_f32_fp32_data[]; + +extern const uint64_t matmul_turbo3_0_f16_len; +extern const unsigned char matmul_turbo3_0_f16_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_cm1_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_cm2_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_turbo3_0_f16_aligned_fp32_len; +extern const unsigned char matmul_turbo3_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_turbo3_0_f16_cm1_len; +extern const unsigned char matmul_turbo3_0_f16_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f16_cm2_len; +extern const unsigned char matmul_turbo3_0_f16_cm2_data[]; + +extern const uint64_t matmul_turbo3_0_f16_f16acc_len; +extern const unsigned char matmul_turbo3_0_f16_f16acc_data[]; + +extern const uint64_t matmul_turbo3_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_turbo3_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_turbo3_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_turbo3_0_f16_fp32_len; +extern const unsigned char matmul_turbo3_0_f16_fp32_data[]; + +extern const uint64_t matmul_turbo3_0_f32_len; +extern const unsigned char matmul_turbo3_0_f32_data[]; + +extern const uint64_t matmul_turbo3_0_f32_aligned_len; +extern const unsigned char matmul_turbo3_0_f32_aligned_data[]; + +extern const uint64_t matmul_turbo3_0_f32_aligned_cm1_len; +extern const unsigned char matmul_turbo3_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_turbo3_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_turbo3_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_turbo3_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f32_aligned_fp32_len; +extern const unsigned char matmul_turbo3_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_turbo3_0_f32_cm1_len; +extern const unsigned char matmul_turbo3_0_f32_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f32_f16acc_len; +extern const unsigned char matmul_turbo3_0_f32_f16acc_data[]; + +extern const uint64_t matmul_turbo3_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_turbo3_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo3_0_f32_fp32_len; +extern const unsigned char matmul_turbo3_0_f32_fp32_data[]; + +extern const uint64_t matmul_turbo4_0_f16_len; +extern const unsigned char matmul_turbo4_0_f16_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_cm1_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_cm2_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_cm2_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_f16acc_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_f16acc_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_f16acc_cm1_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_f16acc_cm2_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_f16acc_cm2_data[]; + +extern const uint64_t matmul_turbo4_0_f16_aligned_fp32_len; +extern const unsigned char matmul_turbo4_0_f16_aligned_fp32_data[]; + +extern const uint64_t matmul_turbo4_0_f16_cm1_len; +extern const unsigned char matmul_turbo4_0_f16_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f16_cm2_len; +extern const unsigned char matmul_turbo4_0_f16_cm2_data[]; + +extern const uint64_t matmul_turbo4_0_f16_f16acc_len; +extern const unsigned char matmul_turbo4_0_f16_f16acc_data[]; + +extern const uint64_t matmul_turbo4_0_f16_f16acc_cm1_len; +extern const unsigned char matmul_turbo4_0_f16_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f16_f16acc_cm2_len; +extern const unsigned char matmul_turbo4_0_f16_f16acc_cm2_data[]; + +extern const uint64_t matmul_turbo4_0_f16_fp32_len; +extern const unsigned char matmul_turbo4_0_f16_fp32_data[]; + +extern const uint64_t matmul_turbo4_0_f32_len; +extern const unsigned char matmul_turbo4_0_f32_data[]; + +extern const uint64_t matmul_turbo4_0_f32_aligned_len; +extern const unsigned char matmul_turbo4_0_f32_aligned_data[]; + +extern const uint64_t matmul_turbo4_0_f32_aligned_cm1_len; +extern const unsigned char matmul_turbo4_0_f32_aligned_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f32_aligned_f16acc_len; +extern const unsigned char matmul_turbo4_0_f32_aligned_f16acc_data[]; + +extern const uint64_t matmul_turbo4_0_f32_aligned_f16acc_cm1_len; +extern const unsigned char matmul_turbo4_0_f32_aligned_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f32_aligned_fp32_len; +extern const unsigned char matmul_turbo4_0_f32_aligned_fp32_data[]; + +extern const uint64_t matmul_turbo4_0_f32_cm1_len; +extern const unsigned char matmul_turbo4_0_f32_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f32_f16acc_len; +extern const unsigned char matmul_turbo4_0_f32_f16acc_data[]; + +extern const uint64_t matmul_turbo4_0_f32_f16acc_cm1_len; +extern const unsigned char matmul_turbo4_0_f32_f16acc_cm1_data[]; + +extern const uint64_t matmul_turbo4_0_f32_fp32_len; +extern const unsigned char matmul_turbo4_0_f32_fp32_data[]; + +extern const uint64_t mul_f16_f16_f16_len; +extern const unsigned char mul_f16_f16_f16_data[]; + +extern const uint64_t mul_f16_f16_f32_len; +extern const unsigned char mul_f16_f16_f32_data[]; + +extern const uint64_t mul_f16_f32_f16_len; +extern const unsigned char mul_f16_f32_f16_data[]; + +extern const uint64_t mul_f16_f32_f32_len; +extern const unsigned char mul_f16_f32_f32_data[]; + +extern const uint64_t mul_f32_len; +extern const unsigned char mul_f32_data[]; + +extern const uint64_t mul_f32_f16_f16_len; +extern const unsigned char mul_f32_f16_f16_data[]; + +extern const uint64_t mul_f32_f16_f32_len; +extern const unsigned char mul_f32_f16_f32_data[]; + +extern const uint64_t mul_f32_f32_f16_len; +extern const unsigned char mul_f32_f32_f16_data[]; + +extern const uint64_t mul_f32_f32_f32_len; +extern const unsigned char mul_f32_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_bf16_f16_f32_len; +extern const unsigned char mul_mat_vec_bf16_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_bf16_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_bf16_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_bf16_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_bf16_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_bf16_f32_f32_len; +extern const unsigned char mul_mat_vec_bf16_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_bf16_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_bf16_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_bf16_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_bf16_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_f16_f16_f32_len; +extern const unsigned char mul_mat_vec_f16_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_f16_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_f16_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_f16_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_f16_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_f16_f32_f32_len; +extern const unsigned char mul_mat_vec_f16_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_f16_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_f16_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_f16_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_f16_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_f32_f16_f32_len; +extern const unsigned char mul_mat_vec_f32_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_f32_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_f32_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_f32_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_f32_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_f32_f32_f32_len; +extern const unsigned char mul_mat_vec_f32_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_f32_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_f32_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_f32_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_f32_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_bf16_f32_f32_len; +extern const unsigned char mul_mat_vec_id_bf16_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_bf16_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_bf16_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_bf16_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_bf16_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_f16_f32_f32_len; +extern const unsigned char mul_mat_vec_id_f16_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_f16_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_f16_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_f16_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_f16_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_f32_f32_f32_len; +extern const unsigned char mul_mat_vec_id_f32_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_f32_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_f32_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_f32_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_f32_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_m_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq1_m_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_m_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq1_m_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_m_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq1_m_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_m_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_iq1_m_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_m_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq1_m_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_m_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq1_m_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_s_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq1_s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq1_s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq1_s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_s_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_iq1_s_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_s_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq1_s_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq1_s_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq1_s_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_s_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq2_s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq2_s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq2_s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_xs_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq2_xs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_xs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq2_xs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_xs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq2_xs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_xxs_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq2_xxs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_xxs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq2_xxs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq2_xxs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq2_xxs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq3_s_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq3_s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq3_s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq3_s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq3_s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq3_s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq3_xxs_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq3_xxs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq3_xxs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq3_xxs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq3_xxs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq3_xxs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq4_nl_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq4_nl_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq4_nl_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq4_nl_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq4_nl_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq4_nl_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_iq4_xs_f32_f32_len; +extern const unsigned char mul_mat_vec_id_iq4_xs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_iq4_xs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_iq4_xs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_iq4_xs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_iq4_xs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_mxfp4_f32_f32_len; +extern const unsigned char mul_mat_vec_id_mxfp4_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_mxfp4_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_mxfp4_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_mxfp4_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_mxfp4_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_mxfp4_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_mxfp4_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_mxfp4_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_mxfp4_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_mxfp4_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_mxfp4_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_nvfp4_f32_f32_len; +extern const unsigned char mul_mat_vec_id_nvfp4_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_nvfp4_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_nvfp4_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_nvfp4_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_nvfp4_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q1_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q1_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q1_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q1_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q1_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q1_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q2_k_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q2_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q2_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q2_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q2_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q2_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q2_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q2_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q2_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q2_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q2_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q2_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q3_k_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q3_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q3_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q3_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q3_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q3_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q3_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q3_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q3_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q3_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q3_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q3_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q4_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q4_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q4_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q4_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q4_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q4_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q4_0_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q4_0_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q4_0_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q4_0_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q4_0_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q4_0_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q4_1_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q4_1_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q4_1_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q4_1_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q4_1_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q4_1_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q4_1_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q4_1_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q4_1_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q4_1_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q4_1_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q4_1_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q4_k_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q4_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q4_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q4_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q4_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q4_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q4_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q4_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q4_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q4_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q4_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q4_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q5_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q5_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q5_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q5_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q5_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q5_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q5_0_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q5_0_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q5_0_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q5_0_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q5_0_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q5_0_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q5_1_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q5_1_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q5_1_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q5_1_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q5_1_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q5_1_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q5_1_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q5_1_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q5_1_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q5_1_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q5_1_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q5_1_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q5_k_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q5_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q5_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q5_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q5_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q5_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q5_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q5_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q5_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q5_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q5_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q5_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q6_k_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q6_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q6_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q6_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q6_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q6_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q6_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q6_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q6_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q6_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q6_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q6_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q8_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_q8_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q8_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q8_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q8_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q8_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_q8_0_q8_1_f32_len; +extern const unsigned char mul_mat_vec_id_q8_0_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_id_q8_0_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_q8_0_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_q8_0_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_q8_0_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_tq3_1s_f32_f32_len; +extern const unsigned char mul_mat_vec_id_tq3_1s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_tq3_1s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_tq3_1s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_tq3_1s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_tq3_1s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_tq4_1s_f32_f32_len; +extern const unsigned char mul_mat_vec_id_tq4_1s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_tq4_1s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_tq4_1s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_tq4_1s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_tq4_1s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_turbo2_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_turbo2_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_turbo2_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_turbo2_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_turbo2_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_turbo2_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_turbo3_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_turbo3_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_turbo3_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_turbo3_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_turbo3_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_turbo3_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_id_turbo4_0_f32_f32_len; +extern const unsigned char mul_mat_vec_id_turbo4_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_id_turbo4_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_id_turbo4_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_id_turbo4_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_id_turbo4_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_f16_f32_len; +extern const unsigned char mul_mat_vec_iq1_m_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq1_m_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq1_m_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_f32_f32_len; +extern const unsigned char mul_mat_vec_iq1_m_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq1_m_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq1_m_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_q8_1_f32_len; +extern const unsigned char mul_mat_vec_iq1_m_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq1_m_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq1_m_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq1_m_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_f16_f32_len; +extern const unsigned char mul_mat_vec_iq1_s_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq1_s_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq1_s_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_f32_f32_len; +extern const unsigned char mul_mat_vec_iq1_s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq1_s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq1_s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_q8_1_f32_len; +extern const unsigned char mul_mat_vec_iq1_s_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq1_s_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq1_s_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq1_s_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq2_s_f16_f32_len; +extern const unsigned char mul_mat_vec_iq2_s_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq2_s_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq2_s_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq2_s_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq2_s_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq2_s_f32_f32_len; +extern const unsigned char mul_mat_vec_iq2_s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq2_s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq2_s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq2_s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq2_s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq2_xs_f16_f32_len; +extern const unsigned char mul_mat_vec_iq2_xs_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq2_xs_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq2_xs_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq2_xs_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq2_xs_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq2_xs_f32_f32_len; +extern const unsigned char mul_mat_vec_iq2_xs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq2_xs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq2_xs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq2_xs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq2_xs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq2_xxs_f16_f32_len; +extern const unsigned char mul_mat_vec_iq2_xxs_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq2_xxs_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq2_xxs_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq2_xxs_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq2_xxs_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq2_xxs_f32_f32_len; +extern const unsigned char mul_mat_vec_iq2_xxs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq2_xxs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq2_xxs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq2_xxs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq2_xxs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq3_s_f16_f32_len; +extern const unsigned char mul_mat_vec_iq3_s_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq3_s_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq3_s_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq3_s_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq3_s_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq3_s_f32_f32_len; +extern const unsigned char mul_mat_vec_iq3_s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq3_s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq3_s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq3_s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq3_s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq3_xxs_f16_f32_len; +extern const unsigned char mul_mat_vec_iq3_xxs_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq3_xxs_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq3_xxs_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq3_xxs_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq3_xxs_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq3_xxs_f32_f32_len; +extern const unsigned char mul_mat_vec_iq3_xxs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq3_xxs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq3_xxs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq3_xxs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq3_xxs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq4_nl_f16_f32_len; +extern const unsigned char mul_mat_vec_iq4_nl_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq4_nl_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq4_nl_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq4_nl_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq4_nl_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq4_nl_f32_f32_len; +extern const unsigned char mul_mat_vec_iq4_nl_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq4_nl_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq4_nl_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq4_nl_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq4_nl_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq4_xs_f16_f32_len; +extern const unsigned char mul_mat_vec_iq4_xs_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_iq4_xs_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq4_xs_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq4_xs_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq4_xs_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_iq4_xs_f32_f32_len; +extern const unsigned char mul_mat_vec_iq4_xs_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_iq4_xs_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_iq4_xs_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_iq4_xs_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_iq4_xs_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_f16_f32_len; +extern const unsigned char mul_mat_vec_mxfp4_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_mxfp4_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_mxfp4_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_f32_f32_len; +extern const unsigned char mul_mat_vec_mxfp4_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_mxfp4_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_mxfp4_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_q8_1_f32_len; +extern const unsigned char mul_mat_vec_mxfp4_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_mxfp4_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_mxfp4_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_mxfp4_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_nc_f16_f32_len; +extern const unsigned char mul_mat_vec_nc_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_nvfp4_f16_f32_len; +extern const unsigned char mul_mat_vec_nvfp4_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_nvfp4_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_nvfp4_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_nvfp4_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_nvfp4_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_nvfp4_f32_f32_len; +extern const unsigned char mul_mat_vec_nvfp4_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_nvfp4_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_nvfp4_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_nvfp4_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_nvfp4_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_p021_f16_f32_len; +extern const unsigned char mul_mat_vec_p021_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_p021_f16_f32_subgroup_add_len; +extern const unsigned char mul_mat_vec_p021_f16_f32_subgroup_add_data[]; + +extern const uint64_t mul_mat_vec_q1_0_f16_f32_len; +extern const unsigned char mul_mat_vec_q1_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q1_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q1_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q1_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q1_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q1_0_f32_f32_len; +extern const unsigned char mul_mat_vec_q1_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q1_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q1_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q1_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q1_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q2_k_f16_f32_len; +extern const unsigned char mul_mat_vec_q2_k_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q2_k_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q2_k_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q2_k_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q2_k_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q2_k_f32_f32_len; +extern const unsigned char mul_mat_vec_q2_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q2_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q2_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q2_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q2_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q2_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q2_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q2_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q2_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q2_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q2_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q3_k_f16_f32_len; +extern const unsigned char mul_mat_vec_q3_k_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q3_k_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q3_k_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q3_k_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q3_k_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q3_k_f32_f32_len; +extern const unsigned char mul_mat_vec_q3_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q3_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q3_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q3_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q3_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q3_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q3_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q3_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q3_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q3_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q3_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_0_f16_f32_len; +extern const unsigned char mul_mat_vec_q4_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_0_f32_f32_len; +extern const unsigned char mul_mat_vec_q4_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_0_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q4_0_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_0_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_0_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_0_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_0_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_1_f16_f32_len; +extern const unsigned char mul_mat_vec_q4_1_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_1_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_1_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_1_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_1_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_1_f32_f32_len; +extern const unsigned char mul_mat_vec_q4_1_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_1_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_1_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_1_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_1_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_1_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q4_1_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_1_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_1_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_1_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_1_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_k_f16_f32_len; +extern const unsigned char mul_mat_vec_q4_k_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_k_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_k_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_k_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_k_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_k_f32_f32_len; +extern const unsigned char mul_mat_vec_q4_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q4_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q4_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q4_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q4_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q4_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q4_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_0_f16_f32_len; +extern const unsigned char mul_mat_vec_q5_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_0_f32_f32_len; +extern const unsigned char mul_mat_vec_q5_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_0_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q5_0_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_0_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_0_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_0_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_0_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_1_f16_f32_len; +extern const unsigned char mul_mat_vec_q5_1_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_1_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_1_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_1_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_1_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_1_f32_f32_len; +extern const unsigned char mul_mat_vec_q5_1_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_1_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_1_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_1_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_1_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_1_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q5_1_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_1_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_1_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_1_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_1_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_k_f16_f32_len; +extern const unsigned char mul_mat_vec_q5_k_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_k_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_k_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_k_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_k_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_k_f32_f32_len; +extern const unsigned char mul_mat_vec_q5_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q5_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q5_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q5_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q5_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q5_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q5_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q6_k_f16_f32_len; +extern const unsigned char mul_mat_vec_q6_k_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q6_k_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q6_k_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q6_k_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q6_k_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q6_k_f32_f32_len; +extern const unsigned char mul_mat_vec_q6_k_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q6_k_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q6_k_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q6_k_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q6_k_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q6_k_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q6_k_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q6_k_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q6_k_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q6_k_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q6_k_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q8_0_f16_f32_len; +extern const unsigned char mul_mat_vec_q8_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_q8_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q8_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q8_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q8_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q8_0_f32_f32_len; +extern const unsigned char mul_mat_vec_q8_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_q8_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q8_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q8_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q8_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_q8_0_q8_1_f32_len; +extern const unsigned char mul_mat_vec_q8_0_q8_1_f32_data[]; + +extern const uint64_t mul_mat_vec_q8_0_q8_1_f32_subgroup_len; +extern const unsigned char mul_mat_vec_q8_0_q8_1_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_q8_0_q8_1_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_q8_0_q8_1_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_tq3_1s_f16_f32_len; +extern const unsigned char mul_mat_vec_tq3_1s_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_tq3_1s_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_tq3_1s_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_tq3_1s_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_tq3_1s_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_tq3_1s_f32_f32_len; +extern const unsigned char mul_mat_vec_tq3_1s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_tq3_1s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_tq3_1s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_tq3_1s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_tq3_1s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_tq4_1s_f16_f32_len; +extern const unsigned char mul_mat_vec_tq4_1s_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_tq4_1s_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_tq4_1s_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_tq4_1s_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_tq4_1s_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_tq4_1s_f32_f32_len; +extern const unsigned char mul_mat_vec_tq4_1s_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_tq4_1s_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_tq4_1s_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_tq4_1s_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_tq4_1s_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_turbo2_0_f16_f32_len; +extern const unsigned char mul_mat_vec_turbo2_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_turbo2_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_turbo2_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_turbo2_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_turbo2_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_turbo2_0_f32_f32_len; +extern const unsigned char mul_mat_vec_turbo2_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_turbo2_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_turbo2_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_turbo2_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_turbo2_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_turbo3_0_f16_f32_len; +extern const unsigned char mul_mat_vec_turbo3_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_turbo3_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_turbo3_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_turbo3_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_turbo3_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_turbo3_0_f32_f32_len; +extern const unsigned char mul_mat_vec_turbo3_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_turbo3_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_turbo3_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_turbo3_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_turbo3_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_turbo4_0_f16_f32_len; +extern const unsigned char mul_mat_vec_turbo4_0_f16_f32_data[]; + +extern const uint64_t mul_mat_vec_turbo4_0_f16_f32_subgroup_len; +extern const unsigned char mul_mat_vec_turbo4_0_f16_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_turbo4_0_f16_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_turbo4_0_f16_f32_subgroup_no_shmem_data[]; + +extern const uint64_t mul_mat_vec_turbo4_0_f32_f32_len; +extern const unsigned char mul_mat_vec_turbo4_0_f32_f32_data[]; + +extern const uint64_t mul_mat_vec_turbo4_0_f32_f32_subgroup_len; +extern const unsigned char mul_mat_vec_turbo4_0_f32_f32_subgroup_data[]; + +extern const uint64_t mul_mat_vec_turbo4_0_f32_f32_subgroup_no_shmem_len; +extern const unsigned char mul_mat_vec_turbo4_0_f32_f32_subgroup_no_shmem_data[]; + +extern const uint64_t multi_add_f32_len; +extern const unsigned char multi_add_f32_data[]; + +extern const uint64_t multi_add_rms_f32_len; +extern const unsigned char multi_add_rms_f32_data[]; + +extern const uint64_t neg_f16_len; +extern const unsigned char neg_f16_data[]; + +extern const uint64_t neg_f32_len; +extern const unsigned char neg_f32_data[]; + +extern const uint64_t norm_f32_len; +extern const unsigned char norm_f32_data[]; + +extern const uint64_t opt_step_adamw_f32_len; +extern const unsigned char opt_step_adamw_f32_data[]; + +extern const uint64_t opt_step_sgd_f32_len; +extern const unsigned char opt_step_sgd_f32_data[]; + +extern const uint64_t pad_f32_len; +extern const unsigned char pad_f32_data[]; + +extern const uint64_t pool2d_f32_len; +extern const unsigned char pool2d_f32_data[]; + +extern const uint64_t quantize_q8_1_len; +extern const unsigned char quantize_q8_1_data[]; + +extern const uint64_t quantize_q8_1_subgroup_len; +extern const unsigned char quantize_q8_1_subgroup_data[]; + +extern const uint64_t quantize_q8_1_x4_len; +extern const unsigned char quantize_q8_1_x4_data[]; + +extern const uint64_t quantize_q8_1_x4_subgroup_len; +extern const unsigned char quantize_q8_1_x4_subgroup_data[]; + +extern const uint64_t reglu_f16_len; +extern const unsigned char reglu_f16_data[]; + +extern const uint64_t reglu_f32_len; +extern const unsigned char reglu_f32_data[]; + +extern const uint64_t relu_f16_len; +extern const unsigned char relu_f16_data[]; + +extern const uint64_t relu_f32_len; +extern const unsigned char relu_f32_data[]; + +extern const uint64_t repeat_back_f32_len; +extern const unsigned char repeat_back_f32_data[]; + +extern const uint64_t repeat_f32_len; +extern const unsigned char repeat_f32_data[]; + +extern const uint64_t rms_norm_back_f32_len; +extern const unsigned char rms_norm_back_f32_data[]; + +extern const uint64_t rms_norm_f32_len; +extern const unsigned char rms_norm_f32_data[]; + +extern const uint64_t rms_norm_mul_rope_f32_f16_len; +extern const unsigned char rms_norm_mul_rope_f32_f16_data[]; + +extern const uint64_t rms_norm_mul_rope_f32_f32_len; +extern const unsigned char rms_norm_mul_rope_f32_f32_data[]; + +extern const uint64_t rms_norm_partials_f32_len; +extern const unsigned char rms_norm_partials_f32_data[]; + +extern const uint64_t roll_f32_len; +extern const unsigned char roll_f32_data[]; + +extern const uint64_t rope_multi_f16_len; +extern const unsigned char rope_multi_f16_data[]; + +extern const uint64_t rope_multi_f32_len; +extern const unsigned char rope_multi_f32_data[]; + +extern const uint64_t rope_multi_f32_f16_len; +extern const unsigned char rope_multi_f32_f16_data[]; + +extern const uint64_t rope_neox_f16_len; +extern const unsigned char rope_neox_f16_data[]; + +extern const uint64_t rope_neox_f32_len; +extern const unsigned char rope_neox_f32_data[]; + +extern const uint64_t rope_neox_f32_f16_len; +extern const unsigned char rope_neox_f32_f16_data[]; + +extern const uint64_t rope_norm_f16_len; +extern const unsigned char rope_norm_f16_data[]; + +extern const uint64_t rope_norm_f32_len; +extern const unsigned char rope_norm_f32_data[]; + +extern const uint64_t rope_norm_f32_f16_len; +extern const unsigned char rope_norm_f32_f16_data[]; + +extern const uint64_t rope_vision_f16_len; +extern const unsigned char rope_vision_f16_data[]; + +extern const uint64_t rope_vision_f32_len; +extern const unsigned char rope_vision_f32_data[]; + +extern const uint64_t round_f16_len; +extern const unsigned char round_f16_data[]; + +extern const uint64_t round_f32_len; +extern const unsigned char round_f32_data[]; + +extern const uint64_t rwkv_wkv6_f32_len; +extern const unsigned char rwkv_wkv6_f32_data[]; + +extern const uint64_t rwkv_wkv7_f32_len; +extern const unsigned char rwkv_wkv7_f32_data[]; + +extern const uint64_t scale_f32_len; +extern const unsigned char scale_f32_data[]; + +extern const uint64_t set_rows_bf16_i32_len; +extern const unsigned char set_rows_bf16_i32_data[]; + +extern const uint64_t set_rows_bf16_i64_len; +extern const unsigned char set_rows_bf16_i64_data[]; + +extern const uint64_t set_rows_f16_i32_len; +extern const unsigned char set_rows_f16_i32_data[]; + +extern const uint64_t set_rows_f16_i64_len; +extern const unsigned char set_rows_f16_i64_data[]; + +extern const uint64_t set_rows_f32_i32_len; +extern const unsigned char set_rows_f32_i32_data[]; + +extern const uint64_t set_rows_f32_i64_len; +extern const unsigned char set_rows_f32_i64_data[]; + +extern const uint64_t set_rows_iq4_nl_i32_len; +extern const unsigned char set_rows_iq4_nl_i32_data[]; + +extern const uint64_t set_rows_iq4_nl_i64_len; +extern const unsigned char set_rows_iq4_nl_i64_data[]; + +extern const uint64_t set_rows_q1_0_i32_len; +extern const unsigned char set_rows_q1_0_i32_data[]; + +extern const uint64_t set_rows_q1_0_i64_len; +extern const unsigned char set_rows_q1_0_i64_data[]; + +extern const uint64_t set_rows_q4_0_i32_len; +extern const unsigned char set_rows_q4_0_i32_data[]; + +extern const uint64_t set_rows_q4_0_i64_len; +extern const unsigned char set_rows_q4_0_i64_data[]; + +extern const uint64_t set_rows_q4_1_i32_len; +extern const unsigned char set_rows_q4_1_i32_data[]; + +extern const uint64_t set_rows_q4_1_i64_len; +extern const unsigned char set_rows_q4_1_i64_data[]; + +extern const uint64_t set_rows_q5_0_i32_len; +extern const unsigned char set_rows_q5_0_i32_data[]; + +extern const uint64_t set_rows_q5_0_i64_len; +extern const unsigned char set_rows_q5_0_i64_data[]; + +extern const uint64_t set_rows_q5_1_i32_len; +extern const unsigned char set_rows_q5_1_i32_data[]; + +extern const uint64_t set_rows_q5_1_i64_len; +extern const unsigned char set_rows_q5_1_i64_data[]; + +extern const uint64_t set_rows_q8_0_i32_len; +extern const unsigned char set_rows_q8_0_i32_data[]; + +extern const uint64_t set_rows_q8_0_i64_len; +extern const unsigned char set_rows_q8_0_i64_data[]; + +extern const uint64_t set_rows_tq3_1s_i32_len; +extern const unsigned char set_rows_tq3_1s_i32_data[]; + +extern const uint64_t set_rows_tq3_1s_i64_len; +extern const unsigned char set_rows_tq3_1s_i64_data[]; + +extern const uint64_t set_rows_tq4_1s_i32_len; +extern const unsigned char set_rows_tq4_1s_i32_data[]; + +extern const uint64_t set_rows_tq4_1s_i64_len; +extern const unsigned char set_rows_tq4_1s_i64_data[]; + +extern const uint64_t set_rows_turbo2_0_i32_len; +extern const unsigned char set_rows_turbo2_0_i32_data[]; + +extern const uint64_t set_rows_turbo2_0_i64_len; +extern const unsigned char set_rows_turbo2_0_i64_data[]; + +extern const uint64_t set_rows_turbo3_0_i32_len; +extern const unsigned char set_rows_turbo3_0_i32_data[]; + +extern const uint64_t set_rows_turbo3_0_i64_len; +extern const unsigned char set_rows_turbo3_0_i64_data[]; + +extern const uint64_t set_rows_turbo4_0_i32_len; +extern const unsigned char set_rows_turbo4_0_i32_data[]; + +extern const uint64_t set_rows_turbo4_0_i64_len; +extern const unsigned char set_rows_turbo4_0_i64_data[]; + +extern const uint64_t sgn_f16_len; +extern const unsigned char sgn_f16_data[]; + +extern const uint64_t sgn_f32_len; +extern const unsigned char sgn_f32_data[]; + +extern const uint64_t sigmoid_f16_len; +extern const unsigned char sigmoid_f16_data[]; + +extern const uint64_t sigmoid_f32_len; +extern const unsigned char sigmoid_f32_data[]; + +extern const uint64_t silu_back_f32_len; +extern const unsigned char silu_back_f32_data[]; + +extern const uint64_t silu_f16_len; +extern const unsigned char silu_f16_data[]; + +extern const uint64_t silu_f32_len; +extern const unsigned char silu_f32_data[]; + +extern const uint64_t sin_f32_len; +extern const unsigned char sin_f32_data[]; + +extern const uint64_t soft_max_back_f32_len; +extern const unsigned char soft_max_back_f32_data[]; + +extern const uint64_t soft_max_f32_len; +extern const unsigned char soft_max_f32_data[]; + +extern const uint64_t soft_max_f32_f16_len; +extern const unsigned char soft_max_f32_f16_data[]; + +extern const uint64_t soft_max_large1_f32_len; +extern const unsigned char soft_max_large1_f32_data[]; + +extern const uint64_t soft_max_large1_f32_f16_len; +extern const unsigned char soft_max_large1_f32_f16_data[]; + +extern const uint64_t soft_max_large2_f32_len; +extern const unsigned char soft_max_large2_f32_data[]; + +extern const uint64_t soft_max_large2_f32_f16_len; +extern const unsigned char soft_max_large2_f32_f16_data[]; + +extern const uint64_t soft_max_large3_f32_len; +extern const unsigned char soft_max_large3_f32_data[]; + +extern const uint64_t soft_max_large3_f32_f16_len; +extern const unsigned char soft_max_large3_f32_f16_data[]; + +extern const uint64_t softplus_f16_len; +extern const unsigned char softplus_f16_data[]; + +extern const uint64_t softplus_f32_len; +extern const unsigned char softplus_f32_data[]; + +extern const uint64_t solve_tri_f32_len; +extern const unsigned char solve_tri_f32_data[]; + +extern const uint64_t split_k_reduce_len; +extern const unsigned char split_k_reduce_data[]; + +extern const uint64_t sqr_f32_len; +extern const unsigned char sqr_f32_data[]; + +extern const uint64_t sqrt_f32_len; +extern const unsigned char sqrt_f32_data[]; + +extern const uint64_t ssm_conv_f32_len; +extern const unsigned char ssm_conv_f32_data[]; + +extern const uint64_t ssm_scan_f32_len; +extern const unsigned char ssm_scan_f32_data[]; + +extern const uint64_t ssm_scan_subgroup_f32_len; +extern const unsigned char ssm_scan_subgroup_f32_data[]; + +extern const uint64_t step_f16_len; +extern const unsigned char step_f16_data[]; + +extern const uint64_t step_f32_len; +extern const unsigned char step_f32_data[]; + +extern const uint64_t sub_f16_f16_f16_len; +extern const unsigned char sub_f16_f16_f16_data[]; + +extern const uint64_t sub_f16_f16_f32_len; +extern const unsigned char sub_f16_f16_f32_data[]; + +extern const uint64_t sub_f16_f32_f16_len; +extern const unsigned char sub_f16_f32_f16_data[]; + +extern const uint64_t sub_f16_f32_f32_len; +extern const unsigned char sub_f16_f32_f32_data[]; + +extern const uint64_t sub_f32_len; +extern const unsigned char sub_f32_data[]; + +extern const uint64_t sub_f32_f16_f16_len; +extern const unsigned char sub_f32_f16_f16_data[]; + +extern const uint64_t sub_f32_f16_f32_len; +extern const unsigned char sub_f32_f16_f32_data[]; + +extern const uint64_t sub_f32_f32_f16_len; +extern const unsigned char sub_f32_f32_f16_data[]; + +extern const uint64_t sub_f32_f32_f32_len; +extern const unsigned char sub_f32_f32_f32_data[]; + +extern const uint64_t sum_rows_f32_len; +extern const unsigned char sum_rows_f32_data[]; + +extern const uint64_t swiglu_f16_len; +extern const unsigned char swiglu_f16_data[]; + +extern const uint64_t swiglu_f32_len; +extern const unsigned char swiglu_f32_data[]; + +extern const uint64_t swiglu_oai_f16_len; +extern const unsigned char swiglu_oai_f16_data[]; + +extern const uint64_t swiglu_oai_f32_len; +extern const unsigned char swiglu_oai_f32_data[]; + +extern const uint64_t tanh_f16_len; +extern const unsigned char tanh_f16_data[]; + +extern const uint64_t tanh_f32_len; +extern const unsigned char tanh_f32_data[]; + +extern const uint64_t timestep_embedding_f32_len; +extern const unsigned char timestep_embedding_f32_data[]; + +extern const uint64_t topk_argsort_f32_len; +extern const unsigned char topk_argsort_f32_data[]; + +extern const uint64_t topk_moe_f32_len; +extern const unsigned char topk_moe_f32_data[]; + +extern const uint64_t topk_nary_search_f32_len; +extern const unsigned char topk_nary_search_f32_data[]; + +extern const uint64_t tri_f16_len; +extern const unsigned char tri_f16_data[]; + +extern const uint64_t tri_f32_len; +extern const unsigned char tri_f32_data[]; + +extern const uint64_t trunc_f16_len; +extern const unsigned char trunc_f16_data[]; + +extern const uint64_t trunc_f32_len; +extern const unsigned char trunc_f32_data[]; + +extern const uint64_t turbo_wht_len; +extern const unsigned char turbo_wht_data[]; + +extern const uint64_t upscale_f32_len; +extern const unsigned char upscale_f32_data[]; + +extern const uint64_t xielu_f16_len; +extern const unsigned char xielu_f16_data[]; + +extern const uint64_t xielu_f32_len; +extern const unsigned char xielu_f32_data[]; + +extern const void * add_data[2][2][2]; +extern const uint64_t add_len[2][2][2]; +extern const void * sub_data[2][2][2]; +extern const uint64_t sub_len[2][2][2]; +extern const void * mul_data[2][2][2]; +extern const uint64_t mul_len[2][2][2]; +extern const void * div_data[2][2][2]; +extern const uint64_t div_len[2][2][2]; +extern const void * add_rms_data[2][2][2]; +extern const uint64_t add_rms_len[2][2][2]; +extern const void * arr_dmmv_f32_f16_f32_data[3]; +extern const uint64_t arr_dmmv_f32_f16_f32_len[3]; +extern const void * arr_dmmv_f16_f16_f32_data[3]; +extern const uint64_t arr_dmmv_f16_f16_f32_len[3]; +extern const void * arr_dmmv_q1_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q1_0_f16_f32_len[3]; +extern const void * arr_dmmv_q4_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q4_0_f16_f32_len[3]; +extern const void * arr_dmmv_q4_1_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q4_1_f16_f32_len[3]; +extern const void * arr_dmmv_q5_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q5_0_f16_f32_len[3]; +extern const void * arr_dmmv_q5_1_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q5_1_f16_f32_len[3]; +extern const void * arr_dmmv_q8_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q8_0_f16_f32_len[3]; +extern const void * arr_dmmv_q2_k_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q2_k_f16_f32_len[3]; +extern const void * arr_dmmv_q3_k_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q3_k_f16_f32_len[3]; +extern const void * arr_dmmv_q4_k_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q4_k_f16_f32_len[3]; +extern const void * arr_dmmv_q5_k_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q5_k_f16_f32_len[3]; +extern const void * arr_dmmv_q6_k_f16_f32_data[3]; +extern const uint64_t arr_dmmv_q6_k_f16_f32_len[3]; +extern const void * arr_dmmv_iq1_s_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq1_s_f16_f32_len[3]; +extern const void * arr_dmmv_iq1_m_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq1_m_f16_f32_len[3]; +extern const void * arr_dmmv_iq2_xxs_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq2_xxs_f16_f32_len[3]; +extern const void * arr_dmmv_iq2_xs_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq2_xs_f16_f32_len[3]; +extern const void * arr_dmmv_iq2_s_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq2_s_f16_f32_len[3]; +extern const void * arr_dmmv_iq3_xxs_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq3_xxs_f16_f32_len[3]; +extern const void * arr_dmmv_iq3_s_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq3_s_f16_f32_len[3]; +extern const void * arr_dmmv_iq4_xs_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq4_xs_f16_f32_len[3]; +extern const void * arr_dmmv_iq4_nl_f16_f32_data[3]; +extern const uint64_t arr_dmmv_iq4_nl_f16_f32_len[3]; +extern const void * arr_dmmv_mxfp4_f16_f32_data[3]; +extern const uint64_t arr_dmmv_mxfp4_f16_f32_len[3]; +extern const void * arr_dmmv_nvfp4_f16_f32_data[3]; +extern const uint64_t arr_dmmv_nvfp4_f16_f32_len[3]; +extern const void * arr_dmmv_bf16_f16_f32_data[3]; +extern const uint64_t arr_dmmv_bf16_f16_f32_len[3]; +extern const void * arr_dmmv_turbo2_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_turbo2_0_f16_f32_len[3]; +extern const void * arr_dmmv_turbo3_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_turbo3_0_f16_f32_len[3]; +extern const void * arr_dmmv_turbo4_0_f16_f32_data[3]; +extern const uint64_t arr_dmmv_turbo4_0_f16_f32_len[3]; +extern const void * arr_dmmv_tq3_1s_f16_f32_data[3]; +extern const uint64_t arr_dmmv_tq3_1s_f16_f32_len[3]; +extern const void * arr_dmmv_tq4_1s_f16_f32_data[3]; +extern const uint64_t arr_dmmv_tq4_1s_f16_f32_len[3]; +extern const void * arr_dmmv_f32_f32_f32_data[3]; +extern const uint64_t arr_dmmv_f32_f32_f32_len[3]; +extern const void * arr_dmmv_id_f32_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_f32_f32_f32_len[3]; +extern const void * arr_dmmv_f16_f32_f32_data[3]; +extern const uint64_t arr_dmmv_f16_f32_f32_len[3]; +extern const void * arr_dmmv_id_f16_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_f16_f32_f32_len[3]; +extern const void * arr_dmmv_q1_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q1_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_q1_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q1_0_f32_f32_len[3]; +extern const void * arr_dmmv_q4_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q4_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_q4_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q4_0_f32_f32_len[3]; +extern const void * arr_dmmv_q4_1_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q4_1_f32_f32_len[3]; +extern const void * arr_dmmv_id_q4_1_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q4_1_f32_f32_len[3]; +extern const void * arr_dmmv_q5_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q5_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_q5_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q5_0_f32_f32_len[3]; +extern const void * arr_dmmv_q5_1_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q5_1_f32_f32_len[3]; +extern const void * arr_dmmv_id_q5_1_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q5_1_f32_f32_len[3]; +extern const void * arr_dmmv_q8_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q8_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_q8_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q8_0_f32_f32_len[3]; +extern const void * arr_dmmv_q2_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q2_k_f32_f32_len[3]; +extern const void * arr_dmmv_id_q2_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q2_k_f32_f32_len[3]; +extern const void * arr_dmmv_q3_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q3_k_f32_f32_len[3]; +extern const void * arr_dmmv_id_q3_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q3_k_f32_f32_len[3]; +extern const void * arr_dmmv_q4_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q4_k_f32_f32_len[3]; +extern const void * arr_dmmv_id_q4_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q4_k_f32_f32_len[3]; +extern const void * arr_dmmv_q5_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q5_k_f32_f32_len[3]; +extern const void * arr_dmmv_id_q5_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q5_k_f32_f32_len[3]; +extern const void * arr_dmmv_q6_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_q6_k_f32_f32_len[3]; +extern const void * arr_dmmv_id_q6_k_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_q6_k_f32_f32_len[3]; +extern const void * arr_dmmv_iq1_s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq1_s_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq1_s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq1_s_f32_f32_len[3]; +extern const void * arr_dmmv_iq1_m_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq1_m_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq1_m_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq1_m_f32_f32_len[3]; +extern const void * arr_dmmv_iq2_xxs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq2_xxs_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq2_xxs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq2_xxs_f32_f32_len[3]; +extern const void * arr_dmmv_iq2_xs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq2_xs_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq2_xs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq2_xs_f32_f32_len[3]; +extern const void * arr_dmmv_iq2_s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq2_s_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq2_s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq2_s_f32_f32_len[3]; +extern const void * arr_dmmv_iq3_xxs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq3_xxs_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq3_xxs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq3_xxs_f32_f32_len[3]; +extern const void * arr_dmmv_iq3_s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq3_s_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq3_s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq3_s_f32_f32_len[3]; +extern const void * arr_dmmv_iq4_xs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq4_xs_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq4_xs_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq4_xs_f32_f32_len[3]; +extern const void * arr_dmmv_iq4_nl_f32_f32_data[3]; +extern const uint64_t arr_dmmv_iq4_nl_f32_f32_len[3]; +extern const void * arr_dmmv_id_iq4_nl_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq4_nl_f32_f32_len[3]; +extern const void * arr_dmmv_mxfp4_f32_f32_data[3]; +extern const uint64_t arr_dmmv_mxfp4_f32_f32_len[3]; +extern const void * arr_dmmv_id_mxfp4_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_mxfp4_f32_f32_len[3]; +extern const void * arr_dmmv_nvfp4_f32_f32_data[3]; +extern const uint64_t arr_dmmv_nvfp4_f32_f32_len[3]; +extern const void * arr_dmmv_id_nvfp4_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_nvfp4_f32_f32_len[3]; +extern const void * arr_dmmv_bf16_f32_f32_data[3]; +extern const uint64_t arr_dmmv_bf16_f32_f32_len[3]; +extern const void * arr_dmmv_id_bf16_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_bf16_f32_f32_len[3]; +extern const void * arr_dmmv_turbo2_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_turbo2_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_turbo2_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_turbo2_0_f32_f32_len[3]; +extern const void * arr_dmmv_turbo3_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_turbo3_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_turbo3_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_turbo3_0_f32_f32_len[3]; +extern const void * arr_dmmv_turbo4_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_turbo4_0_f32_f32_len[3]; +extern const void * arr_dmmv_id_turbo4_0_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_turbo4_0_f32_f32_len[3]; +extern const void * arr_dmmv_tq3_1s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_tq3_1s_f32_f32_len[3]; +extern const void * arr_dmmv_id_tq3_1s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_tq3_1s_f32_f32_len[3]; +extern const void * arr_dmmv_tq4_1s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_tq4_1s_f32_f32_len[3]; +extern const void * arr_dmmv_id_tq4_1s_f32_f32_data[3]; +extern const uint64_t arr_dmmv_id_tq4_1s_f32_f32_len[3]; +extern const void * arr_dmmv_q4_0_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q4_0_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q4_0_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q4_0_q8_1_f32_len[3]; +extern const void * arr_dmmv_q4_1_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q4_1_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q4_1_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q4_1_q8_1_f32_len[3]; +extern const void * arr_dmmv_q5_0_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q5_0_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q5_0_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q5_0_q8_1_f32_len[3]; +extern const void * arr_dmmv_q5_1_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q5_1_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q5_1_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q5_1_q8_1_f32_len[3]; +extern const void * arr_dmmv_q8_0_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q8_0_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q8_0_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q8_0_q8_1_f32_len[3]; +extern const void * arr_dmmv_q2_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q2_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q2_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q2_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_q3_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q3_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q3_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q3_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_q4_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q4_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q4_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q4_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_q5_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q5_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q5_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q5_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_q6_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_q6_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_q6_k_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_q6_k_q8_1_f32_len[3]; +extern const void * arr_dmmv_iq1_s_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_iq1_s_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_iq1_s_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq1_s_q8_1_f32_len[3]; +extern const void * arr_dmmv_iq1_m_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_iq1_m_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_iq1_m_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_iq1_m_q8_1_f32_len[3]; +extern const void * arr_dmmv_mxfp4_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_mxfp4_q8_1_f32_len[3]; +extern const void * arr_dmmv_id_mxfp4_q8_1_f32_data[3]; +extern const uint64_t arr_dmmv_id_mxfp4_q8_1_f32_len[3]; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index cedd7132cfa..0388412c71c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -3536,7 +3536,9 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, ) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, ) } + CREATE_FA(GGML_TYPE_TURBO2_0, turbo2_0, FA_SCALAR, ) CREATE_FA(GGML_TYPE_TURBO3_0, turbo3_0, FA_SCALAR, ) + CREATE_FA(GGML_TYPE_TURBO4_0, turbo4_0, FA_SCALAR, ) } else { CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, _fp32) @@ -3559,7 +3561,9 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _fp32) } + CREATE_FA(GGML_TYPE_TURBO2_0, turbo2_0, FA_SCALAR, _fp32) CREATE_FA(GGML_TYPE_TURBO3_0, turbo3_0, FA_SCALAR, _fp32) + CREATE_FA(GGML_TYPE_TURBO4_0, turbo4_0, FA_SCALAR, _fp32) } #if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->coopmat1_fa_support) { @@ -3571,7 +3575,9 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT1, _cm1) + CREATE_FA(GGML_TYPE_TURBO2_0, turbo2_0, FA_COOPMAT1, _cm1) CREATE_FA(GGML_TYPE_TURBO3_0, turbo3_0, FA_COOPMAT1, _cm1) + CREATE_FA(GGML_TYPE_TURBO4_0, turbo4_0, FA_COOPMAT1, _cm1) } #endif #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) @@ -3584,7 +3590,9 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT2, _cm2) CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT2, _cm2) CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT2, _cm2) + CREATE_FA(GGML_TYPE_TURBO2_0, turbo2_0, FA_COOPMAT2, _cm2) CREATE_FA(GGML_TYPE_TURBO3_0, turbo3_0, FA_COOPMAT2, _cm2) + CREATE_FA(GGML_TYPE_TURBO4_0, turbo4_0, FA_COOPMAT2, _cm2) } #endif #undef CREATE_FA @@ -4225,6 +4233,9 @@ static void ggml_vk_load_shaders(vk_device& device) { // shared-memory reduction. NUM_ROWS=8 amortises the butterfly cost // across 8 output rows per workgroup. ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_TQ4_1S][i], "mul_mat_vec_tq4_1s_f32_f32", arr_dmmv_tq4_1s_f32_f32_len[tq4_1s_reduc], arr_dmmv_tq4_1s_f32_f32_data[tq4_1s_reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {8, 1, 1}, {tq4_1s_wg_size, 8, i+1}, 1, true, tq4_1s_use_subgroups, tq4_1s_force_sg_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_TURBO2_0][i], "mul_mat_vec_turbo2_0_f32_f32", arr_dmmv_turbo2_0_f32_f32_len[reduc], arr_dmmv_turbo2_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_TURBO3_0][i], "mul_mat_vec_turbo3_0_f32_f32", arr_dmmv_turbo3_0_f32_f32_len[reduc], arr_dmmv_turbo3_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_TURBO4_0][i], "mul_mat_vec_turbo4_0_f32_f32", arr_dmmv_turbo4_0_f32_f32_len[reduc], arr_dmmv_turbo4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32", arr_dmmv_f32_f16_f32_len[reduc], arr_dmmv_f32_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); @@ -4252,6 +4263,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_MXFP4][i], "mul_mat_vec_mxfp4_f16_f32", arr_dmmv_mxfp4_f16_f32_len[reduc16], arr_dmmv_mxfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_NVFP4][i], "mul_mat_vec_nvfp4_f16_f32", arr_dmmv_nvfp4_f16_f32_len[reduc16], arr_dmmv_nvfp4_f16_f32_data[reduc16], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_TQ4_1S][i], "mul_mat_vec_tq4_1s_f16_f32", arr_dmmv_tq4_1s_f16_f32_len[tq4_1s_reduc], arr_dmmv_tq4_1s_f16_f32_data[tq4_1s_reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {8, 1, 1}, {tq4_1s_wg_size, 8, i+1}, 1, true, tq4_1s_use_subgroups, tq4_1s_force_sg_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_TURBO2_0][i], "mul_mat_vec_turbo2_0_f16_f32", arr_dmmv_turbo2_0_f16_f32_len[reduc], arr_dmmv_turbo2_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_TURBO3_0][i], "mul_mat_vec_turbo3_0_f16_f32", arr_dmmv_turbo3_0_f16_f32_len[reduc], arr_dmmv_turbo3_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_TURBO4_0][i], "mul_mat_vec_turbo4_0_f16_f32", arr_dmmv_turbo4_0_f16_f32_len[reduc], arr_dmmv_turbo4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { @@ -4304,6 +4318,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", arr_dmmv_id_iq4_nl_f32_f32_len[reduc16], arr_dmmv_id_iq4_nl_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_MXFP4], "mul_mat_vec_id_mxfp4_f32", arr_dmmv_id_mxfp4_f32_f32_len[reduc16], arr_dmmv_id_mxfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_NVFP4], "mul_mat_vec_id_nvfp4_f32", arr_dmmv_id_nvfp4_f32_f32_len[reduc16], arr_dmmv_id_nvfp4_f32_f32_data[reduc16], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq}, 1, true, use_subgroups16, force_subgroup_size16); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_TURBO2_0], "mul_mat_vec_id_turbo2_0_f32", arr_dmmv_id_turbo2_0_f32_f32_len[reduc], arr_dmmv_id_turbo2_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_TURBO3_0], "mul_mat_vec_id_turbo3_0_f32", arr_dmmv_id_turbo3_0_f32_f32_len[reduc], arr_dmmv_id_turbo3_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_TURBO4_0], "mul_mat_vec_id_turbo4_0_f32", arr_dmmv_id_turbo4_0_f32_f32_len[reduc], arr_dmmv_id_turbo4_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { @@ -4360,7 +4377,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_MXFP4], "dequant_mxfp4", dequant_mxfp4_len, dequant_mxfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_NVFP4], "dequant_nvfp4", dequant_nvfp4_len, dequant_nvfp4_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TURBO2_0], "dequant_turbo2_0", dequant_turbo2_0_len, dequant_turbo2_0_data, "main", 2, 5 * sizeof(uint32_t), {128, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TURBO3_0], "dequant_turbo3_0", dequant_turbo3_0_len, dequant_turbo3_0_data, "main", 2, 5 * sizeof(uint32_t), {128, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TURBO4_0], "dequant_turbo4_0", dequant_turbo4_0_len, dequant_turbo4_0_data, "main", 2, 5 * sizeof(uint32_t), {128, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TQ3_1S], "dequant_tq3_1s", dequant_tq3_1s_len, dequant_tq3_1s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_TQ4_1S], "dequant_tq4_1s", dequant_tq4_1s_len, dequant_tq4_1s_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); // TurboQuant WHT @@ -4392,7 +4412,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_NVFP4], "get_rows_nvfp4", get_rows_nvfp4_len, get_rows_nvfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_TURBO2_0], "get_rows_turbo2_0", get_rows_turbo2_0_len, get_rows_turbo2_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_TURBO3_0], "get_rows_turbo3_0", get_rows_turbo3_0_len, get_rows_turbo3_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_TURBO4_0], "get_rows_turbo4_0", get_rows_turbo4_0_len, get_rows_turbo4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_TQ3_1S], "get_rows_tq3_1s", get_rows_tq3_1s_len, get_rows_tq3_1s_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); @@ -4420,7 +4443,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_TURBO2_0], "get_rows_turbo2_0_f32", get_rows_turbo2_0_f32_len, get_rows_turbo2_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_TURBO3_0], "get_rows_turbo3_0_f32", get_rows_turbo3_0_f32_len, get_rows_turbo3_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_TURBO4_0], "get_rows_turbo4_0_f32", get_rows_turbo4_0_f32_len, get_rows_turbo4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_TQ3_1S], "get_rows_tq3_1s_f32", get_rows_tq3_1s_f32_len, get_rows_tq3_1s_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); @@ -4519,7 +4545,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TURBO2_0], "cpy_turbo2_0_f32", cpy_turbo2_0_f32_len, cpy_turbo2_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TURBO2_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TURBO3_0], "cpy_turbo3_0_f32", cpy_turbo3_0_f32_len, cpy_turbo3_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TURBO3_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TURBO4_0], "cpy_turbo4_0_f32", cpy_turbo4_0_f32_len, cpy_turbo4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TURBO4_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TQ3_1S], "cpy_tq3_1s_f32", cpy_tq3_1s_f32_len, cpy_tq3_1s_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TQ3_1S), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_TQ4_1S], "cpy_tq4_1s_f32", cpy_tq4_1s_f32_len, cpy_tq4_1s_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_TQ4_1S), 1, 1}, {}, 1); auto get_suffix = [](bool src0_f16, bool src1_f16, bool dst_f16) { @@ -6319,6 +6348,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: case GGML_TYPE_TQ4_1S: + case GGML_TYPE_TURBO2_0: + case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: break; default: return nullptr; @@ -6485,6 +6517,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: + case GGML_TYPE_TURBO2_0: + case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: break; default: return nullptr; @@ -7434,7 +7469,10 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_TURBO2_0: case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: + case GGML_TYPE_TQ3_1S: case GGML_TYPE_TQ4_1S: return ctx->device->pipeline_cpy_quant_f32[src->type]; default: @@ -10264,7 +10302,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co dst->type == GGML_TYPE_TURBO3_0 || dst->type == GGML_TYPE_TURBO4_0) { ne = ne / 128; - } else if (dst->type == GGML_TYPE_TQ4_1S) { + } else if (dst->type == GGML_TYPE_TQ3_1S || dst->type == GGML_TYPE_TQ4_1S) { ne = ne / 32; } else if (ggml_is_quantized(dst->type)) { // quants run 32 threads each doing QUANT_K elements @@ -15576,7 +15614,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_F32: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_TURBO2_0: case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: // supported in scalar and coopmat2 paths break; case GGML_TYPE_Q4_1: @@ -15637,7 +15677,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: + case GGML_TYPE_TURBO2_0: case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: + case GGML_TYPE_TQ3_1S: case GGML_TYPE_I32: return true; default: @@ -15660,6 +15703,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_TURBO2_0: case GGML_TYPE_TURBO3_0: case GGML_TYPE_TURBO4_0: + case GGML_TYPE_TQ3_1S: case GGML_TYPE_TQ4_1S: return true; default: @@ -15700,7 +15744,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_TURBO2_0: case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: + case GGML_TYPE_TQ3_1S: case GGML_TYPE_TQ4_1S: return true; default: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp index 812401ee6e3..cd4e76a3737 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp @@ -31,7 +31,36 @@ void main() { const uint a_offset = 0; const uint ib = src_idx; -#if defined(DATA_A_TQ4_1S) +#if defined(DATA_A_TQ3_1S) + const float tq3_signs[32] = float[32]( + +1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0, + -1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0, + -1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0, + -1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0 + ); + const float TQ3_INV_SQRT32 = 0.17677669529663688; + + float buf[32]; + for (int j = 0; j < 32; j += 2) { + vec2 v = dequantize(ib, j, a_offset); + buf[j] = v.x; + buf[j+1] = v.y; + } + + for (uint step = 1u; step < 32u; step <<= 1u) { + for (uint i = 0u; i < 32u; i += step * 2u) { + for (uint j2 = i; j2 < i + step; j2++) { + float a2 = buf[j2], b2 = buf[j2 + step]; + buf[j2] = a2 + b2; + buf[j2 + step] = a2 - b2; + } + } + } + + for (int j = 0; j < 32; j++) { + data_d[dst_idx + j] = buf[j] * TQ3_INV_SQRT32 * tq3_signs[j]; + } +#elif defined(DATA_A_TQ4_1S) // TQ4_1S requires full inverse WHT after centroid*scale dequant. // Dequant all 32 elements into a buffer, apply butterfly, then write. const float tq4_signs[32] = float[32]( diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp index 480de55fb85..982889e19e3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp @@ -214,7 +214,7 @@ void quantize(uint dst_idx, uint src_idx) } #endif -#if defined(DATA_A_TURBO3_0) +#if defined(DATA_A_TURBO3_0) || defined(DATA_A_TURBO2_0) || defined(DATA_A_TURBO4_0) const float TS1[128] = float[128]( -1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, @@ -239,6 +239,9 @@ const float TS2[128] = float[128]( const float TINV = 0.08838834764831845; // 1 / sqrt(128) +// TurboQuant 2-bit centroids +const float T2M[3] = float[3](-0.086728, 0.0, 0.086728); + const float TC[8] = float[8]( -0.190685, -0.117832, -0.065717, -0.021460, 0.021460, 0.065717, 0.117832, 0.190685 @@ -250,43 +253,49 @@ const float TM[7] = float[7]( 0.043589, 0.091775, 0.154259 ); +// TurboQuant 4-bit midpoints +const float T4M[15] = float[15]( + -0.145561, -0.103361, -0.079142, -0.060009, + -0.043430, -0.028293, -0.013964, 0.000000, + 0.013964, 0.028293, 0.043430, 0.060009, + 0.079142, 0.103361, 0.145561 +); + #if defined(SET_ROWS) shared float wht[128]; shared float sg_acc[16]; shared float gnrm; -void quantize_block(uint b, uint o) { - [[unroll]] for (int j = 0; j < 32; ++j) data_q[b].qs[j] = uint8_t(0); - [[unroll]] for (int j = 0; j < 16; ++j) data_q[b].signs[j] = uint8_t(0); - float rs = 0.0; - [[unroll]] for (int j = 0; j < 128; ++j) { - float v = wht[o + j]; - uint i = v < TM[0] ? 0 : v < TM[1] ? 1 : v < TM[2] ? 2 : v < TM[3] ? 3 : - v < TM[4] ? 4 : v < TM[5] ? 5 : v < TM[6] ? 6 : 7; - rs += TC[i] * TC[i]; - uint low2 = i & 0x3; - uint hi1 = (i >> 2) & 0x1; - data_q[b].qs[j / 4] |= uint8_t(low2 << ((j % 4) * 2)); - data_q[b].signs[j / 8] |= uint8_t(hi1 << (j % 8)); - } - float rn = sqrt(rs); - data_q[b].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm); -} - #endif // defined(SET_ROWS) -#endif // defined(DATA_A_TURBO3_0) +#endif // defined(DATA_A_TURBO3_0) ... -#if defined(DATA_A_TQ4_1S) +#if defined(DATA_A_TQ3_1S) || defined(DATA_A_TQ4_1S) -const float TQ4_SIGNS[32] = float[32]( +const float TQ_SIGNS[32] = float[32]( +1, -1, +1, -1, +1, +1, -1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, -1, +1, -1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, -1, +1 ); -const float TQ4_INV_SQRT32 = 0.17677669529663688; // 1 / sqrt(32) +const float TQ_INV_SQRT32 = 0.17677669529663688; // 1 / sqrt(32) + +const float TQ3_CENTROIDS[8] = float[8]( + -1.996684, -1.291398, -0.740341, -0.247508, + 0.230106, 0.725222, 1.277503, 1.988943 +); + +uint tq3_choose_index(float val) { + if (val < -1.644041) return 0u; + if (val < -1.015869) return 1u; + if (val < -0.493925) return 2u; + if (val < -0.008701) return 3u; + if (val < 0.477664) return 4u; + if (val < 1.001362) return 5u; + if (val < 1.633223) return 6u; + return 7u; +} const float TQ4_CENTROIDS[16] = float[16]( -2.732590, -2.069017, -1.618046, -1.256231, @@ -315,7 +324,7 @@ uint tq4_choose_index(float val) { return 15u; } -#endif // defined(DATA_A_TQ4_1S) +#endif // defined(DATA_A_TQ4_1S) ... #if defined(DATA_A_IQ4_NL) uint best_index(float x) { @@ -379,7 +388,7 @@ void quantize(uint dst_idx, uint src_idx) } #endif -#if defined(SET_ROWS) && defined(DATA_A_TURBO3_0) +#if defined(SET_ROWS) && (defined(DATA_A_TURBO3_0) || defined(DATA_A_TURBO2_0) || defined(DATA_A_TURBO4_0)) void main() { const uint t = gl_LocalInvocationID.x; const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; @@ -398,12 +407,10 @@ void main() { const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE; const uint db = dst_idx(ig, i1, i02, i03) + get_doffset(); - // Step 1: load into shared memory - wht[t] = data_s[sb + t]; - barrier(); + float val = data_s[sb + t]; // Step 2: L2 norm via subgroup reduction - float v2 = wht[t] * wht[t]; + float v2 = val * val; v2 = subgroupAdd(v2); if (gl_SubgroupInvocationID == 0) sg_acc[gl_SubgroupID] = v2; barrier(); @@ -414,301 +421,111 @@ void main() { } barrier(); - // Step 3: normalize, then apply forward WHT: signs1 -> butterfly -> signs2 - wht[t] *= (gnrm > 1e-10) ? (1.0 / gnrm) : 0.0; - barrier(); + // Step 3: normalize, then apply forward WHT + val *= (gnrm > 1e-10) ? (1.0 / gnrm) : 0.0; + val *= TS1[t]; - wht[t] *= TS1[t]; - barrier(); + // Optimized WHT using subgroup shuffle + uint sg_size = gl_SubgroupSize; + for (uint h = 1; h < sg_size && h < 128; h *= 2) { + float other = subgroupShuffleXor(val, h); + val = ((t & h) == 0) ? (val + other) : (other - val); + } - [[unroll]] for (uint h = 1; h < 128; h *= 2) { - if ((t % (2 * h)) < h) { - float a = wht[t]; - float b = wht[t + h]; - wht[t] = a + b; - wht[t + h] = a - b; - } + if (sg_size < 128) { + wht[t] = val; barrier(); + for (uint h = sg_size; h < 128; h *= 2) { + if ((t % (2 * h)) < h) { + float a = wht[t]; + float b = wht[t + h]; + wht[t] = a + b; + wht[t + h] = a - b; + } + barrier(); + } + val = wht[t]; } // Step 5: apply signs2 + scaling - float rv = wht[t] * TINV * TS2[t]; - - // Step 6: quantize -- all 128 threads participate - uint idx = rv < TM[0] ? 0u : rv < TM[1] ? 1u : rv < TM[2] ? 2u : rv < TM[3] ? 3u : - rv < TM[4] ? 4u : rv < TM[5] ? 5u : rv < TM[6] ? 6u : 7u; + val *= TINV * TS2[t]; - // Pack qs: 4 elements per byte via subgroup shuffle + // Step 6: Quantize +#if defined(DATA_A_TURBO2_0) + uint idx = val < T2M[0] ? 0u : val < T2M[1] ? 1u : val < T2M[2] ? 2u : 3u; uint sg_lane = gl_SubgroupInvocationID; - uint my_low2 = idx & 0x3u; + uint my_2bit = idx & 0x3u; uint qs_byte = 0u; [[unroll]] for (uint k = 0; k < 4; k++) { - uint contrib = subgroupShuffle(my_low2, (sg_lane & ~3u) + k); + uint contrib = subgroupShuffle(my_2bit, (sg_lane & ~3u) + k); qs_byte |= contrib << (k * 2u); } if (sg_lane % 4u == 0u) { data_q[db].qs[t / 4u] = uint8_t(qs_byte); } - - // Pack signs: 8 elements per byte via subgroup ballot - uvec4 ballot = subgroupBallot(((idx >> 2u) & 1u) != 0u); - if (sg_lane % 8u == 0u) { - uint local_byte = sg_lane / 8u; - data_q[db].signs[t / 8u] = uint8_t((ballot.x >> (local_byte * 8u)) & 0xFFu); - } - - // Step 7: reconstruction norm via subgroup reduction - float rc = TC[idx] * TC[idx]; - rc = subgroupAdd(rc); - if (sg_lane == 0u) sg_acc[gl_SubgroupID] = rc; - barrier(); - if (t == 0u) { - float total = 0.0; - for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc[w]; - float rn = sqrt(total); - data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm); - } -} -#elif defined(SET_ROWS) && defined(DATA_A_TURBO2_0) -// Mirror of the TURBO3_0 block above, adapted for turbo2 (4 centroids, -// 2-bit pack, no signs byte). WHT tables and reduction structure are -// identical (QK = 128 for both). -const float TS1_T2[128] = float[128]( - -1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1, - 1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, - -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, - 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1, - -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1, - 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, - -1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1, - 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1 -); -const float TS2_T2[128] = float[128]( - 1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, - 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, - 1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, - 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, - 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1, - -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, - 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, - -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1 -); -const float TINV_T2 = 0.08838834764831845; // 1 / sqrt(128) -// Lloyd-Max centroids for N(0, 1/128), 4 levels (matches CENTROIDS_2BIT in C ref) -const float TC2[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462); -// Midpoints between adjacent centroids -const float TM2[3] = float[3](-0.086728, 0.0, 0.086728); - -shared float wht_t2[128]; -shared float sg_acc_t2[16]; -shared float gnrm_t2; - -void main() { - const uint t = gl_LocalInvocationID.x; - const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; - const uint gpr = p.ne00 / 128; - - if (gpr == 0) return; - if (g >= p.ne / 128) return; - - uint tmp = g; - const uint ig = tmp % gpr; tmp /= gpr; - const uint i01 = tmp % p.ne01; tmp /= p.ne01; - const uint i02 = tmp % p.ne12; - const uint i03 = tmp / p.ne12; - - const uint sb = src0_idx(ig * 128, i01, i02, i03) + get_aoffset(); - const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE; - const uint db = dst_idx(ig, i1, i02, i03) + get_doffset(); - - wht_t2[t] = data_s[sb + t]; - barrier(); - - float v2 = wht_t2[t] * wht_t2[t]; - v2 = subgroupAdd(v2); - if (gl_SubgroupInvocationID == 0) sg_acc_t2[gl_SubgroupID] = v2; - barrier(); - if (t == 0) { - float total = 0.0; - for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t2[w]; - gnrm_t2 = sqrt(total); - } - barrier(); - - wht_t2[t] *= (gnrm_t2 > 1e-10) ? (1.0 / gnrm_t2) : 0.0; - barrier(); - - wht_t2[t] *= TS1_T2[t]; - barrier(); - - [[unroll]] for (uint h = 1; h < 128; h *= 2) { - if ((t % (2 * h)) < h) { - float a = wht_t2[t]; - float b = wht_t2[t + h]; - wht_t2[t] = a + b; - wht_t2[t + h] = a - b; - } - barrier(); - } - - float rv = wht_t2[t] * TINV_T2 * TS2_T2[t]; - - // Quantize to nearest of 4 centroids (2-bit index, no signs byte) - uint idx = rv < TM2[0] ? 0u : rv < TM2[1] ? 1u : rv < TM2[2] ? 2u : 3u; - - // Pack qs: 4 elements per byte (full 2-bit each, no high bit) +#elif defined(DATA_A_TURBO3_0) + uint idx = val < TM[0] ? 0u : val < TM[1] ? 1u : val < TM[2] ? 2u : val < TM[3] ? 3u : + val < TM[4] ? 4u : val < TM[5] ? 5u : val < TM[6] ? 6u : 7u; uint sg_lane = gl_SubgroupInvocationID; + uint my_low2 = idx & 0x3u; uint qs_byte = 0u; [[unroll]] for (uint k = 0; k < 4; k++) { - uint contrib = subgroupShuffle(idx & 0x3u, (sg_lane & ~3u) + k); + uint contrib = subgroupShuffle(my_low2, (sg_lane & ~3u) + k); qs_byte |= contrib << (k * 2u); } if (sg_lane % 4u == 0u) { data_q[db].qs[t / 4u] = uint8_t(qs_byte); } - - // Reconstruction norm via subgroup reduction - float rc = TC2[idx] * TC2[idx]; - rc = subgroupAdd(rc); - if (sg_lane == 0u) sg_acc_t2[gl_SubgroupID] = rc; - barrier(); - if (t == 0u) { - float total = 0.0; - for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t2[w]; - float rn = sqrt(total); - data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm_t2 / rn) : gnrm_t2); - } -} - -#elif defined(SET_ROWS) && defined(DATA_A_TURBO4_0) -// Mirror of the TURBO3_0 block above, adapted for turbo4 (16 centroids, -// 4-bit nibble pack, no signs byte). WHT tables and reduction structure -// are identical (QK = 128 for both). The block struct keeps a reserved -// rnorm field for ABI parity with the legacy 3-bit + QJL layout. -const float TS1_T4[128] = float[128]( - -1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1, - 1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, - -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, - 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1, - -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1, - 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, - -1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1, - 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1 -); -const float TS2_T4[128] = float[128]( - 1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, - 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, - 1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, - 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, - 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, 1, - -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, - 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, - -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1 -); -const float TINV_T4 = 0.08838834764831845; // 1 / sqrt(128) -// Lloyd-Max centroids for N(0, 1/128), 16 levels (matches CENTROIDS_4BIT in C ref) -const float TC4[16] = float[16]( - -0.173926, -0.117195, -0.089527, -0.068756, - -0.051262, -0.035597, -0.020989, -0.006938, - 0.006938, 0.020989, 0.035597, 0.051262, - 0.068756, 0.089527, 0.117195, 0.173926 -); -// 15 midpoints between adjacent centroids -const float TM4[15] = float[15]( - -0.145561, -0.103361, -0.079142, -0.060009, - -0.043430, -0.028293, -0.013964, 0.0, - 0.013964, 0.028293, 0.043430, 0.060009, - 0.079142, 0.103361, 0.145561 -); - -shared float wht_t4[128]; -shared float sg_acc_t4[16]; -shared float gnrm_t4; - -void main() { - const uint t = gl_LocalInvocationID.x; - const uint g = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; - const uint gpr = p.ne00 / 128; - - if (gpr == 0) return; - if (g >= p.ne / 128) return; - - uint tmp = g; - const uint ig = tmp % gpr; tmp /= gpr; - const uint i01 = tmp % p.ne01; tmp /= p.ne01; - const uint i02 = tmp % p.ne12; - const uint i03 = tmp / p.ne12; - - const uint sb = src0_idx(ig * 128, i01, i02, i03) + get_aoffset(); - const uint i1 = data_i[src1_idx(i01, fastmod(i02, p.ne11), fastmod(i03, p.ne12), 0) + get_boffset()] DATA_I_SWIZZLE; - const uint db = dst_idx(ig, i1, i02, i03) + get_doffset(); - - wht_t4[t] = data_s[sb + t]; - barrier(); - - float v2 = wht_t4[t] * wht_t4[t]; - v2 = subgroupAdd(v2); - if (gl_SubgroupInvocationID == 0) sg_acc_t4[gl_SubgroupID] = v2; - barrier(); - if (t == 0) { - float total = 0.0; - for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t4[w]; - gnrm_t4 = sqrt(total); - } - barrier(); - - wht_t4[t] *= (gnrm_t4 > 1e-10) ? (1.0 / gnrm_t4) : 0.0; - barrier(); - - wht_t4[t] *= TS1_T4[t]; - barrier(); - - [[unroll]] for (uint h = 1; h < 128; h *= 2) { - if ((t % (2 * h)) < h) { - float a = wht_t4[t]; - float b = wht_t4[t + h]; - wht_t4[t] = a + b; - wht_t4[t + h] = a - b; - } - barrier(); + uvec4 ballot = subgroupBallot(((idx >> 2u) & 1u) != 0u); + if (sg_lane % 8u == 0u) { + uint local_byte = sg_lane / 8u; + data_q[db].signs[t / 8u] = uint8_t((ballot.x >> (local_byte * 8u)) & 0xFFu); } - - float rv = wht_t4[t] * TINV_T4 * TS2_T4[t]; - - // Quantize to nearest of 16 centroids (4-bit index, no signs byte) +#elif defined(DATA_A_TURBO4_0) uint idx = 0u; - [[unroll]] for (uint i = 0; i < 15; i++) { - if (rv >= TM4[i]) idx = i + 1u; + if (val < T4M[7]) { + if (val < T4M[3]) { + if (val < T4M[1]) idx = (val < T4M[0]) ? 0u : 1u; + else idx = (val < T4M[2]) ? 2u : 3u; + } else { + if (val < T4M[5]) idx = (val < T4M[4]) ? 4u : 5u; + else idx = (val < T4M[6]) ? 6u : 7u; + } + } else { + if (val < T4M[11]) { + if (val < T4M[9]) idx = (val < T4M[8]) ? 8u : 9u; + else idx = (val < T4M[10]) ? 10u : 11u; + } else { + if (val < T4M[13]) idx = (val < T4M[12]) ? 12u : 13u; + else idx = (val < T4M[14]) ? 14u : 15u; + } } - - // Pack qs: 2 elements per byte (4-bit nibble each) uint sg_lane = gl_SubgroupInvocationID; - uint pair_low = subgroupShuffle(idx & 0xFu, sg_lane & ~1u); - uint pair_high = subgroupShuffle(idx & 0xFu, (sg_lane & ~1u) + 1u); - uint qs_byte = pair_low | (pair_high << 4u); - if (sg_lane % 2u == 0u) { - data_q[db].qs[t / 2u] = uint8_t(qs_byte); - } - - // Reset rnorm field (reserved in 4-bit mode) - if (t == 0u) { - data_q[db].rnorm = float16_t(0.0); + uint partner_idx = subgroupShuffle(idx, sg_lane ^ 1u); + uint byte_val = ((sg_lane & 1u) == 0u) ? (idx | (partner_idx << 4u)) : (partner_idx | (idx << 4u)); + if ((sg_lane & 1u) == 0u) { + data_q[db].qs[t / 2u] = uint8_t(byte_val); } +#endif - // Reconstruction norm via subgroup reduction - float rc = TC4[idx] * TC4[idx]; +#if defined(DATA_A_TURBO3_0) + float rc = TC[idx] * TC[idx]; rc = subgroupAdd(rc); - if (sg_lane == 0u) sg_acc_t4[gl_SubgroupID] = rc; + if (gl_SubgroupInvocationID == 0) sg_acc[gl_SubgroupID] = rc; barrier(); if (t == 0u) { float total = 0.0; - for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc_t4[w]; + for (uint w = 0; w < gl_NumSubgroups; w++) total += sg_acc[w]; float rn = sqrt(total); - data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm_t4 / rn) : gnrm_t4); + data_q[db].norm = float16_t((rn > 1e-10) ? (gnrm / rn) : gnrm); } +#else + if (t == 0u) { + data_q[db].norm = float16_t(gnrm); + } +#endif } - -#elif defined(SET_ROWS) && defined(DATA_A_TQ4_1S) +#elif defined(SET_ROWS) && (defined(DATA_A_TQ4_1S) || defined(DATA_A_TQ3_1S)) void main() { const uint t = gl_LocalInvocationID.x; // 0..31, one per block element @@ -733,16 +550,16 @@ void main() { // Step 2: Forward RHT via subgroup ops // Sign flip - val *= TQ4_SIGNS[t]; + val *= TQ_SIGNS[t]; // WHT butterfly via subgroupShuffleXor - [[unroll]] for (uint h = 1u; h < 32u; h <<= 1u) { + for (uint h = 1u; h < 32u; h <<= 1u) { const float other = subgroupShuffleXor(val, h); val = ((t & h) == 0u) ? (val + other) : (other - val); } // Normalize - val *= TQ4_INV_SQRT32; + val *= TQ_INV_SQRT32; // Step 3: Dual half-block RMS scale computation float sq = val * val; @@ -751,7 +568,7 @@ void main() { float rms_lo = sqrt(sum_sq_lo / 16.0); float rms_hi = sqrt(sum_sq_hi / 16.0); - // Step 4: Scale search (9 points) — matches CPU quantize_row_tq4_1s_ref + // Step 4: Scale search (9 points) const float SCALES[9] = float[9](0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.35, 1.5); float best_d0 = rms_lo; float best_d1 = rms_hi; @@ -764,8 +581,13 @@ void main() { ? ((d0 > 1e-10) ? 1.0 / d0 : 0.0) : ((d1 > 1e-10) ? 1.0 / d1 : 0.0); +#if defined(DATA_A_TQ3_1S) + uint idx = tq3_choose_index(val * inv); + float c = TQ3_CENTROIDS[idx]; +#else uint idx = tq4_choose_index(val * inv); float c = TQ4_CENTROIDS[idx]; +#endif float d = (t < 16u) ? d0 : d1; float diff2 = val - c * d; float local_err = diff2 * diff2; @@ -779,13 +601,18 @@ void main() { } // Step 5: Iterative refinement (6 iterations) - [[unroll]] for (uint iter = 0u; iter < 6u; iter++) { + for (uint iter = 0u; iter < 6u; iter++) { float inv = (t < 16u) ? ((best_d0 > 1e-10) ? 1.0 / best_d0 : 0.0) : ((best_d1 > 1e-10) ? 1.0 / best_d1 : 0.0); +#if defined(DATA_A_TQ3_1S) + uint idx = tq3_choose_index(val * inv); + float c = TQ3_CENTROIDS[idx]; +#else uint idx = tq4_choose_index(val * inv); float c = TQ4_CENTROIDS[idx]; +#endif float num_lo = subgroupAdd((t < 16u) ? val * c : 0.0); float den_lo = subgroupAdd((t < 16u) ? c * c : 0.0); @@ -800,22 +627,82 @@ void main() { float inv_final = (t < 16u) ? ((best_d0 > 1e-10) ? 1.0 / best_d0 : 0.0) : ((best_d1 > 1e-10) ? 1.0 / best_d1 : 0.0); +#if defined(DATA_A_TQ3_1S) + uint cidx = tq3_choose_index(val * inv_final); +#else uint cidx = tq4_choose_index(val * inv_final); +#endif - // Step 7: Nibble packing — two 4-bit indices per byte - // Even element j: low nibble; odd element j: high nibble - // Thread t pairs with thread t^1 + // Step 7: Packing +#if defined(DATA_A_TQ4_1S) uint partner_idx = subgroupShuffle(cidx, t ^ 1u); - uint byte_val; - if ((t & 1u) == 0u) { - byte_val = cidx | (partner_idx << 4u); - } else { - byte_val = partner_idx | (cidx << 4u); - } - // Only even threads write (one byte per pair) + uint byte_val = ((t & 1u) == 0u) ? (cidx | (partner_idx << 4u)) : (partner_idx | (cidx << 4u)); if ((t & 1u) == 0u) { data_q[db].qs[t >> 1u] = uint8_t(byte_val); } +#else + // TQ3_1S packing: 8 values -> 3 bytes + // Thread 0-7: byte 0, 1, 2 + uint v0 = subgroupShuffle(cidx, 0); + uint v1 = subgroupShuffle(cidx, 1); + uint v2 = subgroupShuffle(cidx, 2); + uint v3 = subgroupShuffle(cidx, 3); + uint v4 = subgroupShuffle(cidx, 4); + uint v5 = subgroupShuffle(cidx, 5); + uint v6 = subgroupShuffle(cidx, 6); + uint v7 = subgroupShuffle(cidx, 7); + + if (t == 0) { + data_q[db].qs[0] = uint8_t(v0 | (v1 << 3) | ((v2 & 3) << 6)); + data_q[db].qs[1] = uint8_t((v2 >> 2) | (v3 << 1) | (v4 << 4) | ((v5 & 1) << 7)); + data_q[db].qs[2] = uint8_t((v5 >> 1) | (v6 << 2) | (v7 << 5)); + } + + v0 = subgroupShuffle(cidx, 8); + v1 = subgroupShuffle(cidx, 9); + v2 = subgroupShuffle(cidx, 10); + v3 = subgroupShuffle(cidx, 11); + v4 = subgroupShuffle(cidx, 12); + v5 = subgroupShuffle(cidx, 13); + v6 = subgroupShuffle(cidx, 14); + v7 = subgroupShuffle(cidx, 15); + + if (t == 0) { + data_q[db].qs[3] = uint8_t(v0 | (v1 << 3) | ((v2 & 3) << 6)); + data_q[db].qs[4] = uint8_t((v2 >> 2) | (v3 << 1) | (v4 << 4) | ((v5 & 1) << 7)); + data_q[db].qs[5] = uint8_t((v5 >> 1) | (v6 << 2) | (v7 << 5)); + } + + v0 = subgroupShuffle(cidx, 16); + v1 = subgroupShuffle(cidx, 17); + v2 = subgroupShuffle(cidx, 18); + v3 = subgroupShuffle(cidx, 19); + v4 = subgroupShuffle(cidx, 20); + v5 = subgroupShuffle(cidx, 21); + v6 = subgroupShuffle(cidx, 22); + v7 = subgroupShuffle(cidx, 23); + + if (t == 0) { + data_q[db].qs[6] = uint8_t(v0 | (v1 << 3) | ((v2 & 3) << 6)); + data_q[db].qs[7] = uint8_t((v2 >> 2) | (v3 << 1) | (v4 << 4) | ((v5 & 1) << 7)); + data_q[db].qs[8] = uint8_t((v5 >> 1) | (v6 << 2) | (v7 << 5)); + } + + v0 = subgroupShuffle(cidx, 24); + v1 = subgroupShuffle(cidx, 25); + v2 = subgroupShuffle(cidx, 26); + v3 = subgroupShuffle(cidx, 27); + v4 = subgroupShuffle(cidx, 28); + v5 = subgroupShuffle(cidx, 29); + v6 = subgroupShuffle(cidx, 30); + v7 = subgroupShuffle(cidx, 31); + + if (t == 0) { + data_q[db].qs[9] = uint8_t(v0 | (v1 << 3) | ((v2 & 3) << 6)); + data_q[db].qs[10] = uint8_t((v2 >> 2) | (v3 << 1) | (v4 << 4) | ((v5 & 1) << 7)); + data_q[db].qs[11] = uint8_t((v5 >> 1) | (v6 << 2) | (v7 << 5)); + } +#endif // Step 8: Store scales (thread 0 writes both) if (t == 0u) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl index 845a8bd27ab..d4facb90c12 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl @@ -652,30 +652,71 @@ vec2 get_dm(uint ib, uint a_offset) { } #endif -#if defined(DATA_A_TURBO3_0) +#if defined(DATA_A_TQ3_1S) vec2 dequantize(uint ib, uint iqs, uint a_offset) { - // PolarQuant 3-bit centroids (Lloyd-Max for Gaussian) const float centroids[8] = float[8]( - -0.190685, -0.117832, -0.065717, -0.021460, - 0.021460, 0.065717, 0.117832, 0.190685 + -1.996684, -1.291398, -0.740341, -0.247508, + 0.230106, 0.725222, 1.277503, 1.988943 ); + const uint group = iqs / 8u; + const uint i8 = iqs % 8u; + const uint b0 = uint(data_a[a_offset + ib].qs[group * 3 + 0]); + const uint b1 = uint(data_a[a_offset + ib].qs[group * 3 + 1]); + const uint b2 = uint(data_a[a_offset + ib].qs[group * 3 + 2]); + uint idx0, idx1; + switch(i8) { + case 0: idx0 = b0 & 7u; idx1 = (b0 >> 3u) & 7u; break; + case 2: idx0 = ((b0 >> 6u) | (b1 << 2u)) & 7u; idx1 = (b1 >> 1u) & 7u; break; + case 4: idx0 = (b1 >> 4u) & 7u; idx1 = ((b1 >> 7u) | (b2 << 1u)) & 7u; break; + case 6: idx0 = (b2 >> 2u) & 7u; idx1 = (b2 >> 5u) & 7u; break; + } + const float d0 = (iqs < 16) ? float(data_a[a_offset + ib].d0) : float(data_a[a_offset + ib].d1); + const float d1 = ((iqs+1) < 16) ? float(data_a[a_offset + ib].d0) : float(data_a[a_offset + ib].d1); + return vec2(centroids[idx0] * d0, centroids[idx1] * d1); +} +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + vec2 v0 = dequantize(ib, iqs, a_offset); + vec2 v1 = dequantize(ib, iqs + 2, a_offset); + return vec4(v0.x, v0.y, v1.x, v1.y); +} +vec2 get_dm(uint ib, uint a_offset) { + return vec2(1, 0); +} +#endif - // iqs is the element index within the block (0..31), we decode 2 consecutive elements +#if defined(DATA_A_TQ4_1S) +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float centroids[16] = float[16]( + -2.732590, -2.069017, -1.618046, -1.256231, + -0.942340, -0.656759, -0.388048, -0.128395, + 0.128395, 0.388048, 0.656759, 0.942340, + 1.256231, 1.618046, 2.069017, 2.732590 + ); const uint j0 = iqs; const uint j1 = iqs + 1; + const uint idx0 = (uint(data_a[a_offset + ib].qs[j0 / 2]) >> ((j0 % 2) * 4)) & 0xF; + const uint idx1 = (uint(data_a[a_offset + ib].qs[j1 / 2]) >> ((j1 % 2) * 4)) & 0xF; + const float d0 = (j0 < 16) ? float(data_a[a_offset + ib].d0) : float(data_a[a_offset + ib].d1); + const float d1 = (j1 < 16) ? float(data_a[a_offset + ib].d0) : float(data_a[a_offset + ib].d1); + return vec2(centroids[idx0] * d0, centroids[idx1] * d1); +} +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + vec2 v0 = dequantize(ib, iqs, a_offset); + vec2 v1 = dequantize(ib, iqs + 2, a_offset); + return vec4(v0.x, v0.y, v1.x, v1.y); +} +vec2 get_dm(uint ib, uint a_offset) { + return vec2(1, 0); +} +#endif - // Extract 2-bit low indices from qs (4 per byte) - const uint low2_0 = (uint(data_a[a_offset + ib].qs[j0 / 4]) >> ((j0 % 4) * 2)) & 0x3; - const uint low2_1 = (uint(data_a[a_offset + ib].qs[j1 / 4]) >> ((j1 % 4) * 2)) & 0x3; - - // Extract 1-bit high from signs (8 per byte) - const uint hi1_0 = (uint(data_a[a_offset + ib].signs[j0 / 8]) >> (j0 % 8)) & 0x1; - const uint hi1_1 = (uint(data_a[a_offset + ib].signs[j1 / 8]) >> (j1 % 8)) & 0x1; - - // Combine to 3-bit index - const uint idx0 = low2_0 | (hi1_0 << 2); - const uint idx1 = low2_1 | (hi1_1 << 2); - +#if defined(DATA_A_TURBO2_0) +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float centroids[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462); + const uint j0 = iqs; + const uint j1 = iqs + 1; + const uint idx0 = (uint(data_a[a_offset + ib].qs[j0 / 4]) >> ((j0 % 4) * 2)) & 0x3; + const uint idx1 = (uint(data_a[a_offset + ib].qs[j1 / 4]) >> ((j1 % 4) * 2)) & 0x3; return vec2(centroids[idx0], centroids[idx1]); } vec4 dequantize4(uint ib, uint iqs, uint a_offset) { @@ -688,33 +729,55 @@ vec2 get_dm(uint ib, uint a_offset) { } #endif -#if defined(DATA_A_TQ4_1S) +#if defined(DATA_A_TURBO4_0) vec2 dequantize(uint ib, uint iqs, uint a_offset) { - // TQ4_1S: 16-level Lloyd-Max centroids for N(0,1) const float centroids[16] = float[16]( - -2.732590, -2.069017, -1.618046, -1.256231, - -0.942340, -0.656759, -0.388048, -0.128395, - 0.128395, 0.388048, 0.656759, 0.942340, - 1.256231, 1.618046, 2.069017, 2.732590 + -0.173926, -0.117195, -0.089527, -0.068756, + -0.051262, -0.035597, -0.020989, -0.006938, + 0.006938, 0.020989, 0.035597, 0.051262, + 0.068756, 0.089527, 0.117195, 0.173926 + ); + const uint j0 = iqs; + const uint j1 = iqs + 1; + const uint idx0 = (uint(data_a[a_offset + ib].qs[j0 / 2]) >> ((j0 % 2) * 4)) & 0xF; + const uint idx1 = (uint(data_a[a_offset + ib].qs[j1 / 2]) >> ((j1 % 2) * 4)) & 0xF; + return vec2(centroids[idx0], centroids[idx1]); +} +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + vec2 v0 = dequantize(ib, iqs, a_offset); + vec2 v1 = dequantize(ib, iqs + 2, a_offset); + return vec4(v0.x, v0.y, v1.x, v1.y); +} +vec2 get_dm(uint ib, uint a_offset) { + return vec2(float(data_a[a_offset + ib].norm), 0); +} +#endif + +#if defined(DATA_A_TURBO3_0) +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + // PolarQuant 3-bit centroids (Lloyd-Max for Gaussian) + const float centroids[8] = float[8]( + -0.190685, -0.117832, -0.065717, -0.021460, + 0.021460, 0.065717, 0.117832, 0.190685 ); - // iqs is the element pair index within the block (0..15) + // iqs is the element index within the block (0..31), we decode 2 consecutive elements const uint j0 = iqs; const uint j1 = iqs + 1; - // Extract 4-bit nibble indices from qs (2 per byte) - const uint idx0 = (uint(data_a[a_offset + ib].qs[j0 / 2]) >> ((j0 & 1) * 4)) & 0xF; - const uint idx1 = (uint(data_a[a_offset + ib].qs[j1 / 2]) >> ((j1 & 1) * 4)) & 0xF; + // Extract 2-bit low indices from qs (4 per byte) + const uint low2_0 = (uint(data_a[a_offset + ib].qs[j0 / 4]) >> ((j0 % 4) * 2)) & 0x3; + const uint low2_1 = (uint(data_a[a_offset + ib].qs[j1 / 4]) >> ((j1 % 4) * 2)) & 0x3; + + // Extract 1-bit high from signs (8 per byte) + const uint hi1_0 = (uint(data_a[a_offset + ib].signs[j0 / 8]) >> (j0 % 8)) & 0x1; + const uint hi1_1 = (uint(data_a[a_offset + ib].signs[j1 / 8]) >> (j1 % 8)) & 0x1; - // Scale by d0 (elements 0-15) or d1 (elements 16-31) - const float d0 = float(data_a[a_offset + ib].d0); - const float d1 = float(data_a[a_offset + ib].d1); - const float s0 = (j0 < 16) ? d0 : d1; - const float s1 = (j1 < 16) ? d0 : d1; + // Combine to 3-bit index + const uint idx0 = low2_0 | (hi1_0 << 2); + const uint idx1 = low2_1 | (hi1_1 << 2); - // Returns centroid * scale WITHOUT RHT inverse - // (caller must handle pre-rotation for correctness) - return vec2(centroids[idx0] * s0, centroids[idx1] * s1); + return vec2(centroids[idx0], centroids[idx1]); } vec4 dequantize4(uint ib, uint iqs, uint a_offset) { vec2 v0 = dequantize(ib, iqs, a_offset); @@ -722,7 +785,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) { return vec4(v0.x, v0.y, v1.x, v1.y); } vec2 get_dm(uint ib, uint a_offset) { - // No global scale/min — scales are applied per-element in dequantize() - return vec2(1, 0); + return vec2(float(data_a[a_offset + ib].norm), 0); } #endif + diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl index f2f0e06bf36..0850a101218 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl @@ -715,29 +715,59 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords } #endif +#if defined(DATA_A_TQ3_1S) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTQ3_1S { + block_tq3_1s block; +}; +float16_t dequantFuncTQ3_1S(const in decodeBufTQ3_1S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { return float16_t(0); } +#endif + +#if defined(DATA_A_TQ4_1S) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTQ4_1S { + block_tq4_1s block; +}; +float16_t dequantFuncTQ4_1S(const in decodeBufTQ4_1S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { return float16_t(0); } +#endif + +#if defined(DATA_A_TURBO2_0) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTURBO2_0 { + block_turbo2_0 block; +}; +float16_t dequantFuncTURBO2_0(const in decodeBufTURBO2_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float centroids[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462); + const float norm = float(bl.block.norm); + const uint j = coordInBlock[1]; + const uint idx = (uint(bl.block.qs[j / 4]) >> ((j % 4) * 2)) & 0x3; + return float16_t(centroids[idx] * norm); +} +#endif + #if defined(DATA_A_TURBO3_0) layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTURBO3_0 { block_turbo3_0 block; }; - float16_t dequantFuncTURBO3_0(const in decodeBufTURBO3_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { - const float centroids[8] = float[8]( - -0.190685, -0.117832, -0.065717, -0.021460, - 0.021460, 0.065717, 0.117832, 0.190685 - ); + const float centroids[8] = float[8](-0.190685, -0.117832, -0.065717, -0.021460, 0.021460, 0.065717, 0.117832, 0.190685); const float norm = float(bl.block.norm); const uint j = coordInBlock[1]; - - // Extract 2-bit low index from qs (4 per byte) const uint low2 = (uint(bl.block.qs[j / 4]) >> ((j % 4) * 2)) & 0x3; - - // Extract 1-bit high from signs (8 per byte) const uint hi1 = (uint(bl.block.signs[j / 8]) >> (j % 8)) & 0x1; + return float16_t(centroids[low2 | (hi1 << 2)] * norm); +} +#endif - // Combine to 3-bit index - const uint idx = low2 | (hi1 << 2); - +#if defined(DATA_A_TURBO4_0) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTURBO4_0 { + block_turbo4_0 block; +}; +float16_t dequantFuncTURBO4_0(const in decodeBufTURBO4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float centroids[16] = float[16](-0.173926, -0.117195, -0.089527, -0.068756, -0.051262, -0.035597, -0.020989, -0.006938, 0.006938, 0.020989, 0.035597, 0.051262, 0.068756, 0.089527, 0.117195, 0.173926); + const float norm = float(bl.block.norm); + const uint j = coordInBlock[1]; + const uint idx = (uint(bl.block.qs[j / 2]) >> ((j % 2) * 4)) & 0xF; return float16_t(centroids[idx] * norm); } #endif @@ -790,8 +820,16 @@ float16_t dequantFuncTURBO3_0(const in decodeBufTURBO3_0 bl, const in uint block #define dequantFuncA dequantFuncMXFP4 #elif defined(DATA_A_NVFP4) #define dequantFuncA dequantFuncNVFP4 +#elif defined(DATA_A_TQ3_1S) +#define dequantFuncA dequantFuncTQ3_1S +#elif defined(DATA_A_TQ4_1S) +#define dequantFuncA dequantFuncTQ4_1S +#elif defined(DATA_A_TURBO2_0) +#define dequantFuncA dequantFuncTURBO2_0 #elif defined(DATA_A_TURBO3_0) #define dequantFuncA dequantFuncTURBO3_0 +#elif defined(DATA_A_TURBO4_0) +#define dequantFuncA dequantFuncTURBO4_0 #elif defined(DATA_A_F32) #define dequantFuncA dequantFuncF32 #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_tq3_1s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_tq3_1s.comp new file mode 100644 index 00000000000..d836de1d4e2 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_tq3_1s.comp @@ -0,0 +1,66 @@ +#version 450 + +#include "dequant_head.glsl" + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {block_tq3_1s data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; + +void main() { + const float centroids[8] = float[8]( + -1.996684, -1.291398, -0.740341, -0.247508, + 0.230106, 0.725222, 1.277503, 1.988943 + ); + + const float signs[32] = float[32]( + +1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0, + -1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0, + -1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0, + -1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0 + ); + + const float INV_SQRT32 = 0.17677669529663688; + + const uint ib = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x; + if (ib >= p.nel / 32) return; + + const float d0 = float(data_a[ib].d0); + const float d1 = float(data_a[ib].d1); + + float buf[32]; + for (int group = 0; group < 4; group++) { + const uint b0 = uint(data_a[ib].qs[group * 3 + 0]); + const uint b1 = uint(data_a[ib].qs[group * 3 + 1]); + const uint b2 = uint(data_a[ib].qs[group * 3 + 2]); + + buf[group * 8 + 0] = centroids[b0 & 7u]; + buf[group * 8 + 1] = centroids[(b0 >> 3u) & 7u]; + buf[group * 8 + 2] = centroids[((b0 >> 6u) | (b1 << 2u)) & 7u]; + buf[group * 8 + 3] = centroids[(b1 >> 1u) & 7u]; + buf[group * 8 + 4] = centroids[(b1 >> 4u) & 7u]; + buf[group * 8 + 5] = centroids[((b1 >> 7u) | (b2 << 1u)) & 7u]; + buf[group * 8 + 6] = centroids[(b2 >> 2u) & 7u]; + buf[group * 8 + 7] = centroids[(b2 >> 5u) & 7u]; + } + + for (int j = 0; j < 32; j++) { + buf[j] *= (j < 16) ? d0 : d1; + } + + for (uint step = 1u; step < 32u; step <<= 1u) { + for (uint i = 0u; i < 32u; i += step * 2u) { + for (uint j = i; j < i + step; j++) { + const float a = buf[j]; + const float b = buf[j + step]; + buf[j] = a + b; + buf[j + step] = a - b; + } + } + } + + const uint out_base = ib * 32u; + for (int j = 0; j < 32; j++) { + data_b[out_base + uint(j)] = D_TYPE(buf[j] * INV_SQRT32 * signs[j]); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_turbo2_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_turbo2_0.comp new file mode 100644 index 00000000000..e2c8fad542a --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_turbo2_0.comp @@ -0,0 +1,23 @@ +#version 450 + +#include "dequant_head.glsl" + +// 128 elements per block (QK_TURBO2 = 128) +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {block_turbo2_0 data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; + +void main() { + const float centroids[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462); + + const uint ib = gl_WorkGroupID.x; + const uint j = gl_LocalInvocationID.x; + + if (ib >= p.nel / 128) return; + + const float norm = float(data_a[ib].norm); + const uint idx = (uint(data_a[ib].qs[j / 4]) >> ((j % 4) * 2)) & 0x3; + + data_b[ib * 128 + j] = D_TYPE(centroids[idx] * norm); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_turbo4_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_turbo4_0.comp new file mode 100644 index 00000000000..532eb03c302 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_turbo4_0.comp @@ -0,0 +1,28 @@ +#version 450 + +#include "dequant_head.glsl" + +// 128 elements per block (QK_TURBO4 = 128) +layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {block_turbo4_0 data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; + +void main() { + const float centroids[16] = float[16]( + -0.173926, -0.117195, -0.089527, -0.068756, + -0.051262, -0.035597, -0.020989, -0.006938, + 0.006938, 0.020989, 0.035597, 0.051262, + 0.068756, 0.089527, 0.117195, 0.173926 + ); + + const uint ib = gl_WorkGroupID.x; + const uint j = gl_LocalInvocationID.x; + + if (ib >= p.nel / 128) return; + + const float norm = float(data_a[ib].norm); + const uint idx = (uint(data_a[ib].qs[j / 2]) >> ((j % 2) * 4)) & 0xF; + + data_b[ib * 128 + j] = D_TYPE(centroids[idx] * norm); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index 1878eb7e1aa..b81d1059d45 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -84,9 +84,15 @@ layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];}; #if defined(DATA_A_F32) layout (binding = 1) readonly buffer K_PACKED {vec4 k_data_packed[];} k_packed; layout (binding = 2) readonly buffer V_PACKED {vec4 v_data_packed[];} v_packed; +#elif defined(DATA_A_TURBO2_0) +layout (binding = 1) readonly buffer K_T2 {block_turbo2_0 data_k_t2[];}; +layout (binding = 2) readonly buffer V_T2 {block_turbo2_0 data_v_t2[];}; #elif defined(DATA_A_TURBO3_0) layout (binding = 1) readonly buffer K_T3 {block_turbo3_0 data_k_t3[];}; layout (binding = 2) readonly buffer V_T3 {block_turbo3_0 data_v_t3[];}; +#elif defined(DATA_A_TURBO4_0) +layout (binding = 1) readonly buffer K_T4 {block_turbo4_0 data_k_t4[];}; +layout (binding = 2) readonly buffer V_T4 {block_turbo4_0 data_v_t4[];}; #elif defined(A_TYPE_PACKED16) layout (binding = 1) readonly buffer K_PACKED16 {A_TYPE_PACKED16 k_data_packed16[];} k_packed; layout (binding = 2) readonly buffer V_PACKED16 {A_TYPE_PACKED16 v_data_packed16[];} v_packed; @@ -102,8 +108,12 @@ layout (binding = 2) readonly buffer V_PACKED32 {A_TYPE_PACKED32 v_data_packed32 #endif // turbo3: define BLOCK_BYTE_SIZE early (before first use in FA offset computation) -#if defined(DATA_A_TURBO3_0) && !defined(BLOCK_BYTE_SIZE) +#if defined(DATA_A_TURBO2_0) && !defined(BLOCK_BYTE_SIZE) +#define BLOCK_BYTE_SIZE 34 +#elif defined(DATA_A_TURBO3_0) && !defined(BLOCK_BYTE_SIZE) #define BLOCK_BYTE_SIZE 50 // block_turbo3_0: 2 (norm) + 32 (qs) + 16 (signs) = 50 bytes +#elif defined(DATA_A_TURBO4_0) && !defined(BLOCK_BYTE_SIZE) +#define BLOCK_BYTE_SIZE 68 #endif #if defined(DATA_A_F32) @@ -260,32 +270,84 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { } #endif +#if defined(DATA_A_TURBO2_0) +const float T2C[4] = float[4](-0.133462, -0.039994, 0.039994, 0.133462); +// iqs is always a multiple of 4 (4 consecutive elements within the same qs byte group) +FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { + float nm; + uint qb; + if (binding_idx == BINDING_IDX_K) { + nm = float(data_k_t2[a_offset + ib].norm); + qb = uint(data_k_t2[a_offset + ib].qs[iqs / 4]); + } else { + nm = float(data_v_t2[a_offset + ib].norm); + qb = uint(data_v_t2[a_offset + ib].qs[iqs / 4]); + } + return FLOAT_TYPEV4( + T2C[(qb ) & 0x3] * nm, + T2C[(qb >> 2) & 0x3] * nm, + T2C[(qb >> 4) & 0x3] * nm, + T2C[(qb >> 6) & 0x3] * nm + ); +} +#endif + #if defined(DATA_A_TURBO3_0) const float T3C[8] = float[8]( -0.190685, -0.117832, -0.065717, -0.021460, 0.021460, 0.065717, 0.117832, 0.190685 ); +// iqs is always a multiple of 4: all 4 elements share the same qs byte and same signs byte. FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { - FLOAT_TYPEV4 r; - for (int k = 0; k < 4; k++) { - uint j = iqs + uint(k); - float nm; - uint qb; - uint sb; - if (binding_idx == BINDING_IDX_K) { - nm = float(data_k_t3[a_offset + ib].norm); - qb = uint(data_k_t3[a_offset + ib].qs[j / 4]); - sb = uint(data_k_t3[a_offset + ib].signs[j / 8]); - } else { - nm = float(data_v_t3[a_offset + ib].norm); - qb = uint(data_v_t3[a_offset + ib].qs[j / 4]); - sb = uint(data_v_t3[a_offset + ib].signs[j / 8]); - } - uint lo = (qb >> ((j % 4) * 2)) & 0x3; - uint hi = (sb >> (j % 8)) & 0x1; - r[k] = FLOAT_TYPE(T3C[lo | (hi << 2)] * nm); + float nm; + uint qb; + uint sb; + if (binding_idx == BINDING_IDX_K) { + nm = float(data_k_t3[a_offset + ib].norm); + qb = uint(data_k_t3[a_offset + ib].qs[iqs / 4]); + sb = uint(data_k_t3[a_offset + ib].signs[iqs / 8]); + } else { + nm = float(data_v_t3[a_offset + ib].norm); + qb = uint(data_v_t3[a_offset + ib].qs[iqs / 4]); + sb = uint(data_v_t3[a_offset + ib].signs[iqs / 8]); + } + // iqs is a multiple of 4; within the signs byte, the 4 elements start at bit (iqs%8). + uint sshift = iqs & 4u; // 0 if iqs%8 < 4, else 4 + return FLOAT_TYPEV4( + T3C[((qb ) & 0x3) | (((sb >> (sshift + 0)) & 1u) << 2)] * nm, + T3C[((qb >> 2) & 0x3) | (((sb >> (sshift + 1)) & 1u) << 2)] * nm, + T3C[((qb >> 4) & 0x3) | (((sb >> (sshift + 2)) & 1u) << 2)] * nm, + T3C[((qb >> 6) & 0x3) | (((sb >> (sshift + 3)) & 1u) << 2)] * nm + ); +} +#endif + +#if defined(DATA_A_TURBO4_0) +const float T4C[16] = float[16]( + -0.173926, -0.117195, -0.089527, -0.068756, + -0.051262, -0.035597, -0.020989, -0.006938, + 0.006938, 0.020989, 0.035597, 0.051262, + 0.068756, 0.089527, 0.117195, 0.173926 +); +// iqs is always even: pairs of elements share a qs nibble byte. +FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { + float nm; + uint qb0, qb1; + if (binding_idx == BINDING_IDX_K) { + nm = float(data_k_t4[a_offset + ib].norm); + qb0 = uint(data_k_t4[a_offset + ib].qs[iqs / 2 ]); + qb1 = uint(data_k_t4[a_offset + ib].qs[iqs / 2 + 1]); + } else { + nm = float(data_v_t4[a_offset + ib].norm); + qb0 = uint(data_v_t4[a_offset + ib].qs[iqs / 2 ]); + qb1 = uint(data_v_t4[a_offset + ib].qs[iqs / 2 + 1]); } - return r; + return FLOAT_TYPEV4( + T4C[ qb0 & 0xF] * nm, + T4C[(qb0 >> 4) & 0xF] * nm, + T4C[ qb1 & 0xF] * nm, + T4C[(qb1 >> 4) & 0xF] * nm + ); } #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq3_1s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq3_1s.comp new file mode 100644 index 00000000000..897484d7dd6 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq3_1s.comp @@ -0,0 +1,120 @@ +#version 450 + +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require + +#include "mul_mat_vec_base.glsl" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +// Lloyd-Max centroids for TQ3_1S (3-bit, 8 levels) — N(0, 1) optimal +const float TQ3_CENTROIDS[8] = float[8]( + -1.996684, -1.291398, -0.740341, -0.247508, + 0.230106, 0.725222, 1.277503, 1.988943 +); + +// WHT sign pattern for 32-element blocks (shared by TQ3 and TQ4) +const float TQ_SIGNS[32] = float[32]( + +1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0, + -1.0, -1.0, +1.0, -1.0, +1.0, +1.0, -1.0, +1.0, + -1.0, -1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0, + -1.0, +1.0, +1.0, -1.0, +1.0, -1.0, -1.0, +1.0 +); + +const float TQ_INV_SQRT32 = 0.17677669529663688; + +shared float tq3_smem[8 * 32]; + +void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { + const uint tid = gl_LocalInvocationID.x; + + uint a_offset, b_offset, d_offset; + get_offsets(a_offset, b_offset, d_offset); + + FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + [[unroll]] for (uint n = 0; n < NUM_ROWS; ++n) { + temp[j][n] = FLOAT_TYPE(0); + } + } + + const uint num_blocks_per_row = p.ncols / 32u; + const float sign_tid = TQ_SIGNS[tid]; + + for (uint blk = 0; blk < num_blocks_per_row; blk++) { + // --- Stage 1: load activation, sign-flip, write to shared memory --- + [[unroll]] for (uint c = 0; c < NUM_COLS; ++c) { + const uint b_base = c * p.batch_stride_b + b_offset + blk * 32u; + tq3_smem[c * 32u + tid] = float(data_b[b_base + tid]) * sign_tid; + } + barrier(); + + // --- Stage 2: forward WHT butterfly in shared memory (5 stages) --- + [[unroll]] for (uint step = 1u; step < 32u; step <<= 1u) { + if ((tid & step) == 0u) { + const uint partner = tid + step; + [[unroll]] for (uint c = 0; c < NUM_COLS; ++c) { + const uint base = c * 32u; + const float a = tq3_smem[base + tid]; + const float b = tq3_smem[base + partner]; + tq3_smem[base + tid] = a + b; + tq3_smem[base + partner] = a - b; + } + } + barrier(); + } + + // --- Stage 3: dequant all rows' weights for this block position --- + float w_vals[NUM_ROWS]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib = (first_row + n) * num_blocks_per_row + blk; + const uint group = tid / 8u; + const uint i8 = tid % 8u; + + const uint b0 = uint(data_a[a_offset + ib].qs[group * 3 + 0]); + const uint b1 = uint(data_a[a_offset + ib].qs[group * 3 + 1]); + const uint b2 = uint(data_a[a_offset + ib].qs[group * 3 + 2]); + + uint idx; + switch(i8) { + case 0: idx = b0 & 7u; break; + case 1: idx = (b0 >> 3u) & 7u; break; + case 2: idx = ((b0 >> 6u) | (b1 << 2u)) & 7u; break; + case 3: idx = (b1 >> 1u) & 7u; break; + case 4: idx = (b1 >> 4u) & 7u; break; + case 5: idx = ((b1 >> 7u) | (b2 << 1u)) & 7u; break; + case 6: idx = (b2 >> 2u) & 7u; break; + case 7: idx = (b2 >> 5u) & 7u; break; + } + + const float d = (tid < 16u) + ? float(data_a[a_offset + ib].d0) + : float(data_a[a_offset + ib].d1); + w_vals[n] = TQ3_CENTROIDS[idx] * d * TQ_INV_SQRT32; + } + + // --- Stage 4: accumulate dot products --- + [[unroll]] for (uint c = 0; c < NUM_COLS; ++c) { + const float b_rotated = tq3_smem[c * 32u + tid]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + temp[c][n] += FLOAT_TYPE(w_vals[n] * b_rotated); + } + } + + barrier(); + } + + reduce_result(temp, d_offset, first_row, num_rows, tid); +} + +void main() { + const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); + + if (first_row + NUM_ROWS <= p.stride_d) { + compute_outputs(first_row, NUM_ROWS); + } else { + if (first_row >= p.stride_d) { + return; + } + compute_outputs(first_row, p.stride_d - first_row); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/turbo_wht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/turbo_wht.comp index 914875eba7a..1cf87dedfa9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/turbo_wht.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/turbo_wht.comp @@ -3,6 +3,8 @@ #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_control_flow_attributes : require +#extension GL_KHR_shader_subgroup_shuffle : require + layout (local_size_x = 128, local_size_y = 1, local_size_z = 1) in; layout (push_constant) uniform parameter { uint ne; uint direction; uint group_size; } p; @@ -47,21 +49,29 @@ void main() { const float fs = (p.direction == 0) ? S1[tid] : S2[tid]; const float ss = (p.direction == 0) ? S2[tid] : S1[tid]; - x[tid] = data_a[base + tid]; - barrier(); + float val = data_a[base + tid] * fs; - x[tid] *= fs; - barrier(); + // Optimized WHT using subgroup shuffle + uint sg_size = gl_SubgroupSize; + for (uint h = 1; h < sg_size && h < 128; h *= 2) { + float other = subgroupShuffleXor(val, h); + val = ((tid & h) == 0) ? (val + other) : (other - val); + } - [[unroll]] for (uint h = 1; h < 128; h *= 2) { - if ((tid % (2 * h)) < h) { - float a = x[tid]; - float b = x[tid + h]; - x[tid] = a + b; - x[tid + h] = a - b; - } + if (sg_size < 128) { + x[tid] = val; barrier(); + for (uint h = sg_size; h < 128; h *= 2) { + if ((tid % (2 * h)) < h) { + float a = x[tid]; + float b = x[tid + h]; + x[tid] = a + b; + x[tid + h] = a - b; + } + barrier(); + } + val = x[tid]; } - data_d[base + tid] = x[tid] * INV_SQRT_128 * ss; + data_d[base + tid] = val * INV_SQRT_128 * ss; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl index c386d300841..20c7cbf5333 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl @@ -1749,11 +1749,13 @@ struct block_turbo3_0 #define QUANT_K_TURBO2_0 128 #define QUANT_R_TURBO2_0 1 + struct block_turbo2_0 { float16_t norm; uint8_t qs[32]; // 2-bit centroid indices (4 per byte), 128/4 = 32 bytes }; + #if defined(DATA_A_TURBO2_0) #define QUANT_K QUANT_K_TURBO2_0 #define QUANT_R QUANT_R_TURBO2_0 @@ -1776,6 +1778,22 @@ struct block_turbo4_0 #define A_TYPE block_turbo4_0 #endif +#define QUANT_K_TQ3_1S 32 +#define QUANT_R_TQ3_1S 1 + +struct block_tq3_1s +{ + float16_t d0; + float16_t d1; + uint8_t qs[12]; // 3-bit indices packed (32 * 3 / 8 = 12 bytes) +}; + +#if defined(DATA_A_TQ3_1S) +#define QUANT_K QUANT_K_TQ3_1S +#define QUANT_R QUANT_R_TQ3_1S +#define QUANT_AUXF 1 +#define A_TYPE block_tq3_1s +#endif #define QUANT_K_TQ4_1S 32 #define QUANT_R_TQ4_1S 1 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen new file mode 100755 index 00000000000..4b4711a9713 Binary files /dev/null and b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen differ diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index e3e7952b93a..e904c4d796a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -68,7 +68,10 @@ const std::vector type_names = { "mxfp4", "nvfp4", "bf16", + "turbo2_0", "turbo3_0", + "turbo4_0", + "tq3_1s", "tq4_1s", }; @@ -327,7 +330,11 @@ compile_count_guard acquire_compile_slot() { } void string_to_spv_func(std::string name, std::string in_path, std::string out_path, std::map defines, bool coopmat, bool dep_file, compile_count_guard slot) { - std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; + bool needs_vulkan13 = name.find("_cm2") != std::string::npos || + name.find("_cm1") != std::string::npos || + name.find("_int8") != std::string::npos || + name.find("q8_1") != std::string::npos; + std::string target_env = needs_vulkan13 ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; #ifdef _WIN32 std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "\"" + in_path + "\"", "-o", "\"" + out_path + "\""}; @@ -668,7 +675,7 @@ void process_shaders() { if (tname == "f16") { string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp", merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}), fp16, true, false, f16acc); - } else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32" || tname == "turbo3_0") { + } else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32" || tname == "turbo3_0" || tname == "turbo2_0" || tname == "turbo4_0") { std::string data_a_key = "DATA_A_" + to_uppercase(tname); string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp", merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), fp16, true, false, f16acc); @@ -679,13 +686,13 @@ void process_shaders() { if (tname == "f16") { string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp", merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), fp16, false, false, f16acc); - } else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32" || tname == "turbo3_0") { + } else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32" || tname == "turbo3_0" || tname == "turbo2_0" || tname == "turbo4_0") { std::string data_a_key = "DATA_A_" + to_uppercase(tname); string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp", merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), fp16, false, false, f16acc); #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) - // MMQ path has no turbo3_0 code path; skip. - if (tname != "f32" && tname != "turbo3_0") { + // MMQ path has no turbo3_0/turbo2_0/turbo4_0 code path; skip. + if (tname != "f32" && tname != "turbo3_0" && tname != "turbo2_0" && tname != "turbo4_0") { string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp", merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }, {"MMQ", "1"}}), fp16, false, false, f16acc, "_int8"); } @@ -700,7 +707,7 @@ void process_shaders() { for (const auto& tname : type_names) { // mul mat vec std::string data_a_key = "DATA_A_" + to_uppercase(tname); - std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_") || tname == "tq4_1s") ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp"; + std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_") || tname == "tq3_1s" || tname == "tq4_1s") ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp"; string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPEV2", "vec2"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}})); string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV2", "f16vec2"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}})); @@ -781,12 +788,12 @@ void process_shaders() { string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); } - // turbo3_0 copy-from-quant only; copy-to-quant (cpy_f32_turbo3_0) omitted because the non-SET_ROWS quantize() path lacks the WHT transform - string_to_spv("cpy_turbo3_0_f32", "copy_from_quant.comp", {{"DATA_A_TURBO3_0", "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); - // tq4_1s copy-from-quant only; copy-to-quant requires WHT forward (handled in SET_ROWS path) - string_to_spv("cpy_tq4_1s_f32", "copy_from_quant.comp", {{"DATA_A_TQ4_1S", "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + // turbo copy-from-quant; copy-to-quant (cpy_f32_turbo) omitted because the non-SET_ROWS quantize() path lacks the WHT transform + for (std::string t : {"turbo2_0", "turbo3_0", "turbo4_0", "tq3_1s", "tq4_1s"}) { + string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); + } - for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl", "turbo2_0", "turbo3_0", "turbo4_0", "tq4_1s"}) { + for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl", "turbo2_0", "turbo3_0", "turbo4_0", "tq3_1s", "tq4_1s"}) { string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); }