From 6b74955f05df158af326df442e61bc30c39ca302 Mon Sep 17 00:00:00 2001 From: "ziyanyi.zyy" Date: Wed, 24 Sep 2025 09:30:30 +0800 Subject: [PATCH 1/9] feat: rocm amd optimizations --- .bazelrc | 1 + 3rdparty/aiter/0003-gemm_tune.patch | 1985 +++++++++++------ 3rdparty/aiter/BUILD | 225 +- 3rdparty/aiter/aiter-flash_attn.patch | 13 + 3rdparty/aiter/aiter-fmha.patch | 22 + 3rdparty/aiter/refine-aiter-asm-dir.patch | 47 + 3rdparty/aiter/rtp-llm.patch | 26 +- 3rdparty/aiter/silu.patch | 151 ++ .../bin/crosstool_wrapper_driver_rocm.tpl | 4 +- BUILD.aiter | 183 +- WORKSPACE | 8 +- patched_repo.bzl | 27 + rtp_llm/BUILD | 2 +- rtp_llm/config/gpt_init_model_parameters.py | 12 +- rtp_llm/config/py_config_modules.py | 3 + rtp_llm/cpp/cache/BUILD | 1 + rtp_llm/cpp/cache/KVCacheAllocator.cc | 15 +- rtp_llm/cpp/config/ConfigModules.cc | 6 +- rtp_llm/cpp/config/ConfigModules.h | 2 + rtp_llm/cpp/devices/DeviceBase.h | 4 + rtp_llm/cpp/devices/DeviceData.h | 1 + rtp_llm/cpp/devices/DeviceFactory.cc | 1 + rtp_llm/cpp/devices/OpData.h | 2 + .../cpp/devices/base_impl/AttentionLayer.cc | 9 +- .../devices/base_tests/AttentionOpTest.hpp | 78 +- rtp_llm/cpp/devices/base_tests/GemmOpTest.hpp | 75 +- rtp_llm/cpp/devices/rocm_impl/BUILD | 13 +- .../cpp/devices/rocm_impl/ROCmAttentionOp.cc | 64 +- rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc | 2 +- rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h | 6 +- rtp_llm/cpp/devices/rocm_impl/ROCmFfnLayer.cc | 334 +-- rtp_llm/cpp/devices/rocm_impl/ROCmGemmOp.cc | 34 +- .../cpp/devices/rocm_impl/ROCmLayernorm.cc | 265 +-- .../cpp/devices/rocm_impl/ROCmQuantizeOp.cc | 12 +- rtp_llm/cpp/devices/rocm_impl/aiterPA.cc | 48 +- rtp_llm/cpp/devices/rocm_impl/aiterPA.h | 3 + rtp_llm/cpp/devices/rocm_impl/test/BUILD | 6 +- .../rocm_impl/test/ops/LayernormTest.cc | 2 +- .../rocm_impl/test/ops/ROCmAttentionOpTest.cc | 33 +- .../rocm_impl/test/ops/ROCmGemmOpTest.cc | 12 +- rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h | 28 +- rtp_llm/cpp/kernels/rocm/fused_qk_rmsnorm.cu | 122 +- .../cpp/kernels/rotary_position_embedding.h | 173 +- .../cpp/kernels/unfused_attention_kernels.cu | 862 ++++++- .../cpp/kernels/unfused_attention_kernels.h | 110 + rtp_llm/cpp/models/GptModel.cc | 9 +- rtp_llm/cpp/models/GptModel.h | 1 + rtp_llm/cpp/pybind/ConfigInit.cc | 6 +- rtp_llm/cpp/rocm/BUILD | 15 +- rtp_llm/cpp/rocm/TensorDataManipulation.h | 412 ++++ rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc | 47 +- rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h | 2 +- rtp_llm/cpp/rocm/datatype_interface.h | 95 + rtp_llm/cpp/rocm/hipblasMMWrapper.cc | 172 +- rtp_llm/cpp/rocm/hipblasMMWrapper.h | 22 + rtp_llm/cpp/rocm/rocmFmhaWrapper.cc | 66 +- rtp_llm/cpp/utils/utils.h | 2 +- rtp_llm/device/device_impl.py | 9 +- rtp_llm/libs/BUILD | 68 +- .../per_channel_fp8_quant_weight.py | 4 +- rtp_llm/models/base_model.py | 4 + rtp_llm/models/qwen_v2.py | 4 + .../bindings/rocm/FusedRopeKVCacheOp.cc | 15 +- rtp_llm/models_py/modules/rocm/fmha.py | 32 +- rtp_llm/models_py/test/rocm_fmha_test.py | 4 +- rtp_llm/ops/libth_transformer.pyi | 4 + .../server_args/hw_kernel_group_args.py | 16 + .../server_args/test/server_args_test.py | 7 + rtp_llm/utils/model_weight.py | 24 +- rtp_llm/utils/swizzle_utils.py | 53 + tests/BUILD | 30 +- tests/ffn/rocm_ffn_moe_fp8_ptpc_test.cc | 122 + tests/ffn/rocm_ffn_moe_fp8_ptpc_test.py | 353 +++ tests/gemm/rocm_ptpc_gemm_op_test.py | 67 +- tests/layernorm/fusedQkRmsNorm.cpp | 14 +- 75 files changed, 5282 insertions(+), 1429 deletions(-) create mode 100644 3rdparty/aiter/aiter-flash_attn.patch create mode 100644 3rdparty/aiter/aiter-fmha.patch create mode 100644 3rdparty/aiter/refine-aiter-asm-dir.patch create mode 100644 3rdparty/aiter/silu.patch create mode 100644 patched_repo.bzl create mode 100644 rtp_llm/cpp/rocm/TensorDataManipulation.h create mode 100644 rtp_llm/cpp/rocm/datatype_interface.h create mode 100644 rtp_llm/utils/swizzle_utils.py create mode 100644 tests/ffn/rocm_ffn_moe_fp8_ptpc_test.cc create mode 100644 tests/ffn/rocm_ffn_moe_fp8_ptpc_test.py diff --git a/.bazelrc b/.bazelrc index 68145567f..0ab78ab92 100644 --- a/.bazelrc +++ b/.bazelrc @@ -167,6 +167,7 @@ build:asan --linkopt -fsanitize=address test:rocm --test_env PATH="/opt/rocm/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/opt/conda310/bin:/opt/conda310/condabin:/usr/share/Modules/bin:/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/X11R6/bin:/opt/cmake/cmake-3.26.4/bin:$PATH" test:rocm --test_env HOME=/home/admin test:rocm --test_env LD_LIBRARY_PATH="/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rocm/lib:/opt/conda310/lib/:/usr/lib64:/opt/amdgpu/lib64:$LD_LIBRARY_PATH" +test:rocm --test_env LD_PRELOAD="/opt/conda310/lib/libstdc++.so" test --test_env LD_LIBRARY_PATH="/opt/rocm/lib:/opt/conda310/lib/:/usr/local/nvidia/lib64:/usr/lib64:/usr/local/cuda/lib64:/opt/amdgpu/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" test --test_env OMP_NUM_THREADS=8 test --test_env FT_SERVER_TEST="1" diff --git a/3rdparty/aiter/0003-gemm_tune.patch b/3rdparty/aiter/0003-gemm_tune.patch index 5e58f6824..9d79521ee 100644 --- a/3rdparty/aiter/0003-gemm_tune.patch +++ b/3rdparty/aiter/0003-gemm_tune.patch @@ -569,705 +569,1286 @@ --- aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv +++ aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv -@@ -28,3 +28,701 @@ - 4096,4608,4096,68,0,428.3401,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 - 16384,4608,4096,68,0,1645.6259,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 - 32768,4608,4096,68,0,3239.7462,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+1,1280,8192,10,0,13.203,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+32,1280,8192,10,0,12.4022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+64,1280,8192,11,0,14.0131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+128,1280,8192,6,0,20.2891,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+192,1280,8192,11,0,29.5243,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+256,1280,8192,7,0,34.5595,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+320,1280,8192,11,0,45.224,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+512,1280,8192,65,0,45.9428,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+1024,1280,8192,0,0,69.4885,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+2048,1280,8192,70,0,119.7763,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4096,1280,8192,70,0,233.5804,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+8192,1280,8192,0,0,461.6483,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+16384,1280,8192,0,0,916.2015,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+1,8192,1024,15,0,6.0138,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,8192,1024,16,0,9.159,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,8192,1024,16,0,12.0359,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+128,8192,1024,62,0,16.6079,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+192,8192,1024,51,0,23.3947,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+256,8192,1024,0,0,24.1947,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+320,8192,1024,54,0,32.8684,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+512,8192,1024,0,0,43.926,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+1024,8192,1024,70,0,73.5021,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+2048,8192,1024,70,0,131.5924,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4096,8192,1024,70,0,252.1069,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+8192,8192,1024,0,0,487.8416,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+16384,8192,1024,70,0,951.936,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+1,6144,4096,10,0,11.5922,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,4096,4096,11,0,9.1638,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,28672,4096,11,0,34.6164,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,4096,14336,5,0,23.7291,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,2048,1536,10,0,5.0934,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,1536,1536,10,0,4.737,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,17920,1536,16,0,10.6127,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+1,1536,8960,15,0,21.5419,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,896,5120,10,0,9.8038,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,5120,640,9,0,5.493,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,6912,5120,10,0,13.7867,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,5120,3456,23,0,16.7547,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+1,7168,8192,10,0,20.0815,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,8192,3584,19,0,11.9131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+2,6144,4096,10,0,11.6979,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,4096,4096,5,0,9.3886,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,28672,4096,19,0,34.956,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+2,4096,14336,11,0,23.7463,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,2048,1536,10,0,5.0882,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,1536,1536,10,0,4.787,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,17920,1536,16,0,10.7966,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+2,1536,8960,15,0,21.7351,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,896,5120,10,0,9.2858,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,5120,640,9,0,5.4654,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,6912,5120,10,0,13.8355,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,5120,3456,23,0,16.7947,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+2,1280,8192,10,0,13.1587,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,8192,1024,15,0,6.347,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,7168,8192,10,0,20.3891,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,8192,3584,19,0,11.9619,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+4,6144,4096,10,0,11.7646,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,4096,4096,11,0,9.2154,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,28672,4096,19,0,36.0688,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+4,4096,14336,5,0,23.8147,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,2048,1536,10,0,5.2298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,1536,1536,10,0,4.8234,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,17920,1536,16,0,11.0062,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+4,1536,8960,15,0,21.7499,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,896,5120,10,0,9.381,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,5120,640,9,0,5.4974,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,6912,5120,10,0,13.8835,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,5120,3456,9,0,16.8155,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,1280,8192,10,0,13.2867,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,8192,1024,15,0,6.5342,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,7168,8192,10,0,20.4083,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,8192,3584,19,0,12.1495,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+8,6144,4096,10,0,12.0019,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,4096,4096,5,0,9.4678,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,28672,4096,19,0,36.8312,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+8,4096,14336,11,0,23.9931,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,2048,1536,10,0,5.145,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,1536,1536,10,0,4.8658,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,17920,1536,9,0,11.4315,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,1536,8960,15,0,21.8559,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,896,5120,10,0,9.573,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,5120,640,9,0,5.5426,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,6912,5120,10,0,14.2095,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,5120,3456,9,0,16.8699,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,1280,8192,10,0,13.4707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,8192,1024,15,0,6.4974,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,7168,8192,10,0,20.7759,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,8192,3584,19,0,12.2554,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+16,6144,4096,10,0,12.9387,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,4096,4096,11,0,9.2718,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+16,28672,4096,6,0,38.6136,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+16,4096,14336,19,0,22.4267,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+16,2048,1536,10,0,5.1418,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,1536,1536,10,0,4.8234,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,17920,1536,9,0,11.8362,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,1536,8960,15,0,21.7479,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+16,896,5120,10,0,9.035,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,5120,640,9,0,5.5438,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,6912,5120,10,0,15.3519,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,5120,3456,9,0,16.6795,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,1280,8192,10,0,11.9242,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,8192,1024,15,0,6.5694,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+16,7168,8192,10,0,22.5299,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,8192,3584,19,0,12.8282,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+32,6144,4096,5,0,18.5483,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,4096,4096,12,0,12.3935,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 -+32,28672,4096,12,0,55.4328,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 -+32,4096,14336,12,0,32.86,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 -+32,2048,1536,11,0,5.499,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,1536,1536,11,0,5.229,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,17920,1536,5,0,18.2047,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,1536,8960,15,0,21.7471,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,896,5120,10,0,9.209,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+32,5120,640,23,0,6.597,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+32,6912,5120,11,0,22.3103,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,5120,3456,9,0,17.9079,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+32,7168,8192,5,0,32.7363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,8192,3584,26,0,18.9699,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2 -+64,6144,4096,11,0,26.2975,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,4096,4096,6,0,20.0827,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,28672,4096,7,0,96.4463,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,4096,14336,20,0,57.0521,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2 -+64,2048,1536,15,0,7.7242,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,1536,1536,15,0,6.3778,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,17920,1536,16,0,27.6071,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,1536,8960,15,0,22.0751,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,896,5120,11,0,9.9511,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,5120,640,23,0,9.0654,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+64,6912,5120,6,0,34.2984,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,5120,3456,23,0,22.0835,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+64,7168,8192,20,0,51.3056,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2 -+64,8192,3584,16,0,30.8131,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+128,6144,4096,0,0,38.8164,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,4096,4096,65,0,25.4511,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,28672,4096,70,0,98.7439,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,4096,14336,65,0,74.145,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,2048,1536,16,0,9.8331,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+128,1536,1536,15,0,9.0299,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+128,17920,1536,0,0,32.1355,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,1536,8960,15,0,31.9775,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+128,896,5120,10,0,13.6819,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+128,5120,640,62,0,9.3994,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,6912,5120,63,0,47.5628,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,5120,3456,62,0,23.4631,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,7168,8192,0,0,70.1977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,8192,3584,63,0,36.488,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,6144,4096,68,0,722.0707,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,4096,4096,0,0,517.3686,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,28672,4096,0,0,3511.1005,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,4096,14336,0,0,1696.9417,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,2048,1536,0,0,116.6467,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,1536,1536,68,0,84.865,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,17920,1536,70,0,933.909,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,1536,8960,68,0,384.0268,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,896,5120,70,0,163.1018,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,5120,640,0,0,150.6321,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,6912,5120,68,0,997.924,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,5120,3456,70,0,555.2252,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,1280,8192,0,0,319.8745,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,8192,1024,0,0,318.7649,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,7168,8192,70,0,1722.2358,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4880,8192,3584,0,0,911.4248,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,6144,4096,68,0,722.9339,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,4096,4096,70,0,517.9014,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,28672,4096,0,0,3511.1681,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,4096,14336,0,0,1697.2028,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,2048,1536,70,0,116.5507,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,1536,1536,68,0,84.8826,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,17920,1536,70,0,932.8977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,1536,8960,68,0,383.9284,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,896,5120,70,0,163.1133,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,5120,640,0,0,150.5901,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,6912,5120,68,0,996.5008,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,5120,3456,70,0,554.89,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,1280,8192,0,0,319.7205,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,8192,1024,0,0,318.7485,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,7168,8192,0,0,1722.3637,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4888,8192,3584,0,0,911.1884,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,6144,4096,68,0,721.7139,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,4096,4096,70,0,517.1954,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,28672,4096,0,0,3511.2723,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,4096,14336,70,0,1696.9035,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,2048,1536,70,0,116.5331,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,1536,1536,68,0,84.901,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,17920,1536,70,0,933.8512,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,1536,8960,68,0,383.9004,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,896,5120,0,0,163.0481,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,5120,640,0,0,150.5773,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,6912,5120,68,0,996.9315,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,5120,3456,0,0,555.0771,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,1280,8192,0,0,319.7096,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,8192,1024,0,0,318.7777,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,7168,8192,0,0,1723.1288,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4896,8192,3584,70,0,910.7127,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,6144,4096,68,0,721.6983,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,4096,4096,0,0,517.1285,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,28672,4096,70,0,3512.2488,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,4096,14336,0,0,1696.1086,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,2048,1536,70,0,116.5239,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,1536,1536,68,0,84.861,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,17920,1536,70,0,933.8664,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,1536,8960,68,0,383.8607,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,896,5120,70,0,163.0625,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,5120,640,70,0,150.5189,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,6912,5120,68,0,997.2019,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,5120,3456,70,0,554.7055,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,1280,8192,0,0,319.7424,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,8192,1024,70,0,318.7508,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,7168,8192,70,0,1722.1435,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4904,8192,3584,70,0,910.8547,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,6144,4096,68,0,721.711,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,4096,4096,0,0,517.0985,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,28672,4096,0,0,3510.2643,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,4096,14336,70,0,1696.3793,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,2048,1536,70,0,116.5367,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,1536,1536,68,0,84.8902,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,17920,1536,70,0,933.6015,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,1536,8960,68,0,383.7731,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,896,5120,0,0,163.0273,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,5120,640,0,0,150.5685,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,6912,5120,68,0,995.963,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,5120,3456,0,0,554.4538,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,1280,8192,0,0,319.424,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,8192,1024,0,0,318.6576,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,7168,8192,70,0,1721.9694,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4912,8192,3584,70,0,909.9178,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,6144,4096,68,0,721.4914,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,4096,4096,0,0,517.2021,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,28672,4096,70,0,3509.482,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,4096,14336,70,0,1696.1636,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,2048,1536,0,0,116.5079,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,1536,1536,68,0,84.9018,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,17920,1536,70,0,933.0211,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,1536,8960,68,0,383.7803,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,896,5120,0,0,163.0841,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,5120,640,0,0,150.6109,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,6912,5120,68,0,996.1222,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,5120,3456,70,0,554.8414,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,1280,8192,0,0,319.6028,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,8192,1024,70,0,318.8648,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,7168,8192,0,0,1721.5073,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4920,8192,3584,70,0,910.2478,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,6144,4096,68,0,721.8849,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,4096,4096,70,0,517.1461,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,28672,4096,0,0,3510.1711,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,4096,14336,0,0,1696.8984,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,2048,1536,0,0,116.5655,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,1536,1536,68,0,84.8774,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,17920,1536,70,0,933.1971,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,1536,8960,68,0,384.0655,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,896,5120,44,0,154.9317,a8w8_bpreshuffle_256x224x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,5120,640,0,0,150.6645,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,6912,5120,68,0,997.6582,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,5120,3456,70,0,555.0206,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,1280,8192,0,0,319.5956,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,8192,1024,0,0,318.9808,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,7168,8192,40,0,1681.9775,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4928,8192,3584,70,0,910.9266,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,6144,4096,68,0,721.7033,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,4096,4096,0,0,517.302,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,28672,4096,70,0,3510.0414,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,4096,14336,0,0,1696.4572,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,2048,1536,0,0,116.5043,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,1536,1536,68,0,84.8654,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,17920,1536,70,0,933.4298,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,1536,8960,68,0,383.9551,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,896,5120,0,0,163.1197,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,5120,640,70,0,150.6,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,6912,5120,68,0,997.3789,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,5120,3456,0,0,555.3294,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,1280,8192,70,0,319.6532,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,8192,1024,0,0,318.7616,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,7168,8192,70,0,1720.596,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4936,8192,3584,0,0,910.6961,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,6144,4096,68,0,721.8669,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,4096,4096,70,0,517.1708,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,28672,4096,0,0,3508.9225,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,4096,14336,0,0,1696.1199,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,2048,1536,0,0,116.5291,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,1536,1536,68,0,84.8618,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,17920,1536,0,0,932.771,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,1536,8960,68,0,383.9035,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,896,5120,0,0,163.1437,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,5120,640,70,0,150.6652,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,6912,5120,68,0,996.5673,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,5120,3456,70,0,554.9566,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,1280,8192,0,0,319.658,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,8192,1024,70,0,318.7504,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,7168,8192,70,0,1721.2988,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4944,8192,3584,0,0,911.2273,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,6144,4096,68,0,720.9129,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,4096,4096,70,0,516.75,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,28672,4096,0,0,3508.1532,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,4096,14336,0,0,1695.5023,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,2048,1536,70,0,116.5687,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,1536,1536,68,0,84.9322,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,17920,1536,70,0,933.0606,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,1536,8960,68,0,383.7523,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,896,5120,0,0,162.9593,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,5120,640,0,0,150.526,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,6912,5120,68,0,996.4381,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,5120,3456,70,0,554.5042,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,1280,8192,0,0,319.4612,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,8192,1024,0,0,318.8244,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,7168,8192,70,0,1722.84,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4952,8192,3584,0,0,910.5245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,6144,4096,68,0,721.2313,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,4096,4096,70,0,516.8868,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,28672,4096,0,0,3508.4568,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,4096,14336,0,0,1695.4158,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,2048,1536,0,0,116.5063,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,1536,1536,68,0,84.8362,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,17920,1536,0,0,932.7946,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,1536,8960,68,0,383.8078,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,896,5120,0,0,163.0629,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,5120,640,70,0,150.49,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,6912,5120,68,0,996.6372,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,5120,3456,0,0,554.8862,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,1280,8192,54,0,303.5491,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,8192,1024,70,0,318.6011,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,7168,8192,54,0,1655.1632,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4960,8192,3584,0,0,911.2157,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,6144,4096,68,0,721.0853,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,4096,4096,0,0,516.3888,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,28672,4096,0,0,3509.8311,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,4096,14336,0,0,1695.8882,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,2048,1536,0,0,116.5419,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,1536,1536,68,0,84.8662,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,17920,1536,0,0,933.6622,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,1536,8960,68,0,384.153,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,896,5120,0,0,163.0365,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,5120,640,0,0,150.6068,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,6912,5120,68,0,997.4728,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,5120,3456,0,0,555.5226,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,1280,8192,0,0,319.8772,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,8192,1024,0,0,319.0248,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,7168,8192,70,0,1722.1783,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4968,8192,3584,70,0,911.2245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,6144,4096,68,0,721.9552,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,4096,4096,70,0,516.9584,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,28672,4096,0,0,3508.7338,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,4096,14336,0,0,1695.7474,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,2048,1536,0,0,116.5875,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,1536,1536,68,0,84.8574,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,17920,1536,70,0,932.7865,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,1536,8960,68,0,383.911,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,896,5120,0,0,163.0225,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,5120,640,70,0,150.7224,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,6912,5120,68,0,996.0924,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,5120,3456,0,0,553.9673,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,1280,8192,0,0,319.4572,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,8192,1024,70,0,318.7235,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,7168,8192,70,0,1720.2767,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4976,8192,3584,0,0,910.5728,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,6144,4096,68,0,721.8145,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,4096,4096,70,0,517.3384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,28672,4096,0,0,3509.5018,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,4096,14336,70,0,1696.1993,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,2048,1536,0,0,116.5027,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,1536,1536,68,0,84.8414,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,17920,1536,0,0,933.6853,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,1536,8960,68,0,383.8522,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,896,5120,0,0,163.0629,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,5120,640,0,0,150.5328,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,6912,5120,68,0,996.3624,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,5120,3456,0,0,554.7977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,1280,8192,0,0,319.7468,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,8192,1024,0,0,318.6704,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,7168,8192,0,0,1720.7806,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4984,8192,3584,70,0,910.8736,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,6144,4096,68,0,677.7474,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,4096,4096,70,0,474.925,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,28672,4096,0,0,3219.969,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,4096,14336,70,0,1553.3464,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,2048,1536,70,0,109.0531,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,1536,1536,68,0,80.9629,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,17920,1536,70,0,857.1658,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,1536,8960,68,0,362.5549,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,896,5120,0,0,151.234,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,5120,640,0,0,144.7384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,6912,5120,68,0,937.6257,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,5120,3456,0,0,507.3276,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,1280,8192,0,0,292.0606,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,8192,1024,70,0,300.8963,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,7168,8192,70,0,1578.4757,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+4992,8192,3584,0,0,836.7985,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,6144,4096,68,0,722.564,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,4096,4096,0,0,518.28,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,28672,4096,0,0,3573.2853,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,4096,14336,70,0,1698.7618,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,2048,1536,0,0,116.5347,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,1536,1536,68,0,84.7494,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,17920,1536,70,0,943.2838,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,1536,8960,68,0,384.089,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,896,5120,0,0,163.0909,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,5120,640,0,0,151.0808,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,6912,5120,68,0,998.246,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,5120,3456,0,0,556.451,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,1280,8192,0,0,319.9695,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,8192,1024,70,0,320.2159,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,7168,8192,0,0,1723.8551,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5000,8192,3584,70,0,912.0216,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,6144,4096,68,0,722.1309,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,4096,4096,0,0,517.5104,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,28672,4096,70,0,3572.3381,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,4096,14336,70,0,1697.3202,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,2048,1536,0,0,116.5083,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,1536,1536,68,0,84.8038,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,17920,1536,70,0,943.9934,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,1536,8960,68,0,384.419,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,896,5120,0,0,163.0989,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,5120,640,0,0,151.128,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,6912,5120,68,0,998.1032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,5120,3456,0,0,556.1637,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,1280,8192,0,0,319.8811,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,8192,1024,0,0,320.406,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,7168,8192,0,0,1723.8707,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5008,8192,3584,70,0,912.4452,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,6144,4096,68,0,722.6928,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,4096,4096,70,0,518.2384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,28672,4096,70,0,3573.7749,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,4096,14336,70,0,1696.9813,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,2048,1536,0,0,116.4795,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,1536,1536,68,0,84.8162,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,17920,1536,0,0,943.6022,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,1536,8960,68,0,384.4446,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,896,5120,0,0,163.0933,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,5120,640,0,0,151.1972,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,6912,5120,68,0,999.2964,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,5120,3456,70,0,556.8141,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,1280,8192,70,0,320.0599,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,8192,1024,70,0,320.598,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,7168,8192,70,0,1725.0263,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5016,8192,3584,70,0,911.7604,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,6144,4096,68,0,723.4404,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,4096,4096,0,0,518.52,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,28672,4096,0,0,3574.9692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,4096,14336,70,0,1698.7973,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,2048,1536,0,0,116.5783,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,1536,1536,68,0,84.8306,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,17920,1536,70,0,943.7666,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,1536,8960,68,0,384.437,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,896,5120,0,0,163.1245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,5120,640,0,0,151.164,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,6912,5120,68,0,998.884,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,5120,3456,0,0,556.5441,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,1280,8192,0,0,319.7835,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,8192,1024,70,0,320.4855,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,7168,8192,70,0,1722.6626,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5024,8192,3584,0,0,910.8936,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,6144,4096,68,0,722.66,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,4096,4096,0,0,518.1148,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,28672,4096,70,0,3571.9384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,4096,14336,70,0,1697.5977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,2048,1536,0,0,116.5019,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,1536,1536,68,0,84.8218,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,17920,1536,70,0,944.1358,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,1536,8960,68,0,384.6574,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,896,5120,0,0,163.1725,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,5120,640,0,0,151.2748,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,6912,5120,68,0,997.3216,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,5120,3456,70,0,556.5181,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,1280,8192,70,0,319.8112,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,8192,1024,70,0,320.2775,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,7168,8192,0,0,1725.0374,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5032,8192,3584,0,0,912.9028,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,6144,4096,68,0,722.9024,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,4096,4096,70,0,518.1968,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,28672,4096,0,0,3576.404,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,4096,14336,70,0,1698.0521,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,2048,1536,0,0,116.4119,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,1536,1536,68,0,84.8034,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,17920,1536,70,0,943.6378,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,1536,8960,68,0,384.4282,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,896,5120,0,0,163.0225,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,5120,640,0,0,151.082,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,6912,5120,68,0,999.2588,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,5120,3456,0,0,556.7777,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,1280,8192,70,0,319.9255,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,8192,1024,0,0,320.3251,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,7168,8192,0,0,1725.0698,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5040,8192,3584,0,0,912.532,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,6144,4096,68,0,722.6176,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,4096,4096,70,0,518.2176,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,28672,4096,70,0,3574.9924,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,4096,14336,70,0,1699.3301,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,2048,1536,0,0,116.5911,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,1536,1536,68,0,84.8314,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,17920,1536,70,0,944.667,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,1536,8960,68,0,384.3346,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,896,5120,0,0,163.1737,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,5120,640,0,0,151.1584,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,6912,5120,68,0,998.9228,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,5120,3456,0,0,556.8089,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,1280,8192,0,0,320.3219,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,8192,1024,70,0,320.6136,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,7168,8192,0,0,1725.7622,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5048,8192,3584,70,0,912.3652,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,6144,4096,68,0,723.0748,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,4096,4096,70,0,518.3472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,28672,4096,70,0,3573.1804,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,4096,14336,0,0,1698.8861,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,2048,1536,70,0,116.6247,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,1536,1536,68,0,84.8318,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,17920,1536,0,0,944.0254,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,1536,8960,68,0,385.0938,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,896,5120,0,0,163.2085,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,5120,640,0,0,151.1404,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,6912,5120,68,0,998.5256,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,5120,3456,70,0,556.5893,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,1280,8192,70,0,320.0095,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,8192,1024,70,0,320.5692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,7168,8192,70,0,1725.517,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5056,8192,3584,70,0,911.968,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,6144,4096,68,0,723.0832,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,4096,4096,0,0,518.1656,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,28672,4096,0,0,3575.312,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,4096,14336,0,0,1699.5549,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,2048,1536,0,0,116.5611,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,1536,1536,68,0,84.9002,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,17920,1536,70,0,944.5414,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,1536,8960,68,0,384.4386,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,896,5120,0,0,163.1697,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,5120,640,70,0,151.2672,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,6912,5120,68,0,997.8556,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,5120,3456,0,0,556.4153,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,1280,8192,70,0,319.888,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,8192,1024,0,0,320.4688,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,7168,8192,70,0,1724.7874,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5064,8192,3584,0,0,912.4144,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,6144,4096,68,0,723.1368,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,4096,4096,0,0,518.5264,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,28672,4096,70,0,3577.46,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,4096,14336,0,0,1698.8585,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,2048,1536,70,0,116.5131,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,1536,1536,68,0,84.8522,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,17920,1536,70,0,943.997,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,1536,8960,68,0,384.143,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,896,5120,0,0,163.1409,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,5120,640,0,0,151.0372,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,6912,5120,68,0,998.9192,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,5120,3456,70,0,556.8149,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,1280,8192,70,0,320.1304,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,8192,1024,70,0,320.4639,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,7168,8192,0,0,1724.2474,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5072,8192,3584,0,0,912.6188,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,6144,4096,68,0,723.1992,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,4096,4096,70,0,518.4672,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,28672,4096,70,0,3574.8151,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,4096,14336,70,0,1698.4557,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,2048,1536,0,0,116.4291,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,1536,1536,68,0,84.8058,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,17920,1536,0,0,943.3509,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,1536,8960,68,0,383.8942,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,896,5120,0,0,163.0429,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,5120,640,0,0,151.1744,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,6912,5120,68,0,998.1023,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,5120,3456,70,0,555.4617,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,1280,8192,0,0,319.5047,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,8192,1024,70,0,320.4791,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,7168,8192,0,0,1724.4214,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5080,8192,3584,0,0,912.0396,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,6144,4096,68,0,722.7032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,4096,4096,0,0,517.9599,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,28672,4096,0,0,3573.3426,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,4096,14336,70,0,1698.6472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,2048,1536,70,0,116.5243,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,1536,1536,68,0,84.8814,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,17920,1536,70,0,943.9581,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,1536,8960,68,0,384.1678,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,896,5120,0,0,163.0797,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,5120,640,0,0,151.1852,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,6912,5120,68,0,999.3004,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,5120,3456,70,0,556.6433,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,1280,8192,70,0,319.8523,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,8192,1024,70,0,320.5275,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,7168,8192,0,0,1724.6449,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5088,8192,3584,70,0,912.5908,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,6144,4096,68,0,723.2372,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,4096,4096,70,0,518.5907,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,28672,4096,0,0,3577.995,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,4096,14336,0,0,1699.39,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,2048,1536,0,0,116.5787,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,1536,1536,68,0,84.8981,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,17920,1536,70,0,944.2021,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,1536,8960,68,0,384.7514,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,896,5120,0,0,163.2357,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,5120,640,0,0,151.256,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,6912,5120,68,0,998.7911,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,5120,3456,0,0,556.5617,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,1280,8192,0,0,320.0247,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,8192,1024,70,0,320.7655,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,7168,8192,70,0,1725.1549,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5096,8192,3584,0,0,912.3992,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,6144,4096,68,0,723.676,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,4096,4096,70,0,518.3535,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,28672,4096,70,0,3575.5158,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,4096,14336,70,0,1698.0484,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,2048,1536,0,0,116.4571,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,1536,1536,68,0,84.865,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,17920,1536,70,0,944.0377,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,1536,8960,68,0,384.119,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,896,5120,70,0,163.0397,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,5120,640,0,0,150.976,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,6912,5120,68,0,998.4327,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,5120,3456,70,0,556.2829,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,1280,8192,0,0,319.8375,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,8192,1024,0,0,320.4843,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,7168,8192,0,0,1725.7897,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5104,8192,3584,70,0,911.8712,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,6144,4096,68,0,723.024,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,4096,4096,70,0,517.9339,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,28672,4096,0,0,3572.6805,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,4096,14336,0,0,1698.4928,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,2048,1536,70,0,116.5211,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,1536,1536,68,0,84.8258,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,17920,1536,70,0,943.9213,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,1536,8960,68,0,384.297,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,896,5120,0,0,163.1469,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,5120,640,0,0,151.182,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,6912,5120,68,0,998.5615,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,5120,3456,0,0,556.7313,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,1280,8192,70,0,319.8655,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,8192,1024,0,0,320.3695,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,7168,8192,0,0,1724.1557,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5112,8192,3584,70,0,911.9167,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,6144,4096,68,0,679.4918,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,4096,4096,0,0,476.0774,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,28672,4096,0,0,3278.3917,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,4096,14336,70,0,1556.5966,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,2048,1536,70,0,108.853,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,1536,1536,68,0,81.0149,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,17920,1536,0,0,868.8098,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,1536,8960,68,0,362.9605,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,896,5120,70,0,151.1084,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,5120,640,0,0,145.5056,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,6912,5120,68,0,939.8357,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,5120,3456,70,0,508.5807,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,1280,8192,70,0,292.2446,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,8192,1024,70,0,302.3202,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,7168,8192,70,0,1580.9331,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+5120,8192,3584,0,0,838.4504,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+1,4096,8192,11,0,15.6167,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,24576,4096,11,0,29.5131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,4096,12288,11,0,20.7263,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,1280,5120,10,0,9.3202,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,5120,1024,15,0,5.1926,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+1,6400,5120,10,0,13.6715,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+1,5120,3200,23,0,15.5815,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+2,4096,8192,11,0,15.6803,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,24576,4096,11,0,30.2048,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,4096,12288,11,0,20.7555,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,1280,5120,10,0,9.473,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,5120,1024,15,0,5.2698,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+2,6400,5120,10,0,13.7399,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+2,5120,3200,23,0,15.6175,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+4,4096,8192,5,0,15.7483,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,24576,4096,9,0,30.6624,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,4096,12288,5,0,20.7771,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,1280,5120,10,0,9.5426,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,5120,1024,15,0,5.4922,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+4,6400,5120,10,0,13.8239,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+4,5120,3200,23,0,15.6695,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+8,4096,8192,5,0,15.8119,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,24576,4096,11,0,31.22,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,4096,12288,11,0,20.9183,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,1280,5120,10,0,9.6383,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,5120,1024,15,0,5.2118,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+8,6400,5120,10,0,14.0415,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+8,5120,3200,9,0,15.7527,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,4096,8192,10,0,15.0923,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,24576,4096,5,0,33.536,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+16,4096,12288,19,0,19.7555,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+16,1280,5120,10,0,9.1006,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+16,5120,1024,15,0,5.2654,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+16,6400,5120,6,0,15.0351,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+16,5120,3200,9,0,15.5687,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+32,4096,8192,12,0,21.1827,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 -+32,24576,4096,12,0,47.0804,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 -+32,4096,12288,12,0,28.6523,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 -+32,1280,5120,10,0,9.1151,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+32,5120,1024,15,0,6.7839,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,6400,5120,11,0,22.0163,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+32,5120,3200,9,0,16.7551,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 -+64,4096,8192,20,0,36.222,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2 -+64,24576,4096,7,0,81.9382,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,4096,12288,6,0,49.6453,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+64,1280,5120,11,0,10.043,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,5120,1024,29,0,9.2079,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2 -+64,6400,5120,11,0,32.0656,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 -+64,5120,3200,23,0,20.7411,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 -+128,4096,8192,65,0,44.9496,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,24576,4096,68,0,92.1811,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,4096,12288,65,0,64.1853,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,1280,5120,6,0,14.3507,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 -+128,5120,1024,65,0,11.1843,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,6400,5120,63,0,47.6153,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 -+128,5120,3200,62,0,22.1239,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +@@ -1,221 +1,1061 @@ +-cu_num,M,N,K,kernelId,splitK,us,kernelName +-80,1,512,7168,5,0,12.4802,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,1,1280,8192,11,0,14.3155,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,1,2112,7168,11,0,13.4147,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,1,4096,512,9,0,4.2138,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,1,4608,4096,11,0,10.5838,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,1,4608,7168,5,0,13.8515,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,1,7168,256,75,0,6.4606,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,1,7168,2304,29,0,10.1762,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,1,8192,1024,15,0,6.4482,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,1,9216,4096,19,0,14.4331,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,2,4608,4096,5,0,10.661,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,2,9216,4096,5,0,14.4331,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,4,4608,4096,5,0,10.4863,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,4,9216,4096,11,0,14.6151,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,8,4608,4096,5,0,10.8626,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,8,9216,4096,5,0,15.0571,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,16,512,7168,24,0,10.7923,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-80,16,576,7168,10,0,11.091,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,16,1536,7168,10,0,11.2927,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,16,2112,7168,24,0,11.7534,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-80,16,3072,1536,11,0,5.8974,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,16,4096,512,9,0,4.939,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,16,4608,4096,24,0,10.8403,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-80,16,4608,7168,5,0,14.1863,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,16,7168,256,75,0,6.629,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,16,7168,2048,10,0,8.769,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,16,7168,2304,15,0,10.491,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,16,9216,4096,11,0,15.5867,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,32,512,7168,10,0,11.1022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,32,576,7168,10,0,11.199,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,32,1280,8192,24,0,12.4575,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-80,32,1536,7168,19,0,12.6967,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,32,2112,7168,5,0,13.1607,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,32,3072,1536,112,0,6.9922,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,32,4096,512,9,0,4.8822,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,32,4608,4096,6,0,13.4935,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,32,4608,7168,12,0,19.4019,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1 +-80,32,7168,256,75,0,6.8302,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,32,7168,2048,119,0,11.0711,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,32,7168,2304,119,0,11.6614,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,32,8192,1024,119,0,7.4863,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,32,9216,4096,119,0,18.4783,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,48,512,7168,10,0,11.2279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,48,2112,7168,10,0,18.4707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,48,4096,512,9,0,5.3534,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,48,4608,7168,113,0,22.5027,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,48,7168,256,75,0,7.0126,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,48,7168,2304,113,0,13.6727,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,64,512,7168,10,0,11.1263,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,64,576,7168,24,0,11.163,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-80,64,1280,8192,19,0,13.7807,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,64,1536,7168,19,0,16.9363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,64,2112,7168,112,0,18.9179,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,64,3072,1536,114,0,9.0146,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,64,4096,512,77,0,5.8414,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,64,4608,4096,114,0,16.3715,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,64,4608,7168,114,0,24.7999,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,64,7168,256,75,0,7.6886,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,64,7168,2048,112,0,14.2819,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,64,7168,2304,112,0,15.5151,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,64,8192,1024,114,0,10.9842,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,64,9216,4096,114,0,25.0703,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,80,512,7168,24,0,11.2303,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-80,80,2112,7168,113,0,24.3931,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,80,4096,512,76,0,6.8102,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,80,4608,7168,115,0,28.9995,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,80,7168,256,75,0,8.2998,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,80,7168,2304,113,0,18.9055,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,96,512,7168,10,0,11.6635,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,96,2112,7168,113,0,22.3255,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,96,4096,512,76,0,6.6322,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,96,4608,7168,120,0,29.4707,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,96,7168,256,75,0,8.6194,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,96,7168,2304,84,0,19.1495,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,112,512,7168,10,0,11.9903,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1 +-80,112,2112,7168,112,0,25.9619,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,112,4096,512,76,0,7.3154,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,112,4608,7168,117,0,37.2568,a8w8_bpreshuffle_256x112x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,112,7168,256,75,0,9.207,a8w8_bpreshuffle_128x16x256x64_16x16_16x16_4x16x1_4x32x1_1x16x1x8_8x8x1_1x2_intrawave_v1 +-80,112,7168,2304,119,0,21.7835,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,128,512,7168,19,0,12.3367,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,128,576,7168,25,0,12.3755,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2 +-80,128,1280,8192,6,0,20.4871,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,128,1536,7168,112,0,22.8955,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,128,2112,7168,114,0,24.5755,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,128,3072,1536,112,0,11.1838,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,128,4096,512,76,0,7.2018,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,128,4608,4096,121,0,24.6387,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3 +-80,128,4608,7168,114,0,38.7224,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,128,7168,256,73,0,10.1406,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,128,7168,2048,119,0,20.5111,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,128,7168,2304,84,0,21.9275,a8w8_bpreshuffle_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,128,8192,1024,121,0,14.9703,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3 +-80,128,9216,4096,121,0,39.1328,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3 +-80,192,1280,8192,113,0,24.3731,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1 +-80,192,8192,1024,85,0,18.7963,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,512,7168,6,0,17.5859,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,256,576,7168,112,0,18.3779,a8w8_bpreshuffle_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,256,1280,8192,114,0,26.7743,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,1536,7168,120,0,31.1851,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,256,2112,7168,114,0,37.3908,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,3072,1536,120,0,16.0983,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,256,4096,512,76,0,10.9814,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,256,4608,4096,85,0,37.982,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,4608,7168,121,0,62.2609,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3 +-80,256,7168,256,73,0,11.5078,a8w8_bpreshuffle_256x32x256x64_16x16_16x16_4x32x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v1 +-80,256,7168,2048,85,0,30.5235,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,7168,2304,85,0,32.0436,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,8192,1024,70,0,23.9267,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,256,9216,4096,70,0,64.6741,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,320,1280,8192,115,0,32.2959,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,320,8192,1024,85,0,24.7583,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,512,7168,119,0,23.6639,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,512,576,7168,114,0,23.9199,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,1280,8192,114,0,42.3216,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,1536,7168,128,0,47.65,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,2112,7168,129,0,60.4509,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,512,3072,1536,85,0,24.3751,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,4096,512,85,0,15.7231,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,4608,7168,70,0,105.5819,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,7168,256,72,0,16.1951,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,512,7168,2048,71,0,53.438,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,512,7168,2304,72,0,57.4769,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,512,8192,1024,85,0,37.882,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,512,7168,114,0,36.6616,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,576,7168,114,0,37.1872,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,1280,8192,85,0,68.1209,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,1536,7168,136,0,74.0965,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3 +-80,1024,2112,7168,93,0,115.8099,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,3072,1536,85,0,38.8336,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,4096,512,85,0,24.9039,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,4608,4096,71,0,120.44,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,4608,7168,93,0,189.1551,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,7168,256,71,0,27.7779,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,7168,2048,71,0,98.0567,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,7168,2304,85,0,107.3839,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,8192,1024,71,0,67.0889,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1024,9216,4096,93,0,219.0276,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,512,7168,120,0,48.0728,a8w8_bpreshuffle_256x48x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,1536,576,7168,128,0,47.2304,a8w8_bpreshuffle_256x64x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,1536,7168,0,0,105.4423,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,2112,7168,68,0,148.9129,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,3072,1536,93,0,51.4668,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,4096,512,85,0,33.2015,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,4608,7168,85,0,279.9019,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,7168,256,71,0,37.3284,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,7168,2048,85,0,138.0452,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,1536,7168,2304,85,0,150.2725,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,512,7168,85,0,58.1113,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,576,7168,129,0,60.7837,a8w8_bpreshuffle_256x80x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,2048,1280,8192,0,0,119.2848,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,1536,7168,85,0,138.8692,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,2112,7168,93,0,186.5271,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,3072,1536,72,0,69.8541,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,2048,4096,512,71,0,43.6672,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,4608,4096,68,0,218.9032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,4608,7168,93,0,366.0791,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,7168,256,71,0,48.0096,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,7168,2048,85,0,186.7051,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,7168,2304,85,0,199.0359,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,8192,1024,71,0,120.4715,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,2048,9216,4096,68,0,425.2314,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,512,7168,85,0,101.7067,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,576,7168,93,0,113.3711,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,1280,8192,71,0,234.5689,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,1536,7168,85,0,256.917,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,2112,7168,93,0,329.2957,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,3072,1536,93,0,123.6652,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,4096,512,71,0,75.7421,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,4608,4096,85,0,420.6946,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,4608,7168,85,0,715.7879,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,7168,256,71,0,85.7258,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,7168,2048,71,0,345.3094,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,7168,2304,71,0,379.4759,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,8192,1024,71,0,227.2969,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4096,9216,4096,71,0,812.5832,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,4240,9216,4096,93,0,903.6056,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,512,7168,85,0,195.7643,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,576,7168,93,0,191.4823,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,1280,8192,71,0,452.4299,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,1536,7168,72,0,501.1581,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1 +-80,8192,2112,7168,68,0,649.3748,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,3072,1536,71,0,235.4545,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,4096,512,71,0,140.1016,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,4608,7168,93,0,1405.0902,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,7168,256,71,0,158.8437,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,7168,2048,71,0,665.8308,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,7168,2304,71,0,736.4592,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,8192,8192,1024,71,0,434.801,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,512,7168,85,0,354.1062,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,576,7168,68,0,372.1747,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,1280,8192,71,0,902.6619,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,1536,7168,68,0,940.0969,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,2112,7168,93,0,1284.8613,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,3072,1536,71,0,450.9699,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,4096,512,71,0,268.2002,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,4608,4096,93,0,1613.6388,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,4608,7168,93,0,2770.8265,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,7168,256,71,0,306.2344,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,7168,2048,71,0,1322.9659,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,7168,2304,71,0,1464.8949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,8192,1024,71,0,849.5169,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,16384,9216,4096,71,0,3218.0651,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,512,7168,70,0,411.9813,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,576,7168,68,0,439.759,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,1536,7168,93,0,1157.9543,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,3072,1536,71,0,557.954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,4096,512,71,0,331.5621,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,4608,7168,68,0,3421.2891,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,7168,256,71,0,381.5807,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,7168,2048,71,0,1639.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,20480,7168,2304,71,0,1805.4245,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,512,7168,70,0,664.7692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,2112,7168,93,0,2566.2639,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,4096,512,71,0,523.3298,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,4608,4096,93,0,3207.2182,a8w8_bpreshuffle_256x64x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,4608,7168,68,0,5527.6346,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,7168,256,71,0,600.9077,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,7168,2304,71,0,2907.7206,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3 +-80,32768,9216,4096,68,0,6437.3643,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3 ++cu_num,M,N,K,kernelId,splitK,us,kernelName,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11 ++80,1,9216,4096,5,0,14.0435,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,9216,4096,11,0,14.0887,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,9216,4096,5,0,14.2467,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,9216,4096,5,0,14.4655,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,9216,4096,5,0,15.4091,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,9216,4096,12,0,21.6727,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,9216,4096,7,0,36.0128,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,9216,4096,63,0,40.9552,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,9216,4096,70,0,65.0149,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,9216,4096,68,0,220.9164,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,9216,4096,68,0,428.7493,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,9216,4096,68,0,845.13,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4240,9216,4096,68,0,946.1484,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,9216,4096,68,0,3239.5299,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,32768,9216,4096,68,0,6471.5154,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,4608,4096,11,0,10.4271,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,4608,4096,11,0,10.2867,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,4608,4096,11,0,10.2539,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,4608,4096,5,0,10.6603,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,4608,4096,10,0,10.1426,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,4608,4096,12,0,12.9946,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,4608,4096,6,0,20.8103,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,4608,4096,65,0,25.9515,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,4608,4096,70,0,39.2768,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,4608,4096,70,0,124.6319,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,4608,4096,68,0,220.3192,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,4608,4096,68,0,428.3401,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,4608,4096,68,0,1645.6259,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,32768,4608,4096,68,0,3239.7462,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,1280,8192,10,0,13.203,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,1280,8192,10,0,12.4022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,64,1280,8192,11,0,14.0131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,128,1280,8192,6,0,20.2891,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,192,1280,8192,11,0,29.5243,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,256,1280,8192,7,0,34.5595,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,320,1280,8192,11,0,45.224,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,512,1280,8192,65,0,45.9428,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,1280,8192,0,0,69.4885,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,1280,8192,70,0,119.7763,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,1280,8192,70,0,233.5804,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,1280,8192,0,0,461.6483,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,1280,8192,0,0,916.2015,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,8192,1024,15,0,6.0138,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,8192,1024,16,0,9.159,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,8192,1024,16,0,12.0359,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,8192,1024,62,0,16.6079,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,192,8192,1024,51,0,23.3947,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,8192,1024,0,0,24.1947,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,320,8192,1024,54,0,32.8684,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,512,8192,1024,0,0,43.926,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,8192,1024,70,0,73.5021,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,8192,1024,70,0,131.5924,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,8192,1024,70,0,252.1069,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,8192,1024,0,0,487.8416,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,8192,1024,70,0,951.936,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,6144,4096,10,0,11.5922,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,4096,4096,11,0,9.1638,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,28672,4096,11,0,34.6164,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,4096,14336,5,0,23.7291,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,2048,1536,10,0,5.0934,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,1536,1536,10,0,4.737,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,17920,1536,16,0,10.6127,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1,1536,8960,15,0,21.5419,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,896,5120,10,0,9.8038,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,5120,640,9,0,5.493,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,6912,5120,10,0,13.7867,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,5120,3456,23,0,16.7547,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,1,7168,8192,10,0,20.0815,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,8192,3584,19,0,11.9131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,2,6144,4096,10,0,11.6979,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,4096,4096,5,0,9.3886,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,28672,4096,19,0,34.956,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,2,4096,14336,11,0,23.7463,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,2048,1536,10,0,5.0882,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,1536,1536,10,0,4.787,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,17920,1536,16,0,10.7966,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,2,1536,8960,15,0,21.7351,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,896,5120,10,0,9.2858,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,640,9,0,5.4654,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,6912,5120,10,0,13.8355,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,3456,23,0,16.7947,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,2,1280,8192,10,0,13.1587,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,8192,1024,15,0,6.347,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,7168,8192,10,0,20.3891,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,8192,3584,19,0,11.9619,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,4,6144,4096,10,0,11.7646,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,4096,4096,11,0,9.2154,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,28672,4096,19,0,36.0688,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,4,4096,14336,5,0,23.8147,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,2048,1536,10,0,5.2298,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,1536,1536,10,0,4.8234,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,17920,1536,16,0,11.0062,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,4,1536,8960,15,0,21.7499,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,896,5120,10,0,9.381,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,640,9,0,5.4974,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,6912,5120,10,0,13.8835,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,3456,9,0,16.8155,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,1280,8192,10,0,13.2867,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,8192,1024,15,0,6.5342,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,7168,8192,10,0,20.4083,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,8192,3584,19,0,12.1495,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,8,6144,4096,10,0,12.0019,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,4096,4096,5,0,9.4678,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,28672,4096,19,0,36.8312,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,8,4096,14336,11,0,23.9931,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,2048,1536,10,0,5.145,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,1536,1536,10,0,4.8658,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,17920,1536,9,0,11.4315,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,1536,8960,15,0,21.8559,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,896,5120,10,0,9.573,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,640,9,0,5.5426,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,6912,5120,10,0,14.2095,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,3456,9,0,16.8699,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,1280,8192,10,0,13.4707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,8192,1024,15,0,6.4974,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,7168,8192,10,0,20.7759,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,8192,3584,19,0,12.2554,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,16,6144,4096,10,0,12.9387,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,4096,4096,11,0,9.2718,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,28672,4096,6,0,38.6136,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,16,4096,14336,19,0,22.4267,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,16,2048,1536,10,0,5.1418,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,1536,1536,10,0,4.8234,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,17920,1536,9,0,11.8362,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,1536,8960,15,0,21.7479,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,896,5120,10,0,9.035,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,640,9,0,5.5438,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,6912,5120,10,0,15.3519,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,3456,9,0,16.6795,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,1280,8192,10,0,11.9242,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,8192,1024,15,0,6.5694,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,7168,8192,10,0,22.5299,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,8192,3584,19,0,12.8282,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,32,6144,4096,5,0,18.5483,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,4096,4096,12,0,12.3935,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,28672,4096,12,0,55.4328,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,4096,14336,12,0,32.86,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,2048,1536,11,0,5.499,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,1536,1536,11,0,5.229,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,17920,1536,5,0,18.2047,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,1536,8960,15,0,21.7471,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,896,5120,10,0,9.209,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,5120,640,23,0,6.597,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,32,6912,5120,11,0,22.3103,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,5120,3456,9,0,17.9079,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,7168,8192,5,0,32.7363,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,8192,3584,26,0,18.9699,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2,,,,, ++80,64,6144,4096,11,0,26.2975,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,4096,4096,6,0,20.0827,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,28672,4096,7,0,96.4463,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,4096,14336,20,0,57.0521,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,64,2048,1536,15,0,7.7242,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,1536,1536,15,0,6.3778,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,17920,1536,16,0,27.6071,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,1536,8960,15,0,22.0751,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,896,5120,11,0,9.9511,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,5120,640,23,0,9.0654,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,64,6912,5120,6,0,34.2984,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,3456,23,0,22.0835,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,64,7168,8192,20,0,51.3056,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,64,8192,3584,16,0,30.8131,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,6144,4096,0,0,38.8164,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,4096,4096,65,0,25.4511,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,28672,4096,70,0,98.7439,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,4096,14336,65,0,74.145,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,2048,1536,16,0,9.8331,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,1536,1536,15,0,9.0299,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,128,17920,1536,0,0,32.1355,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,1536,8960,15,0,31.9775,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,128,896,5120,10,0,13.6819,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,128,5120,640,62,0,9.3994,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,6912,5120,63,0,47.5628,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,5120,3456,62,0,23.4631,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,7168,8192,0,0,70.1977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,8192,3584,63,0,36.488,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,6144,4096,68,0,722.0707,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,4096,4096,0,0,517.3686,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,28672,4096,0,0,3511.1005,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,4096,14336,0,0,1696.9417,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,2048,1536,0,0,116.6467,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,1536,1536,68,0,84.865,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,17920,1536,70,0,933.909,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,1536,8960,68,0,384.0268,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,896,5120,70,0,163.1018,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,5120,640,0,0,150.6321,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,6912,5120,68,0,997.924,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,5120,3456,70,0,555.2252,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,1280,8192,0,0,319.8745,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,8192,1024,0,0,318.7649,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,7168,8192,70,0,1722.2358,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4880,8192,3584,0,0,911.4248,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,6144,4096,68,0,722.9339,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,4096,4096,70,0,517.9014,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,28672,4096,0,0,3511.1681,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,4096,14336,0,0,1697.2028,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,2048,1536,70,0,116.5507,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,1536,1536,68,0,84.8826,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,17920,1536,70,0,932.8977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,1536,8960,68,0,383.9284,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,896,5120,70,0,163.1133,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,5120,640,0,0,150.5901,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,6912,5120,68,0,996.5008,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,5120,3456,70,0,554.89,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,1280,8192,0,0,319.7205,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,8192,1024,0,0,318.7485,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,7168,8192,0,0,1722.3637,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4888,8192,3584,0,0,911.1884,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,6144,4096,68,0,721.7139,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,4096,4096,70,0,517.1954,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,28672,4096,0,0,3511.2723,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,4096,14336,70,0,1696.9035,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,2048,1536,70,0,116.5331,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,1536,1536,68,0,84.901,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,17920,1536,70,0,933.8512,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,1536,8960,68,0,383.9004,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,896,5120,0,0,163.0481,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,5120,640,0,0,150.5773,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,6912,5120,68,0,996.9315,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,5120,3456,0,0,555.0771,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,1280,8192,0,0,319.7096,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,8192,1024,0,0,318.7777,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,7168,8192,0,0,1723.1288,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4896,8192,3584,70,0,910.7127,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,6144,4096,68,0,721.6983,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,4096,4096,0,0,517.1285,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,28672,4096,70,0,3512.2488,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,4096,14336,0,0,1696.1086,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,2048,1536,70,0,116.5239,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,1536,1536,68,0,84.861,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,17920,1536,70,0,933.8664,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,1536,8960,68,0,383.8607,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,896,5120,70,0,163.0625,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,5120,640,70,0,150.5189,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,6912,5120,68,0,997.2019,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,5120,3456,70,0,554.7055,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,1280,8192,0,0,319.7424,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,8192,1024,70,0,318.7508,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,7168,8192,70,0,1722.1435,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4904,8192,3584,70,0,910.8547,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,6144,4096,68,0,721.711,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,4096,4096,0,0,517.0985,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,28672,4096,0,0,3510.2643,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,4096,14336,70,0,1696.3793,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,2048,1536,70,0,116.5367,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,1536,1536,68,0,84.8902,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,17920,1536,70,0,933.6015,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,1536,8960,68,0,383.7731,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,896,5120,0,0,163.0273,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,5120,640,0,0,150.5685,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,6912,5120,68,0,995.963,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,5120,3456,0,0,554.4538,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,1280,8192,0,0,319.424,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,8192,1024,0,0,318.6576,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,7168,8192,70,0,1721.9694,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4912,8192,3584,70,0,909.9178,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,6144,4096,68,0,721.4914,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,4096,4096,0,0,517.2021,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,28672,4096,70,0,3509.482,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,4096,14336,70,0,1696.1636,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,2048,1536,0,0,116.5079,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,1536,1536,68,0,84.9018,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,17920,1536,70,0,933.0211,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,1536,8960,68,0,383.7803,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,896,5120,0,0,163.0841,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,5120,640,0,0,150.6109,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,6912,5120,68,0,996.1222,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,5120,3456,70,0,554.8414,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,1280,8192,0,0,319.6028,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,8192,1024,70,0,318.8648,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,7168,8192,0,0,1721.5073,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4920,8192,3584,70,0,910.2478,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,6144,4096,68,0,721.8849,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,4096,4096,70,0,517.1461,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,28672,4096,0,0,3510.1711,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,4096,14336,0,0,1696.8984,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,2048,1536,0,0,116.5655,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,1536,1536,68,0,84.8774,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,17920,1536,70,0,933.1971,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,1536,8960,68,0,384.0655,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,896,5120,44,0,154.9317,a8w8_bpreshuffle_256x224x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,5120,640,0,0,150.6645,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,6912,5120,68,0,997.6582,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,5120,3456,70,0,555.0206,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,1280,8192,0,0,319.5956,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,8192,1024,0,0,318.9808,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,7168,8192,40,0,1681.9775,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4928,8192,3584,70,0,910.9266,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,6144,4096,68,0,721.7033,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,4096,4096,0,0,517.302,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,28672,4096,70,0,3510.0414,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,4096,14336,0,0,1696.4572,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,2048,1536,0,0,116.5043,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,1536,1536,68,0,84.8654,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,17920,1536,70,0,933.4298,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,1536,8960,68,0,383.9551,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,896,5120,0,0,163.1197,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,5120,640,70,0,150.6,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,6912,5120,68,0,997.3789,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,5120,3456,0,0,555.3294,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,1280,8192,70,0,319.6532,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,8192,1024,0,0,318.7616,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,7168,8192,70,0,1720.596,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4936,8192,3584,0,0,910.6961,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,6144,4096,68,0,721.8669,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,4096,4096,70,0,517.1708,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,28672,4096,0,0,3508.9225,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,4096,14336,0,0,1696.1199,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,2048,1536,0,0,116.5291,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,1536,1536,68,0,84.8618,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,17920,1536,0,0,932.771,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,1536,8960,68,0,383.9035,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,896,5120,0,0,163.1437,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,5120,640,70,0,150.6652,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,6912,5120,68,0,996.5673,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,5120,3456,70,0,554.9566,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,1280,8192,0,0,319.658,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,8192,1024,70,0,318.7504,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,7168,8192,70,0,1721.2988,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4944,8192,3584,0,0,911.2273,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,6144,4096,68,0,720.9129,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,4096,4096,70,0,516.75,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,28672,4096,0,0,3508.1532,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,4096,14336,0,0,1695.5023,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,2048,1536,70,0,116.5687,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,1536,1536,68,0,84.9322,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,17920,1536,70,0,933.0606,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,1536,8960,68,0,383.7523,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,896,5120,0,0,162.9593,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,5120,640,0,0,150.526,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,6912,5120,68,0,996.4381,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,5120,3456,70,0,554.5042,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,1280,8192,0,0,319.4612,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,8192,1024,0,0,318.8244,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,7168,8192,70,0,1722.84,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4952,8192,3584,0,0,910.5245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,6144,4096,68,0,721.2313,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,4096,4096,70,0,516.8868,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,28672,4096,0,0,3508.4568,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,4096,14336,0,0,1695.4158,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,2048,1536,0,0,116.5063,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,1536,1536,68,0,84.8362,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,17920,1536,0,0,932.7946,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,1536,8960,68,0,383.8078,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,896,5120,0,0,163.0629,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,5120,640,70,0,150.49,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,6912,5120,68,0,996.6372,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,5120,3456,0,0,554.8862,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,1280,8192,54,0,303.5491,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,8192,1024,70,0,318.6011,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,7168,8192,54,0,1655.1632,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4960,8192,3584,0,0,911.2157,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,6144,4096,68,0,721.0853,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,4096,4096,0,0,516.3888,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,28672,4096,0,0,3509.8311,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,4096,14336,0,0,1695.8882,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,2048,1536,0,0,116.5419,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,1536,1536,68,0,84.8662,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,17920,1536,0,0,933.6622,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,1536,8960,68,0,384.153,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,896,5120,0,0,163.0365,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,5120,640,0,0,150.6068,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,6912,5120,68,0,997.4728,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,5120,3456,0,0,555.5226,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,1280,8192,0,0,319.8772,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,8192,1024,0,0,319.0248,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,7168,8192,70,0,1722.1783,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4968,8192,3584,70,0,911.2245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,6144,4096,68,0,721.9552,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,4096,4096,70,0,516.9584,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,28672,4096,0,0,3508.7338,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,4096,14336,0,0,1695.7474,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,2048,1536,0,0,116.5875,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,1536,1536,68,0,84.8574,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,17920,1536,70,0,932.7865,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,1536,8960,68,0,383.911,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,896,5120,0,0,163.0225,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,5120,640,70,0,150.7224,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,6912,5120,68,0,996.0924,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,5120,3456,0,0,553.9673,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,1280,8192,0,0,319.4572,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,8192,1024,70,0,318.7235,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,7168,8192,70,0,1720.2767,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4976,8192,3584,0,0,910.5728,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,6144,4096,68,0,721.8145,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,4096,4096,70,0,517.3384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,28672,4096,0,0,3509.5018,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,4096,14336,70,0,1696.1993,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,2048,1536,0,0,116.5027,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,1536,1536,68,0,84.8414,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,17920,1536,0,0,933.6853,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,1536,8960,68,0,383.8522,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,896,5120,0,0,163.0629,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,5120,640,0,0,150.5328,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,6912,5120,68,0,996.3624,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,5120,3456,0,0,554.7977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,1280,8192,0,0,319.7468,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,8192,1024,0,0,318.6704,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,7168,8192,0,0,1720.7806,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4984,8192,3584,70,0,910.8736,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,6144,4096,68,0,677.7474,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,4096,4096,70,0,474.925,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,28672,4096,0,0,3219.969,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,4096,14336,70,0,1553.3464,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,2048,1536,70,0,109.0531,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,1536,1536,68,0,80.9629,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,17920,1536,70,0,857.1658,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,1536,8960,68,0,362.5549,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,896,5120,0,0,151.234,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,5120,640,0,0,144.7384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,6912,5120,68,0,937.6257,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,5120,3456,0,0,507.3276,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,1280,8192,0,0,292.0606,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,8192,1024,70,0,300.8963,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,7168,8192,70,0,1578.4757,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4992,8192,3584,0,0,836.7985,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,6144,4096,68,0,722.564,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,4096,4096,0,0,518.28,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,28672,4096,0,0,3573.2853,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,4096,14336,70,0,1698.7618,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,2048,1536,0,0,116.5347,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,1536,1536,68,0,84.7494,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,17920,1536,70,0,943.2838,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,1536,8960,68,0,384.089,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,896,5120,0,0,163.0909,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,5120,640,0,0,151.0808,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,6912,5120,68,0,998.246,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,5120,3456,0,0,556.451,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,1280,8192,0,0,319.9695,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,8192,1024,70,0,320.2159,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,7168,8192,0,0,1723.8551,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5000,8192,3584,70,0,912.0216,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,6144,4096,68,0,722.1309,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,4096,4096,0,0,517.5104,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,28672,4096,70,0,3572.3381,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,4096,14336,70,0,1697.3202,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,2048,1536,0,0,116.5083,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,1536,1536,68,0,84.8038,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,17920,1536,70,0,943.9934,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,1536,8960,68,0,384.419,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,896,5120,0,0,163.0989,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,5120,640,0,0,151.128,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,6912,5120,68,0,998.1032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,5120,3456,0,0,556.1637,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,1280,8192,0,0,319.8811,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,8192,1024,0,0,320.406,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,7168,8192,0,0,1723.8707,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5008,8192,3584,70,0,912.4452,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,6144,4096,68,0,722.6928,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,4096,4096,70,0,518.2384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,28672,4096,70,0,3573.7749,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,4096,14336,70,0,1696.9813,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,2048,1536,0,0,116.4795,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,1536,1536,68,0,84.8162,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,17920,1536,0,0,943.6022,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,1536,8960,68,0,384.4446,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,896,5120,0,0,163.0933,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,5120,640,0,0,151.1972,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,6912,5120,68,0,999.2964,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,5120,3456,70,0,556.8141,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,1280,8192,70,0,320.0599,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,8192,1024,70,0,320.598,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,7168,8192,70,0,1725.0263,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5016,8192,3584,70,0,911.7604,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,6144,4096,68,0,723.4404,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,4096,4096,0,0,518.52,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,28672,4096,0,0,3574.9692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,4096,14336,70,0,1698.7973,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,2048,1536,0,0,116.5783,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,1536,1536,68,0,84.8306,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,17920,1536,70,0,943.7666,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,1536,8960,68,0,384.437,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,896,5120,0,0,163.1245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,5120,640,0,0,151.164,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,6912,5120,68,0,998.884,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,5120,3456,0,0,556.5441,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,1280,8192,0,0,319.7835,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,8192,1024,70,0,320.4855,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,7168,8192,70,0,1722.6626,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5024,8192,3584,0,0,910.8936,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,6144,4096,68,0,722.66,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,4096,4096,0,0,518.1148,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,28672,4096,70,0,3571.9384,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,4096,14336,70,0,1697.5977,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,2048,1536,0,0,116.5019,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,1536,1536,68,0,84.8218,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,17920,1536,70,0,944.1358,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,1536,8960,68,0,384.6574,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,896,5120,0,0,163.1725,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,5120,640,0,0,151.2748,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,6912,5120,68,0,997.3216,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,5120,3456,70,0,556.5181,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,1280,8192,70,0,319.8112,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,8192,1024,70,0,320.2775,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,7168,8192,0,0,1725.0374,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5032,8192,3584,0,0,912.9028,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,6144,4096,68,0,722.9024,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,4096,4096,70,0,518.1968,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,28672,4096,0,0,3576.404,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,4096,14336,70,0,1698.0521,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,2048,1536,0,0,116.4119,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,1536,1536,68,0,84.8034,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,17920,1536,70,0,943.6378,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,1536,8960,68,0,384.4282,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,896,5120,0,0,163.0225,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,5120,640,0,0,151.082,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,6912,5120,68,0,999.2588,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,5120,3456,0,0,556.7777,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,1280,8192,70,0,319.9255,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,8192,1024,0,0,320.3251,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,7168,8192,0,0,1725.0698,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5040,8192,3584,0,0,912.532,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,6144,4096,68,0,722.6176,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,4096,4096,70,0,518.2176,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,28672,4096,70,0,3574.9924,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,4096,14336,70,0,1699.3301,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,2048,1536,0,0,116.5911,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,1536,1536,68,0,84.8314,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,17920,1536,70,0,944.667,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,1536,8960,68,0,384.3346,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,896,5120,0,0,163.1737,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,5120,640,0,0,151.1584,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,6912,5120,68,0,998.9228,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,5120,3456,0,0,556.8089,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,1280,8192,0,0,320.3219,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,8192,1024,70,0,320.6136,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,7168,8192,0,0,1725.7622,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5048,8192,3584,70,0,912.3652,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,6144,4096,68,0,723.0748,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,4096,4096,70,0,518.3472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,28672,4096,70,0,3573.1804,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,4096,14336,0,0,1698.8861,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,2048,1536,70,0,116.6247,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,1536,1536,68,0,84.8318,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,17920,1536,0,0,944.0254,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,1536,8960,68,0,385.0938,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,896,5120,0,0,163.2085,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,5120,640,0,0,151.1404,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,6912,5120,68,0,998.5256,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,5120,3456,70,0,556.5893,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,1280,8192,70,0,320.0095,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,8192,1024,70,0,320.5692,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,7168,8192,70,0,1725.517,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5056,8192,3584,70,0,911.968,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,6144,4096,68,0,723.0832,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,4096,4096,0,0,518.1656,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,28672,4096,0,0,3575.312,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,4096,14336,0,0,1699.5549,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,2048,1536,0,0,116.5611,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,1536,1536,68,0,84.9002,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,17920,1536,70,0,944.5414,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,1536,8960,68,0,384.4386,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,896,5120,0,0,163.1697,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,5120,640,70,0,151.2672,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,6912,5120,68,0,997.8556,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,5120,3456,0,0,556.4153,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,1280,8192,70,0,319.888,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,8192,1024,0,0,320.4688,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,7168,8192,70,0,1724.7874,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5064,8192,3584,0,0,912.4144,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,6144,4096,68,0,723.1368,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,4096,4096,0,0,518.5264,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,28672,4096,70,0,3577.46,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,4096,14336,0,0,1698.8585,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,2048,1536,70,0,116.5131,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,1536,1536,68,0,84.8522,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,17920,1536,70,0,943.997,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,1536,8960,68,0,384.143,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,896,5120,0,0,163.1409,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,5120,640,0,0,151.0372,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,6912,5120,68,0,998.9192,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,5120,3456,70,0,556.8149,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,1280,8192,70,0,320.1304,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,8192,1024,70,0,320.4639,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,7168,8192,0,0,1724.2474,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5072,8192,3584,0,0,912.6188,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,6144,4096,68,0,723.1992,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,4096,4096,70,0,518.4672,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,28672,4096,70,0,3574.8151,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,4096,14336,70,0,1698.4557,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,2048,1536,0,0,116.4291,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,1536,1536,68,0,84.8058,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,17920,1536,0,0,943.3509,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,1536,8960,68,0,383.8942,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,896,5120,0,0,163.0429,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,5120,640,0,0,151.1744,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,6912,5120,68,0,998.1023,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,5120,3456,70,0,555.4617,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,1280,8192,0,0,319.5047,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,8192,1024,70,0,320.4791,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,7168,8192,0,0,1724.4214,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5080,8192,3584,0,0,912.0396,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,6144,4096,68,0,722.7032,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,4096,4096,0,0,517.9599,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,28672,4096,0,0,3573.3426,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,4096,14336,70,0,1698.6472,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,2048,1536,70,0,116.5243,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,1536,1536,68,0,84.8814,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,17920,1536,70,0,943.9581,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,1536,8960,68,0,384.1678,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,896,5120,0,0,163.0797,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,5120,640,0,0,151.1852,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,6912,5120,68,0,999.3004,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,5120,3456,70,0,556.6433,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,1280,8192,70,0,319.8523,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,8192,1024,70,0,320.5275,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,7168,8192,0,0,1724.6449,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5088,8192,3584,70,0,912.5908,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,6144,4096,68,0,723.2372,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,4096,4096,70,0,518.5907,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,28672,4096,0,0,3577.995,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,4096,14336,0,0,1699.39,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,2048,1536,0,0,116.5787,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,1536,1536,68,0,84.8981,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,17920,1536,70,0,944.2021,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,1536,8960,68,0,384.7514,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,896,5120,0,0,163.2357,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,5120,640,0,0,151.256,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,6912,5120,68,0,998.7911,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,5120,3456,0,0,556.5617,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,1280,8192,0,0,320.0247,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,8192,1024,70,0,320.7655,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,7168,8192,70,0,1725.1549,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5096,8192,3584,0,0,912.3992,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,6144,4096,68,0,723.676,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,4096,4096,70,0,518.3535,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,28672,4096,70,0,3575.5158,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,4096,14336,70,0,1698.0484,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,2048,1536,0,0,116.4571,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,1536,1536,68,0,84.865,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,17920,1536,70,0,944.0377,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,1536,8960,68,0,384.119,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,896,5120,70,0,163.0397,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,5120,640,0,0,150.976,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,6912,5120,68,0,998.4327,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,5120,3456,70,0,556.2829,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,1280,8192,0,0,319.8375,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,8192,1024,0,0,320.4843,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,7168,8192,0,0,1725.7897,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5104,8192,3584,70,0,911.8712,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,6144,4096,68,0,723.024,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,4096,4096,70,0,517.9339,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,28672,4096,0,0,3572.6805,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,4096,14336,0,0,1698.4928,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,2048,1536,70,0,116.5211,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,1536,1536,68,0,84.8258,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,17920,1536,70,0,943.9213,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,1536,8960,68,0,384.297,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,896,5120,0,0,163.1469,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,5120,640,0,0,151.182,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,6912,5120,68,0,998.5615,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,5120,3456,0,0,556.7313,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,1280,8192,70,0,319.8655,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,8192,1024,0,0,320.3695,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,7168,8192,0,0,1724.1557,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5112,8192,3584,70,0,911.9167,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,6144,4096,68,0,679.4918,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,4096,4096,0,0,476.0774,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,28672,4096,0,0,3278.3917,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,4096,14336,70,0,1556.5966,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,2048,1536,70,0,108.853,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,1536,1536,68,0,81.0149,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,17920,1536,0,0,868.8098,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,1536,8960,68,0,362.9605,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,896,5120,70,0,151.1084,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,5120,640,0,0,145.5056,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,6912,5120,68,0,939.8357,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,5120,3456,70,0,508.5807,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,1280,8192,70,0,292.2446,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,8192,1024,70,0,302.3202,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,7168,8192,70,0,1580.9331,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,5120,8192,3584,0,0,838.4504,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,4096,8192,11,0,15.6167,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,24576,4096,11,0,29.5131,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,4096,12288,11,0,20.7263,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,1280,5120,10,0,9.3202,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,5120,1024,15,0,5.1926,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,6400,5120,10,0,13.6715,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,5120,3200,23,0,15.5815,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,2,4096,8192,11,0,15.6803,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,24576,4096,11,0,30.2048,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,4096,12288,11,0,20.7555,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,1280,5120,10,0,9.473,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,1024,15,0,5.2698,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,6400,5120,10,0,13.7399,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,3200,23,0,15.6175,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,4,4096,8192,5,0,15.7483,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,24576,4096,9,0,30.6624,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,4096,12288,5,0,20.7771,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,1280,5120,10,0,9.5426,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,1024,15,0,5.4922,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,6400,5120,10,0,13.8239,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,3200,23,0,15.6695,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,8,4096,8192,5,0,15.8119,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,24576,4096,11,0,31.22,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,4096,12288,11,0,20.9183,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,1280,5120,10,0,9.6383,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,1024,15,0,5.2118,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,6400,5120,10,0,14.0415,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,3200,9,0,15.7527,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,4096,8192,10,0,15.0923,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,24576,4096,5,0,33.536,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,4096,12288,19,0,19.7555,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,16,1280,5120,10,0,9.1006,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,1024,15,0,5.2654,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,6400,5120,6,0,15.0351,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,16,5120,3200,9,0,15.5687,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,4096,8192,12,0,21.1827,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,24576,4096,12,0,47.0804,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,4096,12288,12,0,28.6523,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,1280,5120,10,0,9.1151,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,5120,1024,15,0,6.7839,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,6400,5120,11,0,22.0163,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,5120,3200,9,0,16.7551,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,64,4096,8192,20,0,36.222,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,64,24576,4096,7,0,81.9382,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,4096,12288,6,0,49.6453,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,1280,5120,11,0,10.043,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,5120,1024,29,0,9.2079,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,64,6400,5120,11,0,32.0656,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,5120,3200,23,0,20.7411,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,128,4096,8192,65,0,44.9496,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,24576,4096,68,0,92.1811,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,4096,12288,65,0,64.1853,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,1280,5120,6,0,14.3507,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,5120,1024,65,0,11.1843,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,6400,5120,63,0,47.6153,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,5120,3200,62,0,22.1239,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,6000,6144,4096,68,0,856.6988,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,6000,4096,4096,0,0,614.3026,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,6000,28672,4096,70,0,4210.9884,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,6000,4096,14336,0,0,2014.8833,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,27000,1280,5120,0,0,1071.4072,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,27000,5120,1024,70,0,1043.1179,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,27000,6400,5120,70,0,5189.3743,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,27000,5120,3200,70,0,2713.2459,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2000,2048,1536,0,0,60.688,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2000,1536,1536,68,0,44.0468,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2000,17920,1536,0,0,385.0725,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2000,1536,8960,54,0,181.9933,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,500,2048,1536,62,0,21.1491,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,500,1536,1536,16,0,18.9451,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,500,17920,1536,0,0,104.1594,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,500,1536,8960,53,0,80.6257,a8w8_bpreshuffle_256x192x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,4608,7168,5,0,13.5191,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,3072,1536,11,0,5.2462,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,7168,2304,29,0,9.2998,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,1,512,7168,10,0,11.3898,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,4096,512,9,0,4.4342,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,7168,2048,10,0,8.8646,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,1536,7168,10,0,11.689,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,4608,7168,5,0,13.6163,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,3072,1536,5,0,5.553,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,7168,2304,29,0,9.4346,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,2,512,7168,10,0,11.4739,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,4096,512,9,0,4.479,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,7168,2048,10,0,8.819,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,1536,7168,10,0,11.8851,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,4608,7168,5,0,13.6979,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,3072,1536,11,0,5.3998,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,7168,2304,29,0,9.5066,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,4,512,7168,10,0,11.561,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,4096,512,9,0,4.481,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,7168,2048,5,0,9.0518,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,1536,7168,10,0,12.0955,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,4608,7168,5,0,13.8067,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,3072,1536,5,0,5.4242,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,7168,2304,29,0,9.3802,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,8,512,7168,10,0,11.7323,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,4096,512,9,0,4.5118,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,7168,2048,10,0,9.0242,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,1536,7168,10,0,12.3663,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,4608,7168,11,0,13.2607,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,3072,1536,11,0,5.5282,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,7168,2304,29,0,9.5778,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,16,512,7168,10,0,10.5066,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,4096,512,9,0,4.439,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,7168,2048,11,0,9.2702,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,1536,7168,10,0,10.8903,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,4608,7168,12,0,18.5431,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,3072,1536,10,0,8.2266,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,7168,2304,29,0,12.2627,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,32,512,7168,10,0,10.8103,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,4096,512,9,0,5.1406,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,7168,2048,11,0,11.141,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,1536,7168,11,0,12.5051,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,4608,7168,7,0,32.0776,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,3072,1536,5,0,9.4782,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,7168,2304,30,0,18.4351,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,64,512,7168,10,0,10.8758,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,64,4096,512,9,0,6.0738,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,64,7168,2048,6,0,16.7399,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,1536,7168,19,0,16.8991,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,128,4608,7168,65,0,40.338,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,3072,1536,15,0,12.8339,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,128,7168,2304,0,0,25.6303,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,512,7168,11,0,12.1835,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,128,4096,512,23,0,9.4662,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,128,7168,2048,63,0,24.6415,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,1536,7168,5,0,26.0343,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,256,1536,7168,11,0,40.0387,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,512,1536,7168,0,0,62.1172,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,1536,7168,68,0,85.8797,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1536,1536,7168,0,0,105.359,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,1536,7168,54,0,149.0851,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,1536,7168,0,0,257.8883,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,1536,7168,73,0,503.2864,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,16384,1536,7168,68,0,937.0239,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,1536,7168,68,0,1153.4843,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,3072,1536,16,0,19.3087,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,512,3072,1536,68,0,27.5747,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,3072,1536,73,0,42.0535,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1536,3072,1536,71,0,54.2256,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,3072,1536,73,0,69.2076,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,4096,3072,1536,71,0,124.8954,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,3072,1536,71,0,237.0182,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,3072,1536,71,0,455.8066,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,3072,1536,71,0,564.1674,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16,576,7168,10,0,10.5618,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,576,7168,10,0,10.8582,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,64,576,7168,10,0,10.911,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,128,576,7168,5,0,12.303,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,256,576,7168,12,0,18.2567,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,512,576,7168,26,0,31.8751,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2,,,,, ++80,1024,576,7168,65,0,40.7496,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1536,576,7168,62,0,66.1536,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,576,7168,62,0,66.9628,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,576,7168,56,0,120.3046,a8w8_bpreshuffle_256x160x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,576,7168,68,0,225.7606,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,576,7168,68,0,370.6111,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,576,7168,68,0,442.0606,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,7168,2048,62,0,37.5135,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,512,7168,2048,73,0,53.5876,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1024,7168,2048,71,0,98.5793,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1536,7168,2048,71,0,143.1335,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,7168,2048,71,0,185.5725,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,7168,2048,71,0,346.1202,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,7168,2048,71,0,667.6494,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,7168,2048,71,0,1321.6889,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,7168,2048,71,0,1645.0789,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,4608,7168,0,0,62.9052,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,512,4608,7168,0,0,105.3998,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1024,4608,7168,71,0,201.4857,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1536,4608,7168,68,0,293.1925,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,4608,7168,68,0,367.9423,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,4608,7168,68,0,720.9652,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,4608,7168,71,0,1398.8474,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,4608,7168,68,0,2779.3652,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,4608,7168,68,0,3422.9431,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,7168,2304,62,0,39.4879,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,512,7168,2304,73,0,56.3116,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1024,7168,2304,73,0,106.609,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1536,7168,2304,71,0,155.0772,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,7168,2304,71,0,202.949,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,7168,2304,71,0,380.2409,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,7168,2304,71,0,734.7959,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,7168,2304,71,0,1459.36,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,7168,2304,71,0,1813.8442,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,512,7168,6,0,17.6359,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,512,512,7168,20,0,29.6435,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,1024,512,7168,65,0,40.6252,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1536,512,7168,60,0,62.1261,a8w8_bpreshuffle_256x160x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,512,7168,0,0,62.9713,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,512,7168,70,0,104.4914,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,512,7168,71,0,197.0978,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,512,7168,70,0,358.1792,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,512,7168,70,0,412.8271,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,4096,512,62,0,11.6791,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,512,4096,512,73,0,16.7415,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1024,4096,512,71,0,27.3143,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1536,4096,512,73,0,35.0211,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,2048,4096,512,71,0,44.53,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,4096,512,71,0,76.8917,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,4096,512,71,0,142.8464,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,4096,512,71,0,273.0349,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,4096,512,71,0,337.2764,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16,7168,256,74,0,8.0662,a8w8_bpreshuffle_64x16x256x64_16x16_16x16_4x16x1_4x16x1_1x8x1x8_8x8x1_1x8_intrawave_v1,,,,, ++80,32,7168,256,72,0,8.889,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,64,7168,256,72,0,9.613,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,128,7168,256,74,0,9.6658,a8w8_bpreshuffle_64x16x256x64_16x16_16x16_4x16x1_4x16x1_1x8x1x8_8x8x1_1x8_intrawave_v1,,,,, ++80,256,7168,256,72,0,11.6959,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,512,7168,256,73,0,15.9855,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1024,7168,256,73,0,28.2775,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1536,7168,256,71,0,38.2028,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,7168,256,71,0,49.3564,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,7168,256,71,0,87.9366,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,7168,256,71,0,162.3181,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,7168,256,71,0,312.9447,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,20480,7168,256,71,0,387.2386,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,1280,8192,71,0,241.0021,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,8192,1024,71,0,231.1181,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,3456,5120,5,0,10.385,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,5120,1728,72,0,21.5079,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,2,3456,5120,5,0,10.4294,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,1728,72,0,21.5563,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,4,3456,5120,11,0,10.4478,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,1728,72,0,21.6855,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,8,3456,5120,11,0,10.5266,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,1728,72,0,21.6267,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,16,3456,5120,5,0,10.2335,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,1728,72,0,21.6631,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,32,3456,5120,12,0,14.4795,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,5120,1728,72,0,21.4651,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,64,3456,5120,11,0,20.5687,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,5120,1728,72,0,21.7863,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,128,3456,5120,65,0,30.2347,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,5120,1728,72,0,22.0235,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,4000,896,5120,71,0,120.6204,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,5120,640,71,0,102.2151,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,3456,5120,71,0,408.3558,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,5120,1728,71,0,217.1533,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,768,4096,10,0,8.297,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,1,3072,4096,11,0,9.2223,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,4096,1536,5,0,5.3586,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,768,4096,19,0,9.1746,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,2,3072,4096,5,0,9.1542,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,4096,1536,5,0,5.4738,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,768,4096,10,0,8.8634,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,3072,4096,25,0,9.3898,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,4,4096,1536,5,0,5.5642,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,768,4096,10,0,8.8694,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,3072,4096,19,0,9.471,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,8,4096,1536,5,0,5.503,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,768,4096,10,0,7.8354,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,3072,4096,19,0,9.7294,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,16,4096,1536,5,0,5.531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,768,4096,10,0,8.8938,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,3072,4096,12,0,12.1967,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,4096,1536,12,0,8.411,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,768,4096,5,0,9.1198,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,3072,4096,5,0,17.2243,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,4096,1536,16,0,10.203,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,768,4096,19,0,11.1494,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,128,3072,4096,65,0,25.2383,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,4096,1536,65,0,13.4171,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,768,4096,54,0,84.6818,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,4096,512,71,0,76.8086,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,3072,4096,71,0,302.4129,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,4096,1536,71,0,164.025,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,4352,5120,5,0,10.7058,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,1,5120,2176,23,0,11.7638,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,2,4352,5120,5,0,10.7722,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,2176,9,0,11.9822,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,4352,5120,5,0,10.823,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,2176,23,0,12.0187,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,8,4352,5120,11,0,10.9158,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,2176,23,0,12.0679,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,16,4352,5120,11,0,10.6386,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,2176,9,0,11.9514,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,4352,5120,12,0,14.6603,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,32,5120,2176,23,0,12.7907,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,64,4352,5120,20,0,24.4903,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,64,5120,2176,23,0,15.7635,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,,,,, ++80,128,4352,5120,65,0,30.6107,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,128,5120,2176,62,0,17.1499,a8w8_bpreshuffle_256x128x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,4352,5120,71,0,516.1362,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4000,5120,2176,71,0,266.0923,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,32768,5120,1728,71,0,1597.5379,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,256,5120,1728,72,0,23.3143,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,512,5120,1728,73,0,33.4744,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,1024,5120,1728,71,0,58.0025,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,2048,5120,1728,71,0,108.4907,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,4096,5120,1728,71,0,207.5261,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,8192,5120,1728,71,0,406.1403,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,16384,5120,1728,71,0,803.6559,a8w8_bpreshuffle_256x128x128x64_16x16_16x16_4x64x1_4x64x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,7168,5120,10,0,13.8591,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,7168,5120,10,0,14.0039,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,7168,5120,10,0,14.0887,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,8,7168,5120,10,0,14.3395,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,16,7168,5120,10,0,15.5539,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,32,7168,5120,11,0,21.9443,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,64,7168,5120,6,0,34.2712,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,7168,5120,70,0,48.1389,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,5120,5120,11,0,10.943,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,5120,11,0,10.9603,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,5120,11,0,11.0667,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,5120,5,0,11.1911,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,5120,5,0,10.8615,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,5120,5120,12,0,14.7587,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,5120,6,0,25.0027,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,5120,5120,65,0,31.022,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,55296,5120,74,0,85.1319,a8w8_bpreshuffle_64x16x256x64_16x16_16x16_4x16x1_4x16x1_1x8x1x8_8x8x1_1x8_intrawave_v1,,,,, ++80,2,55296,5120,17,0,85.9039,a8w8_bpreshuffle_256x16x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,4,55296,5120,16,0,89.6547,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,8,55296,5120,21,0,91.2155,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,16,55296,5120,7,0,93.6895,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,32,55296,5120,72,0,118.9508,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,64,55296,5120,73,0,150.9754,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,55296,5120,68,0,220.3102,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,5120,27648,11,0,44.1312,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,27648,11,0,44.5736,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,27648,11,0,44.648,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,27648,5,0,44.7924,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,27648,19,0,40.9632,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,32,5120,27648,12,0,61.5157,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,27648,7,0,108.0988,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,5120,27648,65,0,140.143,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,27648,5120,9,0,41.0924,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,2,27648,5120,9,0,40.822,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,,,,, ++80,4,27648,5120,20,0,42.9072,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,8,27648,5120,6,0,43.7536,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,16,27648,5120,6,0,44.3848,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,32,27648,5120,7,0,67.7973,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,64,27648,5120,72,0,101.0467,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,128,27648,5120,68,0,113.7247,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,5120,13824,5,0,22.7535,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,13824,5,0,22.8015,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,13824,11,0,22.9779,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,13824,11,0,23.2415,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,13824,25,0,22.5139,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,32,5120,13824,12,0,32.206,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,13824,7,0,57.1009,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,5120,13824,65,0,73.0905,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,34816,5120,5,0,51.5449,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,34816,5120,19,0,51.5629,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,4,34816,5120,19,0,54.1213,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,8,34816,5120,5,0,56.0481,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,34816,5120,7,0,57.6501,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,32,34816,5120,72,0,80.7439,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,64,34816,5120,73,0,104.3024,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,34816,5120,70,0,154.6787,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,5120,17408,5,0,29.0232,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,17408,5,0,29.148,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,17408,11,0,29.2448,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,17408,5,0,29.406,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,17408,25,0,27.6132,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,32,5120,17408,12,0,39.7036,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,17408,7,0,70.9002,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,5120,17408,65,0,90.4219,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,10240,5120,5,0,16.5003,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,10240,5120,5,0,16.6059,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,10240,5120,5,0,16.7935,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,10240,5120,6,0,16.9183,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,16,10240,5120,6,0,17.8779,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,32,10240,5120,12,0,26.1739,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,10240,5120,7,0,44.4204,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,10240,5120,63,0,49.2661,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,5120,8192,5,0,16.0963,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,8192,5,0,16.1551,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,8192,11,0,16.2663,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,8192,5,0,16.4523,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,8192,11,0,16.0223,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,32,5120,8192,12,0,21.5807,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,8192,21,0,37.4184,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v2,,,,, ++80,128,5120,8192,65,0,45.7517,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,51200,5120,16,0,71.3974,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,2,51200,5120,6,0,78.0846,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,4,51200,5120,16,0,79.5634,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,8,51200,5120,6,0,84.0159,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,16,51200,5120,6,0,85.6199,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,32,51200,5120,72,0,114.2116,a8w8_bpreshuffle_128x32x256x64_16x16_16x16_4x32x1_4x32x1_1x8x1x16_8x8x1_1x4_intrawave_v1,,,,, ++80,64,51200,5120,73,0,149.6178,a8w8_bpreshuffle_256x64x256x64_16x16_16x16_4x64x1_4x64x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,51200,5120,0,0,196.1245,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, ++80,1,5120,25600,11,0,41.34,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,2,5120,25600,11,0,41.4056,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,4,5120,25600,11,0,41.6168,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,8,5120,25600,5,0,41.8272,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,,,,, ++80,16,5120,25600,19,0,38.6636,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,,,,, ++80,32,5120,25600,12,0,57.7321,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,,,,, ++80,64,5120,25600,7,0,100.664,a8w8_bpreshuffle_256x16x256x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,,,,, ++80,128,5120,25600,65,0,130.2685,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,,,,, \ No newline at end of file diff --git a/3rdparty/aiter/BUILD b/3rdparty/aiter/BUILD index 04a3707f2..ce7666673 100644 --- a/3rdparty/aiter/BUILD +++ b/3rdparty/aiter/BUILD @@ -1,64 +1,133 @@ +load( + "@local_config_rocm//rocm:build_defs.bzl", + "rocm_default_copts", +) + +genrule( + name = "config_h", + srcs = [ + "3rdparty/composable_kernel/include/ck/config.h.in", + ], + outs = [ + "3rdparty/composable_kernel/include/ck/config.h", + ], + cmd = """ + awk '{gsub(/^#cmakedefine DTYPES \"@DTYPES@\"/, "/* #undef DTYPES*/"); + gsub(/^#cmakedefine CK_ENABLE_ALL_DTYPES @CK_ENABLE_ALL_DTYPES@/, "#define CK_ENABLE_ALL_DTYPES ON"); + gsub(/^#cmakedefine CK_ENABLE_INT8 @CK_ENABLE_INT8@/, "/* #undef CK_ENABLE_INT8*/"); + gsub(/^#cmakedefine CK_ENABLE_FP8 @CK_ENABLE_FP8@/, "/* #undef CK_ENABLE_FP8*/"); + gsub(/^#cmakedefine CK_ENABLE_BF8 @CK_ENABLE_BF8@/, "/* #undef CK_ENABLE_BF8*/"); + gsub(/^#cmakedefine CK_ENABLE_FP16 @CK_ENABLE_FP16@/, "/* #undef CK_ENABLE_FP16*/"); + gsub(/^#cmakedefine CK_ENABLE_BF16 @CK_ENABLE_BF16@/, "/* #undef CK_ENABLE_BF16*/"); + gsub(/^#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@/, "/* #undef CK_ENABLE_FP32*/"); + gsub(/^#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@/, "/* #undef CK_ENABLE_FP64*/"); + gsub(/^#cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@/, "/* #undef CK_ENABLE_DL_KERNELS*/"); + gsub(/^#cmakedefine CK_ENABLE_DPP_KERNELS @CK_ENABLE_DPP_KERNELS@/, "/* #undef CK_ENABLE_DPP_KERNELS*/"); + gsub(/^#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@/, "/* #undef CK_ENABLE_INSTANCES_ONLY*/"); + gsub(/^#cmakedefine CK_USE_XDL @CK_USE_XDL@/, "#define CK_USE_XDL ON"); + gsub(/^#cmakedefine CK_USE_WMMA @CK_USE_WMMA@/, "/* #undef CK_USE_WMMA*/"); + gsub(/^#cmakedefine CK_USE_GFX94 @CK_USE_GFX94@/, "/* #undef CK_USE_GFX94*/"); + gsub(/^#cmakedefine CK_USE_OCP_FP8 @CK_USE_OCP_FP8@/, "/* #undef CK_USE_OCP_FP8*/"); + gsub(/^#cmakedefine CK_USE_FNUZ_FP8 @CK_USE_FNUZ_FP8@/, "/* #undef CK_USE_FNUZ_FP8*/"); + gsub(/^#cmakedefine CK_USE_FP8_ON_UNSUPPORTED_ARCH @CK_USE_FP8_ON_UNSUPPORTED_ARCH@/, "/* #undef CK_USE_FP8_ON_UNSUPPORTED_ARCH*/"); + gsub(/^#cmakedefine CK_USE_NATIVE_MX_SUPPORT @CK_USE_NATIVE_MX_SUPPORT@/, "/* #undef CK_USE_NATIVE_MX_SUPPORT*/"); + gsub(/^#cmakedefine CK_USE_WMMA @CK_USE_WMMA@/, "/* #undef CK_USE_WMMA*/"); + gsub(/^#cmakedefine/, "//cmakedefine");print;}' $(<) > $(@) + """, +) + +cc_library( + name = "ck_headers_real", + hdrs = glob([ + "3rdparty/composable_kernel/include/**/*.h", + "3rdparty/composable_kernel/include/**/*.inc", + "3rdparty/composable_kernel/include/**/*.hpp", + ]), + copts = rocm_default_copts() + ["-std=c++20"], + strip_include_prefix = "3rdparty/composable_kernel/include", + visibility = ["//visibility:public"], + deps = [ + "@local_config_rocm//rocm:rocm_headers", + ":config_h" + ], +) + +cc_library( + name = "ck_library_headers", + srcs = glob(["3rdparty/composable_kernel/library/src/utility/**/*.cpp"]), + hdrs = glob([ + "3rdparty/composable_kernel/library/include/**/*.h", + "3rdparty/composable_kernel/library/include/**/*.inc", + "3rdparty/composable_kernel/library/include/**/*.hpp", + ]), + strip_include_prefix = "3rdparty/composable_kernel/library/include", + copts = rocm_default_copts() + ["-std=c++20"], + deps = [ + ":ck_headers_real", + ], +) + +cc_library( + name = "ck_fmha_example_headers", + hdrs = glob([ + "3rdparty/composable_kernel/example/ck_tile/01_fmha/*.hpp", + ]), + copts = rocm_default_copts() + ["-std=c++20"], + deps = [ + ":ck_headers_real", + ":ck_library_headers", + ], + strip_include_prefix = "3rdparty/composable_kernel/example/ck_tile/01_fmha", + visibility = ["//visibility:public"], +) + genrule( name = "cpp_libraries", srcs = glob([ "**/*", - ]) + ["@composable_kernel_archive//:config_h",], + ]) + [":config_h"], outs = [ "aiter/jit/libmodule_aiter_enum.so", "aiter/jit/libmodule_custom_all_reduce.so", - # "csrc/cpp_itfs/mla/libasm_mla_decode_fwd_torch.so", - # "aiter/jit/libmodule_attention.so", - # "aiter/jit/libmodule_norm.so", - # "aiter/jit/libmodule_cache.so", - # "aiter/jit/libmodule_mha_fwd.so", + "aiter/jit/libmodule_norm.so", + "aiter/jit/libmodule_mha_fwd.so", "aiter/jit/libmodule_quant.so", "aiter/jit/libmodule_gemm_a8w8_blockscale.so", "aiter/jit/libmodule_moe_sorting.so", "aiter/jit/libmodule_moe_asm.so", "aiter/jit/libmodule_pa.so", + "aiter/jit/libmodule_attention_asm.so", "aiter/jit/libmodule_gemm_a8w8_bpreshuffle.so", - "aiter/jit/libmodule_moe.so", "aiter/jit/libmodule_activation.so", + "aiter/jit/libmodule_rmsnorm.so", + "aiter/jit/libmodule_smoothquant.so", + "aiter/jit/libmodule_moe_ck2stages.so" ], cmd = """ - awk '{gsub(/^#cmakedefine DTYPES \"@DTYPES@\"/, "/* #undef DTYPES*/"); - gsub(/^#cmakedefine CK_ENABLE_ALL_DTYPES @CK_ENABLE_ALL_DTYPES@/, "#define CK_ENABLE_ALL_DTYPES ON"); - gsub(/^#cmakedefine CK_ENABLE_INT8 @CK_ENABLE_INT8@/, "/* #undef CK_ENABLE_INT8*/"); - gsub(/^#cmakedefine CK_ENABLE_FP8 @CK_ENABLE_FP8@/, "/* #undef CK_ENABLE_FP8*/"); - gsub(/^#cmakedefine CK_ENABLE_BF8 @CK_ENABLE_BF8@/, "/* #undef CK_ENABLE_BF8*/"); - gsub(/^#cmakedefine CK_ENABLE_FP16 @CK_ENABLE_FP16@/, "/* #undef CK_ENABLE_FP16*/"); - gsub(/^#cmakedefine CK_ENABLE_BF16 @CK_ENABLE_BF16@/, "/* #undef CK_ENABLE_BF16*/"); - gsub(/^#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@/, "/* #undef CK_ENABLE_FP32*/"); - gsub(/^#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@/, "/* #undef CK_ENABLE_FP64*/"); - gsub(/^#cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@/, "/* #undef CK_ENABLE_DL_KERNELS*/"); - gsub(/^#cmakedefine CK_ENABLE_DPP_KERNELS @CK_ENABLE_DPP_KERNELS@/, "/* #undef CK_ENABLE_DPP_KERNELS*/"); - gsub(/^#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@/, "/* #undef CK_ENABLE_INSTANCES_ONLY*/"); - gsub(/^#cmakedefine CK_USE_XDL @CK_USE_XDL@/, "#define CK_USE_XDL ON"); - gsub(/^#cmakedefine CK_USE_WMMA @CK_USE_WMMA@/, "/* #undef CK_USE_WMMA*/"); - gsub(/^#cmakedefine CK_USE_GFX94 @CK_USE_GFX94@/, "/* #undef CK_USE_GFX94*/"); - gsub(/^#cmakedefine CK_USE_OCP_FP8 @CK_USE_OCP_FP8@/, "/* #undef CK_USE_OCP_FP8*/"); - gsub(/^#cmakedefine CK_USE_FNUZ_FP8 @CK_USE_FNUZ_FP8@/, "/* #undef CK_USE_FNUZ_FP8*/"); - gsub(/^#cmakedefine CK_USE_FP8_ON_UNSUPPORTED_ARCH @CK_USE_FP8_ON_UNSUPPORTED_ARCH@/, "/* #undef CK_USE_FP8_ON_UNSUPPORTED_ARCH*/"); - gsub(/^#cmakedefine CK_USE_NATIVE_MX_SUPPORT @CK_USE_NATIVE_MX_SUPPORT@/, "/* #undef CK_USE_NATIVE_MX_SUPPORT*/"); - gsub(/^#cmakedefine/, "//cmakedefine");print;}' external/aiter_src/3rdparty/composable_kernel/include/ck/config.h.in > external/aiter_src/3rdparty/composable_kernel/include/ck/config.h; cd external/aiter_src; - find . -name lock | xargs rm -f; + export PYTHONPATH=$${PWD}:$${PYTHONPATH:-}; + find . -name lock_* | xargs rm -f; /opt/conda310/bin/python -m pip install -r requirements.txt -i https://artifacts.antgroup-inc.cn/simple/ --extra-index-url=https://artlab.alibaba-inc.com/1/PYPI/py-central/ --extra-index-url=https://artlab.alibaba-inc.com/1/PYPI/pytorch/ --extra-index-url=http://artlab.alibaba-inc.com/1/pypi/rtp_diffusion --trusted-host=artlab.alibaba-inc.com; /opt/conda310/bin/python -m pip install ninja -i https://artifacts.antgroup-inc.cn/simple/ --extra-index-url=https://artlab.alibaba-inc.com/1/PYPI/py-central/ --extra-index-url=https://artlab.alibaba-inc.com/1/PYPI/pytorch/ --extra-index-url=http://artlab.alibaba-inc.com/1/pypi/rtp_diffusion --trusted-host=artlab.alibaba-inc.com; /opt/conda310/bin/python -m pip install packaging -i https://artifacts.antgroup-inc.cn/simple/ --extra-index-url=https://artlab.alibaba-inc.com/1/PYPI/py-central/ --extra-index-url=https://artlab.alibaba-inc.com/1/PYPI/pytorch/ --extra-index-url=http://artlab.alibaba-inc.com/1/pypi/rtp_diffusion --trusted-host=artlab.alibaba-inc.com; - GPU_ARCHS=gfx942 ROCM_HOME=/opt/rocm LD_LIBRARY_PATH=/opt/amdgpu/lib64 PATH=/opt/rocm/bin:/opt/conda310/bin:$$PATH /opt/conda310/bin/python build_aiter_module.py; + AITER_SYMBOL_VISIBLE=1 GPU_ARCHS=gfx942 ROCM_HOME=/opt/rocm LD_LIBRARY_PATH=/opt/amdgpu/lib64 PATH=/opt/rocm/bin:/opt/conda310/bin:$$PATH /opt/conda310/bin/python build_aiter_module.py; cd ../..; cp external/aiter_src/aiter/jit/module_aiter_enum.so $(location aiter/jit/libmodule_aiter_enum.so); cp external/aiter_src/aiter/jit/module_custom_all_reduce.so $(location aiter/jit/libmodule_custom_all_reduce.so); cp external/aiter_src/aiter/jit/module_quant.so $(location aiter/jit/libmodule_quant.so); + cp external/aiter_src/aiter/jit/module_smoothquant.so $(location aiter/jit/libmodule_smoothquant.so); cp external/aiter_src/aiter/jit/module_moe_sorting.so $(location aiter/jit/libmodule_moe_sorting.so); cp external/aiter_src/aiter/jit/module_moe_asm.so $(location aiter/jit/libmodule_moe_asm.so); - cp external/aiter_src/aiter/jit/module_moe.so $(location aiter/jit/libmodule_moe.so); cp external/aiter_src/aiter/jit/module_gemm_a8w8_blockscale.so $(location aiter/jit/libmodule_gemm_a8w8_blockscale.so); cp external/aiter_src/aiter/jit/module_pa.so $(location aiter/jit/libmodule_pa.so); + cp external/aiter_src/aiter/jit/module_attention_asm.so $(location aiter/jit/libmodule_attention_asm.so); cp external/aiter_src/aiter/jit/module_gemm_a8w8_bpreshuffle.so $(location aiter/jit/libmodule_gemm_a8w8_bpreshuffle.so); cp external/aiter_src/aiter/jit/module_activation.so $(location aiter/jit/libmodule_activation.so); + cp external/aiter_src/aiter/jit/module_norm.so $(location aiter/jit/libmodule_norm.so); + cp external/aiter_src/aiter/jit/module_rmsnorm.so $(location aiter/jit/libmodule_rmsnorm.so); + cp external/aiter_src/aiter/jit/module_mha_fwd.so $(location aiter/jit/libmodule_mha_fwd.so); + cp external/aiter_src/aiter/jit/module_moe_ck2stages.so $(location aiter/jit/libmodule_moe_ck2stages.so); """, visibility = ["//visibility:public"], tags = ["rocm","local"], @@ -75,26 +144,17 @@ cc_library( tags = ["rocm","local"], ) -# cc_library( -# name = "decode_mla", -# srcs = ["csrc/cpp_itfs/mla/libasm_mla_decode_fwd_torch.so"], -# hdrs = ["csrc/cpp_itfs/mla/asm_mla_decode_fwd_torch.h"], -# deps = [":cpp_libraries"], -# copts = [], -# # strip_include_prefix = "csrc/cpp_itfs/", -# visibility = ["//visibility:public"], -# tags = ["rocm","local"], -# ) - -# cc_library( -# name = "module_mha_fwd", -# srcs = ["aiter/jit/libmodule_mha_fwd.so"], -# hdrs = ["csrc/include/mha_fwd.h"], -# deps = [":cpp_libraries"], -# copts = [], -# # strip_include_prefix = "csrc/include/", -# visibility = ["//visibility:public"], -# ) +cc_library( + name = "module_mha_fwd", + srcs = ["aiter/jit/libmodule_mha_fwd.so"], + hdrs = ["csrc/include/mha_fwd.h", "csrc/include/aiter_hip_common.h"], + deps = [":cpp_libraries", ":ck_fmha_example_headers"], + copts = ["-std=c++20"], + linkopts = [], + strip_include_prefix = "csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm","local"], +) cc_library( name = "module_aiter_enum", @@ -119,23 +179,24 @@ cc_library( ) cc_library( - name = "module_gemm_a8w8_blockscale", - srcs = ["aiter/jit/libmodule_gemm_a8w8_blockscale.so"], - hdrs = ["csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h"], + name = "module_smoothquant", + srcs = ["aiter/jit/libmodule_smoothquant.so"], + hdrs = ["csrc/include/smoothquant.h"], deps = [":cpp_libraries"], copts = [], - strip_include_prefix = "csrc/ck_gemm_a8w8_blockscale/include/", + linkopts = [], + strip_include_prefix = "csrc/include/", visibility = ["//visibility:public"], tags = ["rocm","local"], ) cc_library( - name = "module_moe", - srcs = ["aiter/jit/libmodule_moe.so"], - hdrs = ["csrc/include/moe_ck.h"], + name = "module_gemm_a8w8_blockscale", + srcs = ["aiter/jit/libmodule_gemm_a8w8_blockscale.so"], + hdrs = ["csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h"], deps = [":cpp_libraries"], copts = [], - strip_include_prefix = "csrc/include/", + strip_include_prefix = "csrc/ck_gemm_a8w8_blockscale/include/", visibility = ["//visibility:public"], tags = ["rocm","local"], ) @@ -175,8 +236,14 @@ cc_library( cc_library( name = "module_pa", - srcs = ["aiter/jit/libmodule_pa.so"], - hdrs = ["csrc/include/attention.h"], + srcs = [ + "aiter/jit/libmodule_pa.so", + "aiter/jit/libmodule_attention_asm.so" + ], + hdrs = [ + "csrc/include/attention.h", + "csrc/include/attention_asm.h" + ], deps = [":cpp_libraries"], copts = [], strip_include_prefix = "csrc/include/", @@ -194,3 +261,41 @@ cc_library( visibility = ["//visibility:public"], tags = ["rocm","local"], ) + +cc_library( + name = "module_rmsnorm", + srcs = ["aiter/jit/libmodule_rmsnorm.so"], + hdrs = ["csrc/include/rmsnorm.h"], + deps = [":cpp_libraries"], + copts = [], + linkopts = [], + strip_include_prefix = "csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm","local"], +) + +cc_library( + name = "module_norm", + srcs = ["aiter/jit/libmodule_norm.so"], + hdrs = ["csrc/include/norm.h"], + deps = [":cpp_libraries"], + copts = [], + linkopts = [], + strip_include_prefix = "csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm","local"], +) + +cc_library( + name = "module_moe_ck2stages", + srcs = [ + "aiter/jit/libmodule_moe_ck2stages.so" + ], + hdrs = ["csrc/include/moe_ck.h"], + deps = [":cpp_libraries"], + copts = [], + linkopts = [], + strip_include_prefix = "csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm","local"], +) \ No newline at end of file diff --git a/3rdparty/aiter/aiter-flash_attn.patch b/3rdparty/aiter/aiter-flash_attn.patch new file mode 100644 index 000000000..ea752e61b --- /dev/null +++ b/3rdparty/aiter/aiter-flash_attn.patch @@ -0,0 +1,13 @@ +diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py +index 0892939..b84e620 100644 +--- a/aiter/ops/mha.py ++++ b/aiter/ops/mha.py +@@ -1014,7 +1014,7 @@ def _flash_attn_forward( + ret = ret and (not swa) + ret = ret and (q.dtype == dtypes.bf16) + ret = ret and ((return_lse and gfx == "gfx950") or (gfx == "gfx942")) +- return ret ++ return + + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + if can_impl_fmha_v3_fwd(): \ No newline at end of file diff --git a/3rdparty/aiter/aiter-fmha.patch b/3rdparty/aiter/aiter-fmha.patch new file mode 100644 index 000000000..51c150634 --- /dev/null +++ b/3rdparty/aiter/aiter-fmha.patch @@ -0,0 +1,22 @@ +--- aiter/jit/optCompilerConfig.json ++++ aiter/jit/optCompilerConfig.json +@@ -619,6 +619,7 @@ + "verbose": "False", + "hip_clang_path": "os.environ.get('MHA_HIP_CLANG_PATH')", + "blob_gen_cmd": [ ++ "f'{get_asm_dir()}/fmha_v3_fwd/codegen.py --output_dir {{}}'", + "f'{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd --receipt 600 --output_dir {{}}'", + "f'{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 3 --output_dir {{}}'" + ] + +--- csrc/cpp_itfs/mha_fwd_generate.py ++++ csrc/cpp_itfs/mha_fwd_generate.py +@@ -150,7 +150,7 @@ COMBINED_API = """t = fmha_fwd_v3(traits, args, stream_config); + API_MAP = { + 1: FMHA_FWD_API.format(F_inner_dispatch=V3_API), + 2: FMHA_FWD_API.format(F_inner_dispatch=V2_API), +- 3: FMHA_FWD_API.format(F_inner_dispatch=V2_API) + FMHA_FWD_SPLITKV_API, ++ 3: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API), + 4: FMHA_BATCH_PREFILL_API, + 5: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API) + + FMHA_FWD_SPLITKV_API \ No newline at end of file diff --git a/3rdparty/aiter/refine-aiter-asm-dir.patch b/3rdparty/aiter/refine-aiter-asm-dir.patch new file mode 100644 index 000000000..fd7ee0281 --- /dev/null +++ b/3rdparty/aiter/refine-aiter-asm-dir.patch @@ -0,0 +1,47 @@ +diff --git a/aiter/jit/core.py b/aiter/jit/core.py +index 712feea0..012db7e6 100644 +--- a/aiter/jit/core.py ++++ b/aiter/jit/core.py +@@ -62,35 +62,19 @@ this_dir = os.path.dirname(os.path.abspath(__file__)) + AITER_ROOT_DIR = os.path.abspath(f"{this_dir}/../../") + AITER_LOG_MORE = int(os.getenv("AITER_LOG_MORE", 0)) + +-find_aiter = importlib.util.find_spec("aiter") +-if find_aiter is not None: +- if find_aiter.submodule_search_locations: +- package_path = find_aiter.submodule_search_locations[0] +- elif find_aiter.origin: +- package_path = find_aiter.origin +- package_path = os.path.dirname(package_path) +- package_parent_path = os.path.dirname(package_path) +- import site +- +- site_packages_dirs = site.getsitepackages() +- # develop mode +- isDevelopMode = (package_path not in site_packages_dirs) and ( +- package_parent_path not in site_packages_dirs +- ) +- if isDevelopMode: +- AITER_META_DIR = AITER_ROOT_DIR +- # install mode +- else: +- AITER_META_DIR = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta/") ++meta_path = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta") ++if os.path.exists(meta_path): ++ AITER_META_DIR = meta_path + else: +- AITER_META_DIR = AITER_ROOT_DIR +- logger.warning("aiter is not installed.") ++ AITER_META_DIR = os.path.abspath(AITER_ROOT_DIR) ++ + sys.path.insert(0, AITER_META_DIR) + AITER_CSRC_DIR = f"{AITER_META_DIR}/csrc" + AITER_GRADLIB_DIR = f"{AITER_META_DIR}/gradlib" + gfx = get_gfx() + AITER_ASM_DIR = f"{AITER_META_DIR}/hsa/{gfx}/" +-os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR ++if "AITER_ASM_DIR" not in os.environ: ++ os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR + CK_3RDPARTY_DIR = os.environ.get( + "CK_DIR", f"{AITER_META_DIR}/3rdparty/composable_kernel" + ) diff --git a/3rdparty/aiter/rtp-llm.patch b/3rdparty/aiter/rtp-llm.patch index fba2a8268..5c9ff9b21 100644 --- a/3rdparty/aiter/rtp-llm.patch +++ b/3rdparty/aiter/rtp-llm.patch @@ -1,15 +1,11 @@ ---- aiter/jit/core.py -+++ aiter/jit/core.py -@@ -114,10 +114,12 @@ def get_user_jit_dir(): - shutil.copytree(this_dir, home_jit_dir) - return home_jit_dir - -+CK_DIR = os.environ.get("CK_DIR", f"{AITER_ROOT_DIR}/3rdparty/composable_kernel") - - bd_dir = f"{get_user_jit_dir()}/build" - # copy ck to build, thus hippify under bd_dir - if multiprocessing.current_process().name == "MainProcess": -+ shutil.copytree(CK_DIR, f'{bd_dir}/ck', dirs_exist_ok=True) - os.makedirs(bd_dir, exist_ok=True) - # if os.path.exists(f"{bd_dir}/ck/library"): - # shutil.rmtree(f"{bd_dir}/ck/library") +--- csrc/py_itfs_cu/asm_pa.cu ++++ csrc/py_itfs_cu/asm_pa.cu +@@ -97,7 +97,7 @@ torch::Tensor pa_fwd(torch::Tensor& Q, // [num_seqs, num_heads, head_size] + int num_heads = Q.size(1); + int head_size = Q.size(2); + int num_kv_heads = K.size(1); +- int block_size = K.size(3); ++ int block_size = K.size(2); + const int gqa_ratio = num_heads / num_kv_heads; + TORCH_CHECK(block_size == 16, __func__, " for now only support block_size == 16"); + diff --git a/3rdparty/aiter/silu.patch b/3rdparty/aiter/silu.patch new file mode 100644 index 000000000..b71d52646 --- /dev/null +++ b/3rdparty/aiter/silu.patch @@ -0,0 +1,151 @@ +--- csrc/kernels/activation_kernels.cu ++++ csrc/kernels/activation_kernels.cu +@@ -18,6 +18,7 @@ + #include + #include + ++#include + #include + + #include "hip_compat.h" +@@ -27,6 +28,88 @@ + + using fp8_type = ck_tile::fp8_t; + ++namespace { ++#define FLASHINFER_INLINE inline __attribute__((always_inline)) __device__ ++ ++template ++struct vec_t { ++ FLASHINFER_INLINE float_t& operator[](size_t i); ++ FLASHINFER_INLINE const float_t& operator[](size_t i) const; ++ FLASHINFER_INLINE void load(const float_t* ptr); ++ FLASHINFER_INLINE void store(float_t* ptr) const; ++ FLASHINFER_INLINE float_t* ptr(); ++}; ++ ++template ++struct vec_t { ++ static_assert(vec_size % 8 == 0, "Invalid vector size"); ++ int4 data[vec_size / 8]; ++ ++ FLASHINFER_INLINE c10::BFloat16& operator[](size_t i) { return ((c10::BFloat16*)data)[i]; } ++ FLASHINFER_INLINE const c10::BFloat16& operator[](size_t i) const { ++ return ((const c10::BFloat16*)data)[i]; ++ } ++ FLASHINFER_INLINE c10::BFloat16* ptr() { return reinterpret_cast(&data); } ++ FLASHINFER_INLINE void load(const c10::BFloat16* ptr) { ++#pragma unoll ++ for (size_t i = 0; i < vec_size / 8; ++i) { ++ data[i] = ((int4*)ptr)[i]; ++ } ++ } ++ FLASHINFER_INLINE void store(c10::BFloat16* ptr) const { ++#pragma unoll ++ for (size_t i = 0; i < vec_size / 8; ++i) { ++ ((int4*)ptr)[i] = data[i]; ++ } ++ } ++}; ++ ++ ++template ++struct vec_t { ++ static_assert(vec_size % 8 == 0, "Invalid vector size"); ++ int4 data[vec_size / 8]; ++ FLASHINFER_INLINE c10::Half& operator[](size_t i) { return ((c10::Half*)data)[i]; } ++ FLASHINFER_INLINE const c10::Half& operator[](size_t i) const { return ((const c10::Half*)data)[i]; } ++ FLASHINFER_INLINE c10::Half* ptr() { return reinterpret_cast(&data); } ++ FLASHINFER_INLINE void load(const c10::Half* ptr) { ++#pragma unroll ++ for (size_t i = 0; i < vec_size / 8; ++i) { ++ data[i] = ((int4*)ptr)[i]; ++ } ++ } ++ FLASHINFER_INLINE void store(c10::Half* ptr) const { ++#pragma unroll ++ for (size_t i = 0; i < vec_size / 8; ++i) { ++ ((int4*)ptr)[i] = data[i]; ++ } ++ } ++}; ++ ++template ++struct vec_t { ++ static_assert(vec_size % 4 == 0, "Invalid vector size"); ++ float4 data[vec_size / 4]; ++ ++ FLASHINFER_INLINE float& operator[](size_t i) { return ((float*)(data))[i]; } ++ FLASHINFER_INLINE const float& operator[](size_t i) const { return ((const float*)(data))[i]; } ++ FLASHINFER_INLINE float* ptr() { return reinterpret_cast(&data); } ++ FLASHINFER_INLINE void load(const float* ptr) { ++#pragma unroll ++ for (size_t i = 0; i < vec_size / 4; ++i) { ++ data[i] = ((float4*)ptr)[i]; ++ } ++ } ++ FLASHINFER_INLINE void store(float* ptr) const { ++#pragma unroll ++ for (size_t i = 0; i < vec_size / 4; ++i) { ++ ((float4*)ptr)[i] = data[i]; ++ } ++ } ++}; ++ ++} ++ + namespace vllm + { + +@@ -37,12 +120,25 @@ + const scalar_t *__restrict__ input, // [..., 2, d] + const int d) + { ++ constexpr uint32_t vec_size = 16 / sizeof(scalar_t); + const int64_t token_idx = blockIdx.x; +- for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) +- { +- const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); +- const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); +- out[token_idx * d + idx] = ACT_FN(x) * y; ++ const int64_t thread_idx = threadIdx.x; ++ const int64_t stride = blockDim.x; ++ const int64_t offset = token_idx * 2 * d; ++ const scalar_t* x_ptr = input + offset; ++ const scalar_t* y_ptr = x_ptr + d; ++ const int64_t iters = d / vec_size; ++ out += token_idx * d; ++ ++ for (uint32_t idx = thread_idx; idx < iters; idx += stride) { ++ vec_t x_vec, y_vec, out_vec; ++ x_vec.load(x_ptr + idx * vec_size); ++ y_vec.load(y_ptr + idx * vec_size); ++ #pragma unroll ++ for (uint32_t i = 0; i < vec_size; ++i) { ++ out_vec[i] = ACT_FN(x_vec[i]) * y_vec[i]; ++ } ++ out_vec.store(out + idx * vec_size); + } + } + +@@ -105,6 +201,7 @@ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ ++ assert(d % 8 == 0); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "act_and_mul_kernel", [&] { vllm::act_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + +--- aiter/jit/optCompilerConfig.json ++++ aiter/jit/optCompilerConfig.json +@@ -17,7 +17,7 @@ + "f'{AITER_CSRC_DIR}/kernels/activation_kernels.cu'" + ], + "flags_extra_cc": [], +- "flags_extra_hip": [], ++ "flags_extra_hip": ["'-ffast-math'"], + "extra_ldflags": "None", + "extra_include": [ + "f'{AITER_CSRC_DIR}/include/ck_tile'" + diff --git a/3rdparty/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/3rdparty/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl index 165471ed4..8c305a816 100755 --- a/3rdparty/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl +++ b/3rdparty/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl @@ -143,8 +143,8 @@ def InvokeHipcc(argv, log=False): defines = ''.join([' -D' + define for define in defines]) undefines = GetOptionValue(argv, 'U') undefines = ''.join([' -U' + define for define in undefines]) - std_options = GetOptionValue(argv, 'std') - hipcc_allowed_std_options = ["c++11", "c++14", "c++17"] + std_options = GetOptionValue(argv, 'std') + ["c++20"] + hipcc_allowed_std_options = ["c++11", "c++14", "c++17", "c++20"] std_options = ''.join([' -std=' + define for define in std_options if define in hipcc_allowed_std_options]) diff --git a/BUILD.aiter b/BUILD.aiter index e94f2b91c..2d93c71e5 100644 --- a/BUILD.aiter +++ b/BUILD.aiter @@ -1,3 +1,8 @@ +load( + "@local_config_rocm//rocm:build_defs.bzl", + "rocm_default_copts", +) + cc_library( name = "aiter_so", srcs = glob([ @@ -7,6 +12,15 @@ cc_library( tags = ["rocm"], ) +cc_library( + name = "aiter_meta", + data = glob([ + "aiter_meta/**", + ]), + visibility = ["//visibility:public"], + tags = ["rocm"], +) + cc_library( name = "aiter_headers", hdrs = glob([ @@ -17,6 +31,87 @@ cc_library( tags = ["rocm"], ) +genrule( + name = "config_h", + srcs = [ + "aiter_meta/3rdparty/composable_kernel/include/ck/config.h.in", + ], + outs = [ + "aiter_meta/3rdparty/composable_kernel/include/ck/config.h", + ], + cmd = """ + awk '{gsub(/^#cmakedefine DTYPES \"@DTYPES@\"/, "/* #undef DTYPES*/"); + gsub(/^#cmakedefine CK_ENABLE_ALL_DTYPES @CK_ENABLE_ALL_DTYPES@/, "#define CK_ENABLE_ALL_DTYPES ON"); + gsub(/^#cmakedefine CK_ENABLE_INT8 @CK_ENABLE_INT8@/, "/* #undef CK_ENABLE_INT8*/"); + gsub(/^#cmakedefine CK_ENABLE_FP8 @CK_ENABLE_FP8@/, "/* #undef CK_ENABLE_FP8*/"); + gsub(/^#cmakedefine CK_ENABLE_BF8 @CK_ENABLE_BF8@/, "/* #undef CK_ENABLE_BF8*/"); + gsub(/^#cmakedefine CK_ENABLE_FP16 @CK_ENABLE_FP16@/, "/* #undef CK_ENABLE_FP16*/"); + gsub(/^#cmakedefine CK_ENABLE_BF16 @CK_ENABLE_BF16@/, "/* #undef CK_ENABLE_BF16*/"); + gsub(/^#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@/, "/* #undef CK_ENABLE_FP32*/"); + gsub(/^#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@/, "/* #undef CK_ENABLE_FP64*/"); + gsub(/^#cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@/, "/* #undef CK_ENABLE_DL_KERNELS*/"); + gsub(/^#cmakedefine CK_ENABLE_DPP_KERNELS @CK_ENABLE_DPP_KERNELS@/, "/* #undef CK_ENABLE_DPP_KERNELS*/"); + gsub(/^#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@/, "/* #undef CK_ENABLE_INSTANCES_ONLY*/"); + gsub(/^#cmakedefine CK_USE_XDL @CK_USE_XDL@/, "#define CK_USE_XDL ON"); + gsub(/^#cmakedefine CK_USE_WMMA @CK_USE_WMMA@/, "/* #undef CK_USE_WMMA*/"); + gsub(/^#cmakedefine CK_USE_GFX94 @CK_USE_GFX94@/, "/* #undef CK_USE_GFX94*/"); + gsub(/^#cmakedefine CK_USE_OCP_FP8 @CK_USE_OCP_FP8@/, "/* #undef CK_USE_OCP_FP8*/"); + gsub(/^#cmakedefine CK_USE_FNUZ_FP8 @CK_USE_FNUZ_FP8@/, "/* #undef CK_USE_FNUZ_FP8*/"); + gsub(/^#cmakedefine CK_USE_FP8_ON_UNSUPPORTED_ARCH @CK_USE_FP8_ON_UNSUPPORTED_ARCH@/, "/* #undef CK_USE_FP8_ON_UNSUPPORTED_ARCH*/"); + gsub(/^#cmakedefine CK_USE_NATIVE_MX_SUPPORT @CK_USE_NATIVE_MX_SUPPORT@/, "/* #undef CK_USE_NATIVE_MX_SUPPORT*/"); + gsub(/^#cmakedefine CK_USE_WMMA @CK_USE_WMMA@/, "/* #undef CK_USE_WMMA*/"); + gsub(/^#cmakedefine/, "//cmakedefine");print;}' $(<) > $(@) + """, +) + +cc_library( + name = "ck_headers_real", + hdrs = glob([ + "aiter_meta/3rdparty/composable_kernel/include/**/*.h", + "aiter_meta/3rdparty/composable_kernel/include/**/*.inc", + "aiter_meta/3rdparty/composable_kernel/include/**/*.hpp", + ]) + ["aiter_meta/3rdparty/composable_kernel/include/ck/config.h"], + copts = rocm_default_copts() + ["-std=c++20"], + strip_include_prefix = "aiter_meta/3rdparty/composable_kernel/include/", + visibility = ["//visibility:public"], + deps = [ + "@local_config_rocm//rocm:rocm_headers", + ":config_h" + ], + tags = ["rocm"], +) + +cc_library( + name = "ck_library_headers", + srcs = glob(["aiter_meta/3rdparty/composable_kernel/library/src/utility/**/*.cpp"]), + hdrs = glob([ + "aiter_meta/3rdparty/composable_kernel/library/include/**/*.h", + "aiter_meta/3rdparty/composable_kernel/library/include/**/*.inc", + "aiter_meta/3rdparty/composable_kernel/library/include/**/*.hpp", + ]), + strip_include_prefix = "aiter_meta/3rdparty/composable_kernel/library/include/", + copts = rocm_default_copts() + ["-std=c++20"], + deps = [ + ":ck_headers_real", + ], + tags = ["rocm"], +) + +cc_library( + name = "ck_fmha_example_headers", + hdrs = glob([ + "aiter_meta/3rdparty/composable_kernel/example/ck_tile/01_fmha/*.hpp", + ]), + copts = rocm_default_copts() + ["-std=c++20"], + deps = [ + ":ck_headers_real", + ":ck_library_headers", + ], + strip_include_prefix = "aiter_meta/3rdparty/composable_kernel/example/ck_tile/01_fmha", + visibility = ["//visibility:public"], + tags = ["rocm"], +) + cc_library( name = "module_custom_all_reduce", srcs = ["aiter/jit/module_custom_all_reduce.so"], @@ -60,29 +155,30 @@ cc_library( ) cc_library( - name = "module_gemm_a8w8_blockscale", - srcs = ["aiter/jit/module_gemm_a8w8_blockscale.so"], - hdrs = ["aiter_meta/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h"], + name = "module_smoothquant", + srcs = ["aiter/jit/module_smoothquant.so"], + hdrs = ["aiter_meta/csrc/include/smoothquant.h"], deps = [ ":aiter_so", ":aiter_headers", ], copts = [], - strip_include_prefix = "aiter_meta/csrc/ck_gemm_a8w8_blockscale/include/", + linkopts = [], + strip_include_prefix = "aiter_meta/csrc/include/", visibility = ["//visibility:public"], tags = ["rocm"], ) cc_library( - name = "module_moe", - srcs = ["aiter/jit/module_moe.so"], - hdrs = ["aiter_meta/csrc/include/moe_ck.h"], + name = "module_gemm_a8w8_blockscale", + srcs = ["aiter/jit/module_gemm_a8w8_blockscale.so"], + hdrs = ["aiter_meta/csrc/ck_gemm_a8w8_blockscale/include/gemm_a8w8_blockscale.h"], deps = [ ":aiter_so", ":aiter_headers", ], copts = [], - strip_include_prefix = "aiter_meta/csrc/include/", + strip_include_prefix = "aiter_meta/csrc/ck_gemm_a8w8_blockscale/include/", visibility = ["//visibility:public"], tags = ["rocm"], ) @@ -131,8 +227,14 @@ cc_library( cc_library( name = "module_pa", - srcs = ["aiter/jit/module_pa.so"], - hdrs = ["aiter_meta/csrc/include/attention.h"], + srcs = [ + "aiter/jit/module_pa.so", + "aiter/jit/module_attention_asm.so" + ], + hdrs = [ + "aiter_meta/csrc/include/attention.h", + "aiter_meta/csrc/include/attention_asm.h" + ], deps = [ ":aiter_so", ":aiter_headers", @@ -155,4 +257,65 @@ cc_library( strip_include_prefix = "aiter_meta/csrc/include/", visibility = ["//visibility:public"], tags = ["rocm"], +) + +cc_library( + name = "module_mha_fwd", + srcs = ["aiter/jit/module_mha_fwd.so"], + hdrs = ["aiter_meta/csrc/include/mha_fwd.h", "aiter_meta/csrc/include/aiter_hip_common.h"], + deps = [ + ":aiter_so", + ":aiter_headers", + ":ck_fmha_example_headers", + ], + copts = [], + linkopts = [], + strip_include_prefix = "aiter_meta/csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm"], +) + +cc_library( + name = "module_rmsnorm", + srcs = ["aiter/jit/module_rmsnorm.so"], + hdrs = ["aiter_meta/csrc/include/rmsnorm.h"], + deps = [ + ":aiter_so", + ":aiter_headers", + ], + copts = [], + linkopts = [], + strip_include_prefix = "aiter_meta/csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm"], +) + +cc_library( + name = "module_norm", + srcs = ["aiter/jit/module_norm.so"], + hdrs = ["aiter_meta/csrc/include/norm.h"], + deps = [ + ":aiter_so", + ":aiter_headers", + ], + copts = [], + linkopts = [], + strip_include_prefix = "aiter_meta/csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm"], +) + +cc_library( + name = "module_moe_ck2stages", + srcs = ["aiter/jit/module_moe_ck2stages.so"], + hdrs = ["aiter_meta/csrc/include/moe_ck.h"], + deps = [ + ":aiter_so", + ":aiter_headers", + ], + copts = [], + linkopts = [], + strip_include_prefix = "aiter_meta/csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm"], ) \ No newline at end of file diff --git a/WORKSPACE b/WORKSPACE index ce91a45b3..8ff5f9cac 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -42,4 +42,10 @@ load("@pip_gpu_rocm_torch//:requirements.bzl", pip_gpu_rocm_torch_install_deps = pip_gpu_rocm_torch_install_deps() load("//:def.bzl", "read_release_version") -read_release_version(name = "release_version") \ No newline at end of file +read_release_version(name = "release_version") +load("//:patched_repo.bzl", "patched_pip_repository") + +# 创建打过patch的版本 +patched_pip_repository( + name = "patched_aiter" +) diff --git a/patched_repo.bzl b/patched_repo.bzl new file mode 100644 index 000000000..d01eabecc --- /dev/null +++ b/patched_repo.bzl @@ -0,0 +1,27 @@ +def _patched_pip_repository_impl(repository_ctx): + # 获取原始pip仓库路径:注意,这里我们使用Label来定位原始仓库的BUILD文件,然后取它的目录 + original_path = repository_ctx.path(Label("@pip_gpu_rocm_torch_aiter//:BUILD.bazel")).dirname + + original_path_str = str(original_path) + + # 复制原始仓库内容到当前仓库根目录 + repository_ctx.execute([ + "cp", "-r", original_path_str + "/.", repository_ctx.path("") + ]) + + patch_path = repository_ctx.path(Label("//3rdparty/aiter:refine-aiter-asm-dir.patch")) + patch_path_aiter = repository_ctx.path(Label("//3rdparty/aiter:aiter-flash_attn.patch")) + patch_path_str = str(patch_path) + + result = repository_ctx.execute([ + "sh", "-c", + "cd site-packages && patch -p1 -i " + patch_path_str + " && patch -p1 -i " + str(patch_path_aiter) + ]) + + if result.return_code != 0: + fail("Patch failed: %s" % result.stderr) + +patched_pip_repository = repository_rule( + implementation = _patched_pip_repository_impl, + attrs = {}, +) \ No newline at end of file diff --git a/rtp_llm/BUILD b/rtp_llm/BUILD index 6da22a002..ec8aaf97f 100755 --- a/rtp_llm/BUILD +++ b/rtp_llm/BUILD @@ -434,7 +434,7 @@ py_library( ":async_model", ":embedding", "//rtp_llm/tools/convert:convert", - ":aiter", + "@patched_aiter//:pkg", ], "//conditions:default": [ ":models", diff --git a/rtp_llm/config/gpt_init_model_parameters.py b/rtp_llm/config/gpt_init_model_parameters.py index 642e8aae3..7e9a59bf7 100644 --- a/rtp_llm/config/gpt_init_model_parameters.py +++ b/rtp_llm/config/gpt_init_model_parameters.py @@ -647,12 +647,17 @@ def update_gpt_init_params_from_env( rocm_hipblaslt_config=get_env_str( "ROCM_HIPBLASLT_CONFIG", "gemm_config.csv" ), + use_swizzleA = ( + get_env_bool("USE_SWIZZLEA", False) + and get_env_str("MODEL_TYPE", "") in ("qwen_2", "qwen_3") + ), ft_disable_custom_ar=get_env_bool("FT_DISABLE_CUSTOM_AR", True), enable_cuda_graph=get_env_bool("ENABLE_CUDA_GRAPH", False), enable_cuda_graph_debug_mode=get_env_bool( "ENABLE_CUDA_GRAPH_DEBUG_MODE", False ), use_aiter_pa=get_env_bool("USE_AITER_PA", True), + use_asm_pa=get_env_bool("USE_ASM_PA", True), enable_native_cuda_graph=get_env_bool("ENABLE_NATIVE_CUDA_GRAPH", False), num_native_cuda_graph=get_env_int("NUM_NATIVE_CUDA_GRAPH", 200), ) @@ -819,14 +824,14 @@ def update_inter_padding_size(self, tp_size: int, ep_size: int, dp_size: int): inter_size + ( get_pad_size(inter_size, align_size) - if self.quant_algo.isQuant() + if (self.quant_algo.isQuant() or self.gpt_init_params.hw_kernel_config.use_swizzleA) else 0 ) ) self.layer_inter_padding_size = layer_inter_padding_size self.inter_padding_size = self.inter_size + ( get_pad_size(self.inter_size, align_size) - if self.quant_algo.isQuant() + if (self.quant_algo.isQuant() or self.gpt_init_params.hw_kernel_config.use_swizzleA) else 0 ) if self.head_num_kv <= 0: @@ -1144,6 +1149,9 @@ def update_common( logging.info(f"pre_allocate_op_mem: {self.pre_allocate_op_mem}") logging.info(f"tp_split_emb_and_lm_head: {self.tp_split_emb_and_lm_head}") + if os.environ.get("ROCM_KV_CACHE_DATATYPE", "") == "fp8": + self.kv_cache_data_type = WEIGHT_TYPE.FP8.to_str() + # use environment variables to update stop_words_str and stop_words_id env_stop_words_str = self.py_env_configs.generate_env_config.stop_words_str env_stop_words_id = self.py_env_configs.generate_env_config.stop_words_list diff --git a/rtp_llm/config/py_config_modules.py b/rtp_llm/config/py_config_modules.py index 55d0a0463..449ce65d6 100644 --- a/rtp_llm/config/py_config_modules.py +++ b/rtp_llm/config/py_config_modules.py @@ -757,6 +757,7 @@ def __init__(self): self.enable_cuda_graph: bool = False self.enable_cuda_graph_debug_mode: bool = False self.use_aiter_pa: bool = True + self.use_asm_pa: bool = True self.enable_native_cuda_graph: bool = False self.num_native_cuda_graph: int = 200 @@ -782,6 +783,7 @@ def update_from_env(self): "ENABLE_CUDA_GRAPH_DEBUG_MODE", self.enable_cuda_graph_debug_mode ) self.use_aiter_pa = get_env_bool("USE_AITER_PA", self.use_aiter_pa) + self.use_asm_pa = get_env_bool("USE_ASM_PA", self.use_asm_pa) self.enable_native_cuda_graph = get_env_bool( "ENABLE_NATIVE_CUDA_GRAPH", self.enable_native_cuda_graph ) @@ -800,6 +802,7 @@ def to_string(self): f"enable_cuda_graph: {self.enable_cuda_graph}\n" f"enable_cuda_graph_debug_mode: {self.enable_cuda_graph_debug_mode}\n" f"use_aiter_pa: {self.use_aiter_pa}\n" + f"use_asm_pa: {self.use_asm_pa}\n" f"enable_native_cuda_graph: {self.enable_native_cuda_graph}\n" f"num_native_cuda_graph: {self.num_native_cuda_graph}" ) diff --git a/rtp_llm/cpp/cache/BUILD b/rtp_llm/cpp/cache/BUILD index 8ffc6a751..29f82b640 100644 --- a/rtp_llm/cpp/cache/BUILD +++ b/rtp_llm/cpp/cache/BUILD @@ -54,6 +54,7 @@ cc_library( ], "//conditions:default": [], }), + copts = copts(), visibility = ["//visibility:public"], ) diff --git a/rtp_llm/cpp/cache/KVCacheAllocator.cc b/rtp_llm/cpp/cache/KVCacheAllocator.cc index 98390b3e3..00a0c5075 100644 --- a/rtp_llm/cpp/cache/KVCacheAllocator.cc +++ b/rtp_llm/cpp/cache/KVCacheAllocator.cc @@ -5,6 +5,9 @@ #include "rtp_llm/cpp/disaggregate/cache_store/NormalCacheStore.h" #include "rtp_llm/cpp/core/Buffer.h" #include "rtp_llm/cpp/core/Types.h" +#if USING_ROCM +#include +#endif #ifdef ENABLE_FP8 #include #endif @@ -87,7 +90,7 @@ void KVCacheAllocator::initKVCacheScale() { (int8_t*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2 + kv_cache_.k_scale->sizeBytes()); } -#ifdef ENABLE_FP8 +#if defined(ENABLE_FP8) || defined(USING_ROCM) else if (config_.dtype == rtp_llm::DataType::TYPE_FP8_E4M3) { kv_cache_.k_scale = std::make_unique(rtp_llm::MemoryType::MEMORY_GPU, @@ -96,7 +99,11 @@ void KVCacheAllocator::initKVCacheScale() { (size_t)config_.block_nums, (size_t)config_.local_head_num_kv, (size_t)config_.seq_size_per_block}, +#ifdef USING_ROCM + (__hip_fp8_e4m3_fnuz*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2); +#else (__nv_fp8_e4m3*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2); +#endif kv_cache_.v_scale = std::make_unique( rtp_llm::MemoryType::MEMORY_GPU, rtp_llm::DataType::TYPE_FP32, @@ -104,7 +111,13 @@ void KVCacheAllocator::initKVCacheScale() { (size_t)config_.block_nums, (size_t)config_.local_head_num_kv, (size_t)config_.seq_size_per_block}, +#ifdef USING_ROCM + (__hip_fp8_e4m3_fnuz*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2 + kv_cache_.k_scale->sizeBytes()); +#else (__nv_fp8_e4m3*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2 + kv_cache_.k_scale->sizeBytes()); +#endif + Buffer2torchTensor(kv_cache_.k_scale, false).fill_(1.0); + Buffer2torchTensor(kv_cache_.v_scale, false).fill_(1.0); } #endif } diff --git a/rtp_llm/cpp/config/ConfigModules.cc b/rtp_llm/cpp/config/ConfigModules.cc index e8f3b01f1..4449f19bb 100644 --- a/rtp_llm/cpp/config/ConfigModules.cc +++ b/rtp_llm/cpp/config/ConfigModules.cc @@ -174,9 +174,11 @@ void HWKernelConfig::update_from_env_for_test() { enable_multi_block_mode = bool_from_env_for_test("ENABLE_MULTI_BLOCK_MODE", true); ft_disable_custom_ar = bool_from_env_for_test("FT_DISABLE_CUSTOM_AR", true); rocm_hipblaslt_config = autil::EnvUtil::getEnv("ROCM_HIPBLASLT_CONFIG", "gemm_config.csv"); + use_swizzleA = bool_from_env_for_test("USE_SWIZZLEA", false); enable_cuda_graph = bool_from_env_for_test("ENABLE_CUDA_GRAPH", false); enable_cuda_graph_debug_mode = bool_from_env_for_test("ENABLE_CUDA_GRAPH_DEBUG_MODE", false); use_aiter_pa = bool_from_env_for_test("USE_AITER_PA", true); + use_asm_pa = bool_from_env_for_test("USE_ASM_PA", true); enable_native_cuda_graph = bool_from_env_for_test("ENABLE_NATIVE_CUDA_GRAPH", false); num_native_cuda_graph = autil::EnvUtil::getEnv("NUM_NATIVE_CUDA_GRAPH", 200); } @@ -189,9 +191,11 @@ std::string HWKernelConfig::to_string() const { << "enable_multi_block_mode: " << enable_multi_block_mode << "\n" << "ft_disable_custom_ar: " << ft_disable_custom_ar << "\n" << "rocm_hipblaslt_config: " << rocm_hipblaslt_config << "\n" + << "use_swizzleA: " << use_swizzleA << "\n" << "enable_cuda_graph: " << enable_cuda_graph << "\n" << "enable_cuda_graph_debug_mode" << enable_cuda_graph_debug_mode << "\n" - << "use_aiter_pa" << use_aiter_pa << "\n" + << "use_aiter_pa: " << use_aiter_pa << "\n" + << "use_asm_pa: " << use_asm_pa << "\n" << "enable_native_cuda_graph" << enable_native_cuda_graph << "\n" << "num_native_cuda_graph" << num_native_cuda_graph << "\n"; return oss.str(); diff --git a/rtp_llm/cpp/config/ConfigModules.h b/rtp_llm/cpp/config/ConfigModules.h index 2deba6c07..b42dd42dd 100644 --- a/rtp_llm/cpp/config/ConfigModules.h +++ b/rtp_llm/cpp/config/ConfigModules.h @@ -87,9 +87,11 @@ struct HWKernelConfig { bool enable_multi_block_mode = true; bool ft_disable_custom_ar = true; std::string rocm_hipblaslt_config = "gemm_config.csv"; + bool use_swizzleA = false; bool enable_cuda_graph = false; bool enable_cuda_graph_debug_mode = false; bool use_aiter_pa = true; + bool use_asm_pa = true; bool enable_native_cuda_graph = false; int num_native_cuda_graph = 200; std::string to_string() const; diff --git a/rtp_llm/cpp/devices/DeviceBase.h b/rtp_llm/cpp/devices/DeviceBase.h index 12e83a79e..e0a0baa86 100644 --- a/rtp_llm/cpp/devices/DeviceBase.h +++ b/rtp_llm/cpp/devices/DeviceBase.h @@ -160,6 +160,10 @@ class DeviceBase: public DeviceOps { return native_graph_capturing_; } + virtual BufferPtr getRotaryEmbeddingCoefficientCache(const RopeConfig & rope_config) { + throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); + } + public: // device-independence op implementations void batchCopy(const BatchCopyParams& params) override; diff --git a/rtp_llm/cpp/devices/DeviceData.h b/rtp_llm/cpp/devices/DeviceData.h index 5e604f9e5..6b5c56d4f 100644 --- a/rtp_llm/cpp/devices/DeviceData.h +++ b/rtp_llm/cpp/devices/DeviceData.h @@ -71,6 +71,7 @@ struct DeviceInitParams { bool use_deepep_moe = false; int user_deep_gemm_num_sm = -1; bool use_aiter_pa = true; + bool use_asm_pa = true; bool use_deepep_internode = false; bool use_deepep_low_latency = false; bool is_mtp = false; diff --git a/rtp_llm/cpp/devices/DeviceFactory.cc b/rtp_llm/cpp/devices/DeviceFactory.cc index 004b4c09e..efe4f194a 100644 --- a/rtp_llm/cpp/devices/DeviceFactory.cc +++ b/rtp_llm/cpp/devices/DeviceFactory.cc @@ -125,6 +125,7 @@ void DeviceFactory::initDevices(const GptInitParameter& params) { device_params.enable_layer_micro_batch); device_params.user_deep_gemm_num_sm = params.hw_kernel_config.deep_gemm_num_sm; device_params.use_aiter_pa = params.hw_kernel_config.use_aiter_pa; + device_params.use_asm_pa = params.hw_kernel_config.use_asm_pa; device_params.use_deepep_moe = params.moe_config.use_deepep_moe; device_params.use_deepep_internode = params.moe_config.use_deepep_internode; device_params.use_deepep_low_latency = params.moe_config.use_deepep_low_latency; diff --git a/rtp_llm/cpp/devices/OpData.h b/rtp_llm/cpp/devices/OpData.h index f10b04112..cbcf0aa7a 100644 --- a/rtp_llm/cpp/devices/OpData.h +++ b/rtp_llm/cpp/devices/OpData.h @@ -614,6 +614,7 @@ struct AttentionModuleParams { const AttentionConfigs& configs; const QScheme qscheme; const DataType compute_type = DataType::TYPE_INVALID; + const BufferPtr rotary_embedding_coefficient_cache = nullptr; }; struct MlaRotaryWriteKVCacheParams { @@ -698,6 +699,7 @@ struct AttentionLayerParams { const DataType compute_type; bool enable_sp; size_t pad_token_num; + const BufferPtr rotary_embedding_coefficient_cache = nullptr; }; struct MoeConfigs { diff --git a/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc b/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc index edc1c6a39..838fa9c28 100644 --- a/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc +++ b/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc @@ -149,7 +149,8 @@ BufferPtr DeviceBase::attentionAttn(const AttentionLayerParams& params) { params.weights, params.configs, params.qscheme, - params.compute_type}); + params.compute_type, + params.rotary_embedding_coefficient_cache}); } if (context_batch_size) { auto context_qkv = qkv.view(generate_batch_size, context_token_num); @@ -165,7 +166,8 @@ BufferPtr DeviceBase::attentionAttn(const AttentionLayerParams& params) { params.weights, params.configs, params.qscheme, - params.compute_type}); + params.compute_type, + params.rotary_embedding_coefficient_cache}); } if (layer_kv_cache) { params.common.kv_cache->kv_cache_block_id = kv_cache_block_id; @@ -259,7 +261,8 @@ AttentionLayerOutput DeviceBase::attentionLayer(const AttentionLayerParams& para params.qscheme, params.compute_type, params.enable_sp, - params.pad_token_num}); + params.pad_token_num, + params.rotary_embedding_coefficient_cache}); return {attentionOutGemm({params.layer_id, *attn_out, params.output, diff --git a/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp b/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp index 510aa836d..3061b19cc 100644 --- a/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp +++ b/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp @@ -7,6 +7,7 @@ #ifdef USING_ROCM #include "rtp_llm/cpp/devices/rocm_impl/aiterPA.h" +#include "rtp_llm/cpp/config/StaticConfig.h" #endif #ifdef USING_CUDA12 @@ -94,8 +95,11 @@ struct AttentionImpl: torch::nn::Module { attention_mask = torch::zeros({batch_size, 1, seq_len, kv_seq_len}); } auto scores = torch::softmax((attn_weights / sqrtf(head_dim * 1.0f) + *attention_mask), -1); - +#ifdef USING_ROCM + auto output = torch::matmul(scores.to(torch::kFloat32), v.to(torch::kFloat32)); +#else auto output = torch::matmul(scores, v); +#endif auto transpose_output = output.transpose(1, 2); return {q, k, v, attn_weights, scores, output, transpose_output}; } @@ -164,8 +168,11 @@ void AttentionOpTest::contextAttentionOpTest(size_t batch_size, attention.ptr()->to(torch::Device(torch::kCPU)); auto state_dict = attention.ptr()->named_parameters(); torch::NoGradGuard no_grad; - +#ifdef USING_ROCM + auto tensor_options = torch::TensorOptions(torch::kBFloat16).device(torch::Device(torch::kCPU)); +#else auto tensor_options = torch::TensorOptions(torch::kFloat).device(torch::Device(torch::kCPU)); +#endif auto int_tensor_options = torch::TensorOptions(torch::kInt).device(torch::Device(torch::kCPU)); auto query_states_host = @@ -204,9 +211,26 @@ void AttentionOpTest::contextAttentionOpTest(size_t batch_size, auto cu_seqlens_device = createDeviceBuffer(cu_seqlens_host); auto attention_mask_device = createDeviceBuffer(attention_mask_host); auto scale_device = createDeviceBuffer(scale_host); - auto rope_config = RopeConfig({RopeStyle::No, (int)head_dim, 10000, 1, 2048, 1, 1}); +#ifdef USING_ROCM + auto rope_config = RopeConfig({rtp_llm::RopeStyle::Base, 128, 1000000}); + size_t tokensPerBlock = 16; + int block_num = batch_size * ((seq_len + tokensPerBlock - 1) / tokensPerBlock + 1); + rtp_llm::CacheConfig cache_conf(rtp_llm::KVCacheParam({1, (uint)block_num, (uint)num_key_value_heads, (uint)head_dim, (uint)tokensPerBlock, DataType::TYPE_BF16})); + auto kv_cache_block_id = device_->allocateBuffer({ + rtp_llm::DataType::TYPE_INT32, {batch_size, block_num / batch_size}, rtp_llm::AllocationType::HOST + }); + + cache_manager_ = std::make_shared(cache_conf, device_);; + auto kv_cache_buffer = cache_manager_->kvCacheBuffer(); + auto layer_k_cache_buffer = kv_cache_buffer.k_blocks->index(0); + auto layer_v_cache_buffer = kv_cache_buffer.v_blocks->index(0); + auto common_inputs = AttentionCommonInputs({input_lengths, sequence_lengths}); + common_inputs.kv_cache = KvCacheInfo({(int)kv_cache_buffer.k_blocks->shape()[0], kv_cache_block_id, layer_k_cache_buffer, layer_v_cache_buffer}); +#else + auto rope_config = RopeConfig({RopeStyle::No, (int)head_dim, 10000, 1, 2048, 1, 1}); auto common_inputs = AttentionCommonInputs({input_lengths, sequence_lengths}); +#endif common_inputs.cu_seqlens = move(cu_seqlens_device); common_inputs.cu_kv_seqlens = common_inputs.cu_seqlens; common_inputs.padding_offset = move(padding_offset_device); @@ -220,7 +244,11 @@ void AttentionOpTest::contextAttentionOpTest(size_t batch_size, auto buffer_nullptr = BufferPtr(nullptr); auto attention_weight = AttentionLayerWeights(); +#ifdef USING_ROCM + attention_weight.qkv_weight = make_shared(DenseWeights(buffer_nullptr)); +#else attention_weight.qkv_weight = make_shared(DenseWeights(buffer_nullptr, bias_device)); +#endif attention_weight.static_scale_reciprocal_weight = make_shared(DenseWeights(scale_device)); @@ -229,13 +257,17 @@ void AttentionOpTest::contextAttentionOpTest(size_t batch_size, auto output_data_type = qscheme == QScheme::Qfp8PerTensor ? DataType::TYPE_FP8_E4M3 : qkv_input_device->type(); auto qkv_output = device_->allocateBuffer({output_data_type, {batch_size, seq_len, num_heads, head_dim}}); +#ifdef USING_ROCM + device_->contextAttention( + {0, *qkv_input_device, *qkv_output, common_inputs, attention_weight, attention_config, qscheme, DataType::TYPE_INVALID, ((ROCmDevice*)device_)->getRotaryEmbeddingCoefficientCache(rope_config)}); + auto result_ref = attention->forward(query_states_host, key_states_host, value_states_host, attention_mask_host, std::nullopt, std::nullopt, true, rope_config.base, rope_config.dim); +#else device_->contextAttention( {0, *qkv_input_device, *qkv_output, common_inputs, attention_weight, attention_config, qscheme}); - auto result_ref = attention->forward(query_states_host, key_states_host, value_states_host, attention_mask_host); - +#endif auto result = bufferToTensor(*qkv_output); - assertTensorClose(result_ref[6], result.to(result_ref[6].dtype())); + assertTensorClose(result_ref[6], result.to(result_ref[6].dtype()), 1e-2, 1e-2); } void AttentionOpTest::selfAttentionOpTest(size_t batch_size, @@ -248,9 +280,12 @@ void AttentionOpTest::selfAttentionOpTest(size_t batch_size, attention.ptr()->to(torch::Device(torch::kCPU)); auto state_dict = attention.ptr()->named_parameters(); torch::NoGradGuard no_grad; - +#ifdef USING_ROCM + auto tensor_options = torch::TensorOptions(torch::kBFloat16).device(torch::Device(torch::kCPU)); +#else auto tensor_options = torch::TensorOptions(torch::kFloat).device(torch::Device(torch::kCPU)); auto half_tensor_options = torch::TensorOptions(torch::kHalf).device(torch::Device(torch::kCPU)); +#endif auto int_tensor_options = torch::TensorOptions(torch::kInt).device(torch::Device(torch::kCPU)); auto query_states_host = @@ -290,7 +325,12 @@ void AttentionOpTest::selfAttentionOpTest(size_t batch_size, padding_kv_seq_len = (kv_seq_len == 0) ? 2 * tokensPerBlock : padding_kv_seq_len; auto kvcache_pad = torch::zeros({1, (int)batch_size, 2, (int)padding_kv_seq_len, (int)num_key_value_heads * (int)head_dim}, - half_tensor_options); +#ifdef USING_ROCM + tensor_options +#else + half_tensor_options +#endif +); auto k_cache_host = kvcache_pad @@ -318,8 +358,11 @@ void AttentionOpTest::selfAttentionOpTest(size_t batch_size, auto qkv_states_device = createDeviceBuffer(qkv_states_host); auto sequence_lengths_device = createDeviceBuffer(sequence_lengths_host); auto input_lengths_device = createDeviceBuffer(input_lengths_host); - +#ifdef USING_ROCM + auto rope_config = RopeConfig({rtp_llm::RopeStyle::Base, 128, 1000000}); +#else auto rope_config = RopeConfig({RopeStyle::No, (int)head_dim, 10000, 1, 2048, 1, 1}); +#endif // cache manager need one block for preserve and every seq need one block for preserve. #ifdef USING_ROCM @@ -367,12 +410,17 @@ void AttentionOpTest::selfAttentionOpTest(size_t batch_size, #endif auto qkv_output = device_->allocateBuffer({qkv_states_device->type(), {token_num, num_heads, head_dim}}); +#ifdef USING_ROCM + device_->decoderSelfAttention( + {0, *qkv_states_device, *qkv_output, common_inputs, attention_weight, attention_config, QScheme::NoQuantize, DataType::TYPE_INVALID, ((ROCmDevice*)device_)->getRotaryEmbeddingCoefficientCache(rope_config)}); + auto result_ref = attention->forward( + query_states_host, key_states_host, value_states_host, attention_mask_host, k_cache_host, v_cache_host, true, rope_config.base, rope_config.dim); +#else device_->decoderSelfAttention( {0, *qkv_states_device, *qkv_output, common_inputs, attention_weight, attention_config}); - auto result_ref = attention->forward( query_states_host, key_states_host, value_states_host, attention_mask_host, k_cache_host, v_cache_host); - +#endif auto result = bufferToTensor(*qkv_output); assertTensorClose(result_ref[6].to(result.dtype()), result, 1e-2, 1e-2); } @@ -412,7 +460,7 @@ void AttentionOpTest::aiterPageAttentionOpTest(size_t batch_size, size_t tokens_per_block = 16; size_t padding_kv_seq_len = ((kv_seq_len + tokens_per_block - 1) / tokens_per_block + 1) * tokens_per_block; padding_kv_seq_len = (kv_seq_len == 0) ? 2 * tokens_per_block : padding_kv_seq_len; - auto kvcache_pad = torch::rand( // hangy: [1, 2, 2, 256, 4 * 128] + auto kvcache_pad = torch::rand( {1, (int)batch_size, 2, (int)padding_kv_seq_len, (int)num_key_value_heads * (int)head_dim}, bf16_tensor_options); auto kvcache_pad_fp8 = @@ -427,7 +475,7 @@ void AttentionOpTest::aiterPageAttentionOpTest(size_t batch_size, .clone(); auto v_cache_host = kvcache_pad - .index( // hangy: [batch_size, num_key_value_heads, kv_seq_len, head_dim] + .index( {0, torch::indexing::Slice(), 1, torch::indexing::Slice(0, kv_seq_len), torch::indexing::Slice()}) .reshape({(int)batch_size, (int)kv_seq_len, (int)num_key_value_heads, (int)head_dim}) .contiguous() @@ -459,7 +507,7 @@ void AttentionOpTest::aiterPageAttentionOpTest(size_t batch_size, {1, (uint)block_num, (uint)num_key_value_heads, (uint)head_dim, (uint)tokens_per_block, DataType::TYPE_BF16})); cache_manager_ = nullptr; auto kv_cache_block_id = allocateKVBlocks( - cache_conf, input_lengths, kvcache_pad); // hangy: copy kv cache content from kvcache_pad_fp8 to RTP local KV + cache_conf, input_lengths, kvcache_pad); // cache, kv_cache_block_id = [batch_size, xxx] auto kv_cache_buffer = cache_manager_->kvCacheBuffer(); auto common_inputs = AttentionCommonInputs({input_lengths_device, sequence_lengths_device}); @@ -489,7 +537,7 @@ void AttentionOpTest::aiterPageAttentionOpTest(size_t batch_size, {"kv_cache_page_List"}); KVBlockArray kv_block_array = device->getKVBlockArray(params, *kv_cache_page_List, batch_size, false); - runAiterPA(params, device, *qkv_states_device); + runAiterAsmPA(params, device, *qkv_states_device); device->syncAndCheck(); auto q_host_fp32 = query_states_host.to(tensor_options); diff --git a/rtp_llm/cpp/devices/base_tests/GemmOpTest.hpp b/rtp_llm/cpp/devices/base_tests/GemmOpTest.hpp index 00ec5d11f..b52577bc0 100644 --- a/rtp_llm/cpp/devices/base_tests/GemmOpTest.hpp +++ b/rtp_llm/cpp/devices/base_tests/GemmOpTest.hpp @@ -1,5 +1,9 @@ #pragma once #include "rtp_llm/cpp/devices/testing/TestBase.h" +#if USING_ROCM +#include "rtp_llm/cpp/rocm/datatype_interface.h" +#include "rtp_llm/cpp/rocm/TensorDataManipulation.h" +#endif #include using namespace rtp_llm; @@ -49,6 +53,24 @@ class GemmOpTest: public DeviceTestBase { auto A = tensorToBuffer(input.A); auto B = tensorToBuffer(input.B); auto D = device_->allocateBuffer({A->type(), {A->shape()[0], B->shape()[1]}}); + // Gemm B in device_->gemm(params); is A +#if USING_ROCM + auto B_ptr = B->data(); + auto testSwizzle = bool(autil::EnvUtil::getEnv("TEST_SWIZZLEA", 0L)); + if (testSwizzle){ + std::vector Ashape = A->shape(); + std::vector Bshape = B->shape(); + size_t dim = A->dim(); + size_t m = Ashape[dim - 2]; + size_t k = Ashape[dim - 1]; + size_t n = Bshape[dim - 1]; + std::vector src(n * k, hip_bfloat16{}); + std::vector dst(n * k, hip_bfloat16{}); + hipMemcpy(src.data(), B_ptr, n * k * sizeof(hip_bfloat16), hipMemcpyDeviceToHost); + swizzleTensor(dst.data(), src.data(), n, k, true); + hipMemcpy(const_cast(B_ptr), dst.data(), n * k * sizeof(hip_bfloat16), hipMemcpyHostToDevice); + } +#endif GemmParams params{*A, *B, std::nullopt, D}; device_->gemm(params); return GemmOpTestOutput({bufferToTensor(*D)}); @@ -159,7 +181,7 @@ class GemmOpTest: public DeviceTestBase { auto input = PrepareGemmOpInput(m, n, k, dtype); auto result = BasicGemmOpRun(input); auto result_ref = BasicGemmTorchRefRun(input); - assertTensorClose(result.C.to(result_ref.C.type()), result_ref.C); + assertTensorClose(result.C.to(result_ref.C.type()), result_ref.C, 1e-2, 1e-2); } void TransposeGemmOpTest( @@ -244,4 +266,53 @@ class GemmOpTest: public DeviceTestBase { auto result_ref = GemmOpTestOutput({torch::matmul(input.A.to(torch::kFloat), input.B.t().to(torch::kFloat))}); assertTensorClose(result.C.to(result_ref.C.type()), result_ref.C, 1e-2, 1e-2); } -}; \ No newline at end of file + +#if USING_ROCM + void calculateKforSwizzling(hipDataType datatype, size_t& MiK, size_t& MiKv, size_t& PackK) + { + switch(datatype) + { + case HIP_R_32F: + MiK = 4; + MiKv = 1; + break; + case HIP_R_16F: + case HIP_R_16BF: + MiK = 16; + MiKv = 4; + break; + case HIP_R_8F_E4M3_FNUZ: + case HIP_R_8F_E5M2_FNUZ: + MiK = 32; + MiKv = 8; + break; + default: + std::cerr << "unsupported datatype in calculateKforSwizzling" << '\n'; + } + + PackK = 16 / MiKv / realDataTypeSize(datatype); + } + + template + void swizzleTensor(T* dst, const T* src, size_t m, size_t k, bool colMaj) + { + using Tensor = Tensor::Manipulation::Tensor; + size_t MiM = 16; + size_t MiK = 0, MiKv = 0, PackK = 0; + calculateKforSwizzling(hipblaslt_type2datatype(), MiK, MiKv, PackK); + auto tmpTensor = Tensor::create({m, k}); + memcpy(tmpTensor.template as(), src, m * k * sizeof(T)); + + if(colMaj) + { + auto orgTensor = Tensor::create({k, m}); + memcpy(orgTensor.template as(), src, m * k * sizeof(T)); + tmpTensor = permute(orgTensor, {1, 0}); + } + + tmpTensor.reshape({m / MiM, MiM, k / (MiK * PackK), MiK / MiKv, MiKv * PackK}); + Tensor permuted = permute(tmpTensor, {0, 2, 3, 1, 4}); + memcpy(dst, permuted.template as(), m * k * sizeof(T)); + } +#endif +}; diff --git a/rtp_llm/cpp/devices/rocm_impl/BUILD b/rtp_llm/cpp/devices/rocm_impl/BUILD index 9b1a33f6b..0a9cf17b4 100644 --- a/rtp_llm/cpp/devices/rocm_impl/BUILD +++ b/rtp_llm/cpp/devices/rocm_impl/BUILD @@ -15,7 +15,6 @@ cc_library( "@local_config_rocm//rocm:rocblas", "@local_config_rocm//rocm:hipblaslt", "@local_config_rocm//rocm:rccl", - "@composable_kernel//:ck_fmha_example", "//rtp_llm/cpp/devices:devices_base", "//rtp_llm/cpp/devices:devices_base_impl", "//rtp_llm/cpp/core:allocator", @@ -31,24 +30,32 @@ cc_library( "@//:using_aiter_src": [ "@aiter_src//:module_aiter_enum", "@aiter_src//:module_quant", + "@aiter_src//:module_smoothquant", "@aiter_src//:module_gemm_a8w8_blockscale", "@aiter_src//:module_gemm_a8w8_bpreshuffle", "@aiter_src//:module_moe_sorting", "@aiter_src//:module_moe_asm", "@aiter_src//:module_pa", - "@aiter_src//:module_moe", "@aiter_src//:module_activation", + "@aiter_src//:module_rmsnorm", + "@aiter_src//:module_norm", + "@aiter_src//:module_mha_fwd", + "@aiter_src//:module_moe_ck2stages", ], "//conditions:default": [ "@aiter//:module_aiter_enum", "@aiter//:module_quant", + "@aiter//:module_smoothquant", "@aiter//:module_gemm_a8w8_blockscale", "@aiter//:module_gemm_a8w8_bpreshuffle", "@aiter//:module_moe_sorting", "@aiter//:module_moe_asm", "@aiter//:module_pa", - "@aiter//:module_moe", "@aiter//:module_activation", + "@aiter//:module_rmsnorm", + "@aiter//:module_norm", + "@aiter//:module_mha_fwd", + "@aiter//:module_moe_ck2stages", ], }), visibility = ["//visibility:public"], diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc index 38fecf8b2..ee66de135 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc @@ -504,12 +504,12 @@ KVBlockArray ROCmDevice::getKVBlockArray(const AttentionModuleParams& params, kv_cache_buffer.mScaleBytesPerBlock = k_scale[0].sizeBytes(); } KvCacheDataType cache_type = KvCacheDataType::BASE; -#ifdef ENABLE_FP8 +#if defined(ENABLE_FP8) if (use_fp8_fmha_) { cache_type = KvCacheDataType::FP8; } else #endif - if (use_fp8_fmha) { + if (use_fp8_fmha) { cache_type = KvCacheDataType::FP8; } else if (kv_cache->k_scale_buffer && params.configs.kv_cache_dtype == KvCacheDataType::INT8) { RTP_LLM_LOG_DEBUG("now use kv_cache int8"); @@ -602,9 +602,7 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& kv_cache_block_id = allocateBuffer({DataType::TYPE_INT32, {batch_size, 1, 2, max_blocks_per_batch}, AllocationType::DEVICE}, {"kv_cache_block_id"}); - - kv_block_array = getKVBlockArray(params, *kv_cache_block_id, batch_size, false); - + kv_block_array = getKVBlockArray(params, *kv_cache_block_id, batch_size, params.common.kv_cache->k_cache_buffer->type() == DataType::TYPE_FP8_E4M3); prefix_prompt_param.kv_block_array = kv_block_array; if (params.common.prefix_prompt_lengths) { @@ -625,7 +623,8 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& if (prefix_prompt_param.max_prefix_prompt_length > 0) { if (init_params_.use_aiter_pa) { - DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, + if (init_params_.use_asm_pa) { + DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, invokeLoadPrefixKVCacheAiter, q_output->data(), k_output->data(), @@ -639,6 +638,9 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& scale_out_ptr, int8_mode, stream_); + } else { + RUNTIME_ASSERT_OP_ARG(init_params_.use_asm_pa, "Should use asm_pa"); + } } else { DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, invokeLoadPrefixKVCache, @@ -668,7 +670,8 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& RTP_LLM_LOG_DEBUG("skip_add_bias_transpose: %d", skip_add_bias_transpose); if (!skip_add_bias_transpose) { if (init_params_.use_aiter_pa) { - DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, + if (init_params_.use_asm_pa) { + DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, invokeAddFusedQKVBiasTransposePrefill, q_output->data(), k_output->data(), @@ -700,7 +703,11 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& store_q, store_kv, store_cache, + params.rotary_embedding_coefficient_cache ? params.rotary_embedding_coefficient_cache->data() : nullptr, stream_); + } else { + RUNTIME_ASSERT_OP_ARG(init_params_.use_asm_pa, "Should use asm_pa"); + } check_cuda_error(); } else { DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, @@ -1029,13 +1036,20 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara {DataType::TYPE_INT32, {batch_size, 1, 2, max_blocks_per_batch}, AllocationType::DEVICE}, {"kv_cache_offset"}); if (init_params_.use_aiter_pa) { - KVBlockArray kv_block_array = getKVBlockArray(params, *kv_cache_offset, batch_size, false, true); PrefixPromptBatchWeightsParam prefix_prompt_param; - auto offset_kv_block_array = OffsetIndexedKVBlockArray( - kv_block_array, - (rtp_llm::KVBlockArrayForContextFMHA::DataType*)params.common.kv_cache->kv_cache_block_id->data(), - params.common.kv_cache->k_cache_buffer->shape()[0] * params.common.kv_cache->layer_num); - prefix_prompt_param.offset_kv_block_array = offset_kv_block_array; + if (init_params_.use_asm_pa) { + KVBlockArray kv_block_array = getKVBlockArray(params, *kv_cache_offset, batch_size, params.common.kv_cache->k_cache_buffer->type() == DataType::TYPE_FP8_E4M3, false); + prefix_prompt_param.kv_block_array = kv_block_array; + } + else { + KVBlockArray kv_block_array = getKVBlockArray(params, *kv_cache_offset, batch_size, params.common.kv_cache->k_cache_buffer->type() == DataType::TYPE_FP8_E4M3, true); + auto offset_kv_block_array = OffsetIndexedKVBlockArray( + kv_block_array, + (rtp_llm::KVBlockArrayForContextFMHA::DataType*)params.common.kv_cache->kv_cache_block_id->data(), + params.common.kv_cache->k_cache_buffer->shape()[0] * params.common.kv_cache->layer_num); + prefix_prompt_param.kv_block_array = kv_block_array; + prefix_prompt_param.offset_kv_block_array = offset_kv_block_array; + } auto token_num = params.input.shape()[0]; auto decoder_batch_size = params.common.decoder_batch_size; @@ -1057,7 +1071,8 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara && !params.configs.fuse_qkv_add_bias); printBufferData(*params.common.input_lengths, "input_lengths"); if (!skip_add_bias_transpose) { - DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, + if (init_params_.use_asm_pa) { + DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, invokeAddFusedQKVBiasTransposeDecode, q_output->data(), nullptr, @@ -1089,14 +1104,23 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara store_q, store_kv, store_cache, + params.rotary_embedding_coefficient_cache ? params.rotary_embedding_coefficient_cache->data() : nullptr, stream_); + } else { + RUNTIME_ASSERT_OP_ARG(init_params_.use_asm_pa, "Should use asm_pa"); + } check_cuda_error(); DEBUG_PRINT_PARAMS(params, this, "decode_writeKVCache", q_output); - runAiterPA(params, this, *q_output); + if (init_params_.use_asm_pa) { + runAiterAsmPA(params, this, *q_output); + } + else { + runAiterPA(params, this, *q_output); + } check_cuda_error(); } } else { - KVBlockArray kv_block_array = getKVBlockArray(params, *kv_cache_offset, batch_size, false); + KVBlockArray kv_block_array = getKVBlockArray(params, *kv_cache_offset, batch_size, params.common.kv_cache->k_cache_buffer->type() == DataType::TYPE_FP8_E4M3); DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, selfAttentionwrapper, @@ -1114,4 +1138,12 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara } } +BufferPtr ROCmDevice::getRotaryEmbeddingCoefficientCache(const RopeConfig & rope_config) { + size_t max_seq_len = 1048576; + auto rotary_embedding_coefficient_cache = allocateBuffer({rtp_llm::DataType::TYPE_FP32, {max_seq_len, (size_t)rope_config.dim / 2, 2}, rtp_llm::AllocationType::DEVICE}); + invokeRotaryEmbeddingCoefficientCache((float2 *)rotary_embedding_coefficient_cache->data(), max_seq_len, rope_config, stream_); + syncAndCheck(); + return rotary_embedding_coefficient_cache; +} + } // namespace rtp_llm diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc index c4382b9c4..aa8118f38 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc @@ -123,7 +123,7 @@ ROCmDevice::ROCmDevice(const DeviceInitParams& params): DeviceBase(params) { hipblas_mm_wrapper_->setStream(stream_); fmha_runner_.reset(new rocmFmhaWrapper()); fmha_runner_->init(stream_); - moe_runner_.reset(new rocmMoeWrapper()); + //moe_runner_.reset(new rocmMoeWrapper()); ck_gemm_runner_.reset(new rocmCKGemmWrapper()); // select mla type diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h index 1111eb675..69fcfbeb9 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h @@ -18,7 +18,7 @@ #include "rtp_llm/cpp/rocm/hipblasMMWrapper.h" #include "rtp_llm/cpp/rocm/rocmFmhaWrapper.h" #include "rtp_llm/cpp/rocm/quantizePreprocessors.h" -#include "rtp_llm/cpp/rocm/rocmMoeWrapper.h" +//#include "rtp_llm/cpp/rocm/rocmMoeWrapper.h" #include "rtp_llm/cpp/rocm/rocmCKGemmWrapper.h" #include "rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h" #include "rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h" @@ -230,6 +230,7 @@ class ROCmDevice: public DeviceBase { int batch_size, bool use_fp8_fmha, bool use_offset_array = false); + BufferPtr getRotaryEmbeddingCoefficientCache(const RopeConfig & rope_config) override; std::shared_ptr getNativeGraphRunner() override { return std::make_shared>(this); @@ -242,6 +243,7 @@ class ROCmDevice: public DeviceBase { protected: void InvokeROCmDeepGemm(const GemmParams& params, BufferPtr output); void InvokeROCmPTPCGemm(const GemmParams& params, BufferPtr output); + void HipblasltPTPCGemm(const GemmParams& params, BufferPtr output); // void prepareCommBuffer(const PrepareCommBufferParams& params) override; public: @@ -303,7 +305,7 @@ class ROCmDevice: public DeviceBase { NcclParam& nccl_param); NcclParam getNcclParam(ParallelMode mode); // moe - std::unique_ptr moe_runner_; + //std::unique_ptr moe_runner_; // for custom allreduce use std::unique_ptr custom_allreduce_comm_ = nullptr; diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmFfnLayer.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmFfnLayer.cc index f5e97cb47..1bd1e60f7 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmFfnLayer.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmFfnLayer.cc @@ -13,12 +13,6 @@ #include "moe_sorting.h" #include "moe_ck.h" -// #include "aiter_meta/csrc/include/aiter_enum.h" -// #include "aiter_meta/csrc/include/moe_op.h" -// #include "aiter_meta/csrc/include/quant.h" -// #include "aiter_meta/csrc/include/moe_sorting.h" -// #include "aiter_meta/csrc/include/moe_ck.h" - using namespace std; namespace rtp_llm { @@ -332,6 +326,7 @@ FfnLayerOutput ROCmDevice::moeFfn(const FfnLayerParams& params, const MoeGateSel const size_t num_token = hidden.shape()[0]; const size_t model_dim = hidden.shape()[1]; + const int inter_dim = static_cast(params.weights.moe_down_weight->kernel->shape()[2]); const size_t num_expert = moe_conf.expert_num; const size_t num_expert_per_rank = moe_conf.expert_num / moe_conf.ep_size; const size_t topk = moe_conf.top_k; @@ -350,96 +345,131 @@ FfnLayerOutput ROCmDevice::moeFfn(const FfnLayerParams& params, const MoeGateSel torch::Tensor topk_ids_tensor = Buffer2torchTensor(*(gate_outputs.expert_ids), false); torch::Tensor topk_weights_tensor = Buffer2torchTensor(*(gate_outputs.expert_scales), false); - // FIXME(liyangcheng.lyc): Is this division correct? I refer to it from - // vLLM(https://github.com/vllm-project/vllm/blob/5ebf66748b8b67731972c389d879ca69c68dc2c4/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py#L23) - if (params.qscheme == QScheme::Qfp8PerTokenBlock) { - RTP_LLM_CHECK_WITH_INFO(dtype == DataType::TYPE_BF16, - "input hidden datatype should be bf16 when using Qfp8PerTokenBlock"); - // fp8 w8a8 block scaled moe - const int block_scale_n = 128; - const int block_scale_k = 128; - const int unit_size = 32; // used in moe_sorting, meaning? - - torch::Tensor hidden_quant_tensor, hidden_quant_scale_tensor; - BufferPtr hidden_quant, hidden_quant_scale; - QBufferPtr q_hidden; - if (params.input.isQBuffer()) { - const QBuffer& qhidden = reinterpret_cast(hidden); - hidden_quant_tensor = Buffer2torchTensor(qhidden.kernel(), false).view({(int)num_token, (int)model_dim}); - hidden_quant_scale_tensor = Buffer2torchTensor(qhidden.scales(), false).t().contiguous(); - } else { - q_hidden = std::dynamic_pointer_cast( - quantize(QuantizeParams(hidden, DataType::TYPE_QFP8_E4M3, 1, QScheme::Qfp8PerTokenBlock, 128, 0))); - hidden_quant_tensor = Buffer2torchTensor(q_hidden->kernelPtr(), false); - hidden_quant_scale_tensor = Buffer2torchTensor(q_hidden->scalesPtr(), false).t().contiguous(); - } + // get input + torch::Tensor hidden_tensor; + std::optional hidden_scale_tensor; + QBufferPtr q_hidden; + if (params.qscheme == QScheme::NoQuantize) { + hidden_tensor = Buffer2torchTensor(hidden, false); + } else if (params.input.isQBuffer()) { + const QBuffer& qhidden = reinterpret_cast(hidden); + hidden_tensor = Buffer2torchTensor(qhidden.kernel(), false).view({(int)num_token, (int)model_dim}); + hidden_scale_tensor = Buffer2torchTensor(qhidden.scales(), false); + } else { + // ignore groupSize when using per_token quantization + q_hidden = std::dynamic_pointer_cast( + quantize(QuantizeParams(hidden, DataType::TYPE_QFP8_E4M3, 1, params.qscheme, 128, 0))); + hidden_tensor = Buffer2torchTensor(q_hidden->kernel(), false); + hidden_scale_tensor = Buffer2torchTensor(q_hidden->scales(), false); + } - // step 2. prepare w1 and w2 + // get w1 and w2 + torch::Tensor w1_tensor, w2_tensor; + std::optional w1_scale_tensor, w2_scale_tensor; + if (params.qscheme == QScheme::NoQuantize) { + w1_tensor = Buffer2torchTensor(*(params.weights.moe_gate_weight->kernel), false); + w2_tensor = Buffer2torchTensor(*(params.weights.moe_down_weight->kernel), false); + } else { const QBuffer& qmoe_gate_weight = reinterpret_cast(*(params.weights.moe_gate_weight->kernel)); - Buffer w1 = qmoe_gate_weight.kernel(); - Buffer w1_scale = qmoe_gate_weight.scales(); const QBuffer& qmoe_down_weight = reinterpret_cast(*(params.weights.moe_down_weight->kernel)); - Buffer w2 = qmoe_down_weight.kernel(); - Buffer w2_scale = qmoe_down_weight.scales(); - - torch::Tensor w1_tensor = Buffer2torchTensor(w1, false); - torch::Tensor w1_scale_tensor = Buffer2torchTensor(w1_scale, false); - torch::Tensor w2_tensor = Buffer2torchTensor(w2, false); - torch::Tensor w2_scale_tensor = Buffer2torchTensor(w2_scale, false); - - w1_scale_tensor = w1_scale_tensor.view({(int)num_expert_per_rank, -1}); - w2_scale_tensor = w2_scale_tensor.view({(int)num_expert_per_rank, -1}); - - // step 3. moe sorting - const int max_num_token_padded = topk_ids_tensor.numel() + num_expert * unit_size - topk; - const int max_num_m_block = (max_num_token_padded + unit_size - 1) / unit_size; - - BufferPtr sorted_ids = - allocateBuffer({DataType::TYPE_INT32, {(size_t)max_num_token_padded}}, {"rocm_moe_sorted_ids"}); - BufferPtr sorted_weights = - allocateBuffer({DataType::TYPE_FP32, {(size_t)max_num_token_padded}}, {"rocm_moe_sorted_weights"}); - BufferPtr sorted_expert_ids = - allocateBuffer({DataType::TYPE_INT32, {(size_t)max_num_m_block}}, {"rocm_moe_sorted_expert_ids"}); - BufferPtr num_valid_ids = allocateBuffer({DataType::TYPE_INT32, {1}}, {"rocm_moe_num_valid_ids"}); - BufferPtr local_expert_mask = - allocateBuffer({DataType::TYPE_INT32, {(size_t)num_expert}}, {"rocm_moe_local_expert_mask"}); - torch::Tensor local_expert_mask_tensor = Buffer2torchTensor(*local_expert_mask, false); - - local_expert_mask_tensor.zero_(); - if (init_params_.use_deepep_moe) { - // deepep has already offset the topk_ids and set the masked expert to num_expert_per_rank - local_expert_mask_tensor.index_put_({torch::indexing::Slice(0, num_expert_per_rank)}, - torch::ones(num_expert_per_rank, torch::device(torch::kCUDA))); - } else { - local_expert_mask_tensor.index_put_({torch::indexing::Slice(moe_conf.ep_rank * num_expert_per_rank, - (moe_conf.ep_rank + 1) * num_expert_per_rank)}, - torch::ones(num_expert_per_rank, torch::device(torch::kCUDA))); - } + w1_tensor = Buffer2torchTensor(qmoe_gate_weight.kernel(), false); + w1_scale_tensor = Buffer2torchTensor(qmoe_gate_weight.scales(), false); + w2_tensor = Buffer2torchTensor(qmoe_down_weight.kernel(), false); + w2_scale_tensor = Buffer2torchTensor(qmoe_down_weight.scales(), false); + } - torch::Tensor sorted_ids_tensor = Buffer2torchTensor(*sorted_ids, false); - torch::Tensor sorted_weights_tensor = Buffer2torchTensor(*sorted_weights, false); - torch::Tensor sorted_expert_ids_tensor = Buffer2torchTensor(*sorted_expert_ids, false); - torch::Tensor num_valid_ids_tensor = Buffer2torchTensor(*num_valid_ids, false); + // get expert mask when ep_size > 1 + BufferPtr local_expert_mask = + allocateBuffer({DataType::TYPE_INT32, {(size_t)num_expert}}, {"rocm_moe_local_expert_mask"}); + torch::Tensor local_expert_mask_tensor = Buffer2torchTensor(*local_expert_mask, false); + local_expert_mask_tensor.zero_(); + if (init_params_.use_deepep_moe) { + // deepep has already offset the topk_ids and set the masked expert to num_expert_per_rank + local_expert_mask_tensor.index_put_({torch::indexing::Slice(0, num_expert_per_rank)}, + torch::ones(num_expert_per_rank, torch::device(torch::kCUDA))); + } else { + local_expert_mask_tensor.index_put_({torch::indexing::Slice(moe_conf.ep_rank * num_expert_per_rank, + (moe_conf.ep_rank + 1) * num_expert_per_rank)}, + torch::ones(num_expert_per_rank, torch::device(torch::kCUDA))); + } - torch::Tensor moe_out_tensor = Buffer2torchTensor(*moe_out_final, false); + // moe sorting + int unit_size = 32; + if (params.qscheme != QScheme::Qfp8PerTokenBlock) { + // get block_size_m, i.e. unit_size + const int cu_num = rocmDevProp.multiProcessorCount; + const int tile_n = 128; + const int tg_n = (inter_dim + tile_n - 1) / tile_n; + int min_rnd = std::numeric_limits::max(); + int min_empty = std::numeric_limits::max(); + const std::vector unit_size_support_list = {32, 64, 128}; + + for (const int& el : unit_size_support_list) { + int max_num_tokens = num_token * topk + num_expert_per_rank * el - topk; + int tg_num = tg_n * (max_num_tokens + el - 1) / el; + int rnd = (tg_num + cu_num - 1) / cu_num; + int empty = cu_num - tg_num % cu_num; + + if (rnd < min_rnd) { + min_rnd = rnd; + min_empty = empty; + unit_size = el; + } else if (rnd == min_rnd) { + if (empty < min_empty) { + min_empty = empty; + unit_size = el; + } + } + } + } - // invoke aiter moe_sorting kernel - moe_sorting_fwd( - /*topk_ids=*/topk_ids_tensor, - /*topk_weights=*/topk_weights_tensor, - /*sorted_token_ids=*/sorted_ids_tensor, - /*sorted_weights=*/sorted_weights_tensor, - /*sorted_expert_ids=*/sorted_expert_ids_tensor, - /*num_valid_ids=*/num_valid_ids_tensor, - /*moe_buf=*/moe_out_tensor, - /*num_experts=*/num_expert, - /*unit_size=*/unit_size, - /*local_expert_mask=*/local_expert_mask_tensor); + const int max_num_token_padded = topk_ids_tensor.numel() + num_expert * unit_size - topk; + const int max_num_m_block = (max_num_token_padded + unit_size - 1) / unit_size; + + BufferPtr sorted_ids = + allocateBuffer({DataType::TYPE_INT32, {(size_t)max_num_token_padded}}, {"rocm_moe_sorted_ids"}); + BufferPtr sorted_weights = + allocateBuffer({DataType::TYPE_FP32, {(size_t)max_num_token_padded}}, {"rocm_moe_sorted_weights"}); + BufferPtr sorted_expert_ids = + allocateBuffer({DataType::TYPE_INT32, {(size_t)max_num_m_block}}, {"rocm_moe_sorted_expert_ids"}); + BufferPtr num_valid_ids = allocateBuffer({DataType::TYPE_INT32, {1}}, {"rocm_moe_num_valid_ids"}); + + torch::Tensor sorted_ids_tensor = Buffer2torchTensor(*sorted_ids, false); + torch::Tensor sorted_weights_tensor = Buffer2torchTensor(*sorted_weights, false); + torch::Tensor sorted_expert_ids_tensor = Buffer2torchTensor(*sorted_expert_ids, false); + torch::Tensor num_valid_ids_tensor = Buffer2torchTensor(*num_valid_ids, false); + + torch::Tensor moe_out_tensor = Buffer2torchTensor(*moe_out_final, false); + + // invoke aiter moe_sorting kernel + moe_sorting_fwd( + /*topk_ids=*/topk_ids_tensor, + /*topk_weights=*/topk_weights_tensor, + /*sorted_token_ids=*/sorted_ids_tensor, + /*sorted_weights=*/sorted_weights_tensor, + /*sorted_expert_ids=*/sorted_expert_ids_tensor, + /*num_valid_ids=*/num_valid_ids_tensor, + /*moe_buf=*/moe_out_tensor, + /*num_experts=*/num_expert, + /*unit_size=*/unit_size, + /*local_expert_mask=*/local_expert_mask_tensor, + /*num_local_tokens*/ std::nullopt, + /*dispatch_policy*/ 0); - // step 3.4 invoke fused_moe function + if (params.qscheme == QScheme::Qfp8PerTokenBlock) { + RTP_LLM_CHECK_WITH_INFO(dtype == DataType::TYPE_BF16, + "input hidden datatype should be bf16 when using Qfp8PerTokenBlock"); + const int block_scale_n = 128; + const int block_scale_k = 128; + hidden_scale_tensor = hidden_scale_tensor.value().t().contiguous(); + w1_scale_tensor = w1_scale_tensor.value().view({(int)num_expert_per_rank, -1}); + w2_scale_tensor = w2_scale_tensor.value().view({(int)num_expert_per_rank, -1}); + std::string fmoe_fp8_block_scale_g1u1_name = ""; + + // invoke aiter moe kernel fmoe_fp8_blockscale_g1u1( /*out=*/moe_out_tensor, - /*input=*/hidden_quant_tensor, + /*input=*/hidden_tensor, /*gate=*/w1_tensor, /*down=*/w2_tensor, /*sorted_token_ids=*/sorted_ids_tensor, @@ -447,60 +477,98 @@ FfnLayerOutput ROCmDevice::moeFfn(const FfnLayerParams& params, const MoeGateSel /*sorted_expert_ids=*/sorted_expert_ids_tensor, /*num_valid_ids=*/num_valid_ids_tensor, /*topk=*/topk, - /*input_scale=*/hidden_quant_scale_tensor, - /*fc1_scale=*/w1_scale_tensor, - /*fc2_scale=*/w2_scale_tensor, + /*input_scale=*/hidden_scale_tensor.value(), + /*fc1_scale=*/w1_scale_tensor.value(), + /*fc2_scale=*/w2_scale_tensor.value(), + /*kernel_name*/ fmoe_fp8_block_scale_g1u1_name, /*fc_scale_blkn=*/block_scale_n, /*fc_scale_blkk*/ block_scale_k, /*fc2_smooth_scale=*/nullopt, /*activation*/ ::ActivationType::Silu); - - printBufferData(*moe_out_final, "rocm_moe_out_final"); - - } else if (params.qscheme == QScheme::NoQuantize) { - const int unit_size = 32; - - torch::Tensor hidden_tensor = Buffer2torchTensor(hidden, false); - torch::Tensor w1_tensor = Buffer2torchTensor(*(params.weights.moe_gate_weight->kernel), false); - torch::Tensor w2_tensor = Buffer2torchTensor(*(params.weights.moe_down_weight->kernel), false); - - // step 1. prepare expert mask - BufferPtr local_expert_mask = - allocateBuffer({DataType::TYPE_INT32, {(size_t)num_expert}}, {"rocm_moe_local_expert_mask"}); - torch::Tensor local_expert_mask_tensor = Buffer2torchTensor(*local_expert_mask, false); - local_expert_mask_tensor.zero_(); - if (init_params_.use_deepep_moe) { - // deepep has already offset the topk_ids and set the masked expert to num_expert_per_rank - local_expert_mask_tensor.index_put_({torch::indexing::Slice(0, num_expert_per_rank)}, - torch::ones(num_expert_per_rank, torch::device(torch::kCUDA))); + } else { + BufferPtr a2 = allocateBuffer({dtype, {num_token, topk, (size_t)inter_dim}}, {"rocm_a2"}); + torch::Tensor a2_tensor = Buffer2torchTensor(*a2, false); + // FIXME(liyangcheng.lyc): workaround for two stage moe accuracy issue, see + // https://github.com/ROCm/aiter/issues/566 + a2_tensor.zero_(); + std::optional a2_scale_tensor; + QBufferPtr a2_q; + std::string ck_moe_stage1_kernel_name = ""; + std::string asm_moe_stage1_kernel_name = ""; + std::string ck_moe_stage2_kernel_name = ""; + + auto aiterQscheme = [qscheme = params.qscheme]() { + switch (qscheme) { + case QScheme::NoQuantize: + return ::QuantType::No; + case QScheme::Qfp8PerToken: + return ::QuantType::per_Token; + default: + RTP_LLM_FAIL("[ROCm moeFfn]: quant type %d not implemented yet", (int)qscheme); + } + }(); + + // invoke aiter two stage moe kernels + if (params.qscheme == QScheme::NoQuantize) { + ck_moe_stage1( + /*hidden_states*/ hidden_tensor, + /*w1*/ w1_tensor, + /*w2*/ w2_tensor, + /*sorted_token_ids*/ sorted_ids_tensor, + /*sorted_expert_ids*/ sorted_expert_ids_tensor, + /*num_valid_ids*/ num_valid_ids_tensor, + /*out*/ a2_tensor, + /*topk*/ topk, + /*kernelName*/ ck_moe_stage1_kernel_name, + /*w1_scale*/ w1_scale_tensor, + /*a1_scale*/ hidden_scale_tensor, + /*block_m*/ unit_size, + /*sorted_weights*/ std::nullopt, + /*quant_type*/ static_cast(aiterQscheme), + /*activation*/ static_cast(::ActivationType::Silu)); } else { - local_expert_mask_tensor.index_put_({torch::indexing::Slice(moe_conf.ep_rank * num_expert_per_rank, - (moe_conf.ep_rank + 1) * num_expert_per_rank)}, - torch::ones(num_expert_per_rank, torch::device(torch::kCUDA))); + moe_stage1_g1u1( + /*input*/ hidden_tensor, + /*w1*/ w1_tensor, + /*w2*/ w2_tensor, + /*sorted_token_ids*/ sorted_ids_tensor, + /*sorted_expert_ids*/ sorted_expert_ids_tensor, + /*num_valid_ids*/ num_valid_ids_tensor, + /*out*/ a2_tensor, + /*inter_dim*/ inter_dim, + /*kernelName*/ asm_moe_stage1_kernel_name, + /*block_m*/ unit_size, + /*ksplit*/ 0, + /*activation*/ ::ActivationType::Silu, + /*quant_type*/ aiterQscheme, + /*a1_scale*/ hidden_scale_tensor, + /*w1_scale*/ w1_scale_tensor, + /*sorted_weights*/ std::nullopt); } - // step 2. invoke ck_moe function - auto moe_out_tensor = ck_moe(hidden_tensor, - w1_tensor, - w2_tensor, - topk_weights_tensor, - topk_ids_tensor, - nullopt, - nullopt, - nullopt, - nullopt, - unit_size, - local_expert_mask_tensor); - - BufferPtr moe_out_tensor_buffer = torchTensor2Buffer(moe_out_tensor); - copy({*moe_out_final, *moe_out_tensor_buffer, false, DeviceStream::DEFAULT, false}); - return FfnLayerOutput{moe_out_final}; - } else if (params.qscheme == QScheme::Qfp8PerToken) { - RTP_LLM_FAIL("[ROCm moeFfn]: quant type %d not implemented yet", (int)params.qscheme); - } else { - RTP_LLM_FAIL("[ROCm moeFfn]: quant type %d not implemented yet", (int)params.qscheme); + if (params.qscheme != QScheme::NoQuantize) { + a2_q = std::dynamic_pointer_cast( + quantize(QuantizeParams(*a2, DataType::TYPE_QFP8_E4M3, 1, params.qscheme, 128, 0))); + a2_tensor = Buffer2torchTensor(a2_q->kernel(), false).view({(int)num_token, (int)topk, inter_dim}); + a2_scale_tensor = Buffer2torchTensor(a2_q->scales(), false); + } + ck_moe_stage2( + /*inter_states*/ a2_tensor, + /*w1*/ w1_tensor, + /*w2*/ w2_tensor, + /*sorted_token_ids*/ sorted_ids_tensor, + /*sorted_expert_ids*/ sorted_expert_ids_tensor, + /*num_valid_ids*/ num_valid_ids_tensor, + /*out*/ moe_out_tensor, + /*topk*/ topk, + /*kernelName*/ ck_moe_stage2_kernel_name, + /*w2_scale*/ w2_scale_tensor, + /*a2_scale*/ a2_scale_tensor, + /*block_m*/ unit_size, + /*sorted_weights*/ sorted_weights_tensor, + /*quant_type*/ static_cast(aiterQscheme), + /*activation*/ static_cast(::ActivationType::Silu)); } - return {moe_out_final}; } diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmGemmOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmGemmOp.cc index 7eeba1de6..4777bf48e 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmGemmOp.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmGemmOp.cc @@ -254,6 +254,33 @@ void ROCmDevice::InvokeROCmPTPCGemm(const GemmParams& params, BufferPtr output) gemm_a8w8_bpreshuffle(A_quant_tensor, W_kernel_tensor, A_quant_scale_tensor, W_scale_tensor, output_tensor); } +void ROCmDevice::HipblasltPTPCGemm(const GemmParams& params, BufferPtr output) { + RTP_LLM_LOG_DEBUG("use hipBLASLt ptpc gemm."); + ROCmGemmArguments arguments(params); + + QBufferPtr q_hidden = std::dynamic_pointer_cast( + quantize(QuantizeParams(params.A, DataType::TYPE_QFP8_E4M3, 1, QScheme::Qfp8PerToken, 0, 0))); + + BufferPtr A_quant_buffer = q_hidden->kernelPtr(); + BufferPtr A_scales = q_hidden->scalesPtr(); + BufferPtr W_kernel = reinterpret_cast(params.B).kernelPtr(); + BufferPtr W_scales = reinterpret_cast(params.B).scalesPtr(); + + auto A = A_quant_buffer->data(); + auto B = W_kernel->data(); + auto D = output->data(); + auto a_op = opConvert(params.transA); + auto b_op = opConvert(params.transB); + + hipblas_mm_wrapper_->setStream(current_stream_); + hipblas_mm_wrapper_->setGemmConfig(dtypeConvert(A_quant_buffer->type()), dtypeConvert(W_kernel->type()), + dtypeConvert(output->type()), dtypeConvert(arguments.compute_type)); + + hipblas_mm_wrapper_->FP8_Gemm(b_op, a_op, arguments.n, arguments.m, arguments.k, B, arguments.ldb, + A, arguments.lda, D, arguments.ldc, reinterpret_cast(W_scales->data()), + reinterpret_cast(A_scales->data()), arguments.alpha, arguments.beta); +} + /// @brief basic gemm ops /// @details D = alpha * op(A) * op(B) + beta * C /// A [b, ..., m, k] @@ -371,7 +398,12 @@ BufferPtr ROCmDevice::gemm(const GemmParams& params) { if (kernel_K == scale_K * 128) { InvokeROCmDeepGemm(params, output); } else if (1 == scale_K && scale_N == kernel_N) { - InvokeROCmPTPCGemm(params, output); + if (hipblas_mm_wrapper_->use_swizzleA() || hipblas_mm_wrapper_->test_swizzleA()){ + HipblasltPTPCGemm(params, output); + } + else { + InvokeROCmPTPCGemm(params, output); + } } else { ROCM_FAIL( "[GEMM]: Other FP8 weight quantization not implemented, with weight kernel [%d, %d], weight scales [%d, %d]", diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmLayernorm.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmLayernorm.cc index 4a2751c4d..22f387bd9 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmLayernorm.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmLayernorm.cc @@ -1,7 +1,5 @@ -#include "layernorm2d_fwd.hpp" -#include "rmsnorm2d_fwd.hpp" -// #include "aiter_meta/3rdparty/composable_kernel/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp" -// #include "aiter_meta/3rdparty/composable_kernel/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp" +#include "norm.h" +#include "rmsnorm.h" #include "rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h" #include "rtp_llm/cpp/devices/rocm_impl/ROCmAllocator.h" #include "rtp_llm/cpp/core/TrackerAllocator.h" @@ -20,6 +18,8 @@ #include "rtp_llm/cpp/kernels/alpha_layernorm_kernels.h" #include "rtp_llm/cpp/kernels/rmsnormKernels.h" #include "rtp_llm/cpp/kernels/rocm/fused_qk_rmsnorm.h" +#include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h" + namespace rtp_llm { using namespace rocm; @@ -169,7 +169,7 @@ LayernormOutput ROCmDevice::layernorm(const LayernormParams& params) { m, n, stream_); - check_cuda_error(); + ROCM_CHECK_ERROR(); return LayernormOutput({std::move(norm_output), nullptr}); } else if (params.alpha != 0.f) { DISPATCH_CUDA_FUNCTION_DATA_TYPE(data_type, @@ -184,47 +184,68 @@ LayernormOutput ROCmDevice::layernorm(const LayernormParams& params) { m, n, stream_); - check_cuda_error(); + ROCM_CHECK_ERROR(); return LayernormOutput({std::move(norm_output), nullptr}); } else { throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); } } + if ((params.qscheme == QScheme::NoQuantize || params.qscheme == QScheme::Qfp8PerToken) && data_type != DataType::TYPE_FP32 && ((params.norm_type == NormType::layernorm && beta && !params.residual1) || (params.norm_type == NormType::rmsnorm))) + { + int fused_add = params.residual1 ? 1: 0; + int xbias = params.bias? 1: 0; - if (!(norm_type == NormType::layernorm || norm_type == NormType::rmsnorm)) { - throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); - } - - auto quant_data_type = (params.qscheme == QScheme::Qfp8PerTensor) ? DataType::TYPE_FP8_E4M3 : DataType::TYPE_INT8; - - if (params.residual1.has_value() || params.bias.has_value()) { - if (params.norm_type == NormType::layernorm) { - if ((!params.bias.has_value()) && (data_type == DataType::TYPE_FP16 && m > 32 && n <= 768)) { - layernorm2d_fwd_traits traits{"fp16", "fp16", "fp32", "fp32", 0, 1, 0}; - layernorm2d_fwd_args args{input->data(), - params.residual1.value().get().data(), - nullptr, - nullptr, - gamma, - beta, + auto input_tensor = Buffer2torchTensor(input, false); + auto out_tensor = Buffer2torchTensor(norm_output, false); + auto weight_tensor = Buffer2torchTensor(*norm_weight->get().gamma.get(), false); - norm_output->data(), - (params.before_norm_output == nullptr) ? input->data() : - params.before_norm_output->data(), - nullptr, - nullptr, // p_mean, unsupported yet - nullptr, // p_invStd, unsupported yet + if (params.norm_type == NormType::layernorm) + { + std::optional bias_tensor; + if (xbias) + bias_tensor = Buffer2torchTensor(params.bias.value().get(), false); - static_cast(eps), - static_cast(m), - static_cast(n), - static_cast(n), // x row_stride - static_cast(n), // x residule row stride - static_cast(n), // y row stride - static_cast(n)}; // y residule row stride + auto beta_tensor = Buffer2torchTensor(*norm_weight->get().beta.get(), false); + if (fused_add) + { + auto residual_in_tensor = Buffer2torchTensor(params.residual1.value().get(), false); + auto residual_out_tensor = Buffer2torchTensor((params.before_norm_output == nullptr) ? params.input:params.before_norm_output, false); + layernorm2d_with_add(out_tensor, input_tensor, residual_in_tensor, residual_out_tensor, weight_tensor, beta_tensor, static_cast(eps), bias_tensor); + } + else + { + auto res_tensor = layernorm2d(input_tensor, weight_tensor, beta_tensor, static_cast(eps), bias_tensor); + copy({*norm_output, *torchTensor2Buffer(res_tensor)}); + } + + } + else if(params.norm_type == NormType::rmsnorm) + { + if (fused_add) + { + auto residual_in_tensor = Buffer2torchTensor(params.residual1.value().get(), false); + auto residual_out_tensor = Buffer2torchTensor((params.before_norm_output == nullptr) ? params.input:params.before_norm_output, false); + rmsnorm2d_with_add(out_tensor, input_tensor, residual_in_tensor, residual_out_tensor, weight_tensor, static_cast(eps), 0); + } + else + { + auto res_tensor = rmsnorm2d(input_tensor, weight_tensor, static_cast(eps), 0); + copy({*norm_output, *torchTensor2Buffer(res_tensor)}); + } - layernorm2d_fwd(traits, args, {stream_, false, 0, 0, 1}); - } else { + } + else + { + throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); + } + } + else + { + auto quant_data_type = (params.qscheme == QScheme::Qfp8PerTensor) ? DataType::TYPE_FP8_E4M3 : DataType::TYPE_INT8; + if (params.norm_type == NormType::layernorm) + { + if (params.residual1.has_value() || params.bias.has_value()) + { DISPATCH_CUDA_FUNCTION_DATA_TYPE( data_type, invokeGeneralAddBiasResidualLayerNorm, @@ -246,115 +267,77 @@ LayernormOutput ROCmDevice::layernorm(const LayernormParams& params) { quant_output, // out_quant params.return_normed_output); } - check_cuda_error(); - return LayernormOutput({norm_output, params.before_norm_output}); - } else if (params.norm_type == NormType::rmsnorm) { - DISPATCH_CUDA_FUNCTION_COMPUTE_QUANT_TYPES( - data_type, - quant_data_type, - invokeAddBiasResidualRmsNorm, - params.before_norm_output->data(), // or null - norm_output->data(), - input->data(), - params.bias ? params.bias.value().get().data() : nullptr, - params.residual1 ? params.residual1.value().get().data() : nullptr, - params.residual2 ? params.residual2.value().get().data() : nullptr, - gamma, - beta, - eps, - m, - n, - stream_, - nullptr, // scale - scales_ptr, // dynamic_scale - quant_output // out_quant - ); - check_cuda_error(); - return LayernormOutput({norm_output, params.before_norm_output}); + else + { + DISPATCH_CUDA_FUNCTION_DATA_TYPE( + data_type, + invokeGeneralLayerNorm, + nullptr, + norm_output->data(), + input->data(), + gamma, + beta, + eps, + m, + n, + stream_, + true, // use_diff_of_squares + nullptr, // scale + scales_ptr, // dynamic_scale + quant_output, // out_quant + params.return_normed_output); + } } - } else { - if (params.norm_type == NormType::layernorm) { - if (data_type == DataType::TYPE_FP16 && m > 32 && n <= 768) { - layernorm2d_fwd_traits traits{"fp16", "fp16", "fp32", "fp32", 0, 0, 0}; - layernorm2d_fwd_args args{input->data(), - nullptr, - nullptr, - nullptr, - gamma, - beta, - - norm_output->data(), - nullptr, - nullptr, - nullptr, // p_mean, unsupported yet - nullptr, // p_invStd, unsupported yet - - static_cast(eps), - static_cast(m), - static_cast(n), - static_cast(n), // x row_stride - static_cast(n), // x residule row stride - static_cast(n), // y row stride - static_cast(n)}; // y residule row stride - - layernorm2d_fwd(traits, args, {stream_, false, 0, 0, 1}); - } else { - DISPATCH_CUDA_FUNCTION_DATA_TYPE(data_type, - invokeGeneralLayerNorm, - nullptr, - norm_output->data(), - input->data(), - gamma, - beta, - eps, - m, - n, - stream_, - true, // use_diff_of_squares - nullptr, // scale - scales_ptr, // dynamic_scale - quant_output, // out_quant - params.return_normed_output); + else if(params.norm_type == NormType::rmsnorm) + { + if (params.residual1.has_value() || params.bias.has_value()) + { + DISPATCH_CUDA_FUNCTION_COMPUTE_QUANT_TYPES( + data_type, + quant_data_type, + invokeAddBiasResidualRmsNorm, + params.before_norm_output->data(), // or null + norm_output->data(), + input->data(), + params.bias ? params.bias.value().get().data() : nullptr, + params.residual1 ? params.residual1.value().get().data() : nullptr, + params.residual2 ? params.residual2.value().get().data() : nullptr, + gamma, + beta, + eps, + m, + n, + stream_, + nullptr, // scale + scales_ptr, // dynamic_scale + quant_output); // out_quant } - check_cuda_error(); - return LayernormOutput({norm_output, params.before_norm_output}); - } else if (params.norm_type == NormType::rmsnorm) { - std::string prec_i; - if (data_type == DataType::TYPE_FP16) - prec_i = "fp16"; - else if (data_type == DataType::TYPE_BF16) - prec_i = "bf16"; else - throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); - - rmsnorm2d_fwd_traits traits{prec_i, prec_i, "fp32", "fp32", 0, 0, 0, 0}; - - rmsnorm2d_fwd_args args{input->data(), - nullptr, - nullptr, - gamma, - norm_output->data(), - nullptr, - nullptr, - nullptr, - nullptr, - static_cast(eps), - static_cast(m), - static_cast(n), - static_cast(n), - static_cast(n), - static_cast(n), - static_cast(n)}; - - float run_time = rmsnorm2d_fwd(traits, args, {stream_, false, 0, 0, 1}); - - // std::cout << "rmsnorm2d_fwd run_time: " << run_time * 1.E3 << " us"<< std::endl; - - check_cuda_error(); - return LayernormOutput({norm_output, params.before_norm_output}); + { + DISPATCH_CUDA_FUNCTION_COMPUTE_QUANT_TYPES( + data_type, + quant_data_type, + invokeGeneralRmsNorm, + norm_output->data(), + input->data(), + gamma, + beta, + eps, + m, + n, + stream_, + nullptr, // scale + scales_ptr, // dynamic_scale + quant_output); // out_quant + } + } + else + { + throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); } } - throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); + ROCM_CHECK_ERROR(); + return LayernormOutput({norm_output, params.before_norm_output}); } #define ARGS_DISPATCH(Atype, Dtype, out, bias, gate, gate_bias, m, n, stream) \ diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmQuantizeOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmQuantizeOp.cc index fe31ac225..877ec6e29 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmQuantizeOp.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmQuantizeOp.cc @@ -10,7 +10,7 @@ namespace rtp_llm { using namespace rocm; BufferPtr ROCmDevice::quantize(const QuantizeParams& params) { - ROCM_CHECK_VALUE((params.input.dim() == 2), "quantize only support 2D."); + ROCM_CHECK_VALUE((params.input.dim() == 2 || params.input.dim() == 3), "quantize only support 2D or 3D."); ROCM_CHECK_VALUE((params.input.type() == DataType::TYPE_FP16 || params.input.type() == DataType::TYPE_FP32 || params.input.type() == DataType::TYPE_BF16), "quantize only support half or float quantize. but get %d.", @@ -50,9 +50,10 @@ BufferPtr ROCmDevice::quantize(const QuantizeParams& params) { "Qfp8PerToken only support qtype = TYPE_QFP8_E4M3"); ROCM_CHECK_VALUE((params.axis == 1), "Qfp8PerToken only support axis = 1"); size_t num_token = params.input.shape()[0]; - size_t model_dim = params.input.shape()[1]; + auto scale_shape = params.input.shape(); + scale_shape.back() = 1; kernel = allocateBuffer({DataType::TYPE_FP8_E4M3, params.input.shape()}, {"quant_kernel"}); - scales = allocateBuffer({DataType::TYPE_FP32, {num_token, 1}}, {"quant_scale"}); + scales = allocateBuffer({DataType::TYPE_FP32, scale_shape}, {"quant_scale"}); zeros = BufferPtr(new Buffer(MemoryType::MEMORY_GPU, DataType::TYPE_INVALID, {0}, nullptr)); if (num_token > 0) { torch::Tensor input_tensor = Buffer2torchTensor(params.input, false); @@ -91,7 +92,10 @@ BufferPtr ROCmDevice::quantize(const QuantizeParams& params) { /*out=*/kernel_tensor, /*input=*/input_tensor, /*scales=*/scales_tensor, - /*scale_ub=*/std::nullopt); + /*scale_ub=*/std::nullopt, + /*shuffle_case=*/false, + /*num_rows=*/std::nullopt, + /*num_rows_factor*/1); } } else { ROCM_FAIL("other quantize not implemented"); diff --git a/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc b/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc index 9ed518e47..2f6413a9c 100644 --- a/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc +++ b/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc @@ -10,10 +10,49 @@ inline torch::Tensor Buffer2torchTensorCustom(const Buffer& buf, std::vector 3) { + query = query.reshape({query.size(0), query.size(1), -1}); + } + size_t num_heads = params.configs.head_num; + int64_t partition_size = 256; + int64_t max_seq_len = params.common.decoder_max_seq_len + 1; + + auto key_cache = Buffer2torchTensor(params.common.kv_cache->k_cache_buffer,false); + auto value_cache = Buffer2torchTensor(params.common.kv_cache->v_cache_buffer,false); + + auto block_tables = Buffer2torchTensor(params.common.kv_cache->kv_cache_block_id,false); + + auto context_lens = Buffer2torchTensor(params.common.sequence_lengths,false); + context_lens = context_lens + 1; + + int max_num_blocks = block_tables.size(1); + std::optional K_QScale = std::nullopt; + std::optional V_QScale = std::nullopt; + std::optional out_opt = out; + if (key_cache.dtype() == at::kFloat8_e4m3fnuz) { + K_QScale = Buffer2torchTensor(params.common.kv_cache->k_scale_buffer,false); + V_QScale = Buffer2torchTensor(params.common.kv_cache->v_scale_buffer,false); + pa_fwd(query, key_cache, value_cache, block_tables, context_lens, max_num_blocks, max_seq_len, K_QScale, V_QScale, out_opt, std::nullopt, 2); + } else { + pa_fwd(query, key_cache, value_cache, block_tables, context_lens, max_num_blocks, max_seq_len, K_QScale, V_QScale, out_opt); + } +} + +void runAiterPA(const AttentionModuleParams& params, + rtp_llm::DeviceBase* device, + Buffer& q_tmp) { + auto out = Buffer2torchTensor(params.output,false); + auto query = Buffer2torchTensor(q_tmp,false); + if (q_tmp.shape().size() < 3) { throw std::runtime_error("aiter_paged_attention only support 3-dim input"); } else if (q_tmp.shape().size() > 3) { @@ -99,4 +138,5 @@ void runAiterPA(const AttentionModuleParams& params, rtp_llm::DeviceBase* device partition_size); return; } -} // namespace rtp_llm + +} // namespace rtp_llm diff --git a/rtp_llm/cpp/devices/rocm_impl/aiterPA.h b/rtp_llm/cpp/devices/rocm_impl/aiterPA.h index fb5ad937c..16cc69251 100644 --- a/rtp_llm/cpp/devices/rocm_impl/aiterPA.h +++ b/rtp_llm/cpp/devices/rocm_impl/aiterPA.h @@ -2,8 +2,11 @@ #include "rtp_llm/cpp/devices/DeviceBase.h" // #include "aiter_meta/csrc/include/attention.h" #include "attention.h" +#include "attention_asm.h" namespace rtp_llm { +void runAiterAsmPA(const AttentionModuleParams& params, + rtp_llm::DeviceBase* device, Buffer& q_tmp); void runAiterPA(const AttentionModuleParams& params, rtp_llm::DeviceBase* device, Buffer& q_tmp); } // namespace rtp_llm diff --git a/rtp_llm/cpp/devices/rocm_impl/test/BUILD b/rtp_llm/cpp/devices/rocm_impl/test/BUILD index 315392c19..fb305d9a8 100644 --- a/rtp_llm/cpp/devices/rocm_impl/test/BUILD +++ b/rtp_llm/cpp/devices/rocm_impl/test/BUILD @@ -118,14 +118,14 @@ cc_test( "@//:using_aiter_src": [ "@aiter_src//:module_aiter_enum", "@aiter_src//:module_moe_asm", - "@aiter_src//:module_moe", + "@aiter_src//:module_moe_ck2stages", "@aiter_src//:module_quant", "@aiter_src//:module_moe_sorting", ], "//conditions:default": [ "@aiter//:module_aiter_enum", "@aiter//:module_moe_asm", - "@aiter//:module_moe", + "@aiter//:module_moe_ck2stages", "@aiter//:module_quant", "@aiter//:module_moe_sorting", ], @@ -139,7 +139,7 @@ cc_test( "ops/ROCmAttentionOpTest.cc", ], data=[], - env=device_test_envs(), + env = device_test_envs() | { "AITER_ASM_DIR": "./external/aiter/aiter_meta/hsa/gfx942/"}, copts=test_copts + copts(), linkopts=test_linkopts, deps=test_deps, diff --git a/rtp_llm/cpp/devices/rocm_impl/test/ops/LayernormTest.cc b/rtp_llm/cpp/devices/rocm_impl/test/ops/LayernormTest.cc index e9f54292a..7b57f5e46 100644 --- a/rtp_llm/cpp/devices/rocm_impl/test/ops/LayernormTest.cc +++ b/rtp_llm/cpp/devices/rocm_impl/test/ops/LayernormTest.cc @@ -39,7 +39,7 @@ TEST_F(LayerNormTest, testSimpleLayernorm) { printf("testing m = %d, n = %d \n", m, n); testGeneralLayernorm(DataType::TYPE_FP16, NormType::layernorm, m, n); testGeneralLayernorm(DataType::TYPE_BF16, NormType::layernorm, m, n); - // testGeneralLayernorm(DataType::TYPE_FP32, NormType::layernorm, m, n); + testGeneralLayernorm(DataType::TYPE_FP32, NormType::layernorm, m, n); } } } diff --git a/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmAttentionOpTest.cc b/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmAttentionOpTest.cc index 52d86a02b..a6e53a57c 100644 --- a/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmAttentionOpTest.cc +++ b/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmAttentionOpTest.cc @@ -28,11 +28,16 @@ TEST_F(AttentionOpTest, SelfAttentionOpTest) { std::vector kv_seq = {4096}; for (auto batch_size : batch) { for (auto seq_len : seq) { - for (auto kv_seq_len : kv_seq) { - size_t num_heads = 64; - size_t num_key_value_heads = 4; - size_t head_dim = 128; - selfAttentionOpTest(batch_size, seq_len, kv_seq_len, num_heads, num_key_value_heads, head_dim); + for (auto kv_seq_len: kv_seq) { + size_t num_heads = 8; + size_t num_key_value_heads = 1; + size_t head_dim = 128; + selfAttentionOpTest(batch_size, + seq_len, + kv_seq_len, + num_heads, + num_key_value_heads, + head_dim); } } } @@ -89,7 +94,7 @@ TEST_F(AttentionOpTest, MultiBlockSelfAttentionOpTest) { } } } - +*/ TEST_F(AttentionOpTest, ContextAttentionOpTest) { printf("Runing ContextAttentionOpTest\n"); auto device_init_params = DeviceInitParams(); @@ -101,19 +106,20 @@ TEST_F(AttentionOpTest, ContextAttentionOpTest) { // ASSERT_TRUE(!static_cast(device_)->use_trtv2_fmha); // ASSERT_TRUE(!static_cast(device_)->use_openSource_fmha); // ASSERT_TRUE(!static_cast(device_)->use_trtv1_fmha); - std::vector batch = {1, 2, 4, 8}; - std::vector seq = {1, 10, 20, 30}; + std::vector batch = {1}; + std::vector seq = {4000}; for (auto batch_size : batch) { for (auto seq_len : seq) { - size_t num_heads = 64; - size_t num_key_value_heads = num_heads; - size_t head_dim = 64; + size_t num_heads = 8; + size_t num_key_value_heads = 1; + size_t head_dim = 128; size_t dim = head_dim; contextAttentionOpTest(batch_size, seq_len, num_heads, num_key_value_heads, head_dim); } } } +/* TEST_F(AttentionOpTest, AiterPageAttentionOpTest) { device_ = new ROCmDevice(DeviceInitParams()); device_->init(); @@ -123,8 +129,8 @@ TEST_F(AttentionOpTest, AiterPageAttentionOpTest) { for (auto batch_size : batch) { for (auto seq_len : seq) { for (auto kv_seq_len: kv_seq) { - size_t num_key_value_heads = 4; - size_t num_heads = 64; + size_t num_key_value_heads = 1; + size_t num_heads = 8; size_t head_dim = 128; aiterPageAttentionOpTest(batch_size, seq_len, @@ -137,6 +143,7 @@ TEST_F(AttentionOpTest, AiterPageAttentionOpTest) { } } */ + // TEST_F(AttentionOpTest, OpenSourceFMHAContextAttentionOpTest) { // setenv("ENABLE_TRT_FMHA", "OFF", 1); // setenv("ENABLE_TRTV1_FMHA", "OFF", 1); diff --git a/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmGemmOpTest.cc b/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmGemmOpTest.cc index c4098b21a..a36512e80 100644 --- a/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmGemmOpTest.cc +++ b/rtp_llm/cpp/devices/rocm_impl/test/ops/ROCmGemmOpTest.cc @@ -56,14 +56,14 @@ TEST_F(ROCmGemmOpTest, Q4x2GemmOpTest) { } TEST_F(ROCmGemmOpTest, BasicGemmOpTest) { - BasicGemmOpTest(2, 1024, 2048, DataType::TYPE_FP16); + BasicGemmOpTest(2, 1024, 2048, DataType::TYPE_BF16); BasicGemmOpTest(8, 1024, 2048, DataType::TYPE_FP16); BasicGemmOpTest(1024, 1024, 2048, DataType::TYPE_FP16); BasicGemmOpTest(4096, 1024, 2048, DataType::TYPE_FP16); - BasicGemmOpTest(2, 1024, 2048, DataType::TYPE_FP32); - BasicGemmOpTest(8, 1024, 2048, DataType::TYPE_FP32); - BasicGemmOpTest(1024, 1024, 2048, DataType::TYPE_FP32); - BasicGemmOpTest(4096, 1024, 2048, DataType::TYPE_FP32); + BasicGemmOpTest(2, 1024, 2048, DataType::TYPE_BF16); + BasicGemmOpTest(8, 1024, 2048, DataType::TYPE_BF16); + BasicGemmOpTest(1024, 1024, 2048, DataType::TYPE_BF16); + BasicGemmOpTest(4096, 1024, 2048, DataType::TYPE_BF16); } TEST_F(ROCmGemmOpTest, BasicFP8GemmOpTest) { @@ -98,7 +98,7 @@ TEST_F(ROCmGemmOpTest, TransposeGemmOpTest) { size_t m = 5; size_t n = 1024; size_t k = 4096; - TransposeGemmOpTest(none, none, m, k, k, n, DataType::TYPE_FP16); + TransposeGemmOpTest(none, tran, m, k, n, k, DataType::TYPE_FP16); TransposeGemmOpTest(tran, tran, k, m, n, k, DataType::TYPE_FP16); TransposeGemmOpTest(tran, none, k, m, k, n, DataType::TYPE_FP16); diff --git a/rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h b/rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h index 17b01978c..64ef84ffe 100644 --- a/rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h +++ b/rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h @@ -224,12 +224,35 @@ struct KVBlockArray: public KVBlockArrayForContextFMHA { + getLocalIdx(globalTokenIdx) * vectorize_size + channelIdx % vectorize_size; } - __host__ __device__ inline int32_t - getVLocalIdx(int32_t globalTokenIdx, int32_t headIdx, int32_t dimsPerHead, int32_t channelIdx) const { + template + __host__ __device__ inline int32_t getVLocalIdx(int32_t globalTokenIdx, + int32_t headIdx, + int32_t dimsPerHead, + int32_t channelIdx) const { + constexpr int element_size = ElementSizeInBytes::value; + static_assert(16 % element_size == 0, + "kv cache element size must divide 16"); + constexpr int vectorize_size = 16 / element_size; + + assert(mTokensPerBlock % vectorize_size == 0); + int32_t localTokenIdx = getLocalIdx(globalTokenIdx); + // shape: [numHeads, mTokensPerBlock/vs, dimsPerHead, vs] + // stride: [mTokensPerBlock*dimsPerHead, dimsPerHead*vs, vs, 1] + return headIdx * dimsPerHead * mTokensPerBlock + + localTokenIdx / vectorize_size * dimsPerHead * vectorize_size + + channelIdx * vectorize_size + + localTokenIdx % vectorize_size; + } + + __host__ __device__ inline int32_t getVLocalIdx(int32_t globalTokenIdx, + int32_t headIdx, + int32_t dimsPerHead, + int32_t channelIdx) const { // shape: [numHeads, dimsPerHead, mTokensPerBlock] // stride: [dimsPerHead*mTokensPerBlock, mTokensPerBlock, 1] return headIdx * dimsPerHead * mTokensPerBlock + channelIdx * mTokensPerBlock + getLocalIdx(globalTokenIdx); } + #endif __host__ __device__ inline void* getScaleBlockPtr(DataType const* offsets, int32_t tokenIdx) const { @@ -272,6 +295,7 @@ struct OffsetIndexedKVBlockArray: public KVBlockArray { OffsetIndexedKVBlockArray() = default; OffsetIndexedKVBlockArray(KVBlockArray& base, DataType* raw_block_table, int32_t kv_block_offset): KVBlockArray(base), kv_block_offset_(kv_block_offset) { + cache_type = base.cache_type; data = raw_block_table; } __host__ __device__ inline DataType const* getRowPtr(KVIdxType kvIdx, int32_t seqIdx) const { diff --git a/rtp_llm/cpp/kernels/rocm/fused_qk_rmsnorm.cu b/rtp_llm/cpp/kernels/rocm/fused_qk_rmsnorm.cu index 43ca9e9ee..366d09d29 100644 --- a/rtp_llm/cpp/kernels/rocm/fused_qk_rmsnorm.cu +++ b/rtp_llm/cpp/kernels/rocm/fused_qk_rmsnorm.cu @@ -17,6 +17,27 @@ __inline__ __device__ Tf compute_rmsnorm(Tf val, float s_variance, const T* gamm return ret; } +namespace functor { + +template +__device__ __forceinline__ Tf compute_rmsnorm(Tf val, float s_variance, const Tf gamma, const T* beta, const int i) { + Tf ret = val * s_variance * gamma; + if constexpr (IS_BETA) { + ret = ret + cuda_cast(beta[i]); + } + return ret; +} + +template +__device__ __forceinline__ T warpReduceSum(T val) { + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { + val = add(val, __shfl_xor(val, offset, warpSize)); + } + return val; +} + +} + template __global__ void fusedQkRmsNorm(T* __restrict input, const T* __restrict q_gamma, @@ -70,6 +91,100 @@ __global__ void fusedQkRmsNorm(T* __restrict input, } } +template +__global__ void fusedQkRmsNormOpt(T* __restrict input, + const T* __restrict q_gamma, + const T* __restrict q_bias, + const T* __restrict k_gamma, + const T* __restrict k_bias, + const int q_group_num, + const int k_group_num, + const float eps, + const int n, + const int norm_size, + const float inv_norm_size) { + constexpr auto num_elems_T = num_elems::value; + using float_packed_t = typename packed_as::type; + constexpr int vec_size = num_elems::value; + + const int elem_idx = threadIdx.x; + const int sample_idx = blockIdx.y; + const int group_idx = blockIdx.x; + T* group_start = input + sample_idx * (n / vec_size) + group_idx * (norm_size / vec_size); + + const T* gamma_ptr = group_idx < q_group_num ? q_gamma : k_gamma; + const T* bias_ptr = group_idx < q_group_num ? q_bias : k_bias; + const auto gamma = cuda_cast(gamma_ptr[elem_idx]); + + float square_sum = 0.0f; + T packed_val = group_start[elem_idx]; + auto val = cuda_cast(packed_val); + square_sum += cuda_sum(val * val); + + float variance = functor::warpReduceSum(square_sum) * inv_norm_size; + float scale = rsqrtf(variance + eps); + + const float_packed_t val_f = cuda_cast(packed_val); + const T out = + cuda_cast(functor::compute_rmsnorm(val_f, scale, gamma, bias_ptr, elem_idx)); + group_start[elem_idx] = cuda_cast(out); +} + +template +void invokeFusedQkRmsNormOpt(T* __restrict input, + const T* __restrict q_gamma, + const T* __restrict q_bias, + const T* __restrict k_gamma, + const T* __restrict k_bias, + const float layernorm_eps, + const int q_group_num, + const int k_group_num, + const int m, + const int n, + const int norm_size, + cudaStream_t stream) { + constexpr size_t vec_size = 2; + constexpr size_t warp_size = 64; + + if (n % norm_size != 0) { + throw std::invalid_argument("n must be divisible by norm_size"); + } + if (norm_size % (warp_size * vec_size) != 0) { + throw std::invalid_argument("norm_size must be multiple of " + std::to_string(warp_size * vec_size)); + } + + dim3 grid(q_group_num + k_group_num, m); + dim3 block(warp_size); + + using Tp = typename packed_as::type; + bool is_bias = k_bias != nullptr && q_bias != nullptr; + if (is_bias) { + fusedQkRmsNormOpt<<>>(reinterpret_cast(input), + reinterpret_cast(q_gamma), + reinterpret_cast(q_bias), + reinterpret_cast(k_gamma), + reinterpret_cast(k_bias), + q_group_num, + k_group_num, + layernorm_eps, + n, + norm_size, + 1.0f / norm_size); + } else { + fusedQkRmsNormOpt<<>>(reinterpret_cast(input), + reinterpret_cast(q_gamma), + nullptr, + reinterpret_cast(k_gamma), + nullptr, + q_group_num, + k_group_num, + layernorm_eps, + n, + norm_size, + 1.0f / norm_size); + } +} + template void invokeFusedQkRmsNorm(T* __restrict input, const T* __restrict q_gamma, @@ -83,6 +198,11 @@ void invokeFusedQkRmsNorm(T* __restrict input, const int n, const int norm_size, cudaStream_t stream) { + if (norm_size == 128) { + invokeFusedQkRmsNormOpt(input, q_gamma, q_bias, k_gamma, k_bias, layernorm_eps, + q_group_num, k_group_num, m, n, norm_size, stream); + return; + } constexpr size_t vec_size = 2; constexpr size_t warp_size = 32; @@ -144,4 +264,4 @@ INSTANTIATE_FUSED_QK_RMSNORM(__nv_bfloat16); #endif #undef INSTANTIATE_FUSED_QK_RMSNORM -} // namespace rtp_llm \ No newline at end of file +} // namespace rtp_llm diff --git a/rtp_llm/cpp/kernels/rotary_position_embedding.h b/rtp_llm/cpp/kernels/rotary_position_embedding.h index 453c32e69..08a6c1479 100644 --- a/rtp_llm/cpp/kernels/rotary_position_embedding.h +++ b/rtp_llm/cpp/kernels/rotary_position_embedding.h @@ -455,62 +455,97 @@ __device__ __inline__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloa template __device__ __inline__ void -apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (2 * tid >= rot_embed_dim) { return; } - float2 coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); - q = rotary_embedding_transform(q, coef); + float2 coef; + if (rotary_embedding_coefficient_cache) { + coef = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + tid]; + } else { + coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); + } + q = rotary_embedding_transform(q, coef); } template __device__ __inline__ void -apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (4 * tid >= rot_embed_dim) { return; } Float4_& q_ = *reinterpret_cast(&q); - float2 coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); - float2 coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); - q_.x = rotary_embedding_transform(q_.x, coef0); - q_.y = rotary_embedding_transform(q_.y, coef1); + float2 coef0; + float2 coef1; + if (rotary_embedding_coefficient_cache) { + coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid]; + coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; + } else { + coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); + coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); + } + q_.x = rotary_embedding_transform(q_.x, coef0); + q_.y = rotary_embedding_transform(q_.y, coef1); } template __device__ __inline__ void -apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (2 * tid >= rot_embed_dim) { return; } - float2 coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); - q = rotary_embedding_transform(q, coef); + float2 coef; + if (rotary_embedding_coefficient_cache) { + coef = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + tid]; + } else { + coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); + } + q = rotary_embedding_transform(q, coef); } template __device__ __inline__ void -apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (4 * tid >= rot_embed_dim) { return; } - float2 coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); - float2 coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); - q.x = rotary_embedding_transform(q.x, coef0); - q.y = rotary_embedding_transform(q.y, coef1); + float2 coef0; + float2 coef1; + if (rotary_embedding_coefficient_cache) { + coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid]; + coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; + } else { + coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); + coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); + } + q.x = rotary_embedding_transform(q.x, coef0); + q.y = rotary_embedding_transform(q.y, coef1); } template __device__ __inline__ void -apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (8 * tid >= rot_embed_dim) { return; } - float2 coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base, rope_init); - float2 coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base, rope_init); - float2 coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base, rope_init); - float2 coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base, rope_init); + float2 coef0; + float2 coef1; + float2 coef2; + float2 coef3; + if (rotary_embedding_coefficient_cache) { + coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid]; + coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 1]; + coef2 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 2]; + coef3 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 3]; + } else { + coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base, rope_init); + coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base, rope_init); + coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base, rope_init); + coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base, rope_init); + } q.x = rotary_embedding_transform(q.x, coef0); q.y = rotary_embedding_transform(q.y, coef1); @@ -522,40 +557,61 @@ apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float b template __device__ __inline__ void apply_rotary_embedding( - __nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { + __nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (2 * tid >= rot_embed_dim) { return; } - float2 coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); - q = rotary_embedding_transform(q, coef); + float2 coef; + if (rotary_embedding_coefficient_cache) { + coef = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + tid]; + } else { + coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); + } + q = rotary_embedding_transform(q, coef); } template __device__ __inline__ void -apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (4 * tid >= rot_embed_dim) { return; } - float2 coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); - float2 coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); - + float2 coef0; + float2 coef1; + if (rotary_embedding_coefficient_cache) { + coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid]; + coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; + } else { + coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); + coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); + } q.x = rotary_embedding_transform(q.x, coef0); q.y = rotary_embedding_transform(q.y, coef1); } template __device__ __inline__ void -apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init) { +apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { if (8 * tid >= rot_embed_dim) { return; } - float2 coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base, rope_init); - float2 coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base, rope_init); - float2 coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base, rope_init); - float2 coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base, rope_init); - + float2 coef0; + float2 coef1; + float2 coef2; + float2 coef3; + if (rotary_embedding_coefficient_cache) { + coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid]; + coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 1]; + coef2 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 2]; + coef3 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 3]; + } else { + coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base, rope_init); + coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base, rope_init); + coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base, rope_init); + coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base, rope_init); + } q.x = rotary_embedding_transform(q.x, coef0); q.y = rotary_embedding_transform(q.y, coef1); q.z = rotary_embedding_transform(q.z, coef2); @@ -572,7 +628,8 @@ __device__ __inline__ void normal_rope(vector_t& x, const int dim, const float base, const RopeInit& rope_init, - const int offset = 0) { + const int offset = 0, + const float2 * rotary_embedding_coefficient_cache=nullptr) { const int vec_size = vector_size::size; const int rope_idx = tidx * vec_size - offset; const bool work = (rope_idx >= 0 && rope_idx < dim); @@ -583,10 +640,9 @@ __device__ __inline__ void normal_rope(vector_t& x, } __syncthreads(); - if (work) { RotaryHalfRead(x, smem, rope_tidx, dim / 2); - apply_rotary_embedding(x, rope_tidx, dim, seqidx, base, rope_init); + apply_rotary_embedding(x, rope_tidx, dim, seqidx, base, rope_init, rotary_embedding_coefficient_cache); RotaryHalfWrite(x, smem, rope_tidx, dim / 2); } @@ -647,14 +703,14 @@ get_qwen_dynamic_ntk_base(const int dim, const float base, const int seq_len, co template __device__ inline void -apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int seqidx, int seq_len) { +apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int seqidx, int seq_len, const float2 * rotary_embedding_coefficient_cache=nullptr) { auto base = rope_config.base; auto dim = rope_config.dim; switch (ROPE_STYLE) { case RopeStyle::No: break; case RopeStyle::Base: - normal_rope(x, smem, tidx, seqidx, dim, base, LinearScaleRope{rope_config.scale}); + normal_rope(x, smem, tidx, seqidx, dim, base, LinearScaleRope{rope_config.scale}, 0, rotary_embedding_coefficient_cache); break; case RopeStyle::Glm2: // only do rotary embedding for [..., d / 2] @@ -682,7 +738,8 @@ apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int se rope_config.scale, rope_config.extrapolation_factor, rope_config.mscale}, - rope_config.offset); + rope_config.offset, + rotary_embedding_coefficient_cache); break; case RopeStyle::QwenDynamicNTK: if (seq_len > rope_config.max_pos) { @@ -708,18 +765,19 @@ apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int se } template -__device__ inline void context_rope(RopeConfig rope_config, - vector_t& q, - vector_t& k, - scalar_t* smem, - int tidx, - int seqidx, - int position_id, - int seq_len, - int input_len, - bool PREFIX_PROMPT, - int prefix_prompt_length, - int count_length) { +__device__ inline void context_rope(RopeConfig rope_config, + vector_t& q, + vector_t& k, + scalar_t* smem, + int tidx, + int seqidx, + int position_id, + int seq_len, + int input_len, + bool PREFIX_PROMPT, + int prefix_prompt_length, + int count_length, + const float2 * rotary_embedding_coefficient_cache=nullptr) { if (PREFIX_PROMPT && count_length) { input_len = input_len + prefix_prompt_length; seqidx = seqidx + prefix_prompt_length; @@ -728,9 +786,9 @@ __device__ inline void context_rope(RopeConfig rope_config, seqidx = position_id; } - apply_rope(rope_config, q, smem, tidx, seqidx, seq_len); + apply_rope(rope_config, q, smem, tidx, seqidx, seq_len, rotary_embedding_coefficient_cache); - apply_rope(rope_config, k, smem, tidx, seqidx, seq_len); + apply_rope(rope_config, k, smem, tidx, seqidx, seq_len, rotary_embedding_coefficient_cache); } template @@ -749,7 +807,8 @@ __device__ inline void attention_rope(RopeConfig rope_config, [[maybe_unused]] int prefix_prompt_length, #pragma nv_diagnostic pop int count_prefix_length, - bool handle_kv) { + bool handle_kv, + const float2 * rotary_embedding_coefficient_cache=nullptr) { if (count_prefix_length) { prefix_prompt_length = 0; } @@ -762,10 +821,10 @@ __device__ inline void attention_rope(RopeConfig rope_config, tlength = tlength - prefix_prompt_length; } - apply_rope(rope_config, q, smem, tidx, tlength, seq_len); + apply_rope(rope_config, q, smem, tidx, tlength, seq_len, rotary_embedding_coefficient_cache); if (handle_kv) { - apply_rope(rope_config, k, smem, tidx, tlength, seq_len); + apply_rope(rope_config, k, smem, tidx, tlength, seq_len, rotary_embedding_coefficient_cache); } } diff --git a/rtp_llm/cpp/kernels/unfused_attention_kernels.cu b/rtp_llm/cpp/kernels/unfused_attention_kernels.cu index 5ec9322ec..a0dd7955d 100644 --- a/rtp_llm/cpp/kernels/unfused_attention_kernels.cu +++ b/rtp_llm/cpp/kernels/unfused_attention_kernels.cu @@ -25,6 +25,7 @@ #include "rtp_llm/cpp/cuda/cuda_host_utils.h" #endif #if USING_ROCM +typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3; #include "rtp_llm/cpp/rocm/cuda_shims.h" #endif #include @@ -2370,6 +2371,280 @@ void invokeDecodeAddFusedQKVBiasTranspose(T* q_buf, } #if USING_ROCM + +template +__global__ void add_fusedQKV_bias_transpose_prefill_kernel_v1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam param, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* __restrict qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int batch_size, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head, + RopeConfig rope_config, + const bool use_logn_attn, + bool store_qkv, + bool store_q, + bool store_kv, + bool store_cache, + const float2* rotary_embedding_coefficient_cache) { + // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, + // head_num, size_per_head], and QKV split to 3 split buffer q, k, v and + // transpose them to [batch_size, head_num, seq_len, size_per_head]. For q and + // k, also apply the rotary embedding. + + // When we pass prefix prompt, this kernel also concatenate the prefix prompt + // and key/value along seq_len dimension like [prompt, key/value]. So, the + // final shape of q is same ([batch_size, head_num, seq_len, size_per_head]), + // but the shapes of key and values become [batch_size, head_num, + // max_prefix_prompt_length + seq_len, size_per_head]. + + // NOTE: QKV src shape (batch_size, seq_len, 3, head_num, size_per_head) + // QKV dst shape (3, batch_size, head_num, seq_len, size_per_head) + extern __shared__ __align__(sizeof(float2)) char smem_[]; // align on largest vector type + + static constexpr bool ENABLE_8BITS_CACHE = sizeof(Tcache) == 1; + +#ifdef ENABLE_FP8 + // Quantized output only supports fp8 currently. + using QuantizedEltType = __nv_fp8_e4m3; + using QuantizedVecType = typename Vec_t::QuantizedType; +#endif + constexpr int vec_size = Vec_t::size; + using Vec_t = typename Vec_t::Type; + const int token_idx = blockIdx.x; + const int token_padding_offset = padding_offset == nullptr ? 0 : padding_offset[token_idx]; + const int tgt_token_idx = token_idx + token_padding_offset; + + const int batch_idx = tgt_token_idx / seq_len; + const int seq_idx = tgt_token_idx % seq_len; + + const int head_idx = blockIdx.y; + const int tidx = threadIdx.x; + const int total_seq_len = param.max_prefix_prompt_length + seq_len; + + if (tidx * vec_size >= size_per_head) { + return; + } + + const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0; + const int hidden_idx = head_idx * size_per_head + tidx * vec_size; + const int n = head_num * size_per_head; + const int kv_n = head_num_kv * size_per_head; // MQA + // the [0..seq_len) indices really handle KV [max_pp_len..seq_len+max_pp_len) + // and Q [0..seq_len) + // Note: if !PREFIX_PROMPT, max_pp_len = 0, so it's no-op + const int dst_kv_seq_idx = seq_idx + prefix_prompt_length; + + // NOTE: q has seq len excluding prefix prompt + // src QKV: [batch, time, 3, head, hidden] + const int src_q_idx = token_idx * (n + 2 * kv_n) + hidden_idx; + const int src_k_idx = token_idx * (n + 2 * kv_n) + hidden_idx + n; + const int src_v_idx = token_idx * (n + 2 * kv_n) + hidden_idx + kv_n + n; + + Vec_t q, k, v; + q = *reinterpret_cast(&QKV[src_q_idx]); + + if (head_idx < head_num_kv) { + k = *reinterpret_cast(&QKV[src_k_idx]); + v = *reinterpret_cast(&QKV[src_v_idx]); + } + + if (qkv_bias) { + Vec_t q_bias, k_bias, v_bias; + q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); + q = add(q, q_bias); + + if (head_idx < head_num_kv) { + k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n]); + v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n + kv_n]); + k = add(k, k_bias); + v = add(v, v_bias); + } + } + int position_id = -1; + if (rope_config.style == RopeStyle::Mrope) { + int rope_dim = rope_config.mrope_dim1 + rope_config.mrope_dim2 + rope_config.mrope_dim3; + int now_idx = tidx % rope_dim, now_dim = 0; + if (now_idx >= rope_config.mrope_dim1 + rope_config.mrope_dim2) { + now_dim = 2; + } else if (now_idx >= rope_config.mrope_dim1) { + now_dim = 1; + } + position_id = position_ids[token_idx * rope_config.index_factor + now_dim]; + } else if (position_ids) { + position_id = position_ids[token_idx * rope_config.index_factor]; + } + const int pre_len = cu_seqlens[batch_idx]; + const int input_len = cu_seqlens[batch_idx + 1] - pre_len; + context_rope(rope_config, + q, + k, + reinterpret_cast(smem_), + tidx, + seq_idx, + position_id, + seq_len, + input_len, + PREFIX_PROMPT, + prefix_prompt_length, + param.count_length, + rotary_embedding_coefficient_cache); + + if (use_logn_attn) { + logn_attention(q, seq_idx, rope_config.max_pos); + } + + __syncthreads(); + + if (store_qkv) { + *reinterpret_cast(&QKV[src_q_idx]) = q; + if (head_idx < head_num_kv) { +#ifdef ENABLE_FP8 + if (QuantizedQKV != nullptr) { + // use 1.0f scale currently for qkv input of FP8 FMHA. + convert_to_fp8( + reinterpret_cast(reinterpret_cast(QuantizedQKV) + src_k_idx), + k); + convert_to_fp8( + reinterpret_cast(reinterpret_cast(QuantizedQKV) + src_v_idx), + v); + } +#endif + *reinterpret_cast(&QKV[src_k_idx]) = k; + *reinterpret_cast(&QKV[src_v_idx]) = v; + } +#ifdef ENABLE_FP8 + if (QuantizedQKV != nullptr) { + size_t dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + if constexpr (USE_PAGED_FMHA) { + dest_q_idx = + (pre_len + seq_idx) * size_per_head * head_num + head_idx * size_per_head + tidx * vec_size; + } + *reinterpret_cast(&q_buf[dest_q_idx]) = q; + QuantizedVecType* quantized_q_ptr = + USE_PAGED_FMHA ? reinterpret_ptr(q_buf, dest_q_idx) : + reinterpret_ptr(QuantizedQKV, src_q_idx); + convert_to_fp8(quantized_q_ptr, q); + } +#endif + } + + if (store_q) { + size_t dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + if constexpr (USE_PAGED_FMHA) { + dest_q_idx = (pre_len + seq_idx) * size_per_head * head_num + head_idx * size_per_head + tidx * vec_size; + } + *reinterpret_cast(&q_buf[dest_q_idx]) = q; + } + + if (store_kv) { + const int dest_kv_idx = batch_idx * size_per_head * total_seq_len * head_num_kv + + head_idx * size_per_head * total_seq_len + dst_kv_seq_idx * size_per_head + + tidx * vec_size; + + if (head_idx < head_num_kv) { + *reinterpret_cast(&k_buf[dest_kv_idx]) = k; + *reinterpret_cast(&v_buf[dest_kv_idx]) = v; + } + } + + if (store_cache) { + if (head_idx < head_num_kv) { + KVBlockArray kv_block_array = param.kv_block_array; + Tcache* k_cache = reinterpret_cast(kv_block_array.getKBlockPtr(batch_idx, dst_kv_seq_idx)); + Tcache* v_cache = reinterpret_cast(kv_block_array.getVBlockPtr(batch_idx, dst_kv_seq_idx)); + +#pragma unroll + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inKBlockIdx = kv_block_array.getKLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + k_cache[inKBlockIdx] = reinterpret_cast(&k)[vec_i]; + + const int inVBlockIdx = kv_block_array.getVLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + v_cache[inVBlockIdx] = reinterpret_cast(&v)[vec_i]; + } + } + } +} + +template +void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param_ptr, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + const float2 * rotary_embedding_coefficient_cache, + cudaStream_t stream) { + auto& param = *param_ptr; + dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); + dim3 grid(token_num, head_num); + size_t smem_size = rope_config.style == RopeStyle::No ? 0 : 2 * rope_config.dim * sizeof(T); + + FT_SWITCH(param.max_prefix_prompt_length != 0, PREFIX_PROMPT, [&] { + FT_SWITCH(use_paged_fmha, USE_PAGED_FMHA, [&] { + FT_SWITCH_KV_CACHE_TYPE_CASE(param.kv_block_array.cache_type, Tcache, [&] { + FT_ROPE_SWITCH(rope_config.style, ROPE_STYLE, [&] { + add_fusedQKV_bias_transpose_prefill_kernel_v1 + <<>>(q_buf, + k_buf, + v_buf, + param, + QKV, + QuantizedQKV, + position_ids, + qkv_bias, + padding_offset, + cu_seqlens, + batch_size, + seq_len, + head_num, + head_num_kv, + size_per_head, + rope_config, + use_logn_attn, + store_qkv, + store_q, + store_kv, + store_cache, + rotary_embedding_coefficient_cache); + }); + }); + }); + }); +} + + template __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* q_buf, T* k_buf, @@ -2379,19 +2654,20 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* void* QuantizedQKV, const int* position_ids, const T* __restrict qkv_bias, - const int* padding_offset, - const int* cu_seqlens, - const int batch_size, - const int seq_len, - const int head_num, - const int head_num_kv, - const int size_per_head, - RopeConfig rope_config, - const bool use_logn_attn, - bool store_qkv, - bool store_q, - bool store_kv, - bool store_cache) { + const int* padding_offset, + const int* cu_seqlens, + const int batch_size, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head, + RopeConfig rope_config, + const bool use_logn_attn, + bool store_qkv, + bool store_q, + bool store_kv, + bool store_cache, + const float2* rotary_embedding_coefficient_cache) { // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, // head_num, size_per_head], and QKV split to 3 split buffer q, k, v and // transpose them to [batch_size, head_num, seq_len, size_per_head]. For q and @@ -2492,7 +2768,8 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* input_len, PREFIX_PROMPT, prefix_prompt_length, - param.count_length); + param.count_length, + rotary_embedding_coefficient_cache); if (use_logn_attn) { logn_attention(q, seq_idx, rope_config.max_pos); @@ -2559,30 +2836,28 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* KVBlockArray kv_block_array = param.kv_block_array; Tcache* k_cache = reinterpret_cast(kv_block_array.getKBlockPtr(batch_idx, dst_kv_seq_idx)); Tcache* v_cache = reinterpret_cast(kv_block_array.getVBlockPtr(batch_idx, dst_kv_seq_idx)); - if constexpr (ENABLE_8BITS_CACHE) { - float* k_scale_ptr = reinterpret_cast(kv_block_array.getKScalePtr(batch_idx, dst_kv_seq_idx)); - float* v_scale_ptr = reinterpret_cast(kv_block_array.getVScalePtr(batch_idx, dst_kv_seq_idx)); - const int inBlockIdx = - kv_block_array.getKVLocalIdx(dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size); - const int inScaleIdx = kv_block_array.getKVScaleLocalIdx(dst_kv_seq_idx, head_idx); + if constexpr (std::is_same::value) { + float* k_scale_ptr = reinterpret_cast(kv_block_array.getKScalePtr(batch_idx, dst_kv_seq_idx)); + float* v_scale_ptr = reinterpret_cast(kv_block_array.getVScalePtr(batch_idx, dst_kv_seq_idx)); + const int inScaleIdx = kv_block_array.getKVScaleLocalIdx(dst_kv_seq_idx, head_idx); + __shared__ float s_max[2]; - if constexpr (std::is_same::value) { - float local_max[2]; - local_max[0] = vector_abs_max(k); - local_max[1] = vector_abs_max(v); - blockReduceMaxV2(local_max); - if (threadIdx.x == 0) { - s_max[0] = local_max[0]; - s_max[1] = local_max[1]; - } - } else { - s_max[0] = float(1 << (8 - 1)); - s_max[1] = float(1 << (8 - 1)); + s_max[0] = float(1 << (8 - 1)); + s_max[1] = float(1 << (8 - 1)); + +#pragma unroll + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inKBlockIdx = kv_block_array.getKLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + + const int inVBlockIdx = kv_block_array.getVLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + + // convert_to_fp8(reinterpret_cast<__nv_fp8_e4m3*>(k_cache) + inKBlockIdx, float(reinterpret_cast(&k)[vec_i]) * float(1 << (8 - 1)) / s_max[0]); + k_cache[inKBlockIdx] = Tcache(float(reinterpret_cast(&k)[vec_i]) * (float(1 << (8 - 1)) / s_max[0])); + v_cache[inVBlockIdx] = Tcache(float(reinterpret_cast(&v)[vec_i]) * (float(1 << (8 - 1)) / s_max[1])); } - __syncthreads(); - store_8bits_kv_cache_vec(k_cache, k, inBlockIdx, float(1 << (8 - 1)) / s_max[0]); - store_8bits_kv_cache_vec(v_cache, v, inBlockIdx, float(1 << (8 - 1)) / s_max[1]); if (tidx == 0) { *reinterpret_cast(&k_scale_ptr[inScaleIdx]) = s_max[0] / float(1 << (8 - 1)); *reinterpret_cast(&v_scale_ptr[inScaleIdx]) = s_max[1] / float(1 << (8 - 1)); @@ -2594,8 +2869,9 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); k_cache[inKBlockIdx] = reinterpret_cast(&k)[vec_i]; - const int inVBlockIdx = - kv_block_array.getVLocalIdx(dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + const int inVBlockIdx = kv_block_array.getVLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + v_cache[inVBlockIdx] = reinterpret_cast(&v)[vec_i]; } } @@ -2629,6 +2905,7 @@ void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, + const float2* rotary_embedding_coefficient_cache, cudaStream_t stream) { auto& param = *param_ptr; dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -2660,37 +2937,171 @@ void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, store_qkv, store_q, store_kv, - store_cache); + store_cache, + rotary_embedding_coefficient_cache); }); }); }); }); } +template +__global__ void add_fusedQKV_bias_transpose_decode_kernel_v1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam param, + const int* input_lengths, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* __restrict qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head, + RopeConfig rope_config, + const bool use_logn_attn, + bool store_qkv, + bool store_q, + bool store_kv, + bool store_cache) { + extern __shared__ __align__(sizeof(float2)) char smem_[]; + + constexpr int vec_size = Vec_t::size; + using Vec_t = typename Vec_t::Type; + const int token_idx = blockIdx.x; + const int token_padding_offset = padding_offset == nullptr ? 0 : padding_offset[token_idx]; + const int tgt_token_idx = token_idx + token_padding_offset; + + const int batch_idx = tgt_token_idx / seq_len; + const int seq_idx = tgt_token_idx % seq_len; + + const int head_idx = blockIdx.y; + const int tidx = threadIdx.x; + + if (tidx * vec_size >= size_per_head) { + return; + } + + const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0; + const int sequence_length = sequence_lengths[batch_idx]; + const int tlength = sequence_length + param.max_prefix_prompt_length; + const int hidden_idx = head_idx * size_per_head + tidx * vec_size; + const int n = head_num * size_per_head; + const int kv_n = head_num_kv * size_per_head; // MQA + // the [0..seq_len) indices really handle KV [max_pp_len..seq_len+max_pp_len) + // and Q [0..seq_len) + // Note: if !PREFIX_PROMPT, max_pp_len = 0, so it's no-op + const int dst_kv_seq_idx = seq_idx + tlength; + + // NOTE: q has seq len excluding prefix prompt + // src QKV: [batch, time, 3, head, hidden] + const int src_q_idx = token_idx * (n + 2 * kv_n) + hidden_idx; + const int src_k_idx = token_idx * (n + 2 * kv_n) + hidden_idx + n; + const int src_v_idx = token_idx * (n + 2 * kv_n) + hidden_idx + kv_n + n; + + Vec_t q, k, v; + q = *reinterpret_cast(&QKV[src_q_idx]); + + if (head_idx < head_num_kv) { + k = *reinterpret_cast(&QKV[src_k_idx]); + v = *reinterpret_cast(&QKV[src_v_idx]); + } + + if (qkv_bias) { + Vec_t q_bias, k_bias, v_bias; + q_bias = *reinterpret_cast(&qkv_bias[hidden_idx]); + q = add(q, q_bias); + + if (head_idx < head_num_kv) { + k_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n]); + v_bias = *reinterpret_cast(&qkv_bias[hidden_idx + n + kv_n]); + k = add(k, k_bias); + v = add(v, v_bias); + } + } + + // refer to the implementation of hipify decode attention + const auto batch_beam_idx = blockIdx.y; + const int position_id = position_ids == nullptr ? -1 : position_ids[token_idx * rope_config.index_factor]; + + const int input_len = (input_lengths == nullptr) ? 0 : input_lengths[batch_beam_idx]; + const int timestep = tlength; + attention_rope(rope_config, + q, + k, + reinterpret_cast(smem_), + tidx, + tlength, + tlength, // timestep, + sequence_length, + position_id, + input_len, + prefix_prompt_length, + true /*count_prefix_length*/, + true /*HANDLE_KV*/); + + if (use_logn_attn) { + logn_attention(q, tlength, rope_config.max_pos); + } + + __syncthreads(); + + if (store_q) { + size_t dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len + + seq_idx * size_per_head + tidx * vec_size; + *reinterpret_cast(&q_buf[dest_q_idx]) = q; + } + + if (store_cache) { + if (head_idx < head_num_kv) { + OffsetIndexedKVBlockArray offset_kv_block_array = param.offset_kv_block_array; + Tcache* k_cache = reinterpret_cast(offset_kv_block_array.getKBlockPtr(batch_idx, dst_kv_seq_idx)); + Tcache* v_cache = reinterpret_cast(offset_kv_block_array.getVBlockPtr(batch_idx, dst_kv_seq_idx)); + +#pragma unroll + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inKBlockIdx = offset_kv_block_array.getKLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + k_cache[inKBlockIdx] = reinterpret_cast(&k)[vec_i]; + + const int inVBlockIdx = offset_kv_block_array.getVLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + v_cache[inVBlockIdx] = reinterpret_cast(&v)[vec_i]; + } + } + } +} + template __global__ void add_fusedQKV_bias_transpose_decode_kernel(T* q_buf, T* k_buf, T* v_buf, PrefixPromptBatchWeightsParam param, const int* input_lengths, - T* QKV, - void* QuantizedQKV, - const int* position_ids, - const T* __restrict qkv_bias, - const int* padding_offset, - const int* cu_seqlens, - const int* sequence_lengths, - const int batch_size, - const int seq_len, - const int head_num, - const int head_num_kv, - const int size_per_head, - RopeConfig rope_config, - const bool use_logn_attn, - bool store_qkv, - bool store_q, - bool store_kv, - bool store_cache) { + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* __restrict qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head, + RopeConfig rope_config, + const bool use_logn_attn, + bool store_qkv, + bool store_q, + bool store_kv, + bool store_cache, + const float2* rotary_embedding_coefficient_cache) { extern __shared__ __align__(sizeof(float2)) char smem_[]; constexpr int vec_size = Vec_t::size; @@ -2699,8 +3110,9 @@ __global__ void add_fusedQKV_bias_transpose_decode_kernel(T* const int token_padding_offset = padding_offset == nullptr ? 0 : padding_offset[token_idx]; const int tgt_token_idx = token_idx + token_padding_offset; - const int batch_idx = tgt_token_idx / seq_len; - const int seq_idx = tgt_token_idx % seq_len; + const int batch_idx = tgt_token_idx / seq_len; + const int seq_idx = tgt_token_idx % seq_len; + static constexpr bool ENABLE_8BITS_CACHE = sizeof(Tcache) == 1; const int head_idx = blockIdx.y; const int tidx = threadIdx.x; @@ -2765,7 +3177,8 @@ __global__ void add_fusedQKV_bias_transpose_decode_kernel(T* input_len, prefix_prompt_length, true /*count_prefix_length*/, - true /*HANDLE_KV*/); + true /*HANDLE_KV*/, + rotary_embedding_coefficient_cache); if (use_logn_attn) { logn_attention(q, tlength, rope_config.max_pos); @@ -2778,27 +3191,120 @@ __global__ void add_fusedQKV_bias_transpose_decode_kernel(T* + seq_idx * size_per_head + tidx * vec_size; *reinterpret_cast(&q_buf[dest_q_idx]) = q; } - if (store_cache) { if (head_idx < head_num_kv) { - OffsetIndexedKVBlockArray offset_kv_block_array = param.offset_kv_block_array; - Tcache* k_cache = reinterpret_cast(offset_kv_block_array.getKBlockPtr(batch_idx, dst_kv_seq_idx)); - Tcache* v_cache = reinterpret_cast(offset_kv_block_array.getVBlockPtr(batch_idx, dst_kv_seq_idx)); + KVBlockArray kv_block_array = param.kv_block_array; + Tcache* k_cache = reinterpret_cast(kv_block_array.getKBlockPtr(batch_idx, dst_kv_seq_idx)); + Tcache* v_cache = reinterpret_cast(kv_block_array.getVBlockPtr(batch_idx, dst_kv_seq_idx)); + if constexpr (std::is_same::value) { + float* k_scale_ptr = reinterpret_cast(kv_block_array.getKScalePtr(batch_idx, dst_kv_seq_idx)); + float* v_scale_ptr = reinterpret_cast(kv_block_array.getVScalePtr(batch_idx, dst_kv_seq_idx)); + const int inScaleIdx = kv_block_array.getKVScaleLocalIdx(dst_kv_seq_idx, head_idx); + __shared__ float s_max[2]; + s_max[0] = float(1 << (8 - 1)); + s_max[1] = float(1 << (8 - 1)); #pragma unroll - for (int vec_i = 0; vec_i < vec_size; vec_i++) { - const int inKBlockIdx = offset_kv_block_array.getKLocalIdx( - dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); - k_cache[inKBlockIdx] = reinterpret_cast(&k)[vec_i]; + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inKBlockIdx = kv_block_array.getKLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); - const int inVBlockIdx = offset_kv_block_array.getVLocalIdx( - dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); - v_cache[inVBlockIdx] = reinterpret_cast(&v)[vec_i]; + const int inVBlockIdx = kv_block_array.getVLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + + k_cache[inKBlockIdx] = Tcache(float(reinterpret_cast(&k)[vec_i]) * (float(1 << (8 - 1)) / s_max[0])); + v_cache[inVBlockIdx] = Tcache(float(reinterpret_cast(&v)[vec_i]) * (float(1 << (8 - 1)) / s_max[1])); + } + + if (tidx == 0) { + *reinterpret_cast(&k_scale_ptr[inScaleIdx]) = s_max[0] / float(1 << (8 - 1)); + *reinterpret_cast(&v_scale_ptr[inScaleIdx]) = s_max[1] / float(1 << (8 - 1)); + } + } else { +#pragma unroll + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inKBlockIdx = kv_block_array.getKLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + k_cache[inKBlockIdx] = reinterpret_cast(&k)[vec_i]; + + const int inVBlockIdx = kv_block_array.getVLocalIdx( + dst_kv_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + + v_cache[inVBlockIdx] = reinterpret_cast(&v)[vec_i]; + } } } } } +template +void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param_ptr, + const int* input_lengths, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + cudaStream_t stream) { + auto& param = *param_ptr; + dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); + dim3 grid(token_num, head_num); + size_t smem_size = rope_config.style == RopeStyle::No ? 0 : 2 * rope_config.dim * sizeof(T); + + FT_SWITCH(param.max_prefix_prompt_length != 0, PREFIX_PROMPT, [&] { + FT_SWITCH(use_paged_fmha, USE_PAGED_FMHA, [&] { + FT_SWITCH_KV_CACHE_TYPE_CASE(param.kv_block_array.cache_type, Tcache, [&] { + FT_ROPE_SWITCH(rope_config.style, ROPE_STYLE, [&] { + add_fusedQKV_bias_transpose_decode_kernel_v1 + <<>>(q_buf, + k_buf, + v_buf, + param, + input_lengths, + QKV, + QuantizedQKV, + position_ids, + qkv_bias, + padding_offset, + cu_seqlens, + sequence_lengths, + batch_size, + seq_len, + head_num, + head_num_kv, + size_per_head, + rope_config, + use_logn_attn, + store_qkv, + store_q, + store_kv, + store_cache); + }); + }); + }); + }); +} + template void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, T* k_buf, @@ -2827,6 +3333,7 @@ void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, + const float2* rotary_embedding_coefficient_cache, cudaStream_t stream) { auto& param = *param_ptr; dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -2860,7 +3367,8 @@ void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, store_qkv, store_q, store_kv, - store_cache); + store_cache, + rotary_embedding_coefficient_cache); }); }); }); @@ -3022,7 +3530,7 @@ __global__ void gather_sequences_kernel_combined_v2(T* output_q, } template -__global__ void load_prefix_KVCache_kernel_aiter(T* q_buf, +__global__ void load_prefix_KVCache_kernel_aiter_v1(T* q_buf, T* k_buf, T* v_buf, PrefixPromptBatchWeightsParam param, @@ -3091,6 +3599,78 @@ __global__ void load_prefix_KVCache_kernel_aiter(T* q } } } + + +template +__global__ void load_prefix_KVCache_kernel_aiter(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam param, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head) { + static constexpr bool ENABLE_8BITS_CACHE = sizeof(Tcache) == 1; + + constexpr int vec_size = Vec_t::size; + using Vec_t = typename Vec_t::Type; + + const int head_idx = blockIdx.y; + const int tidx = threadIdx.x; + const int total_seq_len = param.max_prefix_prompt_length + seq_len; + + if (tidx * vec_size >= size_per_head) { + return; + } + // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts + + if (head_idx < head_num_kv) { + const int prompt_batch_idx = blockIdx.x / param.max_prefix_prompt_length; + const int prompt_seq_idx = blockIdx.x % param.max_prefix_prompt_length; + const int prompt_length = param.d_prefix_prompt_lengths[prompt_batch_idx]; + + if (prompt_seq_idx < prompt_length) { + const int dest_kv_idx = prompt_batch_idx * size_per_head * total_seq_len * head_num_kv + + head_idx * size_per_head * total_seq_len + prompt_seq_idx * size_per_head + + tidx * vec_size; + if (param.kv_block_array.mMaxSeqs > 0) { + Tcache* k_cache = + reinterpret_cast(param.kv_block_array.getKBlockPtr(prompt_batch_idx, prompt_seq_idx)); + Tcache* v_cache = + reinterpret_cast(param.kv_block_array.getVBlockPtr(prompt_batch_idx, prompt_seq_idx)); + const int inKBlockIdx = param.kv_block_array.getKLocalIdx( + prompt_seq_idx, head_idx, size_per_head, tidx * vec_size); + + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inVBlockIdx = param.kv_block_array.getVLocalIdx( + prompt_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + v_buf[dest_kv_idx + vec_i] = *reinterpret_cast(&v_cache[inVBlockIdx]); + } + + if constexpr (ENABLE_8BITS_CACHE) { + float* k_scale_ptr = + reinterpret_cast(param.kv_block_array.getKScalePtr(prompt_batch_idx, prompt_seq_idx)); + float* v_scale_ptr = + reinterpret_cast(param.kv_block_array.getVScalePtr(prompt_batch_idx, prompt_seq_idx)); + int inScaleIdx = param.kv_block_array.getKVScaleLocalIdx(prompt_seq_idx, head_idx); + for (int vec_i = 0; vec_i < vec_size; vec_i++) { + const int inVBlockIdx = param.kv_block_array.getVLocalIdx( + prompt_seq_idx, head_idx, size_per_head, tidx * vec_size + vec_i); + load_8bits_kv_cache_vec(reinterpret_cast(&v_buf[dest_kv_idx]), + v_cache, + inVBlockIdx, + v_scale_ptr[inScaleIdx]); + } + load_8bits_kv_cache_vec( + reinterpret_cast(&k_buf[dest_kv_idx]), k_cache, inKBlockIdx, k_scale_ptr[inScaleIdx]); + } else { + *reinterpret_cast(&k_buf[dest_kv_idx]) = + *reinterpret_cast(&k_cache[inKBlockIdx]); + } + } + } + } +} #endif template @@ -3159,6 +3739,29 @@ void invokeGatherSequencesCombined(T* output_q, size_per_head); } +template +void invokeLoadPrefixKVCacheAiterV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param_ptr, + const int batch_size, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head, + const float* scale, + const int int8_mode, + cudaStream_t stream) { + auto& param = *param_ptr; + dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); + dim3 grid(batch_size * param.max_prefix_prompt_length, head_num); + + FT_SWITCH_KV_CACHE_TYPE_CASE(param.kv_block_array.cache_type, Tcache, [&] { + load_prefix_KVCache_kernel_aiter_v1 + <<>>(q_buf, k_buf, v_buf, param, seq_len, head_num, head_num_kv, size_per_head); + }); +} + template void invokeLoadPrefixKVCacheAiter(T* q_buf, T* k_buf, @@ -3313,6 +3916,42 @@ INSTANTIATEDECODEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16); #endif #undef INSTANTIATEDECODEADDFUSEDQKVBIASTRANSPOSE #if USING_ROCM + +#define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(T) \ + template void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, \ + T* k_buf, \ + T* v_buf, \ + PrefixPromptBatchWeightsParam* param, \ + T* QKV, \ + void* QuantizedQKV, \ + const int* position_ids, \ + const T* qkv_bias, \ + const int* padding_offset, \ + const int* cu_seqlens, \ + const int batch_size, \ + const int seq_len, \ + const int token_num, \ + const int head_num, \ + const int head_num_kv, \ + const int size_per_head, \ + const RopeConfig rope_config, \ + const bool use_logn_attn, \ + const float* scale, \ + const int int8_mode, \ + const bool use_paged_fmha, \ + const bool store_qkv, \ + const bool store_q, \ + const bool store_kv, \ + const bool store_cache, \ + const float2 * rotary_embedding_coefficient_cache,\ + cudaStream_t stream) +INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(float); +INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(half); +#ifdef ENABLE_BF16 +INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(__nv_bfloat16); +#endif +#undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1 + #define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(T) \ template void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, \ T* k_buf, \ @@ -3339,7 +3978,8 @@ INSTANTIATEDECODEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16); const bool store_q, \ const bool store_kv, \ const bool store_cache, \ - cudaStream_t stream) + const float2* rotary_embedding_coefficient_cache, \ + cudaStream_t stream) INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(float); INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(half); #ifdef ENABLE_BF16 @@ -3347,6 +3987,42 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(__nv_bfloat16); #endif #undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL +#define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(T) \ + template void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, \ + T* k_buf, \ + T* v_buf, \ + PrefixPromptBatchWeightsParam* param, \ + const int* input_lengths, \ + T* QKV, \ + void* QuantizedQKV, \ + const int* position_ids, \ + const T* qkv_bias, \ + const int* padding_offset, \ + const int* cu_seqlens, \ + const int* sequence_lengths, \ + const int batch_size, \ + const int seq_len, \ + const int token_num, \ + const int head_num, \ + const int head_num_kv, \ + const int size_per_head, \ + const RopeConfig rope_config, \ + const bool use_logn_attn, \ + const float* scale, \ + const int int8_mode, \ + const bool use_paged_fmha, \ + const bool store_qkv, \ + const bool store_q, \ + const bool store_kv, \ + const bool store_cache, \ + cudaStream_t stream) +INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(float); +INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(half); +#ifdef ENABLE_BF16 +INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(__nv_bfloat16); +#endif +#undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1 + #define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODE(T) \ template void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, \ T* k_buf, \ @@ -3375,7 +4051,8 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(__nv_bfloat16); const bool store_q, \ const bool store_kv, \ const bool store_cache, \ - cudaStream_t stream) + const float2* rotary_embedding_coefficient_cache, \ + cudaStream_t stream) INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODE(float); INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODE(half); #ifdef ENABLE_BF16 @@ -3385,6 +4062,26 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODE(__nv_bfloat16); #endif #if USING_ROCM +#define INSTANTIATEINVOKELOADPREFIXKVCACHEAITERV1(T) \ + template void invokeLoadPrefixKVCacheAiterV1(T* q_buf, \ + T* k_buf, \ + T* v_buf, \ + PrefixPromptBatchWeightsParam* param, \ + const int batch_size, \ + const int seq_len, \ + const int head_num, \ + const int head_num_kv, \ + const int size_per_head, \ + const float* scale, \ + const int int8_mode, \ + cudaStream_t stream) +INSTANTIATEINVOKELOADPREFIXKVCACHEAITERV1(float); +INSTANTIATEINVOKELOADPREFIXKVCACHEAITERV1(half); +#ifdef ENABLE_BF16 +INSTANTIATEINVOKELOADPREFIXKVCACHEAITERV1(__nv_bfloat16); +#endif +#undef INSTANTIATEINVOKELOADPREFIXKVCACHEAITERV1 + #define INSTANTIATEINVOKELOADPREFIXKVCACHEAITER(T) \ template void invokeLoadPrefixKVCacheAiter(T* q_buf, \ T* k_buf, \ @@ -3449,4 +4146,21 @@ INSTANTIATEINVOKELOADPREFIXKVCACHE(__nv_bfloat16); +__global__ void +cache_rotary_embedding_coefficient(float2* rotary_embedding_coefficient_cache, int stride, RopeConfig rope_config) { + int tid = threadIdx.x; + int t_step = blockIdx.x; + // only support RopeStyle::Base for now. + rotary_embedding_coefficient_cache[t_step * stride + tid] = rotary_embedding_coefficient( + 2 * tid, rope_config.dim, t_step, rope_config.base, LinearScaleRope{rope_config.scale}); +} + +void invokeRotaryEmbeddingCoefficientCache(float2* rotary_embedding_coefficient_cache, + int max_seq_len, + RopeConfig rope_config, + cudaStream_t stream) { + cache_rotary_embedding_coefficient<<>>( + rotary_embedding_coefficient_cache, rope_config.dim / 2, rope_config); +} + } // namespace rtp_llm diff --git a/rtp_llm/cpp/kernels/unfused_attention_kernels.h b/rtp_llm/cpp/kernels/unfused_attention_kernels.h index 1e7dae06a..ef6d5de3b 100644 --- a/rtp_llm/cpp/kernels/unfused_attention_kernels.h +++ b/rtp_llm/cpp/kernels/unfused_attention_kernels.h @@ -17,6 +17,9 @@ #include "rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h" #include "rtp_llm/cpp/model_utils/RopeConfig.h" +#if USING_ROCM +#include "hip/amd_detail/amd_hip_vector_types.h" +#endif namespace rtp_llm { @@ -139,6 +142,35 @@ void invokeGatherSequencesCombined(T* output_q, int size_per_head, cudaStream_t stream); +template +void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + const float2 * rotary_embedding_coefficient_cache, + cudaStream_t stream); + template void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, T* k_buf, @@ -165,8 +197,38 @@ void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, + const float2* rotary_embedding_coefficient_cache, cudaStream_t stream); template +void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param, + const int* input_lengths, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + cudaStream_t stream); +template void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, T* k_buf, T* v_buf, @@ -194,7 +256,37 @@ void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, + const float2* rotary_embedding_coefficient_cache, cudaStream_t stream); +template +void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param, + const int* input_lengths, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + cudaStream_t stream); #endif template @@ -232,6 +324,20 @@ void invokeLoadPrefixKVCache(T* q_buf, cudaStream_t stream); #if USING_ROCM +template +void invokeLoadPrefixKVCacheAiterV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param, + const int batch_size, + const int seq_len, + const int head_num, + const int head_num_kv, + const int size_per_head, + const float* scale, + const int int8_mode, + cudaStream_t stream); + template void invokeLoadPrefixKVCacheAiter(T* q_buf, T* k_buf, @@ -251,4 +357,8 @@ void invokeLoadPrefixKVCacheAiter(T* q_buf, +void invokeRotaryEmbeddingCoefficientCache(float2* rotary_embedding_coefficient_cache, + int max_seq_len, + RopeConfig rope_config, + cudaStream_t stream); } // namespace rtp_llm diff --git a/rtp_llm/cpp/models/GptModel.cc b/rtp_llm/cpp/models/GptModel.cc index aa4a5e58e..7236f8cad 100644 --- a/rtp_llm/cpp/models/GptModel.cc +++ b/rtp_llm/cpp/models/GptModel.cc @@ -45,6 +45,12 @@ GptModel::GptModel(const GptModelInitParams& params): overall_expert_stats_ = device_->createMoeExpertStates( {layer_num_, moe_conf.ep_size, moe_conf.expert_num, moe_conf.expert_num + moe_conf.extra_expert_num}); } +#if USING_ROCM + auto & rope_config = params.description.attention_conf.rope_config; + if (rope_config.style == RopeStyle::Base) { + rotary_embedding_coefficient_cache_ = device_->getRotaryEmbeddingCoefficientCache(rope_config); + } +#endif } void getPaddingOffsetAndCuSeqLens(int32_t* padding_offset, @@ -1304,7 +1310,8 @@ AttentionBlockOutputs GptModel::forwardAttentionBlock(const GptLayerInputs& description_.act_qscheme, description_.compute_type, enable_sp, - inputs.pad_token_num}); + inputs.pad_token_num, + rotary_embedding_coefficient_cache_}); if (description_.attention_conf.use_mla && device_->mla_ops_type != rtp_llm::MlaOpsType::MHA) { attn_output = device_->mlaAttentionLayer(attn_params); } else { diff --git a/rtp_llm/cpp/models/GptModel.h b/rtp_llm/cpp/models/GptModel.h index ccb6ffb1b..a7334bb55 100644 --- a/rtp_llm/cpp/models/GptModel.h +++ b/rtp_llm/cpp/models/GptModel.h @@ -234,6 +234,7 @@ class GptModel { rtp_llm::BufferPtr v_scale_buffer_; rtp_llm::BufferPtr residual_scale_fp32_; rtp_llm::BufferPtr residual_scale_; + rtp_llm::BufferPtr rotary_embedding_coefficient_cache_ = nullptr; public: rtp_llm::Weights weights_; diff --git a/rtp_llm/cpp/pybind/ConfigInit.cc b/rtp_llm/cpp/pybind/ConfigInit.cc index 86952516e..0de9d9717 100644 --- a/rtp_llm/cpp/pybind/ConfigInit.cc +++ b/rtp_llm/cpp/pybind/ConfigInit.cc @@ -214,16 +214,18 @@ void register_profiling_debug_logging_config(pybind11::module& m) { void register_hwkernel_config(pybind11::module& m) { pybind11::class_(m, "HWKernelConfig") - .def(pybind11::init(), + .def(pybind11::init(), pybind11::arg("deep_gemm_num_sm") = -1, pybind11::arg("arm_gemm_use_kai") = false, pybind11::arg("enable_stable_scatter_add") = false, pybind11::arg("enable_multi_block_mode") = true, pybind11::arg("ft_disable_custom_ar") = true, pybind11::arg("rocm_hipblaslt_config") = "gemm_config.csv", + pybind11::arg("use_swizzleA") = false, pybind11::arg("enable_cuda_graph") = false, pybind11::arg("enable_cuda_graph_debug_mode") = false, pybind11::arg("use_aiter_pa") = true, + pybind11::arg("use_asm_pa") = true, pybind11::arg("enable_native_cuda_graph") = false, pybind11::arg("num_native_cuda_graph") = 200) .def("to_string", &HWKernelConfig::to_string) @@ -234,9 +236,11 @@ void register_hwkernel_config(pybind11::module& m) { .def_readwrite("enable_multi_block_mode", &HWKernelConfig::enable_multi_block_mode) .def_readwrite("ft_disable_custom_ar", &HWKernelConfig::ft_disable_custom_ar) .def_readwrite("rocm_hipblaslt_config", &HWKernelConfig::rocm_hipblaslt_config) + .def_readwrite("use_swizzleA", &HWKernelConfig::use_swizzleA) .def_readwrite("enable_cuda_graph", &HWKernelConfig::enable_cuda_graph) .def_readwrite("enable_cuda_graph_debug_mode", &HWKernelConfig::enable_cuda_graph_debug_mode) .def_readwrite("use_aiter_pa", &HWKernelConfig::use_aiter_pa) + .def_readwrite("use_asm_pa", &HWKernelConfig::use_asm_pa) .def_readwrite("enable_native_cuda_graph", &HWKernelConfig::enable_native_cuda_graph) .def_readwrite("num_native_cuda_graph", &HWKernelConfig::num_native_cuda_graph); } diff --git a/rtp_llm/cpp/rocm/BUILD b/rtp_llm/cpp/rocm/BUILD index 567aaf5e9..75c782630 100644 --- a/rtp_llm/cpp/rocm/BUILD +++ b/rtp_llm/cpp/rocm/BUILD @@ -38,12 +38,12 @@ cc_library( srcs = glob([ "*.cc", "int4_gemm_kernels/*.cc", - ]), + ], exclude=["rocmMoeWrapper.cc"]), hdrs = glob([ "*.h", "int4_gemm_kernels/*.h", "*.cuh", - ]), + ], exclude=["rocmMoeWrapper.h"]), deps = [ "@local_config_rocm//rocm:rocm", "@local_config_rocm//rocm:rocm_headers", @@ -53,15 +53,16 @@ cc_library( "//rtp_llm/cpp/utils:core_utils", "//rtp_llm/cpp/core:allocator", "//rtp_llm/cpp/core:types", - "@composable_kernel//:ck_fmha_example", - "@composable_kernel//:ck_layernorm2d_example", - "@composable_kernel//:ck_rmsnorm2d_example", - "@composable_kernel//:ck_fused_moe_example", "@com_google_absl//absl/container:node_hash_map", "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", - ], + ] + select({ + "@//:using_aiter_src": [ + "@aiter_src//:module_mha_fwd", + ], + "//conditions:default": ["@aiter//:module_mha_fwd",], + }), copts = rocm_copts(), include_prefix = "src", visibility = ["//visibility:public"], diff --git a/rtp_llm/cpp/rocm/TensorDataManipulation.h b/rtp_llm/cpp/rocm/TensorDataManipulation.h new file mode 100644 index 000000000..70b133c5b --- /dev/null +++ b/rtp_llm/cpp/rocm/TensorDataManipulation.h @@ -0,0 +1,412 @@ +#pragma once +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +typedef SSIZE_T ssize_t; +#endif + +namespace Tensor +{ + namespace Manipulation + { + + using Shape = std::vector; + using Strides = std::vector; + using Indices = std::vector; + using Permutation = std::vector; + //shape: [M, N, K] + //strides: [N * K, K, 1] + + class TensorDesc + { + public: + explicit TensorDesc(std::initializer_list shape) + : shape(shape) + { + strides.assign(shape.size(), 1); + + for(ssize_t i = strides.size() - 2; i >= 0; --i) + { + strides[i] = strides[i + 1] * this->shape[i + 1]; + } + } + + explicit TensorDesc(const Shape& shape) + : shape(shape) + { + strides.assign(shape.size(), 1); + + for(int i = strides.size() - 2; i >= 0; --i) + { + strides[i] = strides[i + 1] * this->shape[i + 1]; + } + } + + TensorDesc(std::initializer_list shape, std::initializer_list strides) + : shape(shape) + , strides(strides) + { + } + + TensorDesc(const Shape& shape, const Strides& strides) + : shape(shape) + , strides(strides) + { + } + + size_t stride(size_t i) const + { + return strides.at(i); + } + + size_t numDims() const + { + return shape.size(); + } + + size_t dim(size_t i) const + { + return shape.at(i); + } + + const Shape& getShape() const + { + return shape; + } + + void setShape(const Shape& shape) + { + this->shape = shape; + strides.assign(shape.size(), 1); + + for(int i = strides.size() - 2; i >= 0; --i) + { + strides[i] = strides[i + 1] * this->shape[i + 1]; + } + } + + friend std::ostream& operator<<(std::ostream& os, const TensorDesc& desc) + { + os << "Shape: ["; + for(auto i : desc.shape) + { + os << i << ", "; + } + os << "]\n"; + os << "Strides: ["; + for(auto i : desc.strides) + { + os << i << ", "; + } + os << "]\n"; + return os; + } + + std::size_t flattenSize() const + { + size_t s{1}; + for(auto i : shape) + { + s *= i; + } + return s; + } + + bool isShapeCompatible(const Shape& shape) const + { + TensorDesc newDesc(shape); + return flattenSize() == newDesc.flattenSize(); + } + + bool canShapePadTo(const Shape& shape) const + { + if(this->shape.size() != shape.size()) + { + return false; + } + + for(size_t i = 0; i < this->shape.size(); ++i) + { + if(this->shape.at(i) > shape.at(i)) + { + return false; + } + } + + return true; + } + + private: + Shape shape; + Strides strides; + }; + + class Tensor + { + public: + template + static Tensor create(const Shape shape) + { + return Tensor(shape, sizeof(T)); + } + + Tensor(const Shape shape, size_t elementSize) + : desc(shape) + , elementSize(elementSize) + , data(new char[elementSize * desc.flattenSize()]) + { + } + + template + const T* as() const + { + return reinterpret_cast(data.get()); + } + + template + T* as() + { + return reinterpret_cast(data.get()); + } + + template + const T& getValue(const Indices& indices) const + { + size_t offset{}; + + for(size_t i = 0; i < indices.size(); ++i) + { + const auto idx = indices[i]; + offset += desc.stride(i) * idx; + } + + return as()[offset]; + } + + template + const T& setValue(const Indices& indices, const T& value) + { + size_t offset{}; + + for(size_t i = 0; i < indices.size(); ++i) + { + const auto idx = indices[i]; + offset += desc.stride(i) * idx; + } + + as()[offset] = value; + return value; + } + + friend std::ostream& operator<<(std::ostream& os, const Tensor& t) + { + os << t.desc; + return os; + } + + const TensorDesc& getDesc() const + { + return desc; + } + + size_t getElementSize() const + { + return elementSize; + } + + size_t getNumBytes() const + { + return getDesc().flattenSize() * getElementSize(); + } + + void reshape(const Shape& shape) + { + if(desc.isShapeCompatible(shape)) + { + desc.setShape(shape); + return; + } + assert(false && "Incompatible shape"); + } + + private: + size_t elementSize{}; + TensorDesc desc; + std::unique_ptr data; + }; + + Indices permute(const Indices& indices, const Permutation& perm) + { + assert(indices.size() == perm.size()); + Indices newIndices = indices; + for(size_t i = 0; i < perm.size(); ++i) + { + newIndices[i] = indices.at(perm.at(i)); + } + return newIndices; + } + + using IterateCallback = std::function; + using IterateDimCallback = std::function; + + void iterate( + const Shape& shape, + size_t dim, + Indices& indices, + IterateCallback callback, + IterateDimCallback dimEnterCallback = [](size_t) {}, + IterateDimCallback dimLeaveCallback = [](size_t) {}) + { + + if(dim == shape.size()) + { + callback(indices); + return; + } + + dimEnterCallback(dim); + + for(size_t i = 0; i < shape.at(dim); ++i) + { + indices[dim] = i; + iterate(shape, dim + 1, indices, callback, dimEnterCallback, dimLeaveCallback); + } + + dimLeaveCallback(dim); + } + + template + void permute(Tensor& dst, const Tensor& src, const Permutation& perm) + { + Indices indices(src.getDesc().numDims(), 0); + + iterate( + src.getDesc().getShape(), 0, indices, [&dst, &src, &perm](const Indices& indices) { + Indices dstIndices = permute(indices, perm); + auto&& value = src.getValue(indices); + dst.setValue(dstIndices, value); + }); + } + + template + Tensor permute(const Tensor& tensor, const Permutation& perm) + { + assert(tensor.getDesc().numDims() == perm.size()); + assert(sizeof(T) == tensor.getElementSize()); + Shape newShape = permute(tensor.getDesc().getShape(), perm); + Tensor permuted(newShape, tensor.getElementSize()); + permute(permuted, tensor, perm); + return permuted; + } + + template + Tensor pad(const Tensor& src, const Shape& newShape, T padVal) + { + assert(src.getDesc().canShapePadTo(newShape) && "Invalid shape for padding"); + Tensor dst(newShape, sizeof(T)); + Indices indices(src.getDesc().numDims(), 0); + + iterate(dst.getDesc().getShape(), 0, indices, [&dst, &padVal](const Indices& indices) { + dst.setValue(indices, padVal); + }); + + iterate(src.getDesc().getShape(), 0, indices, [&dst, &src](const Indices& indices) { + auto&& value = src.getValue(indices); + dst.setValue(indices, value); + }); + return dst; + } + + Tensor pad(const Tensor& tensor, + const Shape& newShape, + const void* padValPtr, + size_t padValSize) + { + switch(padValSize) + { + case 1: + return pad(tensor, newShape, *static_cast(padValPtr)); + case 2: + return pad(tensor, newShape, *static_cast(padValPtr)); + case 4: + return pad(tensor, newShape, *static_cast(padValPtr)); + case 8: + return pad(tensor, newShape, *static_cast(padValPtr)); + default: + assert(false && "Unsupported element size"); + } + + return Tensor({0}, tensor.getElementSize()); + } + + Tensor permute(const Tensor& tensor, const Permutation& perm) + { + Shape newShape = permute(tensor.getDesc().getShape(), perm); + Tensor permuted(newShape, tensor.getElementSize()); + switch(tensor.getElementSize()) + { + case 1: + permute(permuted, tensor, perm); + break; + case 2: + permute(permuted, tensor, perm); + break; + case 4: + permute(permuted, tensor, perm); + break; + case 8: + permute(permuted, tensor, perm); + break; + default: + assert(false && "Unsupported element size"); + } + return permuted; + } + + template + void printTensorData(std::ostream& os, const Tensor& tensor) + { + const auto* data = tensor.as(); + const auto numElements = tensor.getDesc().flattenSize(); + os << "["; + + for(size_t i = 0; i < numElements; ++i) + { + os << float(data[i]) << ", "; + } + + os << "]\n"; + } + + template + void printTensorDataMultiDims(std::ostream& os, const Tensor& tensor) + { + os << "["; + + Indices indices(tensor.getDesc().numDims(), 0); + + iterate( + tensor.getDesc().getShape(), + 0, + indices, + [&os, &tensor](const Indices& idx) { + os << float(tensor.getValue(idx)) << ", "; + }, + [&os](size_t dim) { os << "["; }, + [&os, &tensor](size_t dim) { + os << "], "; + + if(dim + 1 == tensor.getDesc().numDims()) + { + os << '\n'; + } + }); + + os << "]\n"; + } + } +} diff --git a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc index 66b1b5b15..b9ebc3b74 100644 --- a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc +++ b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc @@ -35,7 +35,7 @@ bool CustomAllReduceComm::checkAllReduceAvailable(size_t elts_total_num, DataTyp } if (world_size == 2 or support_nv_link_) { - return elts_total_size < comm_buf_threshold_; + return elts_total_size <= comm_buf_threshold_; } return false; @@ -50,15 +50,17 @@ void CustomAllReduceComm::allReduce(torch::Tensor& input_tensor, torch::Tensor& } void CustomAllReduceComm::registerGraphBuffers() { - auto [handle, offset] = aiter::get_graph_buffer_ipc_meta(fa_); - auto _handles = all_gather(handle.data_ptr(), handle.numel(), at::hip::getCurrentHIPStream().stream()); - auto _offsets = all_gather(offset.data(), sizeof(int64_t) * offset.size(), at::hip::getCurrentHIPStream().stream()); - std::vector handles(world_size_); - std::vector> offsets(world_size_); + auto handle_and_offset = aiter::get_graph_buffer_ipc_meta(fa_); // tuple> -> vector size=2 + auto handle = handle_and_offset[0]; + auto offset = handle_and_offset[1]; + + auto _handles = all_gather(handle.data_ptr(), handle.element_size() * handle.numel(), at::hip::getCurrentHIPStream().stream()); + auto _offsets = all_gather(offset.data_ptr(), offset.element_size() * offset.numel(), at::hip::getCurrentHIPStream().stream()); + std::vector handles(world_size_); // vector -> vector + std::vector offsets(world_size_); // vector> -> vector for (int i = 0; i < world_size_; ++i) { - handles[i] = std::string(_handles[i].data(), handle.numel()); - offsets[i] = std::vector(_offsets[i].size() / sizeof(int64_t)); - std::memcpy(offsets[i].data(), _offsets[i].data(), _offsets[i].size()); + handles[i] = torch::from_blob(_handles[i].data(), handle.sizes(), handle.dtype()); + offsets[i] = torch::from_blob(_offsets[i].data(), offset.sizes(), offset.dtype()); } aiter::register_graph_buffers(fa_, handles, offsets); } @@ -89,10 +91,10 @@ void CustomAllReduceComm::init(const NcclParam& nccl_para, hipStream_t stream) { comm_buf_threshold_, }, torch::dtype(torch::kUInt8).device(torch::kCUDA)); - rank_data_ = torch::empty({8 * 1024 * 1024}, torch::dtype(torch::kUInt8).device(torch::kCUDA)); + rank_data_ = torch::empty({16 * 1024 * 1024}, torch::dtype(torch::kUInt8).device(torch::kCUDA)); - std::vector meta_handles = prepareP2PBuffer_(nccl_para, meta_, stream); - std::vector buffer_handles = prepareP2PBuffer_(nccl_para, buffer_, stream); + std::vector meta_handles = prepareP2PBuffer_(nccl_para, meta_, stream); + std::vector buffer_handles = prepareP2PBuffer_(nccl_para, buffer_, stream); std::vector meta_offsets(world_size_, 0); std::vector buffer_offsets(world_size_, 0); @@ -103,7 +105,7 @@ void CustomAllReduceComm::init(const NcclParam& nccl_para, hipStream_t stream) { nccl_para_ = nccl_para; } -std::vector +std::vector CustomAllReduceComm::prepareP2PBuffer_(const NcclParam& nccl_para, torch::Tensor& local_buffer, hipStream_t stream) { // malloc serial handle buffer char* serial_handle_buffer_ptr; @@ -125,16 +127,17 @@ CustomAllReduceComm::prepareP2PBuffer_(const NcclParam& nccl_para, torch::Tensor serial_handle_buffer_ptr, serial_handle_buffer_ptr, HIP_IPC_HANDLE_SIZE, rank_index_, nccl_para, stream); ROCM_CHECK(hipStreamSynchronize(stream)); - // deserialize all ranks' hipIpcMemHandle, and convert to std::string for aiter use - std::vector handles(world_size_); + // deserialize all ranks' hipIpcMemHandle, and convert to std::tensor for aiter use + std::vector handles(world_size_); + auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); for (size_t i = 0; i < handles.size(); ++i) { char tmp[HIP_IPC_HANDLE_SIZE]; - ROCM_CHECK(hipMemcpyAsync(tmp, - serial_handle_buffer_ptr + HIP_IPC_HANDLE_SIZE * i, - HIP_IPC_HANDLE_SIZE, - hipMemcpyDeviceToHost, - stream)); - handles[i] = std::string(tmp, HIP_IPC_HANDLE_SIZE); + handles[i] = torch::empty({static_cast(HIP_IPC_HANDLE_SIZE)}, options); + ROCM_CHECK(hipMemcpyAsync(handles[i].data_ptr(), + serial_handle_buffer_ptr + HIP_IPC_HANDLE_SIZE * i, + HIP_IPC_HANDLE_SIZE, + hipMemcpyDeviceToHost, + stream)); } ROCM_CHECK(hipFreeAsync(serial_handle_buffer_ptr, stream)); @@ -178,7 +181,7 @@ bool CustomAllReduceComm::shouldCustomAR(const std::vector& tp_ranks, si } size_t CustomAllReduceComm::getCommBufThreshold() { - int64_t custom_ar_size_threshold = 8192 * 1024 * 8; + int64_t custom_ar_size_threshold = 8192 * 1024 * 16; return custom_ar_size_threshold; } diff --git a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h index 7c2987597..1c3743a41 100644 --- a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h +++ b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h @@ -41,7 +41,7 @@ class CustomAllReduceComm { return HIP_IPC_HANDLE_SIZE * world_size; } - std::vector + std::vector prepareP2PBuffer_(const NcclParam& nccl_para, torch::Tensor& local_buffer, hipStream_t stream); const size_t rank_ = 0; diff --git a/rtp_llm/cpp/rocm/datatype_interface.h b/rtp_llm/cpp/rocm/datatype_interface.h new file mode 100644 index 000000000..a23831162 --- /dev/null +++ b/rtp_llm/cpp/rocm/datatype_interface.h @@ -0,0 +1,95 @@ +#pragma once +#include + +#include + +union computeTypeInterface +{ + float f32; + double f64; + hipblasLtHalf f16; + int32_t i32; +}; + +template +constexpr auto hipblaslt_type2datatype() +{ + if(std::is_same{}) + return HIP_R_16F; + if(std::is_same{}) + return HIP_R_16BF; + if(std::is_same{}) + return HIP_R_32F; + if(std::is_same{}) + return HIP_R_64F; + if(std::is_same{}) + return HIP_R_8F_E4M3_FNUZ; + if(std::is_same{}) + return HIP_R_8F_E5M2_FNUZ; +#ifdef ROCM_USE_FLOAT8 + if(std::is_same{}) + return HIP_R_8F_E4M3; + if(std::is_same{}) + return HIP_R_8F_E5M2; +#endif + if(std::is_same{}) + return HIP_R_32I; + if(std::is_same{}) + return HIP_R_8I; + + return HIP_R_16F; // testing purposes we default to f32 ex +} + +inline hipDataType computeTypeToRealDataType(hipblasComputeType_t ctype) +{ + static const std::map ctypeMap{ + {HIPBLAS_COMPUTE_16F, HIP_R_16F}, + {HIPBLAS_COMPUTE_16F_PEDANTIC, HIP_R_16F}, + {HIPBLAS_COMPUTE_32F, HIP_R_32F}, + {HIPBLAS_COMPUTE_32F_PEDANTIC, HIP_R_32F}, + {HIPBLAS_COMPUTE_32F_FAST_16F, HIP_R_32F}, + {HIPBLAS_COMPUTE_32F_FAST_16BF, HIP_R_32F}, + {HIPBLAS_COMPUTE_32F_FAST_TF32, HIP_R_32F}, + {HIPBLAS_COMPUTE_64F, HIP_R_64F}, + {HIPBLAS_COMPUTE_64F_PEDANTIC, HIP_R_64F}, + {HIPBLAS_COMPUTE_32I, HIP_R_32I}, + {HIPBLAS_COMPUTE_32I_PEDANTIC, HIP_R_32I}}; + + return ctypeMap.at(ctype); +} + +inline std::size_t realDataTypeSize(hipDataType dtype) +{ + // These types were not defined in older versions of ROCm, so need to be handled specially here. + auto const dtype_int = static_cast(dtype); + if(dtype_int == HIP_R_4F_E2M1_EXT || dtype_int == HIP_R_6F_E2M3_EXT + || dtype_int == HIP_R_6F_E3M2_EXT) + { + return 1; + } + + static const std::map dtypeMap{ + {HIP_R_32F, 4}, + {HIP_R_64F, 8}, + {HIP_R_16F, 2}, + {HIP_R_8I, 1}, + {HIP_R_8U, 1}, + {HIP_R_32I, 4}, + {HIP_R_32U, 4}, + {HIP_R_16BF, 2}, + {HIP_R_4I, 1}, + {HIP_R_4U, 1}, + {HIP_R_16I, 2}, + {HIP_R_16U, 2}, + {HIP_R_64I, 8}, + {HIP_R_64U, 8}, + {HIP_R_8F_E4M3_FNUZ, 1}, + {HIP_R_8F_E5M2_FNUZ, 1}, +#ifdef ROCM_USE_FLOAT8 + {HIP_R_8F_E4M3, 1}, + {HIP_R_8F_E5M2, 1}, +#endif + }; + + return dtypeMap.at(dtype); +} diff --git a/rtp_llm/cpp/rocm/hipblasMMWrapper.cc b/rtp_llm/cpp/rocm/hipblasMMWrapper.cc index 325eb6777..83dc6b44a 100644 --- a/rtp_llm/cpp/rocm/hipblasMMWrapper.cc +++ b/rtp_llm/cpp/rocm/hipblasMMWrapper.cc @@ -1,5 +1,8 @@ #include "hipblasMMWrapper.h" #include "rtp_llm/cpp/config/ConfigModules.h" +#include "datatype_interface.h" +#include "TensorDataManipulation.h" +#include namespace rtp_llm { namespace rocm { @@ -13,6 +16,8 @@ hipblasMMWrapper::hipblasMMWrapper(hipblasHandle_t hipblas_handle, RTP_LLM_LOG_DEBUG(__PRETTY_FUNCTION__); hipblas_workspace_ = allocator_->malloc(HIPBLAS_WORKSPACE_SIZE); std::string config_path = hw_kernel_config.rocm_hipblaslt_config; + use_swizzleA_ = hw_kernel_config.use_swizzleA; + test_swizzleA_ = bool(autil::EnvUtil::getEnv("TEST_SWIZZLEA", 0L)); if (config_path.empty()) { RTP_LLM_LOG_WARNING("ROCM_HIPBLASLT_CONFIG not set. Defaulting to gemm_config.csv."); config_path = "gemm_config.csv"; @@ -31,6 +36,14 @@ hipblasMMWrapper::~hipblasMMWrapper() { allocator_->free((void**)(&hipblas_workspace_)); } +bool hipblasMMWrapper::use_swizzleA() { + return use_swizzleA_; +} + +bool hipblasMMWrapper::test_swizzleA() { + return test_swizzleA_; +} + hipblasDatatype_t hipblasMMWrapper::getHipBlasDataType(hipDataType data_type) { if (data_type == HIP_R_16F) { return HIPBLAS_R_16F; @@ -80,6 +93,129 @@ void hipblasMMWrapper::setGemmConfig(hipDataType aType, hipDataType bType, hipDa computeType_ = computeType; } +void hipblasMMWrapper::FP8_Gemm(hipblasOperation_t transa, + hipblasOperation_t transb, + const int m, + const int n, + const int k, + const void* A, + const int lda, + const void* B, + const int ldb, + void* C, + const int ldc, + const float* d_scale_a, + const float* d_scale_b, + float alpha_, + float beta_) { + + RTP_LLM_LOG_DEBUG(__PRETTY_FUNCTION__); + + hipblasLtMatrixLayout_t ADesc, BDesc, CDesc; + hipblasLtMatmulDesc_t matmul; + ROCM_CHECK(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, computeType_)); + + RTP_LLM_CHECK_WITH_INFO( + Atype_ == HIP_R_8F_E4M3_FNUZ, + "Unexpected Atype_: %d (expected: %d)", static_cast(Atype_), static_cast(HIP_R_8F_E4M3_FNUZ) + ); + RTP_LLM_CHECK_WITH_INFO( + transa == HIPBLAS_OP_N, + "Unexpected transa: %d (expected: %d)", static_cast(transa), static_cast(HIPBLAS_OP_N) + ); + + if (use_swizzleA_ || test_swizzleA_){ + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&ADesc, Atype_, k, m, k)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&BDesc, Btype_, transb == HIPBLAS_OP_N ? k : n, transb == HIPBLAS_OP_N ? n : k, ldb)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&CDesc, Ctype_, m, n, ldc)); + + hipblasLtOrder_t orderA = HIPBLASLT_ORDER_COL16_4R16; + ROCM_CHECK(hipblasLtMatrixLayoutSetAttribute(ADesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA))); + + hipblasOperation_t trans_a = HIPBLAS_OP_T; + hipblasOperation_t trans_b = transb; + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &trans_a, sizeof(int32_t))); + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &trans_b, sizeof(int32_t))); + } + else { + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&ADesc, Atype_, transa == HIPBLAS_OP_N ? m : k, transa == HIPBLAS_OP_N ? k : m, lda)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&BDesc, Btype_, transb == HIPBLAS_OP_N ? k : n, transb == HIPBLAS_OP_N ? n : k, ldb)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&CDesc, Ctype_, m, n, ldc)); + + hipblasOperation_t trans_a = transa; + hipblasOperation_t trans_b = transb; + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &trans_a, sizeof(int32_t))); + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &trans_b, sizeof(int32_t))); + } + + hipblasLtMatmulMatrixScale_t a_mode = HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F; + hipblasLtMatmulMatrixScale_t b_mode = HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F; + + ROCM_CHECK(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, &a_mode, sizeof(uint32_t))); + ROCM_CHECK(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, &b_mode, sizeof(uint32_t))); + + ROCM_CHECK(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a, sizeof(d_scale_a))); + ROCM_CHECK(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b, sizeof(d_scale_b))); + + const int request_solutions = 1; + hipblasLtMatmulHeuristicResult_t heuristicResult[request_solutions]; + int returnedAlgoCount = 0; + ROCM_CHECK(hipblasLtMatmulAlgoGetHeuristic(hipblaslt_handle_, + matmul, + ADesc, + BDesc, + CDesc, + CDesc, + blasLtPrefer, + request_solutions, + heuristicResult, + &returnedAlgoCount)); + + if(returnedAlgoCount == 0) { + std::cerr << "No valid solution found in hipblasMMWrapper::FP8_Gemm" << std::endl; + return; + } + + void* workSpace = hipblas_workspace_; + int workspaceSize = HIPBLAS_WORKSPACE_SIZE; + const void* alpha = reinterpret_cast(&alpha_); + const void* beta = reinterpret_cast(&beta_); + + hipblasStatus_t blaslt_status; + blaslt_status = hipblasLtMatmul(hipblaslt_handle_, + matmul, + alpha, + A, + ADesc, + B, + BDesc, + beta, + C, + CDesc, + C, + CDesc, + &heuristicResult[0].algo, + workSpace, + workspaceSize, + stream_); + + if (blaslt_status != HIPBLAS_STATUS_SUCCESS) { + std::cerr << "hipblasMMWrapper::FP8_Gemm failed" << std::endl; + return; + } + + ROCM_CHECK(hipblasLtMatmulDescDestroy(matmul)); + ROCM_CHECK(hipblasLtMatrixLayoutDestroy(ADesc)); + ROCM_CHECK(hipblasLtMatrixLayoutDestroy(BDesc)); + ROCM_CHECK(hipblasLtMatrixLayoutDestroy(CDesc)); + +} + + void hipblasMMWrapper::Gemm(hipblasOperation_t transa, hipblasOperation_t transb, const int m, @@ -143,20 +279,36 @@ void hipblasMMWrapper::Gemm(hipblasOperation_t transa, workspaceSize, stream_)); } else { - hipblasLtMatrixLayout_t ADesc, BDesc, CDesc; - ROCM_CHECK(hipblasLtMatrixLayoutCreate( - &ADesc, Atype_, transa == HIPBLAS_OP_N ? m : k, transa == HIPBLAS_OP_N ? k : m, lda)); - ROCM_CHECK(hipblasLtMatrixLayoutCreate( - &BDesc, Btype_, transb == HIPBLAS_OP_N ? k : n, transb == HIPBLAS_OP_N ? n : k, ldb)); - ROCM_CHECK(hipblasLtMatrixLayoutCreate(&CDesc, Ctype_, m, n, ldc)); + hipblasLtMatrixLayout_t ADesc, BDesc, CDesc; hipblasLtMatmulDesc_t matmul; ROCM_CHECK(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, computeType_)); - hipblasOperation_t trans_a = transa; - hipblasOperation_t trans_b = transb; - ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &trans_a, sizeof(int32_t))); - ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &trans_b, sizeof(int32_t))); + if ((use_swizzleA_ || test_swizzleA_) && transa==HIPBLAS_OP_N && (Atype_ == HIP_R_16BF || Atype_ == HIP_R_16F)){ + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&ADesc, Atype_, k, m, k)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&BDesc, Btype_, transb == HIPBLAS_OP_N ? k : n, transb == HIPBLAS_OP_N ? n : k, ldb)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&CDesc, Ctype_, m, n, ldc)); + hipblasOperation_t trans_a = HIPBLAS_OP_T; + hipblasOperation_t trans_b = transb; + + hipblasLtOrder_t orderA = HIPBLASLT_ORDER_COL16_4R8; + ROCM_CHECK(hipblasLtMatrixLayoutSetAttribute(ADesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA))); + + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &trans_a, sizeof(int32_t))); + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &trans_b, sizeof(int32_t))); + } + else{ + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&ADesc, Atype_, transa == HIPBLAS_OP_N ? m : k, transa == HIPBLAS_OP_N ? k : m, lda)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&BDesc, Btype_, transb == HIPBLAS_OP_N ? k : n, transb == HIPBLAS_OP_N ? n : k, ldb)); + ROCM_CHECK(hipblasLtMatrixLayoutCreate(&CDesc, Ctype_, m, n, ldc)); + hipblasOperation_t trans_a = transa; + hipblasOperation_t trans_b = transb; + + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &trans_a, sizeof(int32_t))); + ROCM_CHECK(hipblasLtMatmulDescSetAttribute(matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &trans_b, sizeof(int32_t))); + } + + const int request_solutions = 1; hipblasLtMatmulHeuristicResult_t heuristicResult[request_solutions]; int returnedAlgoCount = 0; diff --git a/rtp_llm/cpp/rocm/hipblasMMWrapper.h b/rtp_llm/cpp/rocm/hipblasMMWrapper.h index 3361701a1..44acc77f2 100644 --- a/rtp_llm/cpp/rocm/hipblasMMWrapper.h +++ b/rtp_llm/cpp/rocm/hipblasMMWrapper.h @@ -23,6 +23,9 @@ class hipblasMMWrapper { hipDataType Ctype_; hipDataType computeType_; + bool use_swizzleA_; + bool test_swizzleA_; + hipStream_t stream_; rocm::hipblasAlgoMap hipblas_algo_map_; @@ -69,6 +72,22 @@ class hipblasMMWrapper { const int ldc, float alpha_ = float(1.0f), float beta_ = float(0.0f)); + + void FP8_Gemm(hipblasOperation_t transa, + hipblasOperation_t transb, + const int m, + const int n, + const int k, + const void* A, + const int lda, + const void* B, + const int ldb, + void* C, + const int ldc, + const float* d_scale_a, + const float* d_scale_b, + float alpha_ = float(1.0f), + float beta_ = float(0.0f)); void GemmBiasAct(hipblasOperation_t transa, hipblasOperation_t transb, @@ -119,6 +138,9 @@ class hipblasMMWrapper { stream_ = stream; hipblasSetStream(hipblas_handle_, stream_); } + + bool use_swizzleA(); + bool test_swizzleA(); }; } // namespace rocm } // namespace rtp_llm diff --git a/rtp_llm/cpp/rocm/rocmFmhaWrapper.cc b/rtp_llm/cpp/rocm/rocmFmhaWrapper.cc index 39dac1d02..d03f7eb32 100644 --- a/rtp_llm/cpp/rocm/rocmFmhaWrapper.cc +++ b/rtp_llm/cpp/rocm/rocmFmhaWrapper.cc @@ -1,9 +1,7 @@ #include "rocmFmhaWrapper.h" -#include "fmha_fwd.hpp" +#include "mha_fwd.h" #include "ck_tile/host.hpp" -#include "mask.hpp" #include "utils.hpp" -#include "bias.hpp" #include "rtp_llm/cpp/utils/Logger.h" // #include "aiter_meta/3rdparty/composable_kernel/example/ck_tile/01_fmha/mask.hpp" @@ -140,17 +138,17 @@ uint32_t rocmFmhaWrapper::runCKFmha(void* q, // ck_tile::DeviceMem lse_acc_buf(lse_acc_host.get_element_space_size_in_bytes()); auto has_logits_soft_cap = false; - auto fmha_traits = fmha_fwd_traits{hdim_q, - hdim_v, - data_type, - mode == mode_enum::group, - is_v_rowmajor, - has_logits_soft_cap, - mask.type, - bias.type, - lse, - p_drop > 0.0f, - squant}; + // auto fmha_traits = fmha_fwd_traits{hdim_q, + // hdim_v, + // data_type, + // mode == mode_enum::group, + // is_v_rowmajor, + // has_logits_soft_cap, + // mask.type, + // bias.type, + // lse, + // p_drop > 0.0f, + // squant}; auto fmha_args = [&]() { assert(nhead % nhead_k == 0); @@ -283,8 +281,13 @@ uint32_t rocmFmhaWrapper::runCKFmha(void* q, 1, // nrepeat_ // false // }; - - float run_time = fmha_fwd(fmha_traits, fmha_args, stream_config); + float run_time; + // if (data_type == "bf16" && size_per_head_ == 128 && msk_str == "b") + // run_time = aiter::mha_fwd( + // fmha_args, stream_config, data_type, mode == mode_enum::group, mask.type, bias.type, lse, true); + // else + run_time = aiter::mha_fwd( + fmha_args, stream_config, data_type, mode == mode_enum::group, mask.type, bias.type, lse, false); // std::cout << "\nrun_time for ck fmha_fwd: " << run_time << std::endl; if (run_time < 0) { CK_FAIL("fmha_fwd faild"); @@ -670,17 +673,17 @@ uint32_t rocmFmhaWrapper::runCKFmhaMLA(void* q, // ck_tile::DeviceMem lse_acc_buf(lse_acc_host.get_element_space_size_in_bytes()); auto has_logits_soft_cap = false; - auto fmha_traits = fmha_fwd_traits{hdim_q, - hdim_v, - data_type, - mode == mode_enum::group, - is_v_rowmajor, - has_logits_soft_cap, - mask.type, - bias.type, - lse, - p_drop > 0.0f, - squant}; + // auto fmha_traits = fmha_fwd_traits{hdim_q, + // hdim_v, + // data_type, + // mode == mode_enum::group, + // is_v_rowmajor, + // has_logits_soft_cap, + // mask.type, + // bias.type, + // lse, + // p_drop > 0.0f, + // squant}; auto fmha_args = [&]() { assert(nhead % nhead_k == 0); @@ -813,8 +816,13 @@ uint32_t rocmFmhaWrapper::runCKFmhaMLA(void* q, 1, // nrepeat_ // false // }; - - float run_time = fmha_fwd(fmha_traits, fmha_args, stream_config); + float run_time; + if (data_type == "bf16" && size_per_head_ == 128 && msk_str == "b") + run_time = aiter::mha_fwd( + fmha_args, stream_config, data_type, mode == mode_enum::group, mask.type, bias.type, lse, true); + else + run_time = aiter::mha_fwd( + fmha_args, stream_config, data_type, mode == mode_enum::group, mask.type, bias.type, lse, false); // std::cout << "\nrun_time for ck fmha_fwd: " << run_time << std::endl; if (run_time < 0) { CK_FAIL("fmha_fwd faild"); diff --git a/rtp_llm/cpp/utils/utils.h b/rtp_llm/cpp/utils/utils.h index 7b8005fee..fc49e8e8d 100644 --- a/rtp_llm/cpp/utils/utils.h +++ b/rtp_llm/cpp/utils/utils.h @@ -51,7 +51,7 @@ return __VA_ARGS__(); \ } -#ifdef ENABLE_FP8 +#if defined(ENABLE_FP8) || defined(USING_ROCM) #define ENABLE_FP8_CASE(NAME, TYPE, ...) FT_SWITCH_ONE_CASE_T(NAME, KvCacheDataType::FP8, TYPE, __VA_ARGS__) #else #define ENABLE_FP8_CASE(NAME, TYPE, ...) diff --git a/rtp_llm/device/device_impl.py b/rtp_llm/device/device_impl.py index 7434986b2..2a75770ef 100644 --- a/rtp_llm/device/device_impl.py +++ b/rtp_llm/device/device_impl.py @@ -7,7 +7,7 @@ from rtp_llm.device.device_base import DeviceBase, MemInfo from rtp_llm.ops import DeviceExporter from rtp_llm.utils.model_weight import W - +from rtp_llm.utils.swizzle_utils import swizzle_tensor class CpuImpl(DeviceBase): def __init__(self, exported_device: DeviceExporter): @@ -268,6 +268,9 @@ def shuffle_moe_weight( def shuffle_gemm_weight(self, x: torch.Tensor) -> torch.Tensor: return x + def swizzle_gemm_weight(self, src: torch.Tensor, col_maj: bool = False) -> torch.Tensor: + return src + def convert_fp8_weight_params( self, weight: torch.Tensor, weight_scale: torch.Tensor ): @@ -562,6 +565,10 @@ def shuffle_gemm_weight(self, x: torch.Tensor) -> torch.Tensor: x_ = x_.view(*x.shape) return x_ + def swizzle_gemm_weight(self, src: torch.Tensor, col_maj: bool = False) -> torch.Tensor: + src = swizzle_tensor(src, False) + return src + def convert_fp8_weight_params( self, weight: torch.Tensor, weight_scale: torch.Tensor ): diff --git a/rtp_llm/libs/BUILD b/rtp_llm/libs/BUILD index a74a33058..2514666d2 100644 --- a/rtp_llm/libs/BUILD +++ b/rtp_llm/libs/BUILD @@ -41,26 +41,36 @@ genrule( "libmodule_custom_all_reduce.so", "libmodule_moe_sorting.so", "libmodule_moe_asm.so", - "libmodule_moe.so", "libmodule_gemm_a8w8_bpreshuffle.so", "libmodule_gemm_a8w8_blockscale.so", "libmodule_quant.so", + "libmodule_smoothquant.so", "libmodule_pa.so", "libmodule_activation.so", + "libmodule_attention_asm.so", + "libmodule_mha_fwd.so", + "libmodule_norm.so", + "libmodule_rmsnorm.so", + "libmodule_moe_ck2stages.so" ], cmd = """ - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_aiter_enum.so $(location libmodule_aiter_enum.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_custom_all_reduce.so $(location libmodule_custom_all_reduce.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe_sorting.so $(location libmodule_moe_sorting.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe_asm.so $(location libmodule_moe_asm.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe.so $(location libmodule_moe.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_gemm_a8w8_bpreshuffle.so $(location libmodule_gemm_a8w8_bpreshuffle.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_gemm_a8w8_blockscale.so $(location libmodule_gemm_a8w8_blockscale.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_quant.so $(location libmodule_quant.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_pa.so $(location libmodule_pa.so); - cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_activation.so $(location libmodule_activation.so); - """, - tags = ["rocm", "local"], + cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_aiter_enum.so $(location libmodule_aiter_enum.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_custom_all_reduce.so $(location libmodule_custom_all_reduce.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe_sorting.so $(location libmodule_moe_sorting.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe_asm.so $(location libmodule_moe_asm.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_gemm_a8w8_bpreshuffle.so $(location libmodule_gemm_a8w8_bpreshuffle.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_gemm_a8w8_blockscale.so $(location libmodule_gemm_a8w8_blockscale.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_quant.so $(location libmodule_quant.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_smoothquant.so $(location libmodule_smoothquant.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_pa.so $(location libmodule_pa.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_activation.so $(location libmodule_activation.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_attention_asm.so $(location libmodule_attention_asm.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_mha_fwd.so $(location libmodule_mha_fwd.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_norm.so $(location libmodule_norm.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_rmsnorm.so $(location libmodule_rmsnorm.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe_ck2stages.so $(location libmodule_moe_ck2stages.so); +""", +tags = ["rocm", "local"], ) genrule( @@ -71,40 +81,36 @@ genrule( "module_custom_all_reduce.so", "module_moe_sorting.so", "module_moe_asm.so", - "module_moe.so", "module_gemm_a8w8_bpreshuffle.so", "module_gemm_a8w8_blockscale.so", "module_quant.so", + "module_smoothquant.so", "module_pa.so", "module_activation.so", + "module_attention_asm.so", + "module_mha_fwd.so", + "module_norm.so", + "module_rmsnorm.so", + "module_moe_ck2stages.so", ], cmd = """ cp external/aiter/aiter/jit/module_aiter_enum.so $(location module_aiter_enum.so); cp external/aiter/aiter/jit/module_custom_all_reduce.so $(location module_custom_all_reduce.so); cp external/aiter/aiter/jit/module_moe_sorting.so $(location module_moe_sorting.so); cp external/aiter/aiter/jit/module_moe_asm.so $(location module_moe_asm.so); - cp external/aiter/aiter/jit/module_moe.so $(location module_moe.so); cp external/aiter/aiter/jit/module_gemm_a8w8_bpreshuffle.so $(location module_gemm_a8w8_bpreshuffle.so); cp external/aiter/aiter/jit/module_gemm_a8w8_blockscale.so $(location module_gemm_a8w8_blockscale.so); cp external/aiter/aiter/jit/module_quant.so $(location module_quant.so); + cp external/aiter/aiter/jit/module_smoothquant.so $(location module_smoothquant.so); cp external/aiter/aiter/jit/module_pa.so $(location module_pa.so); cp external/aiter/aiter/jit/module_activation.so $(location module_activation.so); - """, - tags = ["rocm", "local"], -) - -genrule( - name = "ck_copy", - srcs = ["@composable_kernel_archive//:ck_fmha_rmsnorm2d_libraries"], - outs = [ - "libtile_rmsnorm2d_fwd.so", - "libtile_example_fmha_fwd.so", - ], - cmd = """ - cp external/composable_kernel_archive/libtile_example_fmha_fwd.so $(location libtile_example_fmha_fwd.so); - cp external/composable_kernel_archive/libtile_rmsnorm2d_fwd.so $(location libtile_rmsnorm2d_fwd.so); - """, - tags = ["rocm", "local"], + cp external/aiter/aiter/jit/module_attention_asm.so $(location module_attention_asm.so); + cp external/aiter/aiter/jit/module_mha_fwd.so $(location module_mha_fwd.so); + cp external/aiter/aiter/jit/module_norm.so $(location module_norm.so); + cp external/aiter/aiter/jit/module_rmsnorm.so $(location module_rmsnorm.so); + cp external/aiter/aiter/jit/module_moe_ck2stages.so $(location module_moe_ck2stages.so); +""", +tags = ["rocm", "local"], ) filegroup( diff --git a/rtp_llm/model_loader/per_channel_fp8_quant_weight.py b/rtp_llm/model_loader/per_channel_fp8_quant_weight.py index e274e97a4..07fc14315 100644 --- a/rtp_llm/model_loader/per_channel_fp8_quant_weight.py +++ b/rtp_llm/model_loader/per_channel_fp8_quant_weight.py @@ -366,9 +366,7 @@ def _postprocess( processed_res = super()._postprocess(tensor, device, load_config) kernel_weight = processed_res[self.kernel.name] if self.kernel.name not in [W.moe_w1, W.moe_w2]: - kernel_weight = load_config.exported_device.shuffle_gemm_weight( - kernel_weight - ) + kernel_weight = load_config.exported_device.swizzle_gemm_weight(kernel_weight, False) kernel_weight = ( kernel_weight.reshape(kernel_weight.shape[-1], -1) if kernel_weight.dim() == 2 diff --git a/rtp_llm/models/base_model.py b/rtp_llm/models/base_model.py index d3524964e..bb319789e 100644 --- a/rtp_llm/models/base_model.py +++ b/rtp_llm/models/base_model.py @@ -145,6 +145,7 @@ def from_config( ) -> "BaseModel": model = cls(config) model.load(parallel_info) + model.postprocess_weights() return model @staticmethod @@ -294,3 +295,6 @@ def eval_model_size(config: GptInitModelParameters): @staticmethod def eval_model_param_count(config: GptInitModelParameters): return config.model_param_count + + def postprocess_weights(self): + pass diff --git a/rtp_llm/models/qwen_v2.py b/rtp_llm/models/qwen_v2.py index adb0eeefd..be67f87bd 100644 --- a/rtp_llm/models/qwen_v2.py +++ b/rtp_llm/models/qwen_v2.py @@ -35,6 +35,7 @@ zeros, ) +from rtp_llm.utils.swizzle_utils import do_swizzle def scale_reshape(ts: List[torch.Tensor]): return ts[0].reshape(-1) @@ -416,6 +417,9 @@ def _from_config_json(config: GptInitModelParameters, config_json: Dict[str, Any def get_weight_cls(): return QWenV2Weight + def postprocess_weights(self): + if self.config.hw_kernel_config.use_swizzleA and self.weight.weights[0]["self_attention_weights.query_weight.kernel"].dtype != torch.float8_e4m3fnuz: + do_swizzle(self.weight.weights) class QWenV2Embedding(QWenV2): @classmethod diff --git a/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc b/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc index be9a9efe3..d5a9f1e90 100644 --- a/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc +++ b/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc @@ -76,7 +76,7 @@ std::tuple FusedRopeKVCachePrefillO if (hw_kernel_config_.use_aiter_pa) { hipStream_t stream_ = device_->getStream(); DISPATCH_CUDA_FUNCTION_DATA_TYPE(torchDTypeToDataType(qkv.dtype()), - invokeAddFusedQKVBiasTransposePrefill, + invokeAddFusedQKVBiasTransposePrefillV1, q_output.data_ptr(), k_output.data_ptr(), v_output.data_ptr(), @@ -97,11 +97,12 @@ std::tuple FusedRopeKVCachePrefillO attn_configs_.use_logn_attn, nullptr, 0, - false, // use_paged_fmha - store_qkv, // store_qkv - store_q, // store_q - store_kv, // store_kv - store_cache, // store_cache + false, // use_paged_fmha + store_qkv, // store_qkv + store_q, // store_q + store_kv, // store_kv + store_cache, // store_cache + nullptr, device_->getStream() // 必须作为最后一个参数 ); } else { @@ -273,7 +274,7 @@ torch::Tensor FusedRopeKVCacheDecodeOp::forward(const torch::Tensor& DISPATCH_CUDA_FUNCTION_DATA_TYPE( torchDTypeToDataType(qkv.dtype()), - invokeAddFusedQKVBiasTransposeDecode, + invokeAddFusedQKVBiasTransposeDecodeV1, q_output.data_ptr(), nullptr, nullptr, diff --git a/rtp_llm/models_py/modules/rocm/fmha.py b/rtp_llm/models_py/modules/rocm/fmha.py index fc0229d93..8167c3a4f 100644 --- a/rtp_llm/models_py/modules/rocm/fmha.py +++ b/rtp_llm/models_py/modules/rocm/fmha.py @@ -136,6 +136,7 @@ def __init__( self.head_dim = config.hidden_size // config.head_num self.head_num_kv = config.head_num_kv self.kv_cache_data_type = config.kv_cache_data_type + self.use_asm_pa = config.hw_kernel_config.use_asm_pa def support(self, attn_inputs: PyAttentionInputs) -> bool: return True @@ -168,21 +169,24 @@ def forward(self, query: torch.Tensor, kv_cache: Optional[KVCache] , fmha_params num_kv_heads = self.head_num_kv scale = 1.0 / (self.head_dim ** 0.5) alibi_slopes = None - - k_scale = kv_cache.k_scale_base if kv_cache and kv_cache.k_scale_base is not None else 1.0 - v_scale = kv_cache.v_scale_base if kv_cache and kv_cache.v_scale_base is not None else 1.0 + k_scale = kv_cache.k_scale_base if kv_cache and kv_cache.k_scale_base is not None else torch.tensor(1.0, device=query.device, dtype=query.dtype) + v_scale = kv_cache.v_scale_base if kv_cache and kv_cache.v_scale_base is not None else torch.tensor(1.0, device=query.device, dtype=query.dtype) max_num_blocks = block_tables_id_device.shape[1] - if os.environ.get('USE_ASM_PA'): - output = torch.ops.aiter.pa_fwd_asm( - query, - key_cache, - value_cache, + # for now not support fp8 + if self.use_asm_pa: + x = 16 // value_cache.element_size() + num_blocks, num_kv_heads, block_size, head_size = value_cache.shape + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, block_size // x, x) + value_cache = value_cache.permute(0, 1, 3, 2, 4).contiguous() + + output = aiter.pa_fwd_asm( + query, # [num_seqs, num_heads, head_size] + key_cache, # [num_blocks, num_kv_heads, block_size, head_size/x, x] + value_cache, # [num_blocks, num_kv_heads, block_size, head_size/x, x] block_tables_id_device, seq_lens, max_num_blocks, - k_scale, - v_scale, ) else : num_seqs, num_heads, head_size = query.shape @@ -213,8 +217,8 @@ def forward(self, query: torch.Tensor, kv_cache: Optional[KVCache] , fmha_params max_logits = torch.ones_like(exp_sums) kv_cache_dtype ="auto" - key_cache_reshaped = key_cache.permute(0,1,3,2) - value_cache_reshaped = value_cache.permute(0,1,3,2) + # key_cache_reshaped = key_cache.permute(0,1,3,2) + # value_cache_reshaped = value_cache.permute(0,1,3,2) aiter.paged_attention_rocm( output, @@ -222,8 +226,8 @@ def forward(self, query: torch.Tensor, kv_cache: Optional[KVCache] , fmha_params max_logits, tmp_output, query, - key_cache_reshaped, - value_cache_reshaped, + key_cache, + value_cache, num_kv_heads, float(scale), block_tables_id_device, diff --git a/rtp_llm/models_py/test/rocm_fmha_test.py b/rtp_llm/models_py/test/rocm_fmha_test.py index ceae74a60..fb52c39e0 100644 --- a/rtp_llm/models_py/test/rocm_fmha_test.py +++ b/rtp_llm/models_py/test/rocm_fmha_test.py @@ -432,8 +432,8 @@ def run_aiter( max_seq_len, alibi_slopes, kv_cache_dtype, - k_scale.item(), - v_scale.item(), + k_scale, + v_scale, fp8_out_scale if cpa_fp8_out else None, _PARTITION_SIZE_ROCM, ) diff --git a/rtp_llm/ops/libth_transformer.pyi b/rtp_llm/ops/libth_transformer.pyi index 6ed8e91e6..5701bb218 100644 --- a/rtp_llm/ops/libth_transformer.pyi +++ b/rtp_llm/ops/libth_transformer.pyi @@ -594,9 +594,11 @@ class HWKernelConfig: enable_stable_scatter_add: bool ft_disable_custom_ar: bool rocm_hipblaslt_config: str + use_swizzleA: bool enable_cuda_graph: bool enable_cuda_graph_debug_mode: bool use_aiter_pa: bool + use_asm_pa: bool enable_native_cuda_graph: bool num_native_cuda_graph: int def __init__( @@ -607,9 +609,11 @@ class HWKernelConfig: enable_multi_block_mode: bool = True, ft_disable_custom_ar: bool = True, rocm_hipblaslt_config: str = "gemm_config.csv", + use_swizzleA: bool = False, enable_cuda_graph: bool = False, enable_cuda_graph_debug_mode: bool = False, use_aiter_pa: bool = True, + use_asm_pa: bool = True, enable_native_cuda_graph: bool = False, num_native_cuda_graph: int = 200, ) -> None: ... diff --git a/rtp_llm/server/server_args/hw_kernel_group_args.py b/rtp_llm/server/server_args/hw_kernel_group_args.py index f34b6da67..affb27151 100644 --- a/rtp_llm/server/server_args/hw_kernel_group_args.py +++ b/rtp_llm/server/server_args/hw_kernel_group_args.py @@ -94,3 +94,19 @@ def init_hw_kernel_group_args(parser): default=True, help="Rocm是否使用AITER Attention", ) + + hw_kernel_group.add_argument( + "--use_asm_pa", + env_name="USE_ASM_PA", + type=str2bool, + default=True, + help="Rocm是否使用AITER ASM Attention", + ) + + hw_kernel_group.add_argument( + "--use_swizzleA", + env_name="USE_SWIZZLEA", + type=str2bool, + default=False, + help="hipBLASLt GEMM 是否使用 swizzle", + ) diff --git a/rtp_llm/server/server_args/test/server_args_test.py b/rtp_llm/server/server_args/test/server_args_test.py index d9cc733fd..f76e4cbfc 100644 --- a/rtp_llm/server/server_args/test/server_args_test.py +++ b/rtp_llm/server/server_args/test/server_args_test.py @@ -86,10 +86,12 @@ def test_default_args_env(self): self.assertEqual(env.get("ENABLE_STABLE_SCATTER_ADD"), "0") self.assertEqual(env.get("ENABLE_MULTI_BLOCK_MODE"), "1") self.assertEqual(env.get("ROCM_HIPBLASLT_CONFIG"), "gemm_config.csv") + self.assertEqual(env.get("USE_SWIZZLEA"), "0") # self.assertIsNone(env.get("FT_DISABLE_CUSTOM_AR")) self.assertEqual(env.get("ENABLE_CUDA_GRAPH"), "0") self.assertEqual(env.get("ENABLE_CUDA_GRAPH_DEBUG_MODE"), "0") self.assertEqual(env.get("USE_AITER_PA"), "1") + self.assertEqual(env.get("USE_ASM_PA"), "1") self.assertEqual(env.get("ENABLE_NATIVE_CUDA_GRAPH"), "0") self.assertEqual(env.get("NUM_NATIVE_CUDA_GRAPH"), "200") @@ -402,6 +404,8 @@ def test_all_args_set_env(self): "False", "--rocm_hipblaslt_config", "another_gemm_config.csv", + "--use_swizzleA", + "False", "--ft_disable_custom_ar", "False", "--enable_cuda_graph", @@ -410,6 +414,8 @@ def test_all_args_set_env(self): "True", "--use_aiter_pa", "False", + "--use_asm_pa", + "False", "--enable_native_cuda_graph", "True", "--num_native_cuda_graph", @@ -786,6 +792,7 @@ def test_all_args_set_env(self): self.assertEqual(env.get("ENABLE_CUDA_GRAPH"), "1") self.assertEqual(env.get("ENABLE_CUDA_GRAPH_DEBUG_MODE"), "1") self.assertEqual(env.get("USE_AITER_PA"), "0") + self.assertEqual(env.get("USE_ASM_PA"), "0") self.assertEqual(env.get("ENABLE_NATIVE_CUDA_GRAPH"), "1") self.assertEqual(env.get("NUM_NATIVE_CUDA_GRAPH"), "100") diff --git a/rtp_llm/utils/model_weight.py b/rtp_llm/utils/model_weight.py index 0cd7e9ecc..1ef3bfe23 100644 --- a/rtp_llm/utils/model_weight.py +++ b/rtp_llm/utils/model_weight.py @@ -27,8 +27,13 @@ def w_half2(ts: List[torch.Tensor], inter_size: int): def concat_0(ts: List[torch.Tensor]) -> torch.Tensor: if len(ts) == 1: return ts[0] - - return torch.concat(ts, dim=0).contiguous() + # torch.concat() dose not support fp8 in current rocm torch version + if ts[0].dtype in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]: + dtype = ts[0].dtype + out_u8 = torch.concat([x.view(torch.uint8) for x in ts], dim=0).contiguous() + return out_u8.view(dtype) + else: + return torch.concat(ts, dim=0).contiguous() def concat_1(ts: List[torch.Tensor]) -> torch.Tensor: @@ -810,7 +815,12 @@ def transpose_q_rope( def pad_w13(ts: List[torch.Tensor], inter_padding_size: int, dim: int): w1 = pad([ts[0]], inter_padding_size, dim) w3 = pad([ts[1]], inter_padding_size, dim) - return torch.concat([w1, w3], dim=dim).contiguous() + if w1.dtype in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]: + dtype = w1.dtype + out_u8 = torch.concat([w1.view(torch.uint8), w3.view(torch.uint8)], dim=dim).contiguous() + return out_u8.view(dtype) + else: + return torch.concat([w1, w3], dim=dim).contiguous() def transpose_w13(ts: List[torch.Tensor]): @@ -830,7 +840,13 @@ def concat_w13(ts: List[torch.Tensor]): def concat_w13_2(ts: List[torch.Tensor]): - return torch.concat(ts, dim=0).contiguous() + # torch.concat() dose not support fp8 in current rocm torch version + if ts[0].dtype in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]: + dtype = ts[0].dtype + out_u8 = torch.concat([x.view(torch.uint8) for x in ts], dim=0).contiguous() + return out_u8.view(dtype) + else: + return torch.concat(ts, dim=0).contiguous() def ffn_sp_neg1_w13( diff --git a/rtp_llm/utils/swizzle_utils.py b/rtp_llm/utils/swizzle_utils.py new file mode 100644 index 000000000..66b8b0a17 --- /dev/null +++ b/rtp_llm/utils/swizzle_utils.py @@ -0,0 +1,53 @@ +import torch +from typing import List, Dict + +def calculate_k_for_swizzling(dtype: torch.dtype): + if dtype == torch.float32: + MiK, MiKv = 4, 1 + elif dtype in (torch.float16, torch.half, torch.bfloat16): + MiK, MiKv = 16, 4 + elif dtype in (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz): + MiK, MiKv = 32, 8 + else: + raise ValueError(f"unsupported datatype in calculateKforSwizzling: {dtype}") + elem_size = torch.zeros((), dtype=dtype).element_size() + PackK = 16 // MiKv // elem_size + return MiK, MiKv, PackK + +def swizzle_tensor( + src: torch.Tensor, + col_maj: bool = False, + MiM: int = 16) -> torch.Tensor: + tmp = src.clone() + + if col_maj: + k, m = src.shape + tmp = tmp.view(k, m).permute(1, 0).contiguous() + else: + m, k = src.shape + + MiK, MiKv, PackK = calculate_k_for_swizzling(src.dtype) + + if (MiK == 16): + assert m % 16 == 0, f"swizzle shape m = {m} must be divisible by 16" + assert k % 32 == 0, f"swizzle shape k = {k} must be divisible by 32" + elif (MiK == 32): + assert m % 16 == 0, f"swizzle shape m = {m} must be divisible by 16" + assert k % 64 == 0, f"swizzle shape k = {k} must be divisible by 64" + + tmp = tmp.view(m // MiM, MiM, k // (MiK * PackK), MiK // MiKv, MiKv * PackK) + tmp = tmp.permute(0, 2, 3, 1, 4).contiguous() + + dst = tmp.clone() + return dst.view(src.shape) + +def do_swizzle(weights: List[Dict[str, torch.Tensor]]): + target_layer_weight_names = [ + "self_attention_weights.query_weight.kernel", + "self_attention_weights.attention_output_weight.kernel", + "ffn_weights.intermediate_weight2.kernel", + "ffn_weights.intermediate_weight13.kernel" + ] + for layer_weights in weights: + for k in target_layer_weight_names: + layer_weights[k] = swizzle_tensor(layer_weights[k], True) diff --git a/tests/BUILD b/tests/BUILD index c836c78c6..b71d3d8da 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -45,7 +45,11 @@ cc_library( cc_binary( name = "test_ops", - deps = [":test_ops_libs"], + deps = select({ + "@//:using_cuda12":[":test_ops_libs"], + "@//:using_rocm": [":rocm_test_ops_libs"], + "//conditions:default": [], + }), linkshared = 1, visibility = ["//visibility:public"], ) @@ -240,7 +244,9 @@ cc_library( "mla/mla_context_attention.cc", "gemm/gemm_op_test.cc", "ffn/rocm_ffn_moe_fp8_test.cc", + "ffn/rocm_ffn_moe_fp8_ptpc_test.cc", "ffn/rocm_ffn_moe_bf16_test.cc", + "layernorm/fusedQkRmsNorm.cpp" ]), deps = torch_deps() + [ "//rtp_llm/cpp/devices/rocm_impl:rocm_impl", @@ -343,6 +349,28 @@ py_test( # ], #) +#ToDo: fix aiter import issue +# py_test( +# name = "rocm_ffn_moe_fp8_ptpc_test", +# srcs = [ +# "ffn/rocm_ffn_moe_fp8_ptpc_test.py" +# ], +# data = [ +# "//:th_transformer", +# ":rocm_test_ops", +# ], +# deps = [ +# "//rtp_llm:torch", +# "//rtp_llm:rtp_llm_lib", +# ], +# env = { +# "DEVICE_RESERVE_MEMORY_BYTES": "1024000", +# }, +# tags = [ +# "rocm", +# ], +# ) + #ToDo: fix aiter import issue #py_test( # name = "rocm_ffn_moe_bf16_test", diff --git a/tests/ffn/rocm_ffn_moe_fp8_ptpc_test.cc b/tests/ffn/rocm_ffn_moe_fp8_ptpc_test.cc new file mode 100644 index 000000000..95b0bc7dd --- /dev/null +++ b/tests/ffn/rocm_ffn_moe_fp8_ptpc_test.cc @@ -0,0 +1,122 @@ +#include "rtp_llm/cpp/core/Buffer.h" +#include "rtp_llm/cpp/core/QBuffer.h" +#include "rtp_llm/cpp/core/Types.h" +#include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h" +#include "rtp_llm/cpp/devices/DeviceBase.h" +#include "rtp_llm/cpp/devices/DeviceFactory.h" +#include "rtp_llm/cpp/devices/OpData.h" +#include "rtp_llm/cpp/devices/Weights.h" + +using namespace rtp_llm; + +namespace unittest { + +class ROCmFfnMoeFp8PTPCOp : public torch::jit::CustomClassHolder { + +public: + ROCmFfnMoeFp8PTPCOp(int64_t ep_rank, int64_t ep_size); + + void forward(torch::Tensor input, + torch::Tensor w1, + torch::Tensor w2, + torch::Tensor fc1_scale, + torch::Tensor fc2_scale, + torch::Tensor gating_weight, + torch::Tensor e_score_correction_bias, + int64_t topk, + int64_t num_expert_group, + int64_t topk_group, + torch::Tensor output); + +private: + DeviceBase* device_ = nullptr; + GptInitParameter params_; +}; + + +ROCmFfnMoeFp8PTPCOp::ROCmFfnMoeFp8PTPCOp(int64_t ep_rank, int64_t ep_size) { + // TODO: add ep parameters here + params_ = GptInitParameter(); + params_.dp_size_ = ep_size; + params_.dp_rank_ = ep_rank; + params_.ep_size_ = ep_size; + params_.ep_rank_ = ep_rank; + params_.nccl_ip_ = "localhost"; + params_.dp_tp_nccl_port_ = 50049; + DeviceFactory::initDevices(params_); + device_ = DeviceFactory::getDefaultDevice(); +} + + +void ROCmFfnMoeFp8PTPCOp::forward(torch::Tensor input, + torch::Tensor w1, + torch::Tensor w2, + torch::Tensor fc1_scale, + torch::Tensor fc2_scale, + torch::Tensor gating_weight, + torch::Tensor e_score_correction_bias, + int64_t topk, + int64_t num_expert_group, + int64_t topk_group, + torch::Tensor output) { + + size_t num_expert = static_cast(gating_weight.size(0)); + int64_t model_dim = input.size(1); + + // TODO: config ep parameters here + MoeConfigs moe_configs({ + .expert_num=num_expert, + .top_k=static_cast(topk), + .normalize_expert_scale=false, // FIXME(liyangcheng.lyc): has_moe_norm? + .moe_inter_padding_size=model_dim, + .has_moe_norm=true, + .ep_rank=static_cast(params_.ep_rank_), + .ep_size=static_cast(params_.ep_size_), + .dp_rank=static_cast(params_.dp_rank_), + .dp_size=static_cast(params_.dp_size_), + .scoring_func=1, // FIXME(liyangcheng.lyc): useless now + .topk_group=static_cast(topk_group), + .n_group=static_cast(num_expert_group) + }); + + FfnConfigs ffn_configs({ + .activation_type=ActivationType::Swiglu, + .moe_configs=moe_configs + }); + + BufferPtr input_buffer = torchTensor2Buffer(input); + + BufferPtr w1_buffer = torchTensor2Buffer(w1); + BufferPtr w2_buffer = torchTensor2Buffer(w2); + BufferPtr fc1_scale_buffer = torchTensor2Buffer(fc1_scale); + BufferPtr fc2_scale_buffer = torchTensor2Buffer(fc2_scale); + + MemoryType zeros_type = fc1_scale_buffer->where(); + BufferPtr gate_weight_buffer = BufferPtr(new QBuffer(std::move(w1_buffer), std::move(fc1_scale_buffer), std::move(BufferPtr(new Buffer(zeros_type, DataType::TYPE_INVALID, {0}, nullptr))))); + BufferPtr down_weight_buffer = BufferPtr(new QBuffer(std::move(w2_buffer), std::move(fc2_scale_buffer), std::move(BufferPtr(new Buffer(zeros_type, DataType::TYPE_INVALID, {0}, nullptr))))); + + torch::Tensor gating_weight_t = gating_weight.transpose(0, 1).contiguous(); + BufferPtr gating_weight_buffer = torchTensor2Buffer(gating_weight_t); + + BufferPtr e_score_correction_bias_buffer = torchTensor2Buffer(e_score_correction_bias); + + FfnLayerWeights weights; + weights.moe_gate_weight = std::make_shared(DenseWeights(gate_weight_buffer)); + weights.moe_down_weight = std::make_shared(DenseWeights(down_weight_buffer)); + weights.moe_gating_weight = std::make_shared(DenseWeights(gating_weight_buffer)); + weights.e_score_correction_bias = e_score_correction_bias_buffer; + + FfnLayerParams ffn_layer_params(*input_buffer, ffn_configs, weights, std::nullopt, QScheme::Qfp8PerToken, DataType::TYPE_INVALID, nullptr); + + FfnLayerOutput ffn_output = device_->ffnLayer(ffn_layer_params); + + BufferPtr output_buffer = torchTensor2Buffer(output); + device_->copy({*output_buffer, *(ffn_output.hidden_states)}); +} + +} // namespace unittest + + +static auto ROCmFfnMoeFp8PTPCOp = torch::jit::class_("unittest", "ROCmFfnMoeFp8PTPCOp") + .def(torch::jit::init()) + .def("forward", &unittest::ROCmFfnMoeFp8PTPCOp::forward); diff --git a/tests/ffn/rocm_ffn_moe_fp8_ptpc_test.py b/tests/ffn/rocm_ffn_moe_fp8_ptpc_test.py new file mode 100644 index 000000000..0f27143c6 --- /dev/null +++ b/tests/ffn/rocm_ffn_moe_fp8_ptpc_test.py @@ -0,0 +1,353 @@ +import aiter +from aiter.ops.shuffle import shuffle_weight +from aiter import ActivationType, QuantType +from aiter.fused_moe import fused_topk, get_block_size_M, torch_moe_stage1, torch_moe_stage2 +from aiter.ops.moe_sorting import moe_sorting_fwd +from aiter.ops.quant import dynamic_per_token_scaled_quant, pertoken_quant +from aiter.ops.topk import biased_grouped_topk, biased_grouped_topk_torch +from aiter.test_common import checkAllclose +import multiprocessing as mp +import os +import torch +import tempfile +import unittest + +os.environ['DEVICE_RESERVE_MEMORY_BYTES'] = '128000000' + +torch.manual_seed(42) +torch.cuda.manual_seed(42) +torch.cuda.manual_seed_all(42) + + +torch.classes.load_library(os.environ['TEST_SRCDIR'] + "/rtp_llm/tests/librocm_test_ops.so") + +unit_size = 32 + +def aiter_moe_fp8_ptpc(input, w1_q, w2_q, w1_scale, w2_scale, gating_weight, correction_bias, topk, num_expert_group, topk_group): + dtype = torch.bfloat16 + quant_dtype = torch.float8_e4m3fnuz + + token, model_dim = input.shape + num_expert, _, inter_dim = w2_q.shape + + # calculate gating score and get topk weights, topk ids + score = torch.nn.functional.linear(input.type(torch.float32), gating_weight.type(torch.float32), None) + + topk_weights = torch.empty((token, topk), dtype=torch.float32, device='cuda') + topk_ids = torch.empty((token, topk), dtype=torch.int32, device='cuda') + + if num_expert_group > 1: # for deepseek-r1 + if correction_bias is not None: # use bias + biased_grouped_topk( + score, + correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + True, + 1.0 + ) + else: # not implemented now + pass + else: # for qwen3 + topk_weights, topk_ids = fused_topk(input, score, topk, True) + + a_q = torch.empty((token, model_dim), dtype=quant_dtype, device='cuda') + a_scale = torch.empty((token, 1), dtype=torch.float32, device='cuda') + dynamic_per_token_scaled_quant(a_q, input, a_scale) + + unit_size = get_block_size_M(token, topk, num_expert, inter_dim) + + max_num_tokens_padded = topk_ids.numel() + num_expert * unit_size - topk + max_num_m_blocks = int((max_num_tokens_padded + unit_size - 1) // unit_size) + + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device='cuda') + sorted_weights = torch.empty((max_num_tokens_padded,), dtype=torch.float32, device='cuda') + sorted_expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device='cuda') + num_valid_ids = torch.empty((1), dtype=torch.int32, device='cuda') + aiter_ref_output = torch.empty((token, model_dim), dtype=dtype, device='cuda') + + moe_sorting_fwd( + topk_ids, + topk_weights, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + aiter_ref_output, + num_expert, + unit_size, + None + ) + + a2 = torch.empty((token, topk, inter_dim), dtype=dtype, device='cuda') + + aiter.moe_stage1_g1u1( + a_q, + w1_q, + w2_q, + sorted_ids, + sorted_expert_ids, + num_valid_ids, + a2, + inter_dim, + "", # empty kernelName + unit_size, + 0, # 0 ksplit + ActivationType.Silu, + QuantType.per_Token, + a_scale, + w1_scale, + None, # doweight_stage1 is false + ) + + a2_q = torch.empty((token, topk, inter_dim), dtype=quant_dtype, device='cuda') + a2_scale = torch.empty((token, 1), dtype=torch.float32, device='cuda') + dynamic_per_token_scaled_quant(a2_q, a2, a2_scale) + + aiter.ck_moe_stage2( + a2_q, + w1_q, + w2_q, + sorted_ids, + sorted_expert_ids, + num_valid_ids, + aiter_ref_output, + topk, + "", + w2_scale, + a2_scale, + unit_size, + sorted_weights, + 2, + ) + + return aiter_ref_output + + +def torch_moe_fp8(input, w1_q, w2_q, w1_scale, w2_scale, gating_weight, correction_bias, topk, num_expert_group, topk_group): + dtype = torch.bfloat16 + quant_dtype = torch.float8_e4m3fnuz + + token, _ = input.shape + + # calculate gating score and get topk weights, topk ids + score = torch.nn.functional.linear(input.type(torch.float32), gating_weight.type(torch.float32), None) + + if num_expert_group > 1: # for deepseek-r1 + if correction_bias is not None: # use bias + topk_weights, topk_ids = biased_grouped_topk_torch( + score, + correction_bias, + topk, + True, + num_expert_group, + topk_group + ) + else: # not implemented now + pass + else: # for qwen3 + topk_weights, topk_ids = fused_topk(input, score, topk, True) + + a_q, a_scale = pertoken_quant(input, quant_dtype=quant_dtype) + + a2 = torch_moe_stage1( + a_q, + w1_q, + w2_q, + topk_weights, + topk_ids, + dtype, + ActivationType.Silu, + QuantType.per_Token, + a_scale, + w1_scale, + False + ) + + a2_q, a2_scale = pertoken_quant(a2, quant_dtype=quant_dtype) + + torch_out = torch_moe_stage2( + a2_q, + w1_q, + w2_q, + topk_weights, + topk_ids, + dtype, + QuantType.per_Token, + w2_scale, + a2_scale, + True + ) + + return torch_out + + +def subprocess_moe_fp8_ptpc(input_path, w1_q_path, w2_q_path, w1_scale_path, w2_scale_path, gating_weight_path, correction_bias_path, topk, num_expert_group, topk_group, ep_rank, ep_size): + dtype = torch.bfloat16 + + # load inputs/weights + input = torch.load(input_path, weights_only=True).cuda() + w1_q = torch.load(w1_q_path, weights_only=True).cuda() + w2_q = torch.load(w2_q_path, weights_only=True).cuda() + w1_scale = torch.load(w1_scale_path, weights_only=True).cuda() + w2_scale = torch.load(w2_scale_path, weights_only=True).cuda() + gating_weight = torch.load(gating_weight_path, weights_only=True).cuda() + correction_bias = torch.load(correction_bias_path, weights_only=True).cuda() + + num_expert_per_rank = gating_weight.shape[0] // ep_size + token_per_rank = input.shape[0] // ep_size + + input = input[ep_rank * token_per_rank : (ep_rank + 1) * token_per_rank] + + # before we split weights to each rank, calculate reference first + + # invoke torch ref op + torch_ref_output = torch_moe_fp8( + input.clone(), # input should be split first, weights not + w1_q.clone(), + w2_q.clone(), + w1_scale.clone(), + w2_scale.clone(), + gating_weight.clone(), + correction_bias.clone(), + topk, + num_expert_group, + topk_group + ) + + # for aiter op, do shuffle, it's ok shuffle first then split + w1_q = shuffle_weight(w1_q) + w2_q = shuffle_weight(w2_q) + + # invoke (python) aiter ref op + aiter_ref_output = aiter_moe_fp8_ptpc( + input.clone(), # input should be split first, weights not + w1_q.clone(), + w2_q.clone(), + w1_scale.clone(), + w2_scale.clone(), + gating_weight.clone(), + correction_bias.clone(), + topk, + num_expert_group, + topk_group + ) + + # split weights to each rank + w1_q = w1_q[ep_rank * num_expert_per_rank : (ep_rank + 1) * num_expert_per_rank] + w2_q = w2_q[ep_rank * num_expert_per_rank : (ep_rank + 1) * num_expert_per_rank] + w1_scale = w1_scale[ep_rank * num_expert_per_rank : (ep_rank + 1) * num_expert_per_rank] + w2_scale = w2_scale[ep_rank * num_expert_per_rank : (ep_rank + 1) * num_expert_per_rank] + + # init device + rocm_ffn_moe_fp8_ptpc_op = torch.classes.unittest.ROCmFfnMoeFp8PTPCOp(ep_rank, ep_size) + + # invoke rtp (c++ aiter) op + output = torch.empty_like(input, dtype=dtype, device='cuda') + rocm_ffn_moe_fp8_ptpc_op.forward( + input, + w1_q, + w2_q, + w1_scale, + w2_scale, + gating_weight, + correction_bias, + topk, + num_expert_group, + topk_group, + output + ) + + checkAllclose(torch_ref_output, output, rtol=0.05, atol=0.05, msg=f'[ep_size={ep_size}, ep_rank={ep_rank}]: python torch vs rtp') + checkAllclose(aiter_ref_output, output, rtol=0.05, atol=0.05, msg=f'[ep_size={ep_size}, ep_rank={ep_rank}]: python aiter vs rtp') + checkAllclose(torch_ref_output, aiter_ref_output, rtol=0.05, atol=0.05, msg=f'[ep_size={ep_size}, ep_rank={ep_rank}]: python torch vs python aiter') + + +class TestROCmFfnMoeFp8(unittest.TestCase): + + def _test_moe_fp8(self, token, model_dim, inter_dim, num_expert, num_shared_expert, topk, num_expert_group, topk_group, ep_size, dtype, quant_dtype): + assert dtype == torch.bfloat16 + assert quant_dtype == torch.float8_e4m3fnuz + + print(f'token={token}, model_dim={model_dim}, inter_dim={inter_dim}, num_expert={num_expert}, num_shared_expert={num_shared_expert}, topk={topk}, ep_size={ep_size}') + + # Note: num_expert is global, token is per rank + + # input + input = torch.randn((token * ep_size, model_dim), dtype=dtype) + + # w1 gate + up -> w1 + w1_gate = torch.randn((num_expert, inter_dim, model_dim), dtype=dtype) / 10 + w1_up = torch.randn((num_expert, inter_dim, model_dim), dtype=dtype) / 10 + w1 = torch.cat((w1_gate, w1_up), dim=1) + + # w2 + w2 = torch.randn((num_expert, model_dim, inter_dim), dtype=dtype) / 10 + + # ptpc quant w1 and w2 + w1_q, w1_scale = pertoken_quant(w1, quant_dtype=quant_dtype) + w2_q, w2_scale = pertoken_quant(w2, quant_dtype=quant_dtype) + + # gating weight, correction bias, num expert group, topk group + gating_weight = torch.randn((num_expert, model_dim), dtype=dtype) + correction_bias = torch.randn((num_expert,), dtype=torch.float32) + + # save all inputs/weights to disk file and load in new process + input_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_input_', suffix='.pt') + w1_q_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_w1_q_', suffix='.pt') + w2_q_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_w2_q_', suffix='.pt') + w1_scale_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_w1_scale_', suffix='.pt') + w2_scale_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_w2_scale_', suffix='.pt') + gating_weight_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_gating_weight_', suffix='.pt') + correction_bias_file = tempfile.NamedTemporaryFile(prefix='rocm_ffn_moe_fp8_ptpc_test_correction_bias_', suffix='.pt') + + torch.save(input, input_file) + torch.save(w1_q, w1_q_file) + torch.save(w2_q, w2_q_file) + torch.save(w1_scale, w1_scale_file) + torch.save(w2_scale, w2_scale_file) + torch.save(gating_weight, gating_weight_file) + torch.save(correction_bias, correction_bias_file) + + # start a new process to invoke rtp ffn layer + procs = list() + for ep_rank in range(ep_size): + os.environ['CUDA_VISIBLE_DEVICES'] = str(ep_rank) + proc = mp.Process(target=subprocess_moe_fp8_ptpc, args=( + input_file.name, + w1_q_file.name, + w2_q_file.name, + w1_scale_file.name, + w2_scale_file.name, + gating_weight_file.name, + correction_bias_file.name, + topk, + num_expert_group, + topk_group, + ep_rank, + ep_size + )) + proc.start() + procs.append(proc) + try: + [p.join() for p in procs] + except Exception: + [p.terminate() for p in procs] + [p.join() for p in procs] + + + # fp8 ptpc quant, for qwen3 + def test_moe_fp8_ptpc(self): + for ep_size in [1, 2]: + for dtype in [torch.bfloat16]: + for token in [1, 2, 5, 16, 32]: + for model_dim in [4096]: + for inter_dim in [1536]: + self._test_moe_fp8(token, model_dim, inter_dim, 128, 0, 8, 1, 1, ep_size, dtype, torch.float8_e4m3fnuz) + + +if __name__ == '__main__': + mp.set_start_method('spawn') + unittest.main() diff --git a/tests/gemm/rocm_ptpc_gemm_op_test.py b/tests/gemm/rocm_ptpc_gemm_op_test.py index 575162456..3b5531a33 100644 --- a/tests/gemm/rocm_ptpc_gemm_op_test.py +++ b/tests/gemm/rocm_ptpc_gemm_op_test.py @@ -36,6 +36,46 @@ def shuffle_weight(x, layout=(16, 16), use_int4=False): x_ = x_.view(*x.shape) return x_ +def calculate_k_for_swizzling(dtype: torch.dtype): + if dtype == torch.float32: + MiK, MiKv = 4, 1 + elif dtype in (torch.float16, torch.half, torch.bfloat16): + MiK, MiKv = 16, 4 + elif dtype in (torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz): + MiK, MiKv = 32, 8 + else: + raise ValueError(f"unsupported datatype in calculateKforSwizzling: {dtype}") + elem_size = torch.zeros((), dtype=dtype).element_size() + PackK = 16 // MiKv // elem_size + return MiK, MiKv, PackK + +def swizzle_tensor( + src: torch.Tensor, + col_maj: bool = False, + MiM: int = 16) -> torch.Tensor: + tmp = src.clone() + + if col_maj: + k, m = src.shape + tmp = tmp.view(k, m).permute(1, 0).contiguous() + else: + m, k = src.shape + + MiK, MiKv, PackK = calculate_k_for_swizzling(src.dtype) + + if (MiK == 16): + assert m % 16 == 0, f"swizzle shape m = {m} must be divisible by 16" + assert k % 32 == 0, f"swizzle shape k = {k} must be divisible by 32" + elif (MiK == 32): + assert m % 16 == 0, f"swizzle shape m = {m} must be divisible by 16" + assert k % 64 == 0, f"swizzle shape k = {k} must be divisible by 64" + + tmp = tmp.view(m // MiM, MiM, k // (MiK * PackK), MiK // MiKv, MiKv * PackK) + tmp = tmp.permute(0, 2, 3, 1, 4).contiguous() + + dst = tmp.clone() + return dst.view(src.shape) + def detailed_assert_close(a, b, rtol, atol, msg=""): mismatch_mask = ~torch.isclose(a, b, rtol=rtol, atol=atol) @@ -83,14 +123,25 @@ def test_ptpc_gemm(self): torch_output = self._fp8_gemm_ref(input, weight_quant, weight_scale).to( "cpu" ) - - weight_quant_shuffle = shuffle_weight(weight_quant) - weight_quant_shuffle = weight_quant_shuffle.t() # k,n - weight_scale = weight_scale.t() # 1,n - custom_output = torch.zeros((m, n), device="cuda", dtype=torch.bfloat16) - self.gemm_op.forward( - input, weight_quant_shuffle, weight_scale, custom_output - ) + if os.environ.get("TEST_SWIZZLEA", None) == "1": + weight_quant_swizzle = swizzle_tensor(weight_quant, False) + weight_quant_swizzle = weight_quant_swizzle.t() + weight_scale = weight_scale.t() + custom_output = torch.zeros((m, n), device="cuda", dtype=torch.bfloat16) + + self.gemm_op.forward( + input, weight_quant_swizzle, weight_scale, custom_output + ) + else: + weight_quant_shuffle = shuffle_weight(weight_quant) + weight_quant_shuffle = weight_quant_shuffle.t() # k,n + + weight_scale = weight_scale.t() # 1,n + custom_output = torch.zeros((m, n), device="cuda", dtype=torch.bfloat16) + + self.gemm_op.forward( + input, weight_quant_shuffle, weight_scale, custom_output + ) custom_output = custom_output.to(torch.float32).to("cpu") detailed_assert_close( diff --git a/tests/layernorm/fusedQkRmsNorm.cpp b/tests/layernorm/fusedQkRmsNorm.cpp index 39d7e3c46..a7a7460d3 100644 --- a/tests/layernorm/fusedQkRmsNorm.cpp +++ b/tests/layernorm/fusedQkRmsNorm.cpp @@ -1,15 +1,12 @@ -#include "rtp_llm/cpp/kernels/layernorm_kernels.h" -#include "rtp_llm/cpp/kernels/fused_qk_rmsnorm.h" -#include "rtp_llm/cpp/devices/DeviceBase.h" -#include "rtp_llm/cpp/cuda/cuda_fp8_utils.h" -#include "rtp_llm/cpp/cuda/cuda_type_utils.cuh" -#include "torch/csrc/cuda/Stream.h" +#ifdef USING_ROCM +#include "rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h" +#else +#include "rtp_llm/cpp/devices/cuda_impl/CudaDevice.h" +#endif #include "rtp_llm/cpp/devices/DeviceFactory.h" #include "rtp_llm/cpp/devices/OpData.h" #include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h" #include "rtp_llm/cpp/core/BufferHelper.h" -#include -#include using namespace rtp_llm; @@ -41,7 +38,6 @@ void FusedQkRmsNormOp::forward(torch::Tensor input, int64_t q_group_num, int64_t k_group_num, int64_t norm_size) { - auto stream = at::cuda::getCurrentCUDAStream().stream(); auto gpt_params = GptInitParameter(); rtp_llm::DeviceFactory::initDevices(gpt_params); device_ = rtp_llm::DeviceFactory::getDefaultDevice(); From 16bc99661d2d33cc4b91f89f13b9a67ba11b3aa1 Mon Sep 17 00:00:00 2001 From: "ziyanyi.zyy" Date: Wed, 24 Sep 2025 09:31:41 +0800 Subject: [PATCH 2/9] feat: open_source rocm amd optimizations --- open_source/bazel/arch_select.bzl | 4 ++-- open_source/deps/git.bzl | 18 +++++++++------- open_source/deps/http.bzl | 23 ++++++--------------- open_source/deps/requirements_lock_rocm.txt | 16 +++++++------- open_source/deps/requirements_rocm.txt | 10 ++++----- 5 files changed, 32 insertions(+), 39 deletions(-) diff --git a/open_source/bazel/arch_select.bzl b/open_source/bazel/arch_select.bzl index 1bbf5dc2a..4988c38a1 100644 --- a/open_source/bazel/arch_select.bzl +++ b/open_source/bazel/arch_select.bzl @@ -62,7 +62,7 @@ def subscribe_deps(): def whl_deps(): return select({ "@//:using_cuda12": ["torch==2.6.0+cu126"], - "@//:using_rocm": ["torch==2.1.2", "pyyaml"], + "@//:using_rocm": ["pyrsmi", "amdsmi@https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis%2FAMD%2Famd_smi%2Fali%2Famd_smi.tar", "aiter@https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl"], "//conditions:default": ["torch==2.1.2"], }) @@ -143,7 +143,7 @@ def deep_gemm_deps(): def kernel_so_deps(): return select({ "@//:using_cuda": [":libmmha1_so", ":libmmha2_so", ":libdmmha_so", ":libfa_so", ":libfpA_intB_so", ":libint8_gemm_so", ":libmoe_so", ":libmoe_sm90_so", ":libflashinfer_single_prefill_so", ":libflashinfer_single_decode_so", ":libflashinfer_batch_paged_prefill_so", ":libflashinfer_batch_paged_decode_so", ":libflashinfer_batch_ragged_prefill_so", ":libflashinfer_sm90_so", ":libdeepgemm_dpsk_inst_so", ":libdeepgemm_qwen_inst_so"], - "@//:using_rocm": [":libmmha1_so", ":libmmha2_so", ":libdmmha_so", ":ck_copy"], + "@//:using_rocm": [":libmmha1_so", ":libmmha2_so", ":libdmmha_so"], "//conditions:default":[], }) diff --git a/open_source/deps/git.bzl b/open_source/deps/git.bzl index 8c8f5e53f..fa693ad6f 100644 --- a/open_source/deps/git.bzl +++ b/open_source/deps/git.bzl @@ -10,9 +10,9 @@ def git_deps(): git_repository( name = "aiter_src", remote = "https://github.com/ROCm/aiter.git", - commit = "0884818336b46c458440cb7572c9ecff02b7034e", # MLA merge to main (#496) + commit = "94934e7d7cd5e11d81a2ded2a54d35f9cec4374d", # update codegen.py (#880) recursive_init_submodules = True, - patches = ["//3rdparty/aiter:rtp-llm.patch", "//3rdparty/aiter:0003-gemm_tune.patch"], + patches = ["//3rdparty/aiter:rtp-llm.patch", "//3rdparty/aiter:0003-gemm_tune.patch", "//3rdparty/aiter:aiter-fmha.patch", "//3rdparty/aiter:silu.patch"], patch_cmds = [ "echo 'from aiter.jit.core import compile_ops, get_args_of_build, build_module, get_module' >> build_aiter_module.py", "echo 'from typing import Dict' >> build_aiter_module.py", @@ -52,17 +52,21 @@ def git_deps(): "echo ' torch_exclude,' >> build_aiter_module.py", "echo ' )' >> build_aiter_module.py", "echo 'if __name__ == \"__main__\":' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_aiter_enum\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_custom_all_reduce\")' >> build_aiter_module.py", - "echo ' # build_aiter_module(\"module_attention\")' >> build_aiter_module.py", - "echo ' # build_aiter_module(\"module_norm\")' >> build_aiter_module.py", - "echo ' # build_aiter_module(\"module_cache\")' >> build_aiter_module.py", - "echo ' # build_aiter_module(\"module_mha_fwd\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_norm\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_rmsnorm\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_mha_fwd\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_gemm_a8w8_blockscale\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_quant\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_smoothquant\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_moe_sorting\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_moe_asm\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_pa\")' >> build_aiter_module.py", - "echo ' build_aiter_module(\"module_moe\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_attention_asm\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_activation\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_gemm_a8w8_bpreshuffle\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_moe_ck2stages\")' >> build_aiter_module.py", "echo 'echo \"building mla kernel\"' >> build_mla_kernel.sh", "echo 'so_file=\"./csrc/cpp_itfs/mla/asm_mla_decode_fwd_torch_lib.so\"' >> build_mla_kernel.sh", "echo 'if [ -f $so_file ]; then' >> build_mla_kernel.sh", diff --git a/open_source/deps/http.bzl b/open_source/deps/http.bzl index a2d43ba20..83942ae3d 100644 --- a/open_source/deps/http.bzl +++ b/open_source/deps/http.bzl @@ -1,5 +1,4 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file") -load("//3rdparty/composable_kernel:repo.bzl", "ck_repo") def clean_dep(dep): return str(Label(dep)) @@ -49,21 +48,11 @@ def http_deps(): build_file = clean_dep("//:BUILD.pytorch"), ) - http_archive( - name = "composable_kernel_archive", - sha256 = "b31d8b9b1ebf6d1937198b257a2c36c18c394895bce6325630669f957d583094", - urls = [ - "https://search-ad.oss-cn-hangzhou-zmf-internal.aliyuncs.com/amd_pkgs/composable_kernel_archive.tar.gz", - ], - build_file = clean_dep("//3rdparty/composable_kernel:ck.BUILD"), - strip_prefix = "composable_kernel_archive", - ) - http_archive( name = "torch_rocm", - sha256 = "8ccd35611d0f761e570f7904ecbbe27cfa4f48253abc48884b95e7bfaa936e7c", + sha256 = "39287cb8d52e4a71eb7194727365aa6be767da614fa5e44468155c90ba4a4e0c", urls = [ - "https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3/torch-2.4.0%2Brocm6.3.0-cp310-cp310-linux_x86_64.whl" + "https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torch-2.4.1%2Brocm6.4.1.git4e7ae583-cp310-cp310-linux_x86_64.whl" ], type = "zip", build_file = clean_dep("//:BUILD.pytorch"), @@ -71,11 +60,13 @@ def http_deps(): http_archive( name = "aiter", - sha256 = "88178ba538a58fd82e2fbccfd9dcb3dbcc85eb5d75814ea5b4243b048beb5898", + sha256 = "08e90279560e2e066298e976b7a944d6de54e8b2559a207382b112cc60adcf58", urls = [ - "https://search-ad.oss-cn-hangzhou-zmf-internal.aliyuncs.com/amd_pkgs/aiter-0.1.0-py3-none-any.whl", + "https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl", ], type = "zip", + patches = ["//3rdparty/aiter:refine-aiter-asm-dir.patch", "//3rdparty/aiter:aiter-flash_attn.patch"], + patch_args = ["-p1"], build_file = clean_dep("//:BUILD.aiter"), ) @@ -131,5 +122,3 @@ def http_deps(): urls = ["http://search-ad.oss-cn-hangzhou-zmf.aliyuncs.com/pkg%2F3fs%2Fhf3fs-1.2.0-1.alios7.x86_64.rpm"], sha256 = "d5c9ce8474f6bf2177c11c4dc36acf633b5d4763353cd70156b0a0b2d54b8316" ) - - ck_repo(name = "composable_kernel") diff --git a/open_source/deps/requirements_lock_rocm.txt b/open_source/deps/requirements_lock_rocm.txt index 4dcfd1f93..78f8babe7 100644 --- a/open_source/deps/requirements_lock_rocm.txt +++ b/open_source/deps/requirements_lock_rocm.txt @@ -114,8 +114,8 @@ aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via aiohttp -aiter @ https://search-ad.oss-cn-hangzhou-zmf-internal.aliyuncs.com/amd_pkgs/aiter-0.1.0-py3-none-any.whl \ - --hash=sha256:88178ba538a58fd82e2fbccfd9dcb3dbcc85eb5d75814ea5b4243b048beb5898 +aiter @ https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl \ + --hash=sha256:08e90279560e2e066298e976b7a944d6de54e8b2559a207382b112cc60adcf58 # via -r open_source/deps/requirements_rocm.txt aliyun-python-sdk-core==2.15.2 \ --hash=sha256:54f66a53e193c61c5e16ea4505a0cab43543f8ad2ef22833f69c4d5e5151c17d @@ -2266,8 +2266,8 @@ python-dateutil==2.9.0.post0 \ # via # matplotlib # pandas -pytorch-triton-rocm @ http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Fpytorch_triton_rocm-3.0.0%2Brocm6.3.0.75cc27c26a-cp310-cp310-linux_x86_64.whl \ - --hash=sha256:aa87bc8a5b62a0cd6a3170fda1924c19e7c9801a0f868fad1035e1f8e0a8b0dc +pytorch-triton-rocm @ https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/pytorch_triton_rocm-3.0.0%2Brocm6.4.1.git75cc27c2-cp310-cp310-linux_x86_64.whl \ + --hash=sha256:350947d4a8a056ec3ff6f18263eae1ee34a21ea6597849fa9b0365989f978b12 # via # -r open_source/deps/requirements_rocm.txt # torch @@ -2975,8 +2975,8 @@ tokenizers==0.20.3 \ --hash=sha256:fbaf3ea28fedfb2283da60e710aff25492e795a7397cad8a50f1e079b65a5a70 \ --hash=sha256:ff1ef8bd47a02b0dc191688ccb4da53600df5d4c9a05a4b68e1e3de4823e78eb # via transformers -torch @ http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Ftorch-2.4.0%2Brocm6.3.0-cp310-cp310-linux_x86_64.whl \ - --hash=sha256:8ccd35611d0f761e570f7904ecbbe27cfa4f48253abc48884b95e7bfaa936e7c +torch @ https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torch-2.4.1%2Brocm6.4.1.git4e7ae583-cp310-cp310-linux_x86_64.whl \ + --hash=sha256:39287cb8d52e4a71eb7194727365aa6be767da614fa5e44468155c90ba4a4e0c # via # -r open_source/deps/requirements_rocm.txt # accelerate @@ -2984,8 +2984,8 @@ torch @ http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Ftorch-2.4.0%2Brocm # sentence-transformers # timm # torchvision -torchvision @ http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Ftorchvision-0.19.0%2Brocm6.3.0-cp310-cp310-linux_x86_64.whl \ - --hash=sha256:c5a47d66ead2620f62ba7921db5a8999f80fe190fbe71a614d26e167d22797ea +torchvision @ https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torchvision-0.19.0%2Brocm6.4.1.git4d41ad71-cp310-cp310-linux_x86_64.whl \ + --hash=sha256:772f6af6c11724639a460524d53521a31393c20e85818a3f4debf030c7f98aa1 # via # -r open_source/deps/requirements_rocm.txt # timm diff --git a/open_source/deps/requirements_rocm.txt b/open_source/deps/requirements_rocm.txt index ffd292300..4dfd1dbad 100644 --- a/open_source/deps/requirements_rocm.txt +++ b/open_source/deps/requirements_rocm.txt @@ -1,8 +1,8 @@ -r requirements_base.txt -http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Fpytorch_triton_rocm-3.0.0%2Brocm6.3.0.75cc27c26a-cp310-cp310-linux_x86_64.whl -http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Ftorch-2.4.0%2Brocm6.3.0-cp310-cp310-linux_x86_64.whl -http://rtp-maga.oss-cn-zhangjiakou.aliyuncs.com/amd%2Ftorchvision-0.19.0%2Brocm6.3.0-cp310-cp310-linux_x86_64.whl +https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/pytorch_triton_rocm-3.0.0%2Brocm6.4.1.git75cc27c2-cp310-cp310-linux_x86_64.whl +https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torch-2.4.1%2Brocm6.4.1.git4e7ae583-cp310-cp310-linux_x86_64.whl +https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torchvision-0.19.0%2Brocm6.4.1.git4d41ad71-cp310-cp310-linux_x86_64.whl pyrsmi pyyaml -https://search-ad.oss-cn-hangzhou-zmf-internal.aliyuncs.com/amd_pkgs/aiter-0.1.0-py3-none-any.whl -https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis%2FAMD%2Famd_smi%2Fali%2Famd_smi.tar \ No newline at end of file +https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl +https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis%2FAMD%2Famd_smi%2Fali%2Famd_smi.tar From e2a917a3979372835ee8714baa8d8a3ba5db537e Mon Sep 17 00:00:00 2001 From: Hang Yang Date: Mon, 29 Sep 2025 17:47:08 +0800 Subject: [PATCH 3/9] refactor rocm fp8 --- rtp_llm/config/gpt_init_model_parameters.py | 3 --- rtp_llm/cpp/cache/BUILD | 1 + rtp_llm/cpp/cache/KVCacheAllocator.cc | 15 +-------------- rtp_llm/cpp/core/BUILD | 8 ++++++++ rtp_llm/cpp/core/DeviceTypes.h | 9 +++++++++ rtp_llm/cpp/kernels/BUILD | 1 + rtp_llm/cpp/kernels/unfused_attention_kernels.cu | 2 +- 7 files changed, 21 insertions(+), 18 deletions(-) create mode 100644 rtp_llm/cpp/core/DeviceTypes.h diff --git a/rtp_llm/config/gpt_init_model_parameters.py b/rtp_llm/config/gpt_init_model_parameters.py index 7e9a59bf7..1b720a816 100644 --- a/rtp_llm/config/gpt_init_model_parameters.py +++ b/rtp_llm/config/gpt_init_model_parameters.py @@ -1149,9 +1149,6 @@ def update_common( logging.info(f"pre_allocate_op_mem: {self.pre_allocate_op_mem}") logging.info(f"tp_split_emb_and_lm_head: {self.tp_split_emb_and_lm_head}") - if os.environ.get("ROCM_KV_CACHE_DATATYPE", "") == "fp8": - self.kv_cache_data_type = WEIGHT_TYPE.FP8.to_str() - # use environment variables to update stop_words_str and stop_words_id env_stop_words_str = self.py_env_configs.generate_env_config.stop_words_str env_stop_words_id = self.py_env_configs.generate_env_config.stop_words_list diff --git a/rtp_llm/cpp/cache/BUILD b/rtp_llm/cpp/cache/BUILD index 29f82b640..98b6c1a6f 100644 --- a/rtp_llm/cpp/cache/BUILD +++ b/rtp_llm/cpp/cache/BUILD @@ -34,6 +34,7 @@ cc_library( "BlockLRUCache.cc", ], deps = torch_deps() + [ + "//rtp_llm/cpp/core:device_types_hdr", "//rtp_llm/cpp/metrics:metrics", "//rtp_llm/cpp/dataclass:dataclass", "//rtp_llm/cpp/model_rpc:model_rpc_pool", diff --git a/rtp_llm/cpp/cache/KVCacheAllocator.cc b/rtp_llm/cpp/cache/KVCacheAllocator.cc index 00a0c5075..38a1f1057 100644 --- a/rtp_llm/cpp/cache/KVCacheAllocator.cc +++ b/rtp_llm/cpp/cache/KVCacheAllocator.cc @@ -5,12 +5,7 @@ #include "rtp_llm/cpp/disaggregate/cache_store/NormalCacheStore.h" #include "rtp_llm/cpp/core/Buffer.h" #include "rtp_llm/cpp/core/Types.h" -#if USING_ROCM -#include -#endif -#ifdef ENABLE_FP8 -#include -#endif +#include "rtp_llm/cpp/core/DeviceTypes.h" using namespace std; @@ -99,11 +94,7 @@ void KVCacheAllocator::initKVCacheScale() { (size_t)config_.block_nums, (size_t)config_.local_head_num_kv, (size_t)config_.seq_size_per_block}, -#ifdef USING_ROCM - (__hip_fp8_e4m3_fnuz*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2); -#else (__nv_fp8_e4m3*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2); -#endif kv_cache_.v_scale = std::make_unique( rtp_llm::MemoryType::MEMORY_GPU, rtp_llm::DataType::TYPE_FP32, @@ -111,11 +102,7 @@ void KVCacheAllocator::initKVCacheScale() { (size_t)config_.block_nums, (size_t)config_.local_head_num_kv, (size_t)config_.seq_size_per_block}, -#ifdef USING_ROCM - (__hip_fp8_e4m3_fnuz*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2 + kv_cache_.k_scale->sizeBytes()); -#else (__nv_fp8_e4m3*)cache_base_ptr_ + kv_cache_.k_blocks->sizeBytes() * 2 + kv_cache_.k_scale->sizeBytes()); -#endif Buffer2torchTensor(kv_cache_.k_scale, false).fill_(1.0); Buffer2torchTensor(kv_cache_.v_scale, false).fill_(1.0); } diff --git a/rtp_llm/cpp/core/BUILD b/rtp_llm/cpp/core/BUILD index 16f6488f7..65af0d86a 100644 --- a/rtp_llm/cpp/core/BUILD +++ b/rtp_llm/cpp/core/BUILD @@ -13,6 +13,14 @@ cc_library( visibility = ["//visibility:public"], ) +cc_library( + name = "device_types_hdr", + hdrs = [ + "DeviceTypes.h", + ], + visibility = ["//visibility:public"], +) + cc_library( name = "types", srcs = [ diff --git a/rtp_llm/cpp/core/DeviceTypes.h b/rtp_llm/cpp/core/DeviceTypes.h new file mode 100644 index 000000000..b992b84f3 --- /dev/null +++ b/rtp_llm/cpp/core/DeviceTypes.h @@ -0,0 +1,9 @@ +#pragma once + +#if USING_ROCM +#include +typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3; +#endif +#ifdef ENABLE_FP8 +#include +#endif diff --git a/rtp_llm/cpp/kernels/BUILD b/rtp_llm/cpp/kernels/BUILD index eacb107ed..0c982cb56 100644 --- a/rtp_llm/cpp/kernels/BUILD +++ b/rtp_llm/cpp/kernels/BUILD @@ -345,6 +345,7 @@ cc_library( "unfused_attention_kernels.h", ], deps = any_cuda_deps + [ + "//rtp_llm/cpp/core:device_types_hdr", "//rtp_llm/cpp/utils:core_utils", "//rtp_llm/cpp/model_utils:model_utils", "//rtp_llm/cpp/cuda:cuda_utils_cu", diff --git a/rtp_llm/cpp/kernels/unfused_attention_kernels.cu b/rtp_llm/cpp/kernels/unfused_attention_kernels.cu index a0dd7955d..105989625 100644 --- a/rtp_llm/cpp/kernels/unfused_attention_kernels.cu +++ b/rtp_llm/cpp/kernels/unfused_attention_kernels.cu @@ -21,11 +21,11 @@ #include "rtp_llm/cpp/kernels/rotary_position_embedding.h" #include "rtp_llm/cpp/kernels/unfused_attention_kernels.h" #include "rtp_llm/cpp/cuda/cuda_type_utils.cuh" +#include "rtp_llm/cpp/core/DeviceTypes.h" #if USING_CUDA #include "rtp_llm/cpp/cuda/cuda_host_utils.h" #endif #if USING_ROCM -typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3; #include "rtp_llm/cpp/rocm/cuda_shims.h" #endif #include From 141efff29a71398f67f6702ff0cb0fdfccc2784a Mon Sep 17 00:00:00 2001 From: Hang Yang Date: Wed, 8 Oct 2025 16:23:40 +0800 Subject: [PATCH 4/9] fix: rocm remove libstdc++ preload in .bazelrc --- .bazelrc | 1 - rtp_llm/cpp/devices/rocm_impl/test/BUILD | 1 - 2 files changed, 2 deletions(-) diff --git a/.bazelrc b/.bazelrc index 0ab78ab92..68145567f 100644 --- a/.bazelrc +++ b/.bazelrc @@ -167,7 +167,6 @@ build:asan --linkopt -fsanitize=address test:rocm --test_env PATH="/opt/rocm/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/opt/conda310/bin:/opt/conda310/condabin:/usr/share/Modules/bin:/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/X11R6/bin:/opt/cmake/cmake-3.26.4/bin:$PATH" test:rocm --test_env HOME=/home/admin test:rocm --test_env LD_LIBRARY_PATH="/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rocm/lib:/opt/conda310/lib/:/usr/lib64:/opt/amdgpu/lib64:$LD_LIBRARY_PATH" -test:rocm --test_env LD_PRELOAD="/opt/conda310/lib/libstdc++.so" test --test_env LD_LIBRARY_PATH="/opt/rocm/lib:/opt/conda310/lib/:/usr/local/nvidia/lib64:/usr/lib64:/usr/local/cuda/lib64:/opt/amdgpu/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" test --test_env OMP_NUM_THREADS=8 test --test_env FT_SERVER_TEST="1" diff --git a/rtp_llm/cpp/devices/rocm_impl/test/BUILD b/rtp_llm/cpp/devices/rocm_impl/test/BUILD index fb305d9a8..4e102cd0b 100644 --- a/rtp_llm/cpp/devices/rocm_impl/test/BUILD +++ b/rtp_llm/cpp/devices/rocm_impl/test/BUILD @@ -31,7 +31,6 @@ test_linkopts = [ "-lrocprofiler64v2", "-lrocprofiler-sdk", "-lhsa-runtime64", - "-lrocprofiler-register", "-lrocprofiler64", "-lrocprofiler-sdk-roctx", "-lhipsparse", From 3198ba52a6150eb00a5fd04a09f51036b44fedcf Mon Sep 17 00:00:00 2001 From: YilinZhao Date: Thu, 25 Sep 2025 16:12:49 +0800 Subject: [PATCH 5/9] refactor RotaryEmbedding and swizzle --- rtp_llm/config/gpt_init_model_parameters.py | 1 - rtp_llm/cpp/devices/BUILD | 1 + rtp_llm/cpp/devices/DeviceBase.h | 4 - rtp_llm/cpp/devices/OpData.h | 2 - .../cpp/devices/base_impl/AttentionLayer.cc | 9 +- .../devices/base_tests/AttentionOpTest.hpp | 15 +- .../cpp/devices/cuda_impl/CudaAttentionOp.cc | 50 +--- rtp_llm/cpp/devices/cuda_impl/CudaDevice.h | 3 - .../cpp/devices/rocm_impl/ROCmAttentionOp.cc | 24 +- rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h | 1 - rtp_llm/cpp/devices/rocm_impl/aiterPA.cc | 2 +- .../cpp/kernels/rotary_position_embedding.h | 88 +++---- .../cpp/kernels/unfused_attention_kernels.cu | 220 ++++++++---------- .../cpp/kernels/unfused_attention_kernels.h | 152 +++++------- rtp_llm/cpp/models/GptModel.cc | 9 +- rtp_llm/cpp/models/GptModel.h | 1 - rtp_llm/cpp/utils/BUILD | 20 +- rtp_llm/cpp/utils/RopeCosSin.cc | 54 +++++ rtp_llm/cpp/utils/RopeCosSin.h | 23 ++ rtp_llm/device/device_impl.py | 2 +- rtp_llm/model_loader/attn_weight.py | 48 +++- rtp_llm/model_loader/ffn_weight.py | 76 ++++++ rtp_llm/model_loader/load_config.py | 1 + rtp_llm/model_loader/model_weight_info.py | 2 + .../per_channel_fp8_quant_weight.py | 4 +- rtp_llm/models/base_model.py | 4 - rtp_llm/models/qwen_v2.py | 4 - .../bindings/cuda/FusedRopeKVCacheOp.cc | 2 + .../bindings/rocm/FusedRopeKVCacheOp.cc | 1 + 29 files changed, 454 insertions(+), 369 deletions(-) create mode 100644 rtp_llm/cpp/utils/RopeCosSin.cc create mode 100644 rtp_llm/cpp/utils/RopeCosSin.h diff --git a/rtp_llm/config/gpt_init_model_parameters.py b/rtp_llm/config/gpt_init_model_parameters.py index 1b720a816..b1e82d939 100644 --- a/rtp_llm/config/gpt_init_model_parameters.py +++ b/rtp_llm/config/gpt_init_model_parameters.py @@ -649,7 +649,6 @@ def update_gpt_init_params_from_env( ), use_swizzleA = ( get_env_bool("USE_SWIZZLEA", False) - and get_env_str("MODEL_TYPE", "") in ("qwen_2", "qwen_3") ), ft_disable_custom_ar=get_env_bool("FT_DISABLE_CUSTOM_AR", True), enable_cuda_graph=get_env_bool("ENABLE_CUDA_GRAPH", False), diff --git a/rtp_llm/cpp/devices/BUILD b/rtp_llm/cpp/devices/BUILD index 585258fd4..2081fe0c2 100644 --- a/rtp_llm/cpp/devices/BUILD +++ b/rtp_llm/cpp/devices/BUILD @@ -31,6 +31,7 @@ cc_library( "//rtp_llm/cpp/core:event", "//rtp_llm/cpp/utils:core_utils", "//rtp_llm/cpp/utils:kv_cache_utils", + "//rtp_llm/cpp/utils:calc_utils", "//rtp_llm/cpp/config:static_config", "//rtp_llm/cpp/model_utils:model_utils", "//rtp_llm/cpp/config:gpt_init_params", diff --git a/rtp_llm/cpp/devices/DeviceBase.h b/rtp_llm/cpp/devices/DeviceBase.h index e0a0baa86..12e83a79e 100644 --- a/rtp_llm/cpp/devices/DeviceBase.h +++ b/rtp_llm/cpp/devices/DeviceBase.h @@ -160,10 +160,6 @@ class DeviceBase: public DeviceOps { return native_graph_capturing_; } - virtual BufferPtr getRotaryEmbeddingCoefficientCache(const RopeConfig & rope_config) { - throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); - } - public: // device-independence op implementations void batchCopy(const BatchCopyParams& params) override; diff --git a/rtp_llm/cpp/devices/OpData.h b/rtp_llm/cpp/devices/OpData.h index cbcf0aa7a..f10b04112 100644 --- a/rtp_llm/cpp/devices/OpData.h +++ b/rtp_llm/cpp/devices/OpData.h @@ -614,7 +614,6 @@ struct AttentionModuleParams { const AttentionConfigs& configs; const QScheme qscheme; const DataType compute_type = DataType::TYPE_INVALID; - const BufferPtr rotary_embedding_coefficient_cache = nullptr; }; struct MlaRotaryWriteKVCacheParams { @@ -699,7 +698,6 @@ struct AttentionLayerParams { const DataType compute_type; bool enable_sp; size_t pad_token_num; - const BufferPtr rotary_embedding_coefficient_cache = nullptr; }; struct MoeConfigs { diff --git a/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc b/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc index 838fa9c28..edc1c6a39 100644 --- a/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc +++ b/rtp_llm/cpp/devices/base_impl/AttentionLayer.cc @@ -149,8 +149,7 @@ BufferPtr DeviceBase::attentionAttn(const AttentionLayerParams& params) { params.weights, params.configs, params.qscheme, - params.compute_type, - params.rotary_embedding_coefficient_cache}); + params.compute_type}); } if (context_batch_size) { auto context_qkv = qkv.view(generate_batch_size, context_token_num); @@ -166,8 +165,7 @@ BufferPtr DeviceBase::attentionAttn(const AttentionLayerParams& params) { params.weights, params.configs, params.qscheme, - params.compute_type, - params.rotary_embedding_coefficient_cache}); + params.compute_type}); } if (layer_kv_cache) { params.common.kv_cache->kv_cache_block_id = kv_cache_block_id; @@ -261,8 +259,7 @@ AttentionLayerOutput DeviceBase::attentionLayer(const AttentionLayerParams& para params.qscheme, params.compute_type, params.enable_sp, - params.pad_token_num, - params.rotary_embedding_coefficient_cache}); + params.pad_token_num}); return {attentionOutGemm({params.layer_id, *attn_out, params.output, diff --git a/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp b/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp index 3061b19cc..8723a8c1d 100644 --- a/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp +++ b/rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp @@ -212,7 +212,7 @@ void AttentionOpTest::contextAttentionOpTest(size_t batch_size, auto attention_mask_device = createDeviceBuffer(attention_mask_host); auto scale_device = createDeviceBuffer(scale_host); #ifdef USING_ROCM - auto rope_config = RopeConfig({rtp_llm::RopeStyle::Base, 128, 1000000}); + auto rope_config = RopeConfig({RopeStyle::Base, (int)head_dim, 10000, 1, 2048, 1, 1}); size_t tokensPerBlock = 16; int block_num = batch_size * ((seq_len + tokensPerBlock - 1) / tokensPerBlock + 1); @@ -258,8 +258,10 @@ void AttentionOpTest::contextAttentionOpTest(size_t batch_size, auto output_data_type = qscheme == QScheme::Qfp8PerTensor ? DataType::TYPE_FP8_E4M3 : qkv_input_device->type(); auto qkv_output = device_->allocateBuffer({output_data_type, {batch_size, seq_len, num_heads, head_dim}}); #ifdef USING_ROCM + device_->initParamsRef().use_asm_pa = true; + device_->initParamsRef().max_seq_len = 150000; device_->contextAttention( - {0, *qkv_input_device, *qkv_output, common_inputs, attention_weight, attention_config, qscheme, DataType::TYPE_INVALID, ((ROCmDevice*)device_)->getRotaryEmbeddingCoefficientCache(rope_config)}); + {0, *qkv_input_device, *qkv_output, common_inputs, attention_weight, attention_config, qscheme, DataType::TYPE_INVALID}); auto result_ref = attention->forward(query_states_host, key_states_host, value_states_host, attention_mask_host, std::nullopt, std::nullopt, true, rope_config.base, rope_config.dim); #else device_->contextAttention( @@ -359,7 +361,7 @@ void AttentionOpTest::selfAttentionOpTest(size_t batch_size, auto sequence_lengths_device = createDeviceBuffer(sequence_lengths_host); auto input_lengths_device = createDeviceBuffer(input_lengths_host); #ifdef USING_ROCM - auto rope_config = RopeConfig({rtp_llm::RopeStyle::Base, 128, 1000000}); + auto rope_config = RopeConfig({RopeStyle::Base, (int)head_dim, 10000, 1, 2048, 1, 1}); #else auto rope_config = RopeConfig({RopeStyle::No, (int)head_dim, 10000, 1, 2048, 1, 1}); #endif @@ -411,8 +413,10 @@ void AttentionOpTest::selfAttentionOpTest(size_t batch_size, auto qkv_output = device_->allocateBuffer({qkv_states_device->type(), {token_num, num_heads, head_dim}}); #ifdef USING_ROCM + device_->initParamsRef().use_asm_pa = true; + device_->initParamsRef().max_seq_len = 150000; device_->decoderSelfAttention( - {0, *qkv_states_device, *qkv_output, common_inputs, attention_weight, attention_config, QScheme::NoQuantize, DataType::TYPE_INVALID, ((ROCmDevice*)device_)->getRotaryEmbeddingCoefficientCache(rope_config)}); + {0, *qkv_states_device, *qkv_output, common_inputs, attention_weight, attention_config, QScheme::NoQuantize, DataType::TYPE_INVALID}); auto result_ref = attention->forward( query_states_host, key_states_host, value_states_host, attention_mask_host, k_cache_host, v_cache_host, true, rope_config.base, rope_config.dim); #else @@ -500,7 +504,8 @@ void AttentionOpTest::aiterPageAttentionOpTest(size_t batch_size, auto qkv_states_device = createDeviceBuffer<__nv_bfloat16>(qkv_states_host); auto sequence_lengths_device = createDeviceBuffer(sequence_lengths_host); auto input_lengths_device = createDeviceBuffer(input_lengths_host); - auto rope_config = RopeConfig({RopeStyle::Base, (int)head_dim, 1000000, 1., 0., 0., 40960}); + // auto rope_config = RopeConfig({RopeStyle::Base, (int)head_dim, 1000000, 1., 0., 0., 40960}); + auto rope_config = RopeConfig({RopeStyle::Base, (int)head_dim, 10000, 1, 2048, 1, 1}); // cache manager need one block for preserve and every seq need one block for preserve. auto block_num = 2 * batch_size * ((kv_seq_len + tokens_per_block - 1) / tokens_per_block + 1) + 1; rtp_llm::CacheConfig cache_conf(rtp_llm::KVCacheParam( diff --git a/rtp_llm/cpp/devices/cuda_impl/CudaAttentionOp.cc b/rtp_llm/cpp/devices/cuda_impl/CudaAttentionOp.cc index 1bd39109e..be3f44a19 100644 --- a/rtp_llm/cpp/devices/cuda_impl/CudaAttentionOp.cc +++ b/rtp_llm/cpp/devices/cuda_impl/CudaAttentionOp.cc @@ -14,6 +14,7 @@ #include "rtp_llm/cpp/kernels/kv_cache/kv_cache_utils.h" #include "rtp_llm/cpp/kernels/kv_cache_kernels.h" #include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h" +#include "rtp_llm/cpp/utils/RopeCosSin.h" #ifdef USING_CUDA12 #include "rtp_llm/cpp/devices/cuda_impl/CudaXqa.h" @@ -24,55 +25,6 @@ using namespace rtp_llm; namespace rtp_llm { -torch::Tensor genNormalCosSin(int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings) { - auto inv_freq = - 1.0 / torch::pow(rope_theta, torch::arange(0, rope_dim, 2, torch::kInt64).to(torch::kFloat32) / rope_dim); - auto t = torch::arange(max_position_embeddings, torch::kInt64).to(torch::kFloat32); - t.div_(rope_scale); - auto freqs = torch::outer(t, inv_freq); - auto cos = freqs.cos().to(torch::kFloat32); - auto sin = freqs.sin().to(torch::kFloat32); - auto cos_sin = torch::stack({cos, sin}, 0).permute({1, 2, 0}).reshape({cos.size(0), -1}).contiguous(); - return cos_sin.cuda(); -} - -/** - * @brief Get the Rope Cos Sin object, TODO: move to python - * - * @param device - * @param rope_style - * @param rope_dim - * @param rope_theta - * @param rope_scale - * @param max_position_embeddings - * @return BufferPtr - */ -torch::Tensor -getRopeCosSin(RopeStyle rope_style, int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings) { - RTP_LLM_LOG_INFO("rope: style = %d, dim = %d, theta = %d, scale = %f, max_position_embeddings = %d", - rope_style, - rope_dim, - rope_theta, - rope_scale, - max_position_embeddings); - torch::Tensor cos_sin; - - switch (rope_style) { - case RopeStyle::No: - break; - - case RopeStyle::Base: - cos_sin = genNormalCosSin(rope_dim, rope_theta, rope_scale, max_position_embeddings); - break; - - default: - RTP_LLM_LOG_WARNING("unsupported rope_style = %d, not use rope_cache", rope_style); - break; - } - - return cos_sin; -} - ParamsPtr CudaDevice::prepareTrtAttn(const AttentionConfigs& configs, const BufferPtr& k_cache, const BufferPtr& kv_cache_block_id, diff --git a/rtp_llm/cpp/devices/cuda_impl/CudaDevice.h b/rtp_llm/cpp/devices/cuda_impl/CudaDevice.h index 9f9175c7a..0c15a366d 100644 --- a/rtp_llm/cpp/devices/cuda_impl/CudaDevice.h +++ b/rtp_llm/cpp/devices/cuda_impl/CudaDevice.h @@ -379,7 +379,4 @@ class CudaDevice: public DeviceBase { std::shared_ptr guard_; }; -torch::Tensor -getRopeCosSin(RopeStyle rope_style, int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings); - } // namespace rtp_llm diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc index ee66de135..cfa4b929f 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc @@ -10,7 +10,7 @@ #include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h" #include "rtp_llm/cpp/devices/rocm_impl/aiterPA.h" #include "rtp_llm/cpp/config/StaticConfig.h" - +#include "rtp_llm/cpp/utils/RopeCosSin.h" #include using namespace std; @@ -669,6 +669,11 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& && !params.configs.fuse_qkv_add_bias); RTP_LLM_LOG_DEBUG("skip_add_bias_transpose: %d", skip_add_bias_transpose); if (!skip_add_bias_transpose) { + static torch::Tensor cos_sin_cache = getRopeCosSin(params.configs.rope_config.style, + params.configs.rope_config.dim, + params.configs.rope_config.base, + params.configs.rope_config.scale, + init_params_.max_seq_len); if (init_params_.use_aiter_pa) { if (init_params_.use_asm_pa) { DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, @@ -703,7 +708,7 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams& store_q, store_kv, store_cache, - params.rotary_embedding_coefficient_cache ? params.rotary_embedding_coefficient_cache->data() : nullptr, + cos_sin_cache.defined() ? static_cast(cos_sin_cache.data_ptr()) : nullptr, stream_); } else { RUNTIME_ASSERT_OP_ARG(init_params_.use_asm_pa, "Should use asm_pa"); @@ -1071,6 +1076,11 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara && !params.configs.fuse_qkv_add_bias); printBufferData(*params.common.input_lengths, "input_lengths"); if (!skip_add_bias_transpose) { + static torch::Tensor cos_sin_cache = getRopeCosSin(params.configs.rope_config.style, + params.configs.rope_config.dim, + params.configs.rope_config.base, + params.configs.rope_config.scale, + init_params_.max_seq_len); if (init_params_.use_asm_pa) { DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype, invokeAddFusedQKVBiasTransposeDecode, @@ -1104,7 +1114,7 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara store_q, store_kv, store_cache, - params.rotary_embedding_coefficient_cache ? params.rotary_embedding_coefficient_cache->data() : nullptr, + cos_sin_cache.defined() ? static_cast(cos_sin_cache.data_ptr()) : nullptr, stream_); } else { RUNTIME_ASSERT_OP_ARG(init_params_.use_asm_pa, "Should use asm_pa"); @@ -1138,12 +1148,4 @@ AttentionModuleOutput ROCmDevice::decoderSelfAttention(const AttentionModulePara } } -BufferPtr ROCmDevice::getRotaryEmbeddingCoefficientCache(const RopeConfig & rope_config) { - size_t max_seq_len = 1048576; - auto rotary_embedding_coefficient_cache = allocateBuffer({rtp_llm::DataType::TYPE_FP32, {max_seq_len, (size_t)rope_config.dim / 2, 2}, rtp_llm::AllocationType::DEVICE}); - invokeRotaryEmbeddingCoefficientCache((float2 *)rotary_embedding_coefficient_cache->data(), max_seq_len, rope_config, stream_); - syncAndCheck(); - return rotary_embedding_coefficient_cache; -} - } // namespace rtp_llm diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h index 69fcfbeb9..9fbe8634c 100644 --- a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h +++ b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.h @@ -230,7 +230,6 @@ class ROCmDevice: public DeviceBase { int batch_size, bool use_fp8_fmha, bool use_offset_array = false); - BufferPtr getRotaryEmbeddingCoefficientCache(const RopeConfig & rope_config) override; std::shared_ptr getNativeGraphRunner() override { return std::make_shared>(this); diff --git a/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc b/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc index 2f6413a9c..648ff243a 100644 --- a/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc +++ b/rtp_llm/cpp/devices/rocm_impl/aiterPA.cc @@ -41,7 +41,7 @@ void runAiterAsmPA(const AttentionModuleParams& params, if (key_cache.dtype() == at::kFloat8_e4m3fnuz) { K_QScale = Buffer2torchTensor(params.common.kv_cache->k_scale_buffer,false); V_QScale = Buffer2torchTensor(params.common.kv_cache->v_scale_buffer,false); - pa_fwd(query, key_cache, value_cache, block_tables, context_lens, max_num_blocks, max_seq_len, K_QScale, V_QScale, out_opt, std::nullopt, 2); + pa_fwd(query, key_cache, value_cache, block_tables, context_lens, max_num_blocks, max_seq_len, K_QScale, V_QScale, out_opt, std::nullopt, 0); } else { pa_fwd(query, key_cache, value_cache, block_tables, context_lens, max_num_blocks, max_seq_len, K_QScale, V_QScale, out_opt); } diff --git a/rtp_llm/cpp/kernels/rotary_position_embedding.h b/rtp_llm/cpp/kernels/rotary_position_embedding.h index 08a6c1479..d7e2bfb9d 100644 --- a/rtp_llm/cpp/kernels/rotary_position_embedding.h +++ b/rtp_llm/cpp/kernels/rotary_position_embedding.h @@ -455,13 +455,13 @@ __device__ __inline__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloa template __device__ __inline__ void -apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (2 * tid >= rot_embed_dim) { return; } float2 coef; - if (rotary_embedding_coefficient_cache) { - coef = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + tid]; + if (cos_sin_cache) { + coef = cos_sin_cache[t_step * rot_embed_dim / 2 + tid]; } else { coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); } @@ -470,7 +470,7 @@ apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, float template __device__ __inline__ void -apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (4 * tid >= rot_embed_dim) { return; } @@ -478,9 +478,9 @@ apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, float Float4_& q_ = *reinterpret_cast(&q); float2 coef0; float2 coef1; - if (rotary_embedding_coefficient_cache) { - coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid]; - coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; + if (cos_sin_cache) { + coef0 = cos_sin_cache[t_step * rot_embed_dim / 2 + 2 * tid]; + coef1 = cos_sin_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; } else { coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); @@ -491,13 +491,13 @@ apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, float template __device__ __inline__ void -apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (2 * tid >= rot_embed_dim) { return; } float2 coef; - if (rotary_embedding_coefficient_cache) { - coef = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + tid]; + if (cos_sin_cache) { + coef = cos_sin_cache[t_step * rot_embed_dim / 2 + tid]; } else { coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); } @@ -506,16 +506,16 @@ apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, floa template __device__ __inline__ void -apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (4 * tid >= rot_embed_dim) { return; } float2 coef0; float2 coef1; - if (rotary_embedding_coefficient_cache) { - coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid]; - coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; + if (cos_sin_cache) { + coef0 = cos_sin_cache[t_step * rot_embed_dim / 2 + 2 * tid]; + coef1 = cos_sin_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; } else { coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); @@ -526,7 +526,7 @@ apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, float b template __device__ __inline__ void -apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (8 * tid >= rot_embed_dim) { return; } @@ -535,11 +535,11 @@ apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float b float2 coef1; float2 coef2; float2 coef3; - if (rotary_embedding_coefficient_cache) { - coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid]; - coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 1]; - coef2 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 2]; - coef3 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 3]; + if (cos_sin_cache) { + coef0 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid]; + coef1 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid + 1]; + coef2 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid + 2]; + coef3 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid + 3]; } else { coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base, rope_init); coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base, rope_init); @@ -557,13 +557,13 @@ apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, float b template __device__ __inline__ void apply_rotary_embedding( - __nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { + __nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (2 * tid >= rot_embed_dim) { return; } float2 coef; - if (rotary_embedding_coefficient_cache) { - coef = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + tid]; + if (cos_sin_cache) { + coef = cos_sin_cache[t_step * rot_embed_dim / 2 + tid]; } else { coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base, rope_init); } @@ -572,16 +572,16 @@ __device__ __inline__ void apply_rotary_embedding( template __device__ __inline__ void -apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (4 * tid >= rot_embed_dim) { return; } float2 coef0; float2 coef1; - if (rotary_embedding_coefficient_cache) { - coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid]; - coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; + if (cos_sin_cache) { + coef0 = cos_sin_cache[t_step * rot_embed_dim / 2 + 2 * tid]; + coef1 = cos_sin_cache[t_step * rot_embed_dim / 2 + 2 * tid + 1]; } else { coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base, rope_init); coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base, rope_init); @@ -592,7 +592,7 @@ apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, floa template __device__ __inline__ void -apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, float base, const RopeInit& rope_init, const float2* cos_sin_cache=nullptr) { if (8 * tid >= rot_embed_dim) { return; } @@ -601,11 +601,11 @@ apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, floa float2 coef1; float2 coef2; float2 coef3; - if (rotary_embedding_coefficient_cache) { - coef0 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid]; - coef1 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 1]; - coef2 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 2]; - coef3 = rotary_embedding_coefficient_cache[t_step * rot_embed_dim / 2 + 4 * tid + 3]; + if (cos_sin_cache) { + coef0 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid]; + coef1 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid + 1]; + coef2 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid + 2]; + coef3 = cos_sin_cache[t_step * rot_embed_dim / 2 + 4 * tid + 3]; } else { coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base, rope_init); coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base, rope_init); @@ -629,7 +629,7 @@ __device__ __inline__ void normal_rope(vector_t& x, const float base, const RopeInit& rope_init, const int offset = 0, - const float2 * rotary_embedding_coefficient_cache=nullptr) { + const float2* cos_sin_cache=nullptr) { const int vec_size = vector_size::size; const int rope_idx = tidx * vec_size - offset; const bool work = (rope_idx >= 0 && rope_idx < dim); @@ -642,7 +642,7 @@ __device__ __inline__ void normal_rope(vector_t& x, __syncthreads(); if (work) { RotaryHalfRead(x, smem, rope_tidx, dim / 2); - apply_rotary_embedding(x, rope_tidx, dim, seqidx, base, rope_init, rotary_embedding_coefficient_cache); + apply_rotary_embedding(x, rope_tidx, dim, seqidx, base, rope_init, cos_sin_cache); RotaryHalfWrite(x, smem, rope_tidx, dim / 2); } @@ -703,14 +703,14 @@ get_qwen_dynamic_ntk_base(const int dim, const float base, const int seq_len, co template __device__ inline void -apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int seqidx, int seq_len, const float2 * rotary_embedding_coefficient_cache=nullptr) { +apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int seqidx, int seq_len, const float2* cos_sin_cache=nullptr) { auto base = rope_config.base; auto dim = rope_config.dim; switch (ROPE_STYLE) { case RopeStyle::No: break; case RopeStyle::Base: - normal_rope(x, smem, tidx, seqidx, dim, base, LinearScaleRope{rope_config.scale}, 0, rotary_embedding_coefficient_cache); + normal_rope(x, smem, tidx, seqidx, dim, base, LinearScaleRope{rope_config.scale}, 0, cos_sin_cache); break; case RopeStyle::Glm2: // only do rotary embedding for [..., d / 2] @@ -739,7 +739,7 @@ apply_rope(RopeConfig rope_config, vector_t& x, scalar_t* smem, int tidx, int se rope_config.extrapolation_factor, rope_config.mscale}, rope_config.offset, - rotary_embedding_coefficient_cache); + cos_sin_cache); break; case RopeStyle::QwenDynamicNTK: if (seq_len > rope_config.max_pos) { @@ -777,7 +777,7 @@ __device__ inline void context_rope(RopeConfig rope_config, bool PREFIX_PROMPT, int prefix_prompt_length, int count_length, - const float2 * rotary_embedding_coefficient_cache=nullptr) { + const float2* cos_sin_cache=nullptr) { if (PREFIX_PROMPT && count_length) { input_len = input_len + prefix_prompt_length; seqidx = seqidx + prefix_prompt_length; @@ -786,9 +786,9 @@ __device__ inline void context_rope(RopeConfig rope_config, seqidx = position_id; } - apply_rope(rope_config, q, smem, tidx, seqidx, seq_len, rotary_embedding_coefficient_cache); + apply_rope(rope_config, q, smem, tidx, seqidx, seq_len, cos_sin_cache); - apply_rope(rope_config, k, smem, tidx, seqidx, seq_len, rotary_embedding_coefficient_cache); + apply_rope(rope_config, k, smem, tidx, seqidx, seq_len, cos_sin_cache); } template @@ -808,7 +808,7 @@ __device__ inline void attention_rope(RopeConfig rope_config, #pragma nv_diagnostic pop int count_prefix_length, bool handle_kv, - const float2 * rotary_embedding_coefficient_cache=nullptr) { + const float2* cos_sin_cache=nullptr) { if (count_prefix_length) { prefix_prompt_length = 0; } @@ -821,10 +821,10 @@ __device__ inline void attention_rope(RopeConfig rope_config, tlength = tlength - prefix_prompt_length; } - apply_rope(rope_config, q, smem, tidx, tlength, seq_len, rotary_embedding_coefficient_cache); + apply_rope(rope_config, q, smem, tidx, tlength, seq_len, cos_sin_cache); if (handle_kv) { - apply_rope(rope_config, k, smem, tidx, tlength, seq_len, rotary_embedding_coefficient_cache); + apply_rope(rope_config, k, smem, tidx, tlength, seq_len, cos_sin_cache); } } diff --git a/rtp_llm/cpp/kernels/unfused_attention_kernels.cu b/rtp_llm/cpp/kernels/unfused_attention_kernels.cu index 105989625..224354e76 100644 --- a/rtp_llm/cpp/kernels/unfused_attention_kernels.cu +++ b/rtp_llm/cpp/kernels/unfused_attention_kernels.cu @@ -2394,7 +2394,7 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel_v1(T* bool store_q, bool store_kv, bool store_cache, - const float2* rotary_embedding_coefficient_cache) { + const float2* cos_sin_cache) { // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, // head_num, size_per_head], and QKV split to 3 split buffer q, k, v and // transpose them to [batch_size, head_num, seq_len, size_per_head]. For q and @@ -2496,7 +2496,7 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel_v1(T* PREFIX_PROMPT, prefix_prompt_length, param.count_length, - rotary_embedding_coefficient_cache); + cos_sin_cache); if (use_logn_attn) { logn_attention(q, seq_idx, rope_config.max_pos); @@ -2580,32 +2580,32 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel_v1(T* template void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, - T* k_buf, - T* v_buf, - PrefixPromptBatchWeightsParam* param_ptr, - T* QKV, - void* QuantizedQKV, - const int* position_ids, - const T* qkv_bias, - const int* padding_offset, - const int* cu_seqlens, - const int batch_size, - const int seq_len, - const int token_num, - const int head_num, - const int head_num_kv, - const int size_per_head, - const RopeConfig rope_config, - const bool use_logn_attn, - const float* scale, - const int int8_mode, - const bool use_paged_fmha, - const bool store_qkv, - const bool store_q, - const bool store_kv, - const bool store_cache, - const float2 * rotary_embedding_coefficient_cache, - cudaStream_t stream) { + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param_ptr, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + const float2* cos_sin_cache, + cudaStream_t stream) { auto& param = *param_ptr; dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); dim3 grid(token_num, head_num); @@ -2637,7 +2637,7 @@ void invokeAddFusedQKVBiasTransposePrefillV1(T* q_bu store_q, store_kv, store_cache, - rotary_embedding_coefficient_cache); + cos_sin_cache); }); }); }); @@ -2667,7 +2667,7 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* bool store_q, bool store_kv, bool store_cache, - const float2* rotary_embedding_coefficient_cache) { + const float2* cos_sin_cache) { // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, // head_num, size_per_head], and QKV split to 3 split buffer q, k, v and // transpose them to [batch_size, head_num, seq_len, size_per_head]. For q and @@ -2769,7 +2769,7 @@ __global__ void add_fusedQKV_bias_transpose_prefill_kernel(T* PREFIX_PROMPT, prefix_prompt_length, param.count_length, - rotary_embedding_coefficient_cache); + cos_sin_cache); if (use_logn_attn) { logn_attention(q, seq_idx, rope_config.max_pos); @@ -2905,7 +2905,7 @@ void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, - const float2* rotary_embedding_coefficient_cache, + const float2* cos_sin_cache, cudaStream_t stream) { auto& param = *param_ptr; dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -2938,7 +2938,7 @@ void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, store_q, store_kv, store_cache, - rotary_embedding_coefficient_cache); + cos_sin_cache); }); }); }); @@ -3101,7 +3101,7 @@ __global__ void add_fusedQKV_bias_transpose_decode_kernel(T* bool store_q, bool store_kv, bool store_cache, - const float2* rotary_embedding_coefficient_cache) { + const float2* cos_sin_cache) { extern __shared__ __align__(sizeof(float2)) char smem_[]; constexpr int vec_size = Vec_t::size; @@ -3178,7 +3178,7 @@ __global__ void add_fusedQKV_bias_transpose_decode_kernel(T* prefix_prompt_length, true /*count_prefix_length*/, true /*HANDLE_KV*/, - rotary_embedding_coefficient_cache); + cos_sin_cache); if (use_logn_attn) { logn_attention(q, tlength, rope_config.max_pos); @@ -3265,6 +3265,7 @@ void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf const bool store_q, const bool store_kv, const bool store_cache, + const float2* cos_sin_cache, cudaStream_t stream) { auto& param = *param_ptr; dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -3333,7 +3334,7 @@ void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, - const float2* rotary_embedding_coefficient_cache, + const float2* cos_sin_cache, cudaStream_t stream) { auto& param = *param_ptr; dim3 block((size_per_head / Vec_t::size + 31) / 32 * 32); @@ -3368,7 +3369,7 @@ void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, store_q, store_kv, store_cache, - rotary_embedding_coefficient_cache); + cos_sin_cache); }); }); }); @@ -3917,34 +3918,34 @@ INSTANTIATEDECODEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16); #undef INSTANTIATEDECODEADDFUSEDQKVBIASTRANSPOSE #if USING_ROCM -#define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(T) \ - template void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, \ - T* k_buf, \ - T* v_buf, \ - PrefixPromptBatchWeightsParam* param, \ - T* QKV, \ - void* QuantizedQKV, \ - const int* position_ids, \ - const T* qkv_bias, \ - const int* padding_offset, \ - const int* cu_seqlens, \ - const int batch_size, \ - const int seq_len, \ - const int token_num, \ - const int head_num, \ - const int head_num_kv, \ - const int size_per_head, \ - const RopeConfig rope_config, \ - const bool use_logn_attn, \ - const float* scale, \ - const int int8_mode, \ - const bool use_paged_fmha, \ - const bool store_qkv, \ - const bool store_q, \ - const bool store_kv, \ - const bool store_cache, \ - const float2 * rotary_embedding_coefficient_cache,\ - cudaStream_t stream) +#define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(T) \ + template void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, \ + T* k_buf, \ + T* v_buf, \ + PrefixPromptBatchWeightsParam* param, \ + T* QKV, \ + void* QuantizedQKV, \ + const int* position_ids, \ + const T* qkv_bias, \ + const int* padding_offset, \ + const int* cu_seqlens, \ + const int batch_size, \ + const int seq_len, \ + const int token_num, \ + const int head_num, \ + const int head_num_kv, \ + const int size_per_head, \ + const RopeConfig rope_config, \ + const bool use_logn_attn, \ + const float* scale, \ + const int int8_mode, \ + const bool use_paged_fmha, \ + const bool store_qkv, \ + const bool store_q, \ + const bool store_kv, \ + const bool store_cache, \ + const float2 * cos_sin_cache, \ + cudaStream_t stream) INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(float); INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(half); #ifdef ENABLE_BF16 @@ -3978,8 +3979,8 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILLV1(__nv_bfloat16); const bool store_q, \ const bool store_kv, \ const bool store_cache, \ - const float2* rotary_embedding_coefficient_cache, \ - cudaStream_t stream) + const float2* cos_sin_cache, \ + cudaStream_t stream) INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(float); INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(half); #ifdef ENABLE_BF16 @@ -3989,33 +3990,34 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSEPREFILL(__nv_bfloat16); #define INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(T) \ template void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, \ - T* k_buf, \ - T* v_buf, \ - PrefixPromptBatchWeightsParam* param, \ - const int* input_lengths, \ - T* QKV, \ - void* QuantizedQKV, \ - const int* position_ids, \ - const T* qkv_bias, \ - const int* padding_offset, \ - const int* cu_seqlens, \ - const int* sequence_lengths, \ - const int batch_size, \ - const int seq_len, \ - const int token_num, \ - const int head_num, \ - const int head_num_kv, \ - const int size_per_head, \ - const RopeConfig rope_config, \ - const bool use_logn_attn, \ - const float* scale, \ - const int int8_mode, \ - const bool use_paged_fmha, \ - const bool store_qkv, \ - const bool store_q, \ - const bool store_kv, \ - const bool store_cache, \ - cudaStream_t stream) + T* k_buf, \ + T* v_buf, \ + PrefixPromptBatchWeightsParam* param, \ + const int* input_lengths, \ + T* QKV, \ + void* QuantizedQKV, \ + const int* position_ids, \ + const T* qkv_bias, \ + const int* padding_offset, \ + const int* cu_seqlens, \ + const int* sequence_lengths, \ + const int batch_size, \ + const int seq_len, \ + const int token_num, \ + const int head_num, \ + const int head_num_kv, \ + const int size_per_head, \ + const RopeConfig rope_config, \ + const bool use_logn_attn, \ + const float* scale, \ + const int int8_mode, \ + const bool use_paged_fmha, \ + const bool store_qkv, \ + const bool store_q, \ + const bool store_kv, \ + const bool store_cache, \ + const float2* cos_sin_cache, \ + cudaStream_t stream) INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(float); INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(half); #ifdef ENABLE_BF16 @@ -4051,8 +4053,8 @@ INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODEV1(__nv_bfloat16); const bool store_q, \ const bool store_kv, \ const bool store_cache, \ - const float2* rotary_embedding_coefficient_cache, \ - cudaStream_t stream) + const float2* cos_sin_cache, \ + cudaStream_t stream) INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODE(float); INSTANTIATEADDFUSEDQKVBIASTRANSPOSEDECODE(half); #ifdef ENABLE_BF16 @@ -4137,30 +4139,4 @@ INSTANTIATEINVOKELOADPREFIXKVCACHE(__nv_bfloat16); #endif #undef INSTANTIATEINVOKELOADPREFIXKVCACHE - - - - - - - - - -__global__ void -cache_rotary_embedding_coefficient(float2* rotary_embedding_coefficient_cache, int stride, RopeConfig rope_config) { - int tid = threadIdx.x; - int t_step = blockIdx.x; - // only support RopeStyle::Base for now. - rotary_embedding_coefficient_cache[t_step * stride + tid] = rotary_embedding_coefficient( - 2 * tid, rope_config.dim, t_step, rope_config.base, LinearScaleRope{rope_config.scale}); -} - -void invokeRotaryEmbeddingCoefficientCache(float2* rotary_embedding_coefficient_cache, - int max_seq_len, - RopeConfig rope_config, - cudaStream_t stream) { - cache_rotary_embedding_coefficient<<>>( - rotary_embedding_coefficient_cache, rope_config.dim / 2, rope_config); -} - } // namespace rtp_llm diff --git a/rtp_llm/cpp/kernels/unfused_attention_kernels.h b/rtp_llm/cpp/kernels/unfused_attention_kernels.h index ef6d5de3b..d40edbce8 100644 --- a/rtp_llm/cpp/kernels/unfused_attention_kernels.h +++ b/rtp_llm/cpp/kernels/unfused_attention_kernels.h @@ -143,33 +143,33 @@ void invokeGatherSequencesCombined(T* output_q, cudaStream_t stream); template -void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, - T* k_buf, - T* v_buf, - PrefixPromptBatchWeightsParam* param, - T* QKV, - void* QuantizedQKV, - const int* position_ids, - const T* qkv_bias, - const int* padding_offset, - const int* cu_seqlens, - const int batch_size, - const int seq_len, - const int token_num, - const int head_num, - const int head_num_kv, - const int size_per_head, - const RopeConfig rope_config, - const bool use_logn_attn, - const float* scale, - const int int8_mode, - const bool use_paged_fmha, - const bool store_qkv, - const bool store_q, - const bool store_kv, - const bool store_cache, - const float2 * rotary_embedding_coefficient_cache, - cudaStream_t stream); +void invokeAddFusedQKVBiasTransposePrefillV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + const float2* cos_sin_cache, + cudaStream_t stream); template void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, @@ -197,37 +197,38 @@ void invokeAddFusedQKVBiasTransposePrefill(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, - const float2* rotary_embedding_coefficient_cache, + const float2* cos_sin_cache, cudaStream_t stream); template -void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, - T* k_buf, - T* v_buf, - PrefixPromptBatchWeightsParam* param, - const int* input_lengths, - T* QKV, - void* QuantizedQKV, - const int* position_ids, - const T* qkv_bias, - const int* padding_offset, - const int* cu_seqlens, - const int* sequence_lengths, - const int batch_size, - const int seq_len, - const int token_num, - const int head_num, - const int head_num_kv, - const int size_per_head, - const RopeConfig rope_config, - const bool use_logn_attn, - const float* scale, - const int int8_mode, - const bool use_paged_fmha, - const bool store_qkv, - const bool store_q, - const bool store_kv, - const bool store_cache, - cudaStream_t stream); +void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, + T* k_buf, + T* v_buf, + PrefixPromptBatchWeightsParam* param, + const int* input_lengths, + T* QKV, + void* QuantizedQKV, + const int* position_ids, + const T* qkv_bias, + const int* padding_offset, + const int* cu_seqlens, + const int* sequence_lengths, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int head_num_kv, + const int size_per_head, + const RopeConfig rope_config, + const bool use_logn_attn, + const float* scale, + const int int8_mode, + const bool use_paged_fmha, + const bool store_qkv, + const bool store_q, + const bool store_kv, + const bool store_cache, + const float2* cos_sin_cache, + cudaStream_t stream); template void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, T* k_buf, @@ -256,37 +257,8 @@ void invokeAddFusedQKVBiasTransposeDecode(T* q_buf, const bool store_q, const bool store_kv, const bool store_cache, - const float2* rotary_embedding_coefficient_cache, + const float2* cos_sin_cache, cudaStream_t stream); -template -void invokeAddFusedQKVBiasTransposeDecodeV1(T* q_buf, - T* k_buf, - T* v_buf, - PrefixPromptBatchWeightsParam* param, - const int* input_lengths, - T* QKV, - void* QuantizedQKV, - const int* position_ids, - const T* qkv_bias, - const int* padding_offset, - const int* cu_seqlens, - const int* sequence_lengths, - const int batch_size, - const int seq_len, - const int token_num, - const int head_num, - const int head_num_kv, - const int size_per_head, - const RopeConfig rope_config, - const bool use_logn_attn, - const float* scale, - const int int8_mode, - const bool use_paged_fmha, - const bool store_qkv, - const bool store_q, - const bool store_kv, - const bool store_cache, - cudaStream_t stream); #endif template @@ -353,12 +325,4 @@ void invokeLoadPrefixKVCacheAiter(T* q_buf, cudaStream_t stream); #endif - - - - -void invokeRotaryEmbeddingCoefficientCache(float2* rotary_embedding_coefficient_cache, - int max_seq_len, - RopeConfig rope_config, - cudaStream_t stream); } // namespace rtp_llm diff --git a/rtp_llm/cpp/models/GptModel.cc b/rtp_llm/cpp/models/GptModel.cc index 7236f8cad..aa4a5e58e 100644 --- a/rtp_llm/cpp/models/GptModel.cc +++ b/rtp_llm/cpp/models/GptModel.cc @@ -45,12 +45,6 @@ GptModel::GptModel(const GptModelInitParams& params): overall_expert_stats_ = device_->createMoeExpertStates( {layer_num_, moe_conf.ep_size, moe_conf.expert_num, moe_conf.expert_num + moe_conf.extra_expert_num}); } -#if USING_ROCM - auto & rope_config = params.description.attention_conf.rope_config; - if (rope_config.style == RopeStyle::Base) { - rotary_embedding_coefficient_cache_ = device_->getRotaryEmbeddingCoefficientCache(rope_config); - } -#endif } void getPaddingOffsetAndCuSeqLens(int32_t* padding_offset, @@ -1310,8 +1304,7 @@ AttentionBlockOutputs GptModel::forwardAttentionBlock(const GptLayerInputs& description_.act_qscheme, description_.compute_type, enable_sp, - inputs.pad_token_num, - rotary_embedding_coefficient_cache_}); + inputs.pad_token_num}); if (description_.attention_conf.use_mla && device_->mla_ops_type != rtp_llm::MlaOpsType::MHA) { attn_output = device_->mlaAttentionLayer(attn_params); } else { diff --git a/rtp_llm/cpp/models/GptModel.h b/rtp_llm/cpp/models/GptModel.h index a7334bb55..ccb6ffb1b 100644 --- a/rtp_llm/cpp/models/GptModel.h +++ b/rtp_llm/cpp/models/GptModel.h @@ -234,7 +234,6 @@ class GptModel { rtp_llm::BufferPtr v_scale_buffer_; rtp_llm::BufferPtr residual_scale_fp32_; rtp_llm::BufferPtr residual_scale_; - rtp_llm::BufferPtr rotary_embedding_coefficient_cache_ = nullptr; public: rtp_llm::Weights weights_; diff --git a/rtp_llm/cpp/utils/BUILD b/rtp_llm/cpp/utils/BUILD index f5261a98a..02c2afcdf 100644 --- a/rtp_llm/cpp/utils/BUILD +++ b/rtp_llm/cpp/utils/BUILD @@ -24,12 +24,12 @@ cc_library( "StringUtil.h", "StatusUtil.h", "Exception.h", - "utils.h", + "utils.h" ], srcs = [ "AssertUtils.cc", "Logger.cc", - "Exception.cc", + "Exception.cc" ], deps = [ "@havenask//aios/alog:alog", @@ -143,4 +143,18 @@ cc_library( visibility = ["//visibility:public"], ) - +cc_library( + name = "calc_utils", + hdrs = [ + "RopeCosSin.h" + ], + srcs = [ + "RopeCosSin.cc" + ], + deps = torch_deps() + [ + ":core_utils", + "//rtp_llm/cpp/model_utils:model_utils" + ], + copts = copts(), + visibility = ["//visibility:public"], +) diff --git a/rtp_llm/cpp/utils/RopeCosSin.cc b/rtp_llm/cpp/utils/RopeCosSin.cc new file mode 100644 index 000000000..cc375463f --- /dev/null +++ b/rtp_llm/cpp/utils/RopeCosSin.cc @@ -0,0 +1,54 @@ +#include "rtp_llm/cpp/utils/RopeCosSin.h" +#include "rtp_llm/cpp/utils/Logger.h" + +namespace rtp_llm { +torch::Tensor genNormalCosSin(int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings) { + auto inv_freq = + 1.0 / torch::pow(rope_theta, torch::arange(0, rope_dim, 2, torch::kInt64).to(torch::kFloat32) / rope_dim); + auto t = torch::arange(max_position_embeddings, torch::kInt64).to(torch::kFloat32); + t.div_(rope_scale); + auto freqs = torch::outer(t, inv_freq); + auto cos = freqs.cos().to(torch::kFloat32); + auto sin = freqs.sin().to(torch::kFloat32); + auto cos_sin = torch::stack({cos, sin}, 0).permute({1, 2, 0}).reshape({cos.size(0), -1}).contiguous(); + return cos_sin.cuda(); +} + +/** + * @brief Get the Rope Cos Sin object, TODO: move to python + * + * @param device + * @param rope_style + * @param rope_dim + * @param rope_theta + * @param rope_scale + * @param max_position_embeddings + * @return BufferPtr + */ +torch::Tensor +getRopeCosSin(RopeStyle rope_style, int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings) { + RTP_LLM_LOG_INFO("rope: style = %d, dim = %d, theta = %d, scale = %f, max_position_embeddings = %d", + rope_style, + rope_dim, + rope_theta, + rope_scale, + max_position_embeddings); + torch::Tensor cos_sin; + + switch (rope_style) { + case RopeStyle::No: + break; + + case RopeStyle::Base: + cos_sin = genNormalCosSin(rope_dim, rope_theta, rope_scale, max_position_embeddings); + break; + + default: + RTP_LLM_LOG_WARNING("unsupported rope_style = %d, not use rope_cache", rope_style); + break; + } + + return cos_sin; +} + +} // namespace rtm_llm diff --git a/rtp_llm/cpp/utils/RopeCosSin.h b/rtp_llm/cpp/utils/RopeCosSin.h new file mode 100644 index 000000000..7c0e74986 --- /dev/null +++ b/rtp_llm/cpp/utils/RopeCosSin.h @@ -0,0 +1,23 @@ +#pragma once +#include +#include "rtp_llm/cpp/model_utils/RopeConfig.h" + +namespace rtp_llm { + +torch::Tensor genNormalCosSin(int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings); + +/** + * @brief Get the Rope Cos Sin object, TODO: move to python + * + * @param device + * @param rope_style + * @param rope_dim + * @param rope_theta + * @param rope_scale + * @param max_position_embeddings + * @return BufferPtr + */ +torch::Tensor +getRopeCosSin(RopeStyle rope_style, int rope_dim, int rope_theta, float rope_scale, int max_position_embeddings); + +} // namespace rtp_llm diff --git a/rtp_llm/device/device_impl.py b/rtp_llm/device/device_impl.py index 2a75770ef..7e68493ca 100644 --- a/rtp_llm/device/device_impl.py +++ b/rtp_llm/device/device_impl.py @@ -566,7 +566,7 @@ def shuffle_gemm_weight(self, x: torch.Tensor) -> torch.Tensor: return x_ def swizzle_gemm_weight(self, src: torch.Tensor, col_maj: bool = False) -> torch.Tensor: - src = swizzle_tensor(src, False) + src = swizzle_tensor(src, col_maj) return src def convert_fp8_weight_params( diff --git a/rtp_llm/model_loader/attn_weight.py b/rtp_llm/model_loader/attn_weight.py index 51a7e6738..9a261f9ed 100644 --- a/rtp_llm/model_loader/attn_weight.py +++ b/rtp_llm/model_loader/attn_weight.py @@ -1,10 +1,10 @@ -from typing import Any, Callable, List, Optional +from typing import Any, Callable, List, Optional, Union, Dict import torch from pydantic import BaseModel - +from rtp_llm.model_loader.load_config import LoadConfig from rtp_llm.model_loader.weight_module import AtomicWeight -from rtp_llm.utils.model_weight import CkptWeightInfo, identity +from rtp_llm.utils.model_weight import CkptWeightInfo, identity, W class AttnConfig(BaseModel): @@ -30,6 +30,48 @@ def __init__( self.config = config super().__init__(name, weights, process_fun, data_type, *args, **kwargs) + def _swizzle_gemm_weight( + self, + name: str, + tensor: Union[torch.Tensor, Dict[str, torch.Tensor]], + load_config: LoadConfig, + ): + if name not in (W.attn_qkv_w, W.attn_o_w): + raise ValueError(f"unsupported swizzle name: {name}") + if isinstance(tensor, dict): + w = tensor.get(name) + if isinstance(w, torch.Tensor): + w = load_config.exported_device.swizzle_gemm_weight(w, w.dtype != torch.float8_e4m3fn) + tensor[name] = w + elif isinstance(w, dict): + self._swizzle_gemm_weight(name, w, load_config) + else: + raise TypeError(f"unsupported type at key {name}: {type(w)}") + + elif isinstance(tensor, torch.Tensor): + swizzled = load_config.exported_device.swizzle_gemm_weight(tensor, tensor.dtype != torch.float8_e4m3fn) + return swizzled + + def _postprocess( + self, + tensor: Union[torch.Tensor, Dict[str, torch.Tensor]], + device: str, + load_config: LoadConfig + ): + if load_config.use_swizzleA: + if isinstance(tensor, torch.Tensor): + if getattr(self, "name", None) in (W.attn_qkv_w, W.attn_o_w): + tensor = self._swizzle_gemm_weight(self.name, tensor, load_config) + return super()._postprocess(tensor, device, load_config) + + for key in (W.attn_qkv_w, W.attn_o_w): + w = tensor.get(key) + if isinstance(w, dict): + self._swizzle_gemm_weight(key, w, load_config) + elif isinstance(w, torch.Tensor): + self._swizzle_gemm_weight(key, tensor, load_config) + + return super()._postprocess(tensor, device, load_config) class MlaConfig(BaseModel): head_num: int = -1 diff --git a/rtp_llm/model_loader/ffn_weight.py b/rtp_llm/model_loader/ffn_weight.py index 34ffb4ba8..02587cd8e 100644 --- a/rtp_llm/model_loader/ffn_weight.py +++ b/rtp_llm/model_loader/ffn_weight.py @@ -267,6 +267,36 @@ def _split( return tensor return super()._split(tensor, load_config) + def _swizzle_gemm_weight( + self, + name: str, + tensor: Union[torch.Tensor, Dict[str, torch.Tensor]], + load_config: LoadConfig, + ): + w = tensor.get(name) + if isinstance(w, torch.Tensor): + w = load_config.exported_device.swizzle_gemm_weight(w, w.dtype != torch.float8_e4m3fn) + tensor[name] = w + elif isinstance(w, dict): + self._swizzle_gemm_weight(name, w, load_config) + else: + raise ValueError("unsupported type") + + def _postprocess( + self, tensor: Dict[str, torch.Tensor], device: str, load_config: LoadConfig + ): + if load_config.use_swizzleA: + ffn_w13 = tensor.get(W.ffn_w13) + ffn_w2 = tensor.get(W.ffn_w2) + for weight, keys in [(ffn_w13, [W.ffn_w13]), (ffn_w2, [W.ffn_w2]),]: + if isinstance(weight, dict): + for key in keys: + if key in weight: + self._swizzle_gemm_weight(key, weight, load_config) + else: + self._swizzle_gemm_weight(keys[0], tensor, load_config) + return super()._postprocess(tensor, device, load_config) + class MoeConfig(BaseModel): is_moe: bool = True @@ -357,6 +387,21 @@ def support( ) -> bool: return False + def _swizzle_gemm_weight( + self, + name: str, + tensor: Union[torch.Tensor, Dict[str, torch.Tensor]], + load_config: LoadConfig, + ): + w = tensor.get(name) + if isinstance(w, torch.Tensor): + w = load_config.exported_device.swizzle_gemm_weight(w, w.dtype != torch.float8_e4m3fn) + tensor[name] = w + elif isinstance(w, dict): + self._swizzle_gemm_weight(name, w, load_config) + else: + raise ValueError("unsupported type") + def _shuff_moe_weight( self, name: str, @@ -379,6 +424,7 @@ def _postprocess( ): moe_w1 = tensor.get(W.moe_w1) moe_w2 = tensor.get(W.moe_w2) + moe_gate = tensor.get(W.moe_gate) for weight, keys in [ (moe_w1, [W.moe_w1, W.moe_s1]), (moe_w2, [W.moe_w2, W.moe_s2]), @@ -389,6 +435,13 @@ def _postprocess( self._shuff_moe_weight(key, weight, load_config) else: self._shuff_moe_weight(keys[0], tensor, load_config) + if load_config.use_swizzleA: + if moe_gate is not None: + if isinstance(moe_gate, dict): + if W.moe_gate in moe_gate: + self._swizzle_gemm_weight(W.moe_gate, moe_gate, load_config) + else: + self._swizzle_gemm_weight(W.moe_gate, tensor, load_config) return super()._postprocess(tensor, device, load_config) @@ -446,6 +499,21 @@ def support( ) -> bool: return False + def _swizzle_gemm_weight( + self, + name: str, + tensor: Union[torch.Tensor, Dict[str, torch.Tensor]], + load_config: LoadConfig, + ): + w = tensor.get(name) + if isinstance(w, torch.Tensor): + w = load_config.exported_device.swizzle_gemm_weight(w, w.dtype != torch.float8_e4m3fn) + tensor[name] = w + elif isinstance(w, dict): + self._swizzle_gemm_weight(name, w, load_config) + else: + raise ValueError("unsupported type") + def _shuff_moe_weight( self, name: str, @@ -477,6 +545,7 @@ def _postprocess( ): moe_w1 = tensor.get(W.moe_w1) moe_w2 = tensor.get(W.moe_w2) + moe_gate = tensor.get(W.moe_gate) for weight, keys in [ (moe_w1, [W.moe_w1, W.moe_s1]), (moe_w2, [W.moe_w2, W.moe_s2]), @@ -487,4 +556,11 @@ def _postprocess( self._shuff_moe_weight(key, weight, load_config) else: self._shuff_moe_weight(keys[0], tensor, load_config) + if load_config.use_swizzleA: + if moe_gate is not None: + if isinstance(moe_gate, dict): + if W.moe_gate in moe_gate: + self._swizzle_gemm_weight(W.moe_gate, moe_gate, load_config) + else: + self._swizzle_gemm_weight(W.moe_gate, tensor, load_config) return super()._postprocess(tensor, device, load_config) diff --git a/rtp_llm/model_loader/load_config.py b/rtp_llm/model_loader/load_config.py index ca782241a..ff3c2b639 100644 --- a/rtp_llm/model_loader/load_config.py +++ b/rtp_llm/model_loader/load_config.py @@ -52,6 +52,7 @@ class LoadConfig(BaseModel): exported_device: Optional[Any] = None phy2log: Optional[List[List[int]]] = None + use_swizzleA: bool = False @field_validator("database", "compute_dtype", "quant_algo", "exported_device") @classmethod diff --git a/rtp_llm/model_loader/model_weight_info.py b/rtp_llm/model_loader/model_weight_info.py index 6e59d2d04..d10d33b51 100644 --- a/rtp_llm/model_loader/model_weight_info.py +++ b/rtp_llm/model_loader/model_weight_info.py @@ -146,6 +146,7 @@ class ModelDeployWeightInfo: def __init__(self, config: GptInitModelParameters, tp_size: int, tp_rank: int): self.config = config + self._use_swizzleA = config.hw_kernel_config.use_swizzleA self._use_qk_norm = config.qk_norm self._hidden_size = config.hidden_size self._inter_size = config.inter_size @@ -585,6 +586,7 @@ def create_load_config( is_ft_style_weight=database.is_ft_style, phy2log=self.config.phy2log, # Notice use config, because phy2log init after ModelDeployWeightInfo.__init__ exported_device=exported_device, + use_swizzleA=self._use_swizzleA ) return load_config diff --git a/rtp_llm/model_loader/per_channel_fp8_quant_weight.py b/rtp_llm/model_loader/per_channel_fp8_quant_weight.py index 07fc14315..810384585 100644 --- a/rtp_llm/model_loader/per_channel_fp8_quant_weight.py +++ b/rtp_llm/model_loader/per_channel_fp8_quant_weight.py @@ -365,8 +365,8 @@ def _postprocess( # need reshape for kernel weight processed_res = super()._postprocess(tensor, device, load_config) kernel_weight = processed_res[self.kernel.name] - if self.kernel.name not in [W.moe_w1, W.moe_w2]: - kernel_weight = load_config.exported_device.swizzle_gemm_weight(kernel_weight, False) + if self.kernel.name not in [W.moe_w1, W.moe_w2] and not load_config.use_swizzleA: + kernel_weight = load_config.exported_device.shuffle_gemm_weight(kernel_weight) kernel_weight = ( kernel_weight.reshape(kernel_weight.shape[-1], -1) if kernel_weight.dim() == 2 diff --git a/rtp_llm/models/base_model.py b/rtp_llm/models/base_model.py index bb319789e..d3524964e 100644 --- a/rtp_llm/models/base_model.py +++ b/rtp_llm/models/base_model.py @@ -145,7 +145,6 @@ def from_config( ) -> "BaseModel": model = cls(config) model.load(parallel_info) - model.postprocess_weights() return model @staticmethod @@ -295,6 +294,3 @@ def eval_model_size(config: GptInitModelParameters): @staticmethod def eval_model_param_count(config: GptInitModelParameters): return config.model_param_count - - def postprocess_weights(self): - pass diff --git a/rtp_llm/models/qwen_v2.py b/rtp_llm/models/qwen_v2.py index be67f87bd..adb0eeefd 100644 --- a/rtp_llm/models/qwen_v2.py +++ b/rtp_llm/models/qwen_v2.py @@ -35,7 +35,6 @@ zeros, ) -from rtp_llm.utils.swizzle_utils import do_swizzle def scale_reshape(ts: List[torch.Tensor]): return ts[0].reshape(-1) @@ -417,9 +416,6 @@ def _from_config_json(config: GptInitModelParameters, config_json: Dict[str, Any def get_weight_cls(): return QWenV2Weight - def postprocess_weights(self): - if self.config.hw_kernel_config.use_swizzleA and self.weight.weights[0]["self_attention_weights.query_weight.kernel"].dtype != torch.float8_e4m3fnuz: - do_swizzle(self.weight.weights) class QWenV2Embedding(QWenV2): @classmethod diff --git a/rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc b/rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc index 6a237f185..bd1d2fe11 100644 --- a/rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc +++ b/rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc @@ -5,6 +5,8 @@ #include "rtp_llm/cpp/utils/AssertUtils.h" #include "rtp_llm/cpp/devices/cuda_impl/CudaFlashInfer.h" #include "rtp_llm/cpp/core/BufferHelper.h" +#include "rtp_llm/cpp/utils/RopeCosSin.h" + namespace rtp_llm { FusedRopeKVCachePrefillOp::FusedRopeKVCachePrefillOp(const GptInitParameter& gpt_init_parameter): diff --git a/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc b/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc index d5a9f1e90..e9dfcf77a 100644 --- a/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc +++ b/rtp_llm/models_py/bindings/rocm/FusedRopeKVCacheOp.cc @@ -304,6 +304,7 @@ torch::Tensor FusedRopeKVCacheDecodeOp::forward(const torch::Tensor& store_q, store_kv, store_cache, + nullptr, device_->getStream()); } else { assert(false && "not implemented"); From 1471412aaf7fcd7dca5fe1796065e6eb8fb3c749 Mon Sep 17 00:00:00 2001 From: "fanfengfeng.fff" Date: Sat, 11 Oct 2025 11:44:12 +0800 Subject: [PATCH 6/9] fix: rm refine-aiter-asm-dir.patch --- 3rdparty/aiter/refine-aiter-asm-dir.patch | 47 ----------------------- WORKSPACE | 6 --- open_source/deps/http.bzl | 2 +- patched_repo.bzl | 27 ------------- rtp_llm/BUILD | 2 +- 5 files changed, 2 insertions(+), 82 deletions(-) delete mode 100644 3rdparty/aiter/refine-aiter-asm-dir.patch delete mode 100644 patched_repo.bzl diff --git a/3rdparty/aiter/refine-aiter-asm-dir.patch b/3rdparty/aiter/refine-aiter-asm-dir.patch deleted file mode 100644 index fd7ee0281..000000000 --- a/3rdparty/aiter/refine-aiter-asm-dir.patch +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/aiter/jit/core.py b/aiter/jit/core.py -index 712feea0..012db7e6 100644 ---- a/aiter/jit/core.py -+++ b/aiter/jit/core.py -@@ -62,35 +62,19 @@ this_dir = os.path.dirname(os.path.abspath(__file__)) - AITER_ROOT_DIR = os.path.abspath(f"{this_dir}/../../") - AITER_LOG_MORE = int(os.getenv("AITER_LOG_MORE", 0)) - --find_aiter = importlib.util.find_spec("aiter") --if find_aiter is not None: -- if find_aiter.submodule_search_locations: -- package_path = find_aiter.submodule_search_locations[0] -- elif find_aiter.origin: -- package_path = find_aiter.origin -- package_path = os.path.dirname(package_path) -- package_parent_path = os.path.dirname(package_path) -- import site -- -- site_packages_dirs = site.getsitepackages() -- # develop mode -- isDevelopMode = (package_path not in site_packages_dirs) and ( -- package_parent_path not in site_packages_dirs -- ) -- if isDevelopMode: -- AITER_META_DIR = AITER_ROOT_DIR -- # install mode -- else: -- AITER_META_DIR = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta/") -+meta_path = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta") -+if os.path.exists(meta_path): -+ AITER_META_DIR = meta_path - else: -- AITER_META_DIR = AITER_ROOT_DIR -- logger.warning("aiter is not installed.") -+ AITER_META_DIR = os.path.abspath(AITER_ROOT_DIR) -+ - sys.path.insert(0, AITER_META_DIR) - AITER_CSRC_DIR = f"{AITER_META_DIR}/csrc" - AITER_GRADLIB_DIR = f"{AITER_META_DIR}/gradlib" - gfx = get_gfx() - AITER_ASM_DIR = f"{AITER_META_DIR}/hsa/{gfx}/" --os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR -+if "AITER_ASM_DIR" not in os.environ: -+ os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR - CK_3RDPARTY_DIR = os.environ.get( - "CK_DIR", f"{AITER_META_DIR}/3rdparty/composable_kernel" - ) diff --git a/WORKSPACE b/WORKSPACE index 8ff5f9cac..7a7e1f075 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -43,9 +43,3 @@ pip_gpu_rocm_torch_install_deps() load("//:def.bzl", "read_release_version") read_release_version(name = "release_version") -load("//:patched_repo.bzl", "patched_pip_repository") - -# 创建打过patch的版本 -patched_pip_repository( - name = "patched_aiter" -) diff --git a/open_source/deps/http.bzl b/open_source/deps/http.bzl index 83942ae3d..034d9284d 100644 --- a/open_source/deps/http.bzl +++ b/open_source/deps/http.bzl @@ -65,7 +65,7 @@ def http_deps(): "https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl", ], type = "zip", - patches = ["//3rdparty/aiter:refine-aiter-asm-dir.patch", "//3rdparty/aiter:aiter-flash_attn.patch"], + patches = ["//3rdparty/aiter:aiter-flash_attn.patch"], patch_args = ["-p1"], build_file = clean_dep("//:BUILD.aiter"), ) diff --git a/patched_repo.bzl b/patched_repo.bzl deleted file mode 100644 index d01eabecc..000000000 --- a/patched_repo.bzl +++ /dev/null @@ -1,27 +0,0 @@ -def _patched_pip_repository_impl(repository_ctx): - # 获取原始pip仓库路径:注意,这里我们使用Label来定位原始仓库的BUILD文件,然后取它的目录 - original_path = repository_ctx.path(Label("@pip_gpu_rocm_torch_aiter//:BUILD.bazel")).dirname - - original_path_str = str(original_path) - - # 复制原始仓库内容到当前仓库根目录 - repository_ctx.execute([ - "cp", "-r", original_path_str + "/.", repository_ctx.path("") - ]) - - patch_path = repository_ctx.path(Label("//3rdparty/aiter:refine-aiter-asm-dir.patch")) - patch_path_aiter = repository_ctx.path(Label("//3rdparty/aiter:aiter-flash_attn.patch")) - patch_path_str = str(patch_path) - - result = repository_ctx.execute([ - "sh", "-c", - "cd site-packages && patch -p1 -i " + patch_path_str + " && patch -p1 -i " + str(patch_path_aiter) - ]) - - if result.return_code != 0: - fail("Patch failed: %s" % result.stderr) - -patched_pip_repository = repository_rule( - implementation = _patched_pip_repository_impl, - attrs = {}, -) \ No newline at end of file diff --git a/rtp_llm/BUILD b/rtp_llm/BUILD index ec8aaf97f..6da22a002 100755 --- a/rtp_llm/BUILD +++ b/rtp_llm/BUILD @@ -434,7 +434,7 @@ py_library( ":async_model", ":embedding", "//rtp_llm/tools/convert:convert", - "@patched_aiter//:pkg", + ":aiter", ], "//conditions:default": [ ":models", From fe6f6edfd2bc7ad3c1fc881759127d7a609c9fc3 Mon Sep 17 00:00:00 2001 From: "fanfengfeng.fff" Date: Sat, 11 Oct 2025 11:53:25 +0800 Subject: [PATCH 7/9] fix: rm aiter-flash_attn.patch --- 3rdparty/aiter/aiter-flash_attn.patch | 13 ------------- open_source/deps/http.bzl | 2 -- 2 files changed, 15 deletions(-) delete mode 100644 3rdparty/aiter/aiter-flash_attn.patch diff --git a/3rdparty/aiter/aiter-flash_attn.patch b/3rdparty/aiter/aiter-flash_attn.patch deleted file mode 100644 index ea752e61b..000000000 --- a/3rdparty/aiter/aiter-flash_attn.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py -index 0892939..b84e620 100644 ---- a/aiter/ops/mha.py -+++ b/aiter/ops/mha.py -@@ -1014,7 +1014,7 @@ def _flash_attn_forward( - ret = ret and (not swa) - ret = ret and (q.dtype == dtypes.bf16) - ret = ret and ((return_lse and gfx == "gfx950") or (gfx == "gfx942")) -- return ret -+ return - - q, k, v = [maybe_contiguous(x) for x in (q, k, v)] - if can_impl_fmha_v3_fwd(): \ No newline at end of file diff --git a/open_source/deps/http.bzl b/open_source/deps/http.bzl index 034d9284d..20029ca1e 100644 --- a/open_source/deps/http.bzl +++ b/open_source/deps/http.bzl @@ -65,8 +65,6 @@ def http_deps(): "https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl", ], type = "zip", - patches = ["//3rdparty/aiter:aiter-flash_attn.patch"], - patch_args = ["-p1"], build_file = clean_dep("//:BUILD.aiter"), ) From 89d4c9615945168a3f8a36d9ef7fb05a1ef7ab78 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 11 Oct 2025 17:15:13 +0800 Subject: [PATCH 8/9] patch optimize and a8w8gemm --- 3rdparty/aiter/BUILD | 14 + 3rdparty/aiter/aiter-fmha.patch | 22 - 3rdparty/aiter/{silu.patch => aiter.patch} | 95 +++ 3rdparty/aiter/gemm_a8w8.patch | 549 ++++++++++++++++++ ...-gemm_tune.patch => gemm_blockscale.patch} | 0 3rdparty/aiter/rtp-llm.patch | 11 - BUILD.aiter | 15 + open_source/bazel/arch_select.bzl | 2 +- open_source/deps/git.bzl | 3 +- open_source/deps/http.bzl | 4 +- open_source/deps/requirements_lock_rocm.txt | 4 +- open_source/deps/requirements_rocm.txt | 2 +- rtp_llm/libs/BUILD | 7 +- 13 files changed, 686 insertions(+), 42 deletions(-) delete mode 100644 3rdparty/aiter/aiter-fmha.patch rename 3rdparty/aiter/{silu.patch => aiter.patch} (60%) create mode 100644 3rdparty/aiter/gemm_a8w8.patch rename 3rdparty/aiter/{0003-gemm_tune.patch => gemm_blockscale.patch} (100%) delete mode 100644 3rdparty/aiter/rtp-llm.patch diff --git a/3rdparty/aiter/BUILD b/3rdparty/aiter/BUILD index ce7666673..7be1faddc 100644 --- a/3rdparty/aiter/BUILD +++ b/3rdparty/aiter/BUILD @@ -101,6 +101,7 @@ genrule( "aiter/jit/libmodule_activation.so", "aiter/jit/libmodule_rmsnorm.so", "aiter/jit/libmodule_smoothquant.so", + "aiter/jit/libmodule_gemm_a8w8.so", "aiter/jit/libmodule_moe_ck2stages.so" ], cmd = """ @@ -127,6 +128,7 @@ genrule( cp external/aiter_src/aiter/jit/module_norm.so $(location aiter/jit/libmodule_norm.so); cp external/aiter_src/aiter/jit/module_rmsnorm.so $(location aiter/jit/libmodule_rmsnorm.so); cp external/aiter_src/aiter/jit/module_mha_fwd.so $(location aiter/jit/libmodule_mha_fwd.so); + cp external/aiter_src/aiter/jit/module_gemm_a8w8.so $(location aiter/jit/libmodule_gemm_a8w8.so); cp external/aiter_src/aiter/jit/module_moe_ck2stages.so $(location aiter/jit/libmodule_moe_ck2stages.so); """, visibility = ["//visibility:public"], @@ -286,6 +288,18 @@ cc_library( tags = ["rocm","local"], ) +cc_library( + name = "module_gemm_a8w8", + srcs = ["aiter/jit/libmodule_gemm_a8w8.so"], + hdrs = ["csrc/ck_gemm_a8w8/include/gemm_a8w8.h"], + deps = [":cpp_libraries"], + copts = [], + linkopts = [], + strip_include_prefix = "csrc/ck_gemm_a8w8/include/", + visibility = ["//visibility:public"], + tags = ["rocm","local"], +) + cc_library( name = "module_moe_ck2stages", srcs = [ diff --git a/3rdparty/aiter/aiter-fmha.patch b/3rdparty/aiter/aiter-fmha.patch deleted file mode 100644 index 51c150634..000000000 --- a/3rdparty/aiter/aiter-fmha.patch +++ /dev/null @@ -1,22 +0,0 @@ ---- aiter/jit/optCompilerConfig.json -+++ aiter/jit/optCompilerConfig.json -@@ -619,6 +619,7 @@ - "verbose": "False", - "hip_clang_path": "os.environ.get('MHA_HIP_CLANG_PATH')", - "blob_gen_cmd": [ -+ "f'{get_asm_dir()}/fmha_v3_fwd/codegen.py --output_dir {{}}'", - "f'{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd --receipt 600 --output_dir {{}}'", - "f'{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 3 --output_dir {{}}'" - ] - ---- csrc/cpp_itfs/mha_fwd_generate.py -+++ csrc/cpp_itfs/mha_fwd_generate.py -@@ -150,7 +150,7 @@ COMBINED_API = """t = fmha_fwd_v3(traits, args, stream_config); - API_MAP = { - 1: FMHA_FWD_API.format(F_inner_dispatch=V3_API), - 2: FMHA_FWD_API.format(F_inner_dispatch=V2_API), -- 3: FMHA_FWD_API.format(F_inner_dispatch=V2_API) + FMHA_FWD_SPLITKV_API, -+ 3: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API), - 4: FMHA_BATCH_PREFILL_API, - 5: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API) - + FMHA_FWD_SPLITKV_API \ No newline at end of file diff --git a/3rdparty/aiter/silu.patch b/3rdparty/aiter/aiter.patch similarity index 60% rename from 3rdparty/aiter/silu.patch rename to 3rdparty/aiter/aiter.patch index b71d52646..f181f4a06 100644 --- a/3rdparty/aiter/silu.patch +++ b/3rdparty/aiter/aiter.patch @@ -1,3 +1,97 @@ +--- aiter/jit/optCompilerConfig.json ++++ aiter/jit/optCompilerConfig.json +@@ -619,6 +619,7 @@ + "verbose": "False", + "hip_clang_path": "os.environ.get('MHA_HIP_CLANG_PATH')", + "blob_gen_cmd": [ ++ "f'{get_asm_dir()}/fmha_v3_fwd/codegen.py --output_dir {{}}'", + "f'{CK_DIR}/example/ck_tile/01_fmha/generate.py -d fwd --receipt 600 --output_dir {{}}'", + "f'{AITER_CSRC_DIR}/cpp_itfs/mha_fwd_generate.py --receipt 3 --output_dir {{}}'" + ] + +--- csrc/cpp_itfs/mha_fwd_generate.py ++++ csrc/cpp_itfs/mha_fwd_generate.py +@@ -150,7 +150,7 @@ COMBINED_API = """t = fmha_fwd_v3(traits, args, stream_config); + API_MAP = { + 1: FMHA_FWD_API.format(F_inner_dispatch=V3_API), + 2: FMHA_FWD_API.format(F_inner_dispatch=V2_API), +- 3: FMHA_FWD_API.format(F_inner_dispatch=V2_API) + FMHA_FWD_SPLITKV_API, ++ 3: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API), + 4: FMHA_BATCH_PREFILL_API, + 5: FMHA_FWD_API.format(F_inner_dispatch=COMBINED_API) + + FMHA_FWD_SPLITKV_API + +--- csrc/py_itfs_cu/asm_pa.cu ++++ csrc/py_itfs_cu/asm_pa.cu +@@ -97,7 +97,7 @@ torch::Tensor pa_fwd(torch::Tensor& Q, // [num_seqs, num_heads, head_size] + int num_heads = Q.size(1); + int head_size = Q.size(2); + int num_kv_heads = K.size(1); +- int block_size = K.size(3); ++ int block_size = K.size(2); + const int gqa_ratio = num_heads / num_kv_heads; + TORCH_CHECK(block_size == 16, __func__, " for now only support block_size == 16"); + + +--- aiter/jit/core.py ++++ aiter/jit/core.py +@@ -62,35 +62,19 @@ this_dir = os.path.dirname(os.path.abspath(__file__)) + AITER_ROOT_DIR = os.path.abspath(f"{this_dir}/../../") + AITER_LOG_MORE = int(os.getenv("AITER_LOG_MORE", 0)) + +-find_aiter = importlib.util.find_spec("aiter") +-if find_aiter is not None: +- if find_aiter.submodule_search_locations: +- package_path = find_aiter.submodule_search_locations[0] +- elif find_aiter.origin: +- package_path = find_aiter.origin +- package_path = os.path.dirname(package_path) +- package_parent_path = os.path.dirname(package_path) +- import site +- +- site_packages_dirs = site.getsitepackages() +- # develop mode +- isDevelopMode = (package_path not in site_packages_dirs) and ( +- package_parent_path not in site_packages_dirs +- ) +- if isDevelopMode: +- AITER_META_DIR = AITER_ROOT_DIR +- # install mode +- else: +- AITER_META_DIR = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta/") ++meta_path = os.path.abspath(f"{AITER_ROOT_DIR}/aiter_meta") ++if os.path.exists(meta_path): ++ AITER_META_DIR = meta_path + else: +- AITER_META_DIR = AITER_ROOT_DIR +- logger.warning("aiter is not installed.") ++ AITER_META_DIR = os.path.abspath(AITER_ROOT_DIR) ++ + sys.path.insert(0, AITER_META_DIR) + AITER_CSRC_DIR = f"{AITER_META_DIR}/csrc" + AITER_GRADLIB_DIR = f"{AITER_META_DIR}/gradlib" + gfx = get_gfx() + AITER_ASM_DIR = f"{AITER_META_DIR}/hsa/{gfx}/" +-os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR ++if "AITER_ASM_DIR" not in os.environ: ++ os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR + CK_3RDPARTY_DIR = os.environ.get( + "CK_DIR", f"{AITER_META_DIR}/3rdparty/composable_kernel" + ) + + +--- aiter/ops/mha.py ++++ aiter/ops/mha.py +@@ -1014,7 +1014,7 @@ def _flash_attn_forward( + ret = ret and (not swa) + ret = ret and (q.dtype == dtypes.bf16) + ret = ret and ((return_lse and gfx == "gfx950") or (gfx == "gfx942")) +- return ret ++ return + + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + if can_impl_fmha_v3_fwd(): + --- csrc/kernels/activation_kernels.cu +++ csrc/kernels/activation_kernels.cu @@ -18,6 +18,7 @@ @@ -149,3 +243,4 @@ "extra_include": [ "f'{AITER_CSRC_DIR}/include/ck_tile'" + diff --git a/3rdparty/aiter/gemm_a8w8.patch b/3rdparty/aiter/gemm_a8w8.patch new file mode 100644 index 000000000..45ac18fe4 --- /dev/null +++ b/3rdparty/aiter/gemm_a8w8.patch @@ -0,0 +1,549 @@ +--- aiter/configs/a8w8_tuned_gemm.csv ++++ aiter/configs/a8w8_tuned_gemm.csv +@@ -1,482 +1,65 @@ + cu_num,M,N,K,kernelId,splitK,us,kernelName +-80,1,1280,8192,34,0,21.6639,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,1,8192,1024,78,0,9.2458,a8w8_rowwise_64x16x16x128_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4x4x1_1x1_interwave_v2 +-80,32,1280,8192,34,0,18.5951,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,32,8192,1024,28,0,10.4978,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,64,1280,8192,34,0,18.7283,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-80,64,8192,1024,21,0,13.2415,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,128,1280,8192,30,0,23.3627,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-80,128,8192,1024,45,0,20.3239,a8w8_rowwise_256x128x64x128_32x32_2x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,192,1280,8192,23,0,33.1804,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,192,8192,1024,47,0,26.4371,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,256,1280,8192,23,0,33.568,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,256,8192,1024,13,0,30.0043,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,320,1280,8192,21,0,54.7817,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,320,8192,1024,47,0,40.0412,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,512,1280,8192,21,0,55.3717,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,512,8192,1024,13,0,53.1344,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,1024,1280,8192,13,0,89.511,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,1024,8192,1024,13,0,94.821,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,2048,1280,8192,13,0,141.1901,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,2048,8192,1024,13,0,161.9182,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,4096,1280,8192,13,0,276.5091,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,4096,8192,1024,13,0,309.2924,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,8192,1280,8192,0,0,528.1318,a8w8_rowwise_256x256x256x64_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4 +-80,8192,8192,1024,13,0,602.993,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-80,16384,1280,8192,0,0,1046.4171,a8w8_rowwise_256x256x256x64_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4 +-80,16384,8192,1024,41,0,1210.4862,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,1,1280,8192,34,0,17.6914,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,1280,8192,34,0,15.0987,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,1280,8192,34,0,14.9955,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,1280,8192,34,0,14.7691,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,128,1280,8192,34,0,13.9827,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,256,1280,8192,30,0,16.2863,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,512,1280,8192,28,0,23.2059,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,1280,8192,21,0,27.7294,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,1280,8192,41,0,42.0169,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,4096,1280,8192,11,0,59.932,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,8192,1280,8192,4,0,95.3807,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,1280,8192,4,0,178.1119,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,1280,8192,1,0,339.8668,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,2560,8192,34,0,17.4658,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,2560,8192,34,0,14.8202,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,2560,8192,34,0,14.7991,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,2560,8192,34,0,14.5967,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,128,2560,8192,30,0,16.5177,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,256,2560,8192,28,0,23.112,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,2560,8192,21,0,27.6646,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,2560,8192,41,0,42.1202,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,2048,2560,8192,11,0,61.0366,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,4096,2560,8192,4,0,90.7509,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,8192,2560,8192,4,0,173.6356,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,2560,8192,1,0,329.1334,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,2560,8192,1,0,599.612,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,10240,8192,34,0,20.5574,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,10240,8192,34,0,18.6072,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,10240,8192,30,0,20.8953,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,10240,8192,28,0,27.6213,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,10240,8192,15,0,31.2683,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,10240,8192,41,0,42.9954,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,512,10240,8192,11,0,61.9597,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,1024,10240,8192,4,0,89.391,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,10240,8192,4,0,170.1034,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,4096,10240,8192,1,0,318.603,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,10240,8192,1,0,575.9516,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,10240,8192,1,0,1149.3883,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,10240,8192,1,0,2299.333,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,8192,1024,79,0,5.7218,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,16,8192,1024,79,0,5.4586,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,32,8192,1024,79,0,5.6281,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,64,192,1024,77,0,4.1582,a8w8_rowwise_64x16x16x64_16x16_1x1_4x16x1_4x16x1_1x16x1x4_4x4x1_1x1_interwave_v2 +-256,128,8192,1024,28,0,7.1186,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,8192,1024,21,0,8.6043,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,8192,1024,13,0,11.7749,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,8192,1024,13,0,17.3848,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,8192,1024,1,0,29.7892,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,8192,1024,1,0,55.6305,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,8192,1024,1,0,105.9654,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,8192,1024,10,0,198.2601,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,8192,1024,10,0,379.9256,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,8192,2048,34,0,7.1625,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,8192,2048,79,0,7.2552,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,32,8192,2048,76,0,7.2889,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 +-256,64,8192,2048,30,0,7.5798,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,8192,2048,23,0,10.2945,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,8192,2048,15,0,10.9387,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,8192,2048,41,0,16.8904,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,1024,8192,2048,41,0,27.2285,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,2048,8192,2048,1,0,42.7444,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,8192,2048,1,0,83.0059,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,8192,2048,1,0,159.9883,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,8192,2048,1,0,299.7045,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,8192,2048,1,0,594.1387,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,8192,8192,34,0,19.4709,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,8192,8192,34,0,17.2283,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,8192,8192,34,0,17.4586,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,8192,8192,30,0,19.4045,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,8192,8192,28,0,26.3513,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,8192,8192,15,0,31.6256,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,8192,8192,13,0,51.209,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,8192,8192,13,0,88.9386,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,8192,8192,1,0,125.3472,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,8192,8192,1,0,240.0438,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,8192,8192,1,0,469.2882,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,8192,8192,1,0,933.9446,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,8192,8192,1,0,1837.7153,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,7168,8192,34,0,19.015,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,7168,8192,34,0,16.6523,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,7168,8192,34,0,16.942,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,7168,8192,30,0,18.4286,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,7168,8192,28,0,26.0097,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,7168,8192,15,0,30.291,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,7168,8192,41,0,47.5197,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,1024,7168,8192,13,0,82.6278,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,7168,8192,2,0,116.6871,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,4096,7168,8192,2,0,219.2844,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,8192,7168,8192,2,0,420.8889,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,7168,8192,1,0,815.6995,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,7168,8192,1,0,1620.6254,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,57344,8192,76,0,88.899,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 +-256,16,57344,8192,80,0,90.9234,a8w8_rowwise_128x16x64x128_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,32,57344,8192,81,0,93.0943,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_interwave_v2 +-256,64,57344,8192,46,0,99.6635,a8w8_rowwise_256x64x128x128_32x32_1x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,57344,8192,13,0,108.1045,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,57344,8192,2,0,135.372,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,512,57344,8192,2,0,233.9286,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,1024,57344,8192,2,0,438.8629,a8w8_rowwise_256x256x224x128_32x32_2x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,57344,8192,1,0,814.7053,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,57344,8192,1,0,1617.8989,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,57344,8192,1,0,3164.0659,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,57344,8192,1,0,6467.762,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,57344,8192,1,0,12840.4952,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,8192,3584,34,0,11.2488,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,8192,3584,34,0,10.6477,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,8192,3584,34,0,9.4243,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,8192,3584,30,0,11.279,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,8192,3584,28,0,13.8708,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,8192,3584,15,0,16.5846,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,8192,3584,41,0,24.9561,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,1024,8192,3584,13,0,43.9928,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,8192,3584,1,0,65.1518,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,8192,3584,1,0,122.3648,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,8192,3584,1,0,232.9396,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,8192,3584,1,0,463.2471,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,8192,3584,1,0,899.1908,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,8192,7168,34,0,16.9371,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,8192,7168,34,0,15.2131,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,8192,7168,34,0,16.2902,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,8192,7168,30,0,17.495,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,8192,7168,28,0,24.2653,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,8192,7168,21,0,29.138,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,8192,7168,41,0,44.168,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,1024,8192,7168,13,0,79.3336,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,8192,7168,1,0,113.2102,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,8192,7168,1,0,215.6239,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,8192,7168,1,0,412.978,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,8192,7168,1,0,819.3844,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,8192,7168,1,0,1640.4404,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,8192,28672,34,0,59.8294,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,8192,28672,34,0,55.9502,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,8192,28672,34,0,54.7382,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,8192,28672,30,0,61.7622,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,8192,28672,28,0,87.5085,a8w8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,8192,28672,15,0,102.0397,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,8192,28672,41,0,150.8316,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,1024,8192,28672,41,0,274.6402,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,2048,8192,28672,1,0,387.0603,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,8192,28672,1,0,764.1022,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,8192,28672,1,0,1532.6908,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,8192,28672,1,0,3088.0838,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,8192,28672,1,0,6209.7387,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,2304,16384,34,0,28.7938,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,4608,16384,34,0,30.4899,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,9216,16384,34,0,34.1179,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,13312,16384,34,0,43.9839,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,16384,2048,34,0,8.2219,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,16384,4096,34,0,15.9093,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,16384,6656,34,0,22.3992,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,16384,8192,34,0,29.3945,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,16384,13312,34,0,39.0876,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,16384,26624,34,0,73.9812,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,26624,16384,34,0,88.2574,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,53248,16384,57,0,170.1222,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_intrawave_v2 +-256,16,2304,16384,34,0,23.498,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,4608,16384,34,0,25.7089,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,9216,16384,34,0,33.174,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,13312,16384,30,0,45.4589,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16,16384,2048,34,0,8.2068,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,16384,4096,30,0,16.22,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16,16384,6656,34,0,21.7401,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,16384,8192,30,0,29.6526,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16,16384,13312,34,0,38.6763,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,16384,26624,34,0,73.7744,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,26624,16384,30,0,86.8898,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16,53248,16384,34,0,180.7857,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,2304,16384,34,0,23.2436,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,4608,16384,34,0,24.957,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,9216,16384,30,0,34.6699,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,13312,16384,30,0,46.5921,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,16384,2048,30,0,10.4352,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,16384,4096,30,0,16.4693,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,16384,6656,30,0,22.4014,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,16384,8192,30,0,30.4324,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,16384,13312,30,0,39.8351,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,16384,26624,30,0,74.3742,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,26624,16384,30,0,89.3239,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,53248,16384,34,0,168.8762,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,2304,16384,34,0,23.2558,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,4608,16384,30,0,27.9735,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,9216,16384,23,0,41.0647,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,13312,16384,23,0,50.0251,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,16384,2048,23,0,11.4046,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,16384,4096,23,0,19.0989,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,16384,6656,23,0,26.4384,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,16384,8192,23,0,33.8597,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,16384,13312,23,0,46.8247,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,16384,26624,23,0,89.8884,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,26624,16384,23,0,91.2922,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,53248,16384,23,0,184.3385,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,2304,16384,30,0,26.5035,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,128,4608,16384,23,0,33.4047,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,9216,16384,23,0,59.454,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,13312,16384,23,0,65.9651,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,16384,2048,23,0,14.5668,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,16384,4096,23,0,23.3525,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,16384,6656,23,0,34.0329,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,16384,8192,23,0,45.1385,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,16384,13312,23,0,61.6432,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,16384,26624,23,0,118.3409,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,26624,16384,23,0,129.1529,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,53248,16384,23,0,230.1893,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,2304,16384,23,0,32.4817,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,4608,16384,23,0,55.0189,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,9216,16384,23,0,86.8248,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,13312,16384,23,0,113.4219,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,16384,2048,15,0,22.0982,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,16384,4096,15,0,39.8495,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,16384,6656,23,0,57.6756,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,16384,8192,15,0,72.9104,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,16384,13312,23,0,106.2833,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,16384,26624,23,0,202.2465,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,26624,16384,23,0,206.2403,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,53248,16384,40,0,368.0253,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,512,2304,16384,23,0,54.2058,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,4608,16384,23,0,87.1647,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,9216,16384,23,0,141.9471,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,13312,16384,23,0,199.0315,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,16384,2048,15,0,38.4305,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,16384,4096,15,0,65.4454,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,16384,6656,15,0,102.3241,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,16384,8192,15,0,123.6929,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,16384,13312,15,0,191.2442,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,16384,26624,15,0,365.7012,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,26624,16384,15,0,349.3418,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,53248,16384,15,0,651.6344,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,2304,16384,23,0,86.7135,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,4608,16384,23,0,143.2677,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,9216,16384,15,0,243.9282,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,13312,16384,40,0,306.5211,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,1024,16384,2048,1,0,61.8687,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,16384,4096,1,0,108.4683,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,16384,6656,1,0,165.4754,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,16384,8192,1,0,202.2384,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,16384,13312,1,0,307.7016,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,16384,26624,1,0,603.8507,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,26624,16384,15,0,630.2527,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,53248,16384,40,0,1195.3563,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,2304,16384,23,0,145.2155,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,4608,16384,11,0,247.0665,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,2048,9216,16384,15,0,431.0243,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,13312,16384,40,0,609.7983,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,16384,2048,1,0,109.435,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,16384,4096,1,0,192.1657,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,16384,6656,1,0,297.4271,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,16384,8192,1,0,351.3779,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,16384,13312,1,0,562.4285,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,16384,26624,1,0,1159.8861,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,26624,16384,40,0,1145.701,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,53248,16384,1,0,2196.6099,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,2304,16384,11,0,246.424,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,4096,4608,16384,15,0,433.3975,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,9216,16384,39,0,778.6417,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,13312,16384,10,0,1114.1158,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,16384,2048,1,0,205.0841,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,16384,4096,39,0,353.73,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,16384,6656,39,0,561.0198,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,16384,8192,39,0,657.0519,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,16384,13312,1,0,1080.1678,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,16384,26624,1,0,2197.8702,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,26624,16384,10,0,2199.7579,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,53248,16384,1,0,4075.2597,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,2304,16384,15,0,463.8213,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,4608,16384,39,0,844.5637,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,9216,16384,43,0,1557.3892,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1 +-256,8192,13312,16384,40,0,2189.1084,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,8192,16384,2048,39,0,392.4492,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,16384,4096,39,0,678.4652,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,16384,6656,39,0,1094.0621,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,16384,8192,1,0,1288.1354,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,16384,13312,1,0,2091.5748,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,16384,26624,1,0,4736.0103,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,26624,16384,40,0,4074.1172,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,8192,53248,16384,40,0,8028.0784,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,2304,16384,15,0,878.7062,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,4608,16384,40,0,1632.4516,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,9216,16384,1,0,2949.0967,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,13312,16384,1,0,4134.6494,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,16384,2048,1,0,719.1952,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,16384,4096,39,0,1327.6342,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16384,16384,6656,39,0,2120.7951,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16384,16384,8192,1,0,2555.2385,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,16384,13312,1,0,4157.4955,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,16384,26624,1,0,9368.503,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,26624,16384,40,0,8090.2903,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,53248,16384,40,0,15996.8357,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,2304,16384,1,0,1717.9546,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,4608,16384,40,0,3026.9648,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,9216,16384,1,0,5884.19,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,13312,16384,40,0,8230.5195,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,16384,2048,1,0,1409.7173,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,16384,4096,39,0,2546.0869,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32768,16384,6656,39,0,4073.3594,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32768,16384,8192,39,0,4965.434,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32768,16384,13312,1,0,8271.2663,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,16384,26624,1,0,19064.3766,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,26624,16384,40,0,16029.2644,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,53248,16384,40,0,31823.1958,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,1,100,5120,34,0,8.7502,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,200,5120,34,0,9.0193,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,800,5120,34,0,11.5471,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,5120,640,79,0,4.3227,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,1,5120,1280,34,0,6.1049,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,5120,3200,30,0,9.8138,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,1,5120,5120,34,0,12.3186,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,5120,6400,34,0,14.5779,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,5120,25600,23,0,47.1407,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1,6400,5120,34,0,12.7506,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,12800,5120,34,0,14.7015,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1,51200,5120,30,0,50.042,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16,100,5120,34,0,8.7447,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,200,5120,34,0,10.4705,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,800,5120,34,0,11.7219,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,5120,640,60,0,4.3842,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 +-256,16,5120,1280,79,0,6.1038,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,16,5120,3200,79,0,9.1289,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,16,5120,5120,34,0,11.1363,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,5120,6400,34,0,14.8838,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,5120,25600,34,0,38.7631,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,6400,5120,34,0,11.5523,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,12800,5120,34,0,14.2563,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,16,51200,5120,30,0,51.7349,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,100,5120,34,0,9.2459,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,200,5120,34,0,10.4825,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,800,5120,34,0,11.9874,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,5120,640,79,0,4.5986,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,32,5120,1280,76,0,6.0464,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 +-256,32,5120,3200,76,0,10.1191,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 +-256,32,5120,5120,34,0,10.4248,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,5120,6400,34,0,15.0885,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,5120,25600,34,0,38.3258,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,6400,5120,34,0,10.9816,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,32,12800,5120,30,0,14.9578,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32,51200,5120,30,0,53.9026,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,100,5120,34,0,9.6716,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,200,5120,34,0,9.9254,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,800,5120,34,0,12.0241,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,64,5120,640,77,0,5.0956,a8w8_rowwise_64x16x16x64_16x16_1x1_4x16x1_4x16x1_1x16x1x4_4x4x1_1x1_interwave_v2 +-256,64,5120,1280,76,0,6.4203,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 +-256,64,5120,3200,76,0,10.7687,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 +-256,64,5120,5120,30,0,11.7059,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,5120,6400,30,0,16.907,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,5120,25600,30,0,43.5801,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,6400,5120,30,0,12.4817,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,64,12800,5120,23,0,18.5404,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,64,51200,5120,23,0,61.5287,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,100,5120,34,0,10.6683,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,128,200,5120,34,0,9.652,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,128,800,5120,34,0,11.5628,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,128,5120,640,79,0,5.566,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 +-256,128,5120,1280,23,0,7.6695,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,5120,3200,23,0,11.8363,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,5120,5120,23,0,14.1859,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,5120,6400,23,0,18.7826,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,5120,25600,23,0,54.0309,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,6400,5120,23,0,14.96,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,12800,5120,23,0,25.5896,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,128,51200,5120,23,0,82.0559,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,100,5120,34,0,10.9556,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,256,200,5120,34,0,9.8147,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,256,800,5120,34,0,11.8423,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,256,5120,640,81,0,6.6084,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_interwave_v2 +-256,256,5120,1280,15,0,8.2757,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,5120,3200,15,0,15.7837,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,5120,5120,23,0,21.8405,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,5120,6400,21,0,23.7832,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,5120,25600,23,0,86.0166,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,6400,5120,23,0,22.7861,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,12800,5120,15,0,42.5463,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,256,51200,5120,40,0,120.9285,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,512,100,5120,34,0,11.1055,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,512,200,5120,34,0,11.8774,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,512,800,5120,30,0,13.1908,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,512,5120,640,47,0,8.4036,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,5120,1280,22,0,12.2285,a8w8_rowwise_256x64x96x256_16x16_2x3_16x16x1_16x16x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,512,5120,3200,13,0,21.8498,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,5120,5120,23,0,33.1376,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,5120,6400,15,0,38.8017,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,5120,25600,23,0,131.897,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,6400,5120,15,0,41.3468,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,12800,5120,23,0,74.3061,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,512,51200,5120,40,0,223.8159,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,1024,100,5120,34,0,11.6746,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1024,200,5120,34,0,12.0055,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,1024,800,5120,23,0,15.5235,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,5120,640,11,0,11.7704,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,1024,5120,1280,11,0,17.5871,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,1024,5120,3200,11,0,27.688,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,1024,5120,5120,23,0,55.7649,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,5120,6400,11,0,57.5786,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 +-256,1024,5120,25600,23,0,227.1849,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,6400,5120,23,0,72.0506,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,1024,12800,5120,40,0,104.2861,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,1024,51200,5120,39,0,396.7395,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,2048,100,5120,34,0,11.6595,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,2048,200,5120,30,0,13.0876,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,2048,800,5120,23,0,24.3923,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,5120,640,10,0,17.6749,a8w8_rowwise_256x128x192x128_32x32_2x3_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,5120,1280,4,0,27.4475,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,5120,3200,4,0,44.9517,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,5120,5120,15,0,96.048,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,5120,6400,4,0,90.0028,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,2048,5120,25600,15,0,419.2679,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,2048,6400,5120,39,0,107.5421,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,2048,12800,5120,39,0,188.6903,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,2048,51200,5120,39,0,756.332,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,100,5120,34,0,16.1651,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 +-256,4096,200,5120,23,0,15.2341,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,800,5120,15,0,36.4711,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,5120,640,13,0,32.4583,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,5120,1280,13,0,50.6317,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,5120,3200,4,0,86.6115,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,4096,5120,5120,15,0,177.0814,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,5120,6400,4,0,162.2341,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,4096,5120,25600,15,0,782.4893,a8w8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,4096,6400,5120,39,0,188.523,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,12800,5120,39,0,356.6531,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,4096,51200,5120,1,0,1418.8126,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,100,5120,29,0,27.2037,a8w8_rowwise_256x32x96x256_16x16_1x3_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3 +-256,8192,200,5120,21,0,25.2734,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,8192,800,5120,41,0,65.9722,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5 +-256,8192,5120,640,42,0,55.1814,a8w8_rowwise_256x128x256x64_32x32_2x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1 +-256,8192,5120,1280,39,0,86.4091,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,5120,3200,40,0,149.3935,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,8192,5120,5120,39,0,277.9848,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,5120,6400,40,0,276.3348,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,8192,5120,25600,39,0,1513.1737,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,6400,5120,39,0,359.0195,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,12800,5120,39,0,683.3672,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,8192,51200,5120,1,0,2639.7599,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,100,5120,27,0,42.6746,a8w8_rowwise_256x32x160x256_16x16_1x5_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3 +-256,16384,200,5120,21,0,44.0772,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,800,5120,1,0,111.2384,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,5120,640,43,0,99.8836,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1 +-256,16384,5120,1280,1,0,153.6015,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,5120,3200,1,0,271.2735,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,5120,5120,1,0,518.1604,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,5120,6400,1,0,543.7765,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,5120,25600,1,0,2869.8056,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,16384,6400,5120,40,0,690.0605,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,16384,12800,5120,39,0,1303.8826,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,16384,51200,5120,1,0,5290.8019,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,100,5120,27,0,78.5325,a8w8_rowwise_256x32x160x256_16x16_1x5_16x16x1_16x16x1_1x32x1x8_4x4x1_1x1_intrawave_v3 +-256,32768,200,5120,21,0,82.0643,a8w8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,800,5120,1,0,216.0528,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,5120,640,43,0,191.0117,a8w8_rowwise_256x256x128x64_32x32_4x2_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_interwave_v1 +-256,32768,5120,1280,1,0,299.0856,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,5120,3200,1,0,538.4921,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,5120,5120,40,0,999.0598,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,5120,6400,1,0,1028.2651,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,5120,25600,1,0,5734.5832,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 +-256,32768,6400,5120,40,0,1274.1106,a8w8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 +-256,32768,12800,5120,39,0,2481.3683,a8w8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3 +-256,32768,51200,5120,1,0,10610.0611,a8w8_rowwise_256x256x256x128_32x32_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,1,768,768,76,0,4.9998,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,1,768,3072,34,0,9.9959,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,1,2304,768,79,0,5.1046,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,1,3072,768,79,0,5.283,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,2,768,768,76,0,5.0342,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,2,768,3072,34,0,10.0142,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,2,2304,768,79,0,5.1105,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,2,3072,768,79,0,5.297,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,4,768,768,76,0,5.0542,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,4,768,3072,34,0,10.0697,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,4,2304,768,79,0,5.1162,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,4,3072,768,79,0,5.4039,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,8,768,768,76,0,5.091,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,8,768,3072,34,0,10.2299,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,8,2304,768,79,0,5.146,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,8,3072,768,79,0,5.4091,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,16,768,768,79,0,4.6902,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,16,768,3072,34,0,9.039,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,16,2304,768,79,0,4.7277,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,16,3072,768,79,0,5.037,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,32,768,768,76,0,4.7434,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,32,768,3072,34,0,9.4536,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,32,2304,768,76,0,5.1442,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,32,3072,768,76,0,5.8581,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,64,768,768,79,0,5.1586,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,64,768,3072,34,0,9.5447,a8w8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3 ++80,64,2304,768,75,0,6.4882,a8w8_rowwise_128x64x16x128_16x16_2x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2 ++80,64,3072,768,57,0,7.5454,a8w8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_intrawave_v2 ++80,128,768,768,79,0,5.5334,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2 ++80,128,768,3072,30,0,11.5339,a8w8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3 ++80,128,2304,768,47,0,8.103,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,128,3072,768,55,0,10.0714,a8w8_rowwise_128x64x32x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 ++80,256,768,768,60,0,6.9792,a8w8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2 ++80,256,768,3072,23,0,16.3419,a8w8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,256,2304,768,47,0,10.1332,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,256,3072,768,47,0,12.9861,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,512,768,768,62,0,9.6114,a8w8_rowwise_128x32x64x128_32x32_1x1_8x16x1_8x16x1_1x16x1x8_8x8x1_1x1_intrawave_v2 ++80,512,768,3072,47,0,23.2341,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,512,2304,768,45,0,15.5051,a8w8_rowwise_256x128x64x128_32x32_2x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,512,3072,768,47,0,19.5125,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,1024,768,768,47,0,12.4938,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,1024,768,3072,47,0,34.4103,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,1024,2304,768,13,0,23.4274,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,1024,3072,768,47,0,32.6289,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,2048,768,768,47,0,19.2279,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,2048,768,3072,11,0,51.1168,a8w8_rowwise_256x128x160x128_32x32_1x5_8x32x1_8x32x1_1x64x1x4_8x8x1_1x1_intrawave_v3 ++80,2048,2304,768,13,0,43.4486,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,2048,3072,768,13,0,54.9992,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,4096,768,768,47,0,32.8076,a8w8_rowwise_256x64x64x128_32x32_1x1_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,4096,768,3072,4,0,86.8871,a8w8_rowwise_256x256x160x128_32x32_2x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3 ++80,4096,2304,768,13,0,81.0493,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,4096,3072,768,13,0,101.3405,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,8192,768,768,13,0,55.2194,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,8192,768,3072,13,0,150.2763,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,8192,2304,768,13,0,147.0913,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,8192,3072,768,13,0,192.9078,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,16384,768,768,13,0,101.4774,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,16384,768,3072,13,0,285.4741,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,16384,2304,768,13,0,277.867,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,16384,3072,768,13,0,366.0724,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,32768,768,768,13,0,193.6044,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,32768,768,3072,13,0,562.1274,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,32768,2304,768,13,0,540.5257,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 ++80,32768,3072,768,13,0,714.2505,a8w8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v3 diff --git a/3rdparty/aiter/0003-gemm_tune.patch b/3rdparty/aiter/gemm_blockscale.patch similarity index 100% rename from 3rdparty/aiter/0003-gemm_tune.patch rename to 3rdparty/aiter/gemm_blockscale.patch diff --git a/3rdparty/aiter/rtp-llm.patch b/3rdparty/aiter/rtp-llm.patch deleted file mode 100644 index 5c9ff9b21..000000000 --- a/3rdparty/aiter/rtp-llm.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- csrc/py_itfs_cu/asm_pa.cu -+++ csrc/py_itfs_cu/asm_pa.cu -@@ -97,7 +97,7 @@ torch::Tensor pa_fwd(torch::Tensor& Q, // [num_seqs, num_heads, head_size] - int num_heads = Q.size(1); - int head_size = Q.size(2); - int num_kv_heads = K.size(1); -- int block_size = K.size(3); -+ int block_size = K.size(2); - const int gqa_ratio = num_heads / num_kv_heads; - TORCH_CHECK(block_size == 16, __func__, " for now only support block_size == 16"); - diff --git a/BUILD.aiter b/BUILD.aiter index 2d93c71e5..bea9e06a3 100644 --- a/BUILD.aiter +++ b/BUILD.aiter @@ -305,6 +305,21 @@ cc_library( tags = ["rocm"], ) +cc_library( + name = "module_gemm_a8w8", + srcs = ["aiter/jit/libmodule_gemm_a8w8.so"], + hdrs = ["csrc/ck_gemm_a8w8/include/gemm_a8w8.h"], + deps = [ + ":aiter_so", + ":aiter_headers", + ], + copts = [], + linkopts = [], + strip_include_prefix = "aiter_meta/csrc/include/", + visibility = ["//visibility:public"], + tags = ["rocm"], +) + cc_library( name = "module_moe_ck2stages", srcs = ["aiter/jit/module_moe_ck2stages.so"], diff --git a/open_source/bazel/arch_select.bzl b/open_source/bazel/arch_select.bzl index 4988c38a1..6a331c118 100644 --- a/open_source/bazel/arch_select.bzl +++ b/open_source/bazel/arch_select.bzl @@ -62,7 +62,7 @@ def subscribe_deps(): def whl_deps(): return select({ "@//:using_cuda12": ["torch==2.6.0+cu126"], - "@//:using_rocm": ["pyrsmi", "amdsmi@https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis%2FAMD%2Famd_smi%2Fali%2Famd_smi.tar", "aiter@https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl"], + "@//:using_rocm": ["pyrsmi", "amdsmi@https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis%2FAMD%2Famd_smi%2Fali%2Famd_smi.tar", "aiter@https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252342-py3-none-any.whl"], "//conditions:default": ["torch==2.1.2"], }) diff --git a/open_source/deps/git.bzl b/open_source/deps/git.bzl index fa693ad6f..1c5201464 100644 --- a/open_source/deps/git.bzl +++ b/open_source/deps/git.bzl @@ -12,7 +12,7 @@ def git_deps(): remote = "https://github.com/ROCm/aiter.git", commit = "94934e7d7cd5e11d81a2ded2a54d35f9cec4374d", # update codegen.py (#880) recursive_init_submodules = True, - patches = ["//3rdparty/aiter:rtp-llm.patch", "//3rdparty/aiter:0003-gemm_tune.patch", "//3rdparty/aiter:aiter-fmha.patch", "//3rdparty/aiter:silu.patch"], + patches = ["//3rdparty/aiter:aiter.patch", "//3rdparty/aiter:gemm_blockscale.patch", "//3rdparty/aiter:gemm_a8w8.patch"], patch_cmds = [ "echo 'from aiter.jit.core import compile_ops, get_args_of_build, build_module, get_module' >> build_aiter_module.py", "echo 'from typing import Dict' >> build_aiter_module.py", @@ -66,6 +66,7 @@ def git_deps(): "echo ' build_aiter_module(\"module_attention_asm\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_activation\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_gemm_a8w8_bpreshuffle\")' >> build_aiter_module.py", + "echo ' build_aiter_module(\"module_gemm_a8w8\")' >> build_aiter_module.py", "echo ' build_aiter_module(\"module_moe_ck2stages\")' >> build_aiter_module.py", "echo 'echo \"building mla kernel\"' >> build_mla_kernel.sh", "echo 'so_file=\"./csrc/cpp_itfs/mla/asm_mla_decode_fwd_torch_lib.so\"' >> build_mla_kernel.sh", diff --git a/open_source/deps/http.bzl b/open_source/deps/http.bzl index 20029ca1e..8ebec92ee 100644 --- a/open_source/deps/http.bzl +++ b/open_source/deps/http.bzl @@ -60,9 +60,9 @@ def http_deps(): http_archive( name = "aiter", - sha256 = "08e90279560e2e066298e976b7a944d6de54e8b2559a207382b112cc60adcf58", + sha256 = "b89328e61855aba0ee3e96f800dbd9c1f8f400adee596978652d3891483bca51", urls = [ - "https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl", + "https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252342-py3-none-any.whl", ], type = "zip", build_file = clean_dep("//:BUILD.aiter"), diff --git a/open_source/deps/requirements_lock_rocm.txt b/open_source/deps/requirements_lock_rocm.txt index 78f8babe7..7ec61ed74 100644 --- a/open_source/deps/requirements_lock_rocm.txt +++ b/open_source/deps/requirements_lock_rocm.txt @@ -114,8 +114,8 @@ aiosignal==1.3.1 \ --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 # via aiohttp -aiter @ https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl \ - --hash=sha256:08e90279560e2e066298e976b7a944d6de54e8b2559a207382b112cc60adcf58 +aiter @ https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252342-py3-none-any.whl \ + --hash=sha256:b89328e61855aba0ee3e96f800dbd9c1f8f400adee596978652d3891483bca51 # via -r open_source/deps/requirements_rocm.txt aliyun-python-sdk-core==2.15.2 \ --hash=sha256:54f66a53e193c61c5e16ea4505a0cab43543f8ad2ef22833f69c4d5e5151c17d diff --git a/open_source/deps/requirements_rocm.txt b/open_source/deps/requirements_rocm.txt index 4dfd1dbad..2e9c076bc 100644 --- a/open_source/deps/requirements_rocm.txt +++ b/open_source/deps/requirements_rocm.txt @@ -4,5 +4,5 @@ https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torch-2.4.1%2Brocm6.4.1.gi https://repo.radeon.com/rocm/manylinux/rocm-rel-6.4.1/torchvision-0.19.0%2Brocm6.4.1.git4d41ad71-cp310-cp310-linux_x86_64.whl pyrsmi pyyaml -https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252339-py3-none-any.whl +https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis/AMD/RTP/aiter-0.1.0%2Bgit.94934e7d.date.202508252342-py3-none-any.whl https://sinian-metrics-platform.oss-cn-hangzhou.aliyuncs.com/kis%2FAMD%2Famd_smi%2Fali%2Famd_smi.tar diff --git a/rtp_llm/libs/BUILD b/rtp_llm/libs/BUILD index 2514666d2..0d1659fd3 100644 --- a/rtp_llm/libs/BUILD +++ b/rtp_llm/libs/BUILD @@ -18,8 +18,7 @@ filegroup( name = "copy_aiter", srcs = select({ "@//:using_aiter_src": [":aiter_src_copy",], - "@//:using_rocm": [":aiter_copy"], - "//conditions:default": [], + "//conditions:default": [":aiter_copy"], }), visibility = ["//visibility:public"], ) @@ -51,6 +50,7 @@ genrule( "libmodule_mha_fwd.so", "libmodule_norm.so", "libmodule_rmsnorm.so", + "libmodule_gemm_a8w8.so", "libmodule_moe_ck2stages.so" ], cmd = """ @@ -68,6 +68,7 @@ cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_attention_asm.s cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_mha_fwd.so $(location libmodule_mha_fwd.so); cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_norm.so $(location libmodule_norm.so); cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_rmsnorm.so $(location libmodule_rmsnorm.so); +cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_gemm_a8w8.so $(location libmodule_gemm_a8w8.so); cp ./bazel-out/k8-opt/bin/external/aiter_src/aiter/jit/libmodule_moe_ck2stages.so $(location libmodule_moe_ck2stages.so); """, tags = ["rocm", "local"], @@ -91,6 +92,7 @@ genrule( "module_mha_fwd.so", "module_norm.so", "module_rmsnorm.so", + "module_gemm_a8w8.so", "module_moe_ck2stages.so", ], cmd = """ @@ -108,6 +110,7 @@ genrule( cp external/aiter/aiter/jit/module_mha_fwd.so $(location module_mha_fwd.so); cp external/aiter/aiter/jit/module_norm.so $(location module_norm.so); cp external/aiter/aiter/jit/module_rmsnorm.so $(location module_rmsnorm.so); + cp external/aiter/aiter/jit/module_gemm_a8w8.so $(location module_gemm_a8w8.so); cp external/aiter/aiter/jit/module_moe_ck2stages.so $(location module_moe_ck2stages.so); """, tags = ["rocm", "local"], From df6736466346dca1473e653fff71caeaed52c902 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 11 Oct 2025 17:42:01 +0800 Subject: [PATCH 9/9] delete python asm prefill patch --- 3rdparty/aiter/aiter.patch | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/3rdparty/aiter/aiter.patch b/3rdparty/aiter/aiter.patch index f181f4a06..276413294 100644 --- a/3rdparty/aiter/aiter.patch +++ b/3rdparty/aiter/aiter.patch @@ -79,19 +79,6 @@ "CK_DIR", f"{AITER_META_DIR}/3rdparty/composable_kernel" ) - ---- aiter/ops/mha.py -+++ aiter/ops/mha.py -@@ -1014,7 +1014,7 @@ def _flash_attn_forward( - ret = ret and (not swa) - ret = ret and (q.dtype == dtypes.bf16) - ret = ret and ((return_lse and gfx == "gfx950") or (gfx == "gfx942")) -- return ret -+ return - - q, k, v = [maybe_contiguous(x) for x in (q, k, v)] - if can_impl_fmha_v3_fwd(): - --- csrc/kernels/activation_kernels.cu +++ csrc/kernels/activation_kernels.cu @@ -18,6 +18,7 @@